#!/usr/bin/env python3
# coding: utf-8

########################################################################################################################
# This file produces a fasta file with scrambled sequences
# Input:
# -fasta file with IDs and sequences
# Output:
# -fasta with the same IDs from the input with the word dumb added before each ID and the sequence but scrambled

########################################################################################################################
# Import of all packages needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import argparse
import random
import sys
from Bio import SeqIO
import copy
import os.path
from datetime import datetime

########################################################################################################################
# Parser.
parser = argparse.ArgumentParser(prog="scrambling_fasta_sequences.py",
                                 description="scrambles sequences in a fasta file and adds 'dumb' to the ID")
parser.add_argument("fasta",
                    type=str,
                    help="any fasta file with at least one sequence")
args = parser.parse_args()

# Extract the fasta naming convention with regex from the passed argument.
m = re.search("\/derived\/(.*)", args.fasta)
fasta_name= m.group(1)

# Open and write to the log file.
out_file = open("../../Data/derived/log.log","a")
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"argument passed: {args.fasta}",
      file = out_file)
print(f"number of required arguments: 1",
      file=out_file)
print(f"argument should contain: a fasta file with at least one sequence and the corresponding ID",
      file=out_file)

# Open the fasta file that will contain the newly scrambled sequences.
fasta_file_with_scrambled_seqs = open(f"../../Data/derived/scrambled_{fasta_name}","w")

# Assign the passed argument (the original unscrambled fasta) to a variable.
fasta_file=args.fasta

# Parse the fasta file to a dictionary.
fasta_dict={}
for record in SeqIO.parse(fasta_file,"fasta"):
    fasta_dict[record.description]=record.seq


##comment this out again
#fasta_dict={}
#fasta_dict["id1"]="MKKA VIVEN KGCAT CSI GAAC"
# Define a function that scrambles one sequence at a time.
def seq_scrambler( seq):
    str_seq=str(seq)
    shuffled_seq=random.sample(str_seq,len(str_seq))
    rejoined_seq="".join(shuffled_seq)
    return rejoined_seq

# Create a dictionary with the scrambled sequences and dump_Ids.
scrambled_fasta_dict={}
for i in fasta_dict:
    fvalue=seq_scrambler(fasta_dict[i])
    fkey="dumb_"+str(i)
    scrambled_fasta_dict[fkey]=fvalue

# Write the scrambled fasta file.
for i in scrambled_fasta_dict:
    print(f">{i}",
          file=fasta_file_with_scrambled_seqs)
    print(scrambled_fasta_dict[i],
          file=fasta_file_with_scrambled_seqs)

# Logs:
print(f"scrambled_{fasta_name}: was stored under derived data",
      file=out_file)
out_file.close()
fasta_file_with_scrambled_seqs.close()


