#!/usr/bin/env python3
# coding: utf-8

#######################################################################################################################
# Author: Tanja Krüger
# Aim: This file removes duplicates IDs and fragments from a multi fasta file
# Input: fasta file with potential duplicates.
# Output: a fasta file without any duplicates.

#######################################################################################################################
# Import libraries needed.
import pandas as pd
import re
import argparse
from datetime import datetime
from Bio import SeqIO

#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="deletion_duplicates_fragments.py",
                                 description="removal of duplicated IDs in a fasta file")
parser.add_argument("tox",
                    type=str,
                    help="fasta with toxins dataset")
args = parser.parse_args()

# #################################################################################################
# Option depending where the user wants the run the code form, default running the code with make from the project folder.
cl = ""
# If one wants to execute this file from the Code/python folder uncomment the next line.
# cl="../../"

# #################################################################################################
# Setting up the log file.
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
out_file = open(f"{cl}Data/derived/log.log", "a")
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"argument passed: {args.tox}",
      file=out_file)
print(f"number of required arguments: 1",
      file=out_file)
 
########################################################################################################################
# Get the file from the command line.
with open(args.tox) as handle:
    df_tox = pd.DataFrame(
        {record.id: [record.description, str(record.seq)] for record in SeqIO.parse(handle, "fasta")}).T
df_tox.columns = ["info", "seq"]

# delete the proteins labeled as fragments
df_tox = df_tox.loc[~df_tox.loc[:, "info"].str.contains("Fragment")]

# Open a new empty fasta file
try:
    savename = re.search("\/animal\/(.*).fasta", args.tox).group(1)
except:
    savename = re.search("derived\/(.*)_raw.fasta", args.tox).group(1)
fasta_file = open(f"{cl}Data/derived/{savename}.fasta", "w")

# Save the fasta file as
for ind in df_tox.index:
    print(f">{df_tox.loc[ind, 'info']}", file=fasta_file)
    print(df_tox.loc[ind, 'seq'], file=fasta_file)

# Log entry.
print(f"the file {savename} was created an stored under derived data",
      file=out_file)

# Close the fasta and log files
fasta_file.close()

out_file.close()
