#!/usr/bin/env python3
# coding: utf-8

#######################################################################################################################

#######################################################################################################################
# Import libraries needed.
import requests
from requests import Session
import pandas as pd
import numpy as np
import re
import argparse
import random
from datetime import datetime
import csv
from pandas.io.json import json_normalize
from Bio import SeqIO

#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="deletion_duplicates.py",
                                 description="removal of duplicated IDs in a fasta file")
parser.add_argument("tox",
                    type=str,
                    help="fasta with toxins dataset")
args = parser.parse_args()

####################################################################################################################
# Setting up the output and the log file.
cl=""
cl="../../"


# Get the time and date for the log.
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Open and write to the log file.
out_file = open(f"{cl}Data/derived/log.log","a")
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"argument passed: {args.tox }",
      file = out_file)
print(f"number of required arguments: 1",
      file=out_file)


########################################################################################################################
## Get a list of unique species from the provieded tsv file.
# Get the file from the command line.
# Do not use the module SeqIO as greyed out here below, for some reason it doesnt recognize some of the sequences.
with open(args.tox) as handle:
    df_at1 = pd.DataFrame(
        {record.id: [record.description, str(record.seq)] for record in SeqIO.parse(handle, "fasta")}).T
df_at1.columns=["info","seq"]

# Alternative opening of the fasta file without using
tox_dict= dict()

with open(args.tox) as f: #open the file
    lines = f.readlines() # read the lines
for i in range(0, len(lines)): # loop over the lines
    s = lines[i]
    if len(s)>0 : # the line has to have at least one charcter
        if str(s)[0] == '>': #if line starts with character > then its the key, this method also prevents duplicates
            key = s[1:-1]
            info=""
        elif str(s)[0]=="#":
            continue
        else: #else its the motive assigned to the key
            info=info+s
        tox_dict[key] = info[0:-1]
df_tox2=pd.DataFrame.from_dict(tox_dict,orient="index")

print(len(df_at1))
print(len(df_tox2))
#print(set(df_tox2.index))
#print(set(df_at1.loc[:,"info"]))
c=0
indexes=[]
for i in set(df_tox2.index):
    if i not in set(df_at1.loc[:,"info"]):
        indexes.append(i)
        c+=1
print(c)
unrec_data=df_tox2.loc[indexes]


unrec_fasta_file=open(f"{cl}Data/derived/unopenable_sequences.fasta","w")
for ind in unrec_data.index:
    print(f">{ind}",file=unrec_fasta_file)
    print(unrec_data.loc[ind,0],file=unrec_fasta_file)
