#!/usr/bin/env python3
# coding: utf-8

# Author: Tanja Krueger
# Aim: crossreferencing of the intermediate clean fasta file with the internal and external CD 100 reduced files
# Input: a fasta file with clean sequences
# Input: a tsv file with the inner CDHIT reduction and the outer CDHIT reduction
# Ouptu: a fasta file that contains only the clean fasta sequences that overlap between the two input files
#######################################################################################################################
# Import libraries needed.
import os
import re
import argparse
from datetime import datetime
from Bio import SeqIO
import pandas as pd

####################################################################################################################
# Option depending where the user wants the run the code form, default running the code with make from the project folder
#cl=""
# If one wants to execute this file from the Code/python folder uncomment the next line
cl="../../"

#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="data_overlap_part2.py",
                                 description="this script filters clean fastas ")
parser.add_argument("cf",
                    type=str,
                    help=" clean Fasta")
parser.add_argument("cdhit",
                    type=str,
                    help="  double cdhit (inner and outer redundancy reduction applied) ")

args = parser.parse_args()

####################################################################################################################
# Open and write to the log file
out_file = open(f"{cl}Data/derived/log.log","a")
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Write to the log file
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}")
print(f"argument passed: {args.cf}{args.cdhit}",
      file = out_file)
print(f"number of required arguments: 2",
      file=out_file)

########################################################################################################################
# Open the fasta file
with open(args.cf) as handle:
    df_cl = pd.DataFrame({record.id: [record.description, str(record.seq)]  for record in SeqIO.parse(handle, "fasta")}).T

# Define headers for better human handling.
head_cf=["info","seq"]
head_cdh=["spec","type"]
# Rename headers of the clean fasta file.
df_cl.columns=head_cf

# Open the double reduced CDHIT result file.
df_cdhit=pd.read_csv(args.cdhit, sep='\t',header=None) # open
df_cdhit.set_index(0,drop=True, inplace=True)
# Rename headers of the CDHIT file.
df_cdhit.columns=head_cdh
#print(f"the cdhit data is:\n{df_cdhit}")
#print(f"the clean fasta is:\n{df_cl}")


# Check if the named input is ok or if two unrelated datasets were tryed to merge
dataset1 = re.search("\/derived\/(.*)_clean.fasta", args.cf) # get input name with regex
dataset2= re.search("\/derived\/(.*)_ID_Type_CDHIT",args.cdhit)
print(dataset1.group(1))
print(dataset2.group(1))
#try:
    #dataset1.group(1)==dataset2.group(1)

# If they are the same construct the new reduced fasta file name.
name=re.search("\/derived\/(.*)_ID_Type_(.*).tsv",args.cdhit)
print(name)
    #if name == None:
    #    name = re.search("\/derived\/(.*)_ID_Type_(.*).tsv", args.cdhit)'
savename=f"{cl}Data/derived/{name.group(1)}_{name.group(2)}_clean2.fasta"
print(savename)
fasta_file = open(savename, "w")
    # Get the subset of df_cl by filtering by the accession numbers.

#check out how many of the sequeences have leaked
c=0
for i in df_cdhit.index:
    try:
        df_cl.loc[df_cdhit.index]
    except:
        c+=1
        print(c)
        print(i)
#df_cl = df_cl.loc[df_cdhit.index]
    # Save.
# for i in df_cl.index:
#     print(f">{df_cl.loc[i,'info']}",file=fasta_file)
#     print(df_cl.loc[i,"seq"],file=fasta_file)
#     # Log entry.
# print(f"the file {savename} was created an stored under derived data",
#         file=out_file)
# fasta_file.close()




#except:
    # If the passed arguments are not based on the same dataset, the underlying naming convention is wrong
    #print("the two provided arguments are not based on the same underlying dataset")

########################################################################################################################
#out_file.close()
