#!/usr/bin/env python3
# coding: utf-8

# Author: Tanja Krueger
# Aim: crossreferencing of the intermediate CDHIT results to the existing tables
#Input: a _listReps.csv file containing the sequences that are kept after a certain level of reduction
#Input: a Typ_ID.tsv file prior to reduction
#Output: a reduce Tpe_ID.tsv fiel

#######################################################################################################################
# Import libraries needed.
import os
import re
import argparse
from datetime import datetime
from Bio import SeqIO
import pandas as pd
#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="CDHIT_crossref_part1",
                                 description="this script parses the CDHIT results")
parser.add_argument("listReps",
                    type=str,
                    help=" the _listReps.csv from the CDHIT reduction")
parser.add_argument("IDType",
                    type=str,
                    help="  _ID_Type_file ")

args = parser.parse_args()

####################################################################################################################
# Open and write to the log file
out_file = open("Data/derived/log.log","a")
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Write to the log file
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}")
print(f"argument passed: {args.listReps}{args.IDType}",
      file = out_file)
print(f"number of required arguments: 2",
      file=out_file)

########################################################################################################################
# Open the Reps result file after CD reduction.
df_reduced=pd.read_csv(args.listReps, sep='\t',header=None) # open
df_reduced=df_reduced[0].str.removesuffix("at 100.00%") #dont use rstrip, it removes characters independet of position.
# Open the toxins set before reduction.
df_original=pd.read_csv(args.IDType, sep='\t',header=None)
# Join the two dataset on accession number, only keep the inner overlap.
df_original_reducded=pd.merge(df_original,
                              df_reduced,
                              left_on=0,
                              right_on=0,
                              how="inner") # what's lost are the reduced IDs

#  Extraction of the naming convention
m = re.search("\/derived\/(.*).tsv", args.IDType) # get input name with regex
lev = re.search("CDHITintermediateResults\/CDHIT(\d+)_",args.listReps) # get level of CDHIT reduction with regex
# Check if the input names are allright (shoud contain the same dataset)
n1= re.search("CDHITintermediateResults\/CDHIT(\d+)_(.*)_listReps.csv",args.listReps)
n2= re.search("\/derived\/(.*)_ID_Type.tsv", args.IDType)
if n1.group(2)!= n2.group(1):
    print("the two files that are crossreferenced are based on diffrent datasets")
else:
    print("the sets match")
print(n1.group(2))
print(n2.group(1))
savename =f"Data/derived/{m.group(1)}_CDHIT{lev.group(1)}.tsv"
# Save the dataframe to a tsv.
df_original_reducded.to_csv(f'{savename}',
                            sep="\t",
                            index=False,
                            header=False)

########################################################################################################################
print(f"the file {savename} was created an stored under derived data",
      file=out_file)

out_file.close()
