#!/usr/bin/env python3
# coding: utf-8

# Author: Tanja Krueger
# Aim: this file checks the external redundancy between two files. The internal redundnacy was alreay
#       established with CDHIT, but sequences that pop up in the toxic set must not be in the control set simultaneously

#Input: the CDHIT100 reduced file of all_animal_proteins
#Input: the CDHIT100 reduced file of animal toxins
#Input: the CDHIT100 reduced file of contorl proteins

#Output: the external reduced animal toxins and control proteins
#         Meaning: non of the sequences inside the toxins and control set pop up in the other


# This script opens both the internal reduced combined sets (all_animal_proteins_...CDHIT100)
# and the toxic (animal_toxins_combined...CDHIT100)
# the unique toxic IDs are substracted from the combined.
# Ideally the leftover sequences are EXACTLY the same to the animal control proteins.

#######################################################################################################################
# Import libraries needed.
import os
import re
import argparse
from datetime import datetime
from Bio import SeqIO
import pandas as pd
#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="data_overlap_part1",
                                 description="this python script checks the external redundancy between two files")
parser.add_argument("CD100allprots",
                    type=str,
                    help="  internal reduced combined sets (all_animal_proteins_...CDHIT100)")
parser.add_argument("CD100toxins",
                    type=str,
                    help="  internal reduced toxins (animal_toxins_combined...CDHIT100)")
parser.add_argument("CD100control",
                    type=str,
                    help=" internal reduced control poroteins(animal_control_proteins.. CDHIT100")
args = parser.parse_args()

####################################################################################################################
# Option were the user wants the run the code form, default running the code with make from the project folder
cl=""
# If one wants to execute this file from the Code/python folder uncomment the next line
#cl="../../" #option for running the code  under the Code/python folder
#####################################################################################################################

# Open and write to the log file
out_file = open(f"{cl}Data/derived/log.log","a")
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Write to the log file
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}")
print(f"argument passed: {args.CD100allprots}{args.CD100toxins}{args.CD100control}",
      file = out_file)
print(f"number of required arguments: 3",
      file=out_file)

########################################################################################################################
# Open the  datasets
# Open the combined all_animal_toxins_dataset after CD100 reduction.
df_all=pd.read_csv(args.CD100allprots, sep='\t',header=None)
df_all.set_index(0, inplace=True,drop=True)
print(df_all)
# Open the toxins set after CD100 reduction.
df_tox=pd.read_csv(args.CD100toxins, sep='\t',header=None)
df_tox.set_index(0, inplace=True,drop=True)
print(df_tox)
# Open the control proteins dataset after CD100 reduction.
df_ctr=pd.read_csv(args.CD100control, sep='\t',header=None)
df_ctr.set_index(0,inplace=True)
print(df_ctr)




# Remove the control sequences from the combined and check if tox and new tox are the sam3
df_tox_new=df_all.drop(df_ctr.index,axis=0)
# if set(df_tox.index) != set(df_tox_new.index):
#     print("the outer reduction showed that the toxins and control proteins overlapped")
#     ID_removed=[i for i in list(df_tox.index) if i not in list(df_tox_new.index)]
#     print(f"the following sequences were subsequently removed {ID_removed} from the control proteins")
# else:
#     print("the outer reduction showed that no sequences are both in toxin and the control",file=out_file)

# Check the same for the control proteins
df_ctr_new=df_all
c=0
for i in df_tox.index:
    try:
        df_ctr_new.drop(i,axis=0,inplace=True)
    except:
        c+=1
        print(c)
        print("ID {i} was not found probably because it was CD100 reduced form the shared set.")

# Additional check that no toxins sneak into the control set additionally.
for i in df_ctr_new.index:
    try:
        # If any IDs that werent in the original control set suddenly appear, they generate an except here.
        a=df_ctr.drop(i,axis=0)
    except:
        # If any IDs show up in th except case they need to be removed
        print(f"the ID {i} was dropped")
        df_ctr_new.drop(i,inplace=True)
#######################################GET RID OF THE DATALEAKAGE
#by checking if all the new control IDS are in the original control IDS if not something new was introduced and that must not be
########################################CONTINUE HERE

#df_ctr_new=df_all.drop(df_tox.index,axis=0)
# Add an extra sublause what to do if a certain index was not there
if set(df_ctr.index) != set(df_ctr_new.index):
    print("the outer reduction showed that the toxins and control proteins overlapped")
    ID_removed=[i for i in list(df_ctr.index) if i not in list(df_ctr_new.index)]
    print(f"the following sequences were subsequently removed from the control proteins: {ID_removed}")
else:
    print("the outer reduction showed that no sequences are both in toxin and the control",file=out_file)


#######################################################################################################################
# Save the results separatly independent if something was removed or not.
# Extraction of the naming convention
tox_m = re.search("\/derived\/(.*).tsv", args.CD100toxins)
tox_savename =f"{cl}Data/derived/{tox_m.group(1)}_outerCDHIT.tsv"
df_tox.to_csv(f'{tox_savename}',                                   # the original toxins are kept
                            sep="\t",                              # only the control proteins set is reduced
                            index=True,                            # misslables more probable to be actual toxins
                            header=False)
ctr_m = re.search("\/derived\/(.*).tsv", args.CD100control)
ctr_savename =f"{cl}Data/derived/{ctr_m.group(1)}_outerCDHIT.tsv"
print(f"##########################\n the new dataset is:\n{df_ctr_new}")
df_ctr_new.to_csv(f'{ctr_savename}',
                            sep="\t",
                            index=True,
                            header=False)

#######################################################################################################################
print(f"the files {ctr_savename} and {tox_savename} were created and saved under Data/derived")
out_file.close()
