#!/usr/bin/env python3
# coding: utf-8

# Author: Tanja Krueger
# Aim: crossreferencing of the intermediate clean fasta file with the internal and external CD 40 reduced files in project 1
# Input: a fasta file with clean sequences
# Input: a tsv file with the inner CDHIT reduction and the outer CDHIT reduction
# Outputu: a fasta file that contains only the clean fasta sequences that overlap between the two input files
#######################################################################################################################
# Import libraries needed.
import os
import re
import argparse
from datetime import datetime
from Bio import SeqIO
import pandas as pd

####################################################################################################################
# Option depending where the user wants the run the code form, default running the code with make from the project folder
cl=""
# If one wants to execute this file from the Code/python folder uncomment the next line
cl="../../"

#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="data_overlap_part2.py",
                                 description="this script filters clean fastas ")
parser.add_argument("cf",
                    type=str,
                    help=" clean Fasta")
parser.add_argument("cdhit",
                    type=str,
                    help="  double cdhit (inner and outer redundancy reduction applied) ")

args = parser.parse_args()

########################################################################################################################

# Open and write to the log file
out_file = open(f"{cl}Data/derived/log.log","a")
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Write to the log file
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}")
print(f"argument passed: {args.cf}{args.cdhit}",
      file = out_file)
print(f"number of required arguments: 2",
      file=out_file)

########################################################################################################################
# Open the fasta file
with open(args.cf) as handle:
    df_cf = pd.DataFrame({record.id: [record.description, str(record.seq)]  for record in SeqIO.parse(handle, "fasta")}).T

# Define headers for better human handling.
head_cf=["info","seq"]
head_cdh=["spec","type"]
# Rename headers of the clean fasta file.
df_cf.columns=head_cf
print(df_cf)