#! /usr/bin/env python3

#######################################################################################################################
"""

#author Tanja Krueger

all such code will be indicated
#Purpose of script: This script takes the Data/derived folder as input and runs the CDHIT reduction.
#Input: a directory with clean fasta files
#Output: the output of the cdhit function are two files, one with the Ids and the clusters they belong to.
"""
#######################################################################################################################
# Import libraries needed.
import os
import re
import argparse
from datetime import datetime
from Bio import SeqIO

#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="callCDHIT",
                                 description="this pyhton script runs multiple CDHIT reductions")
parser.add_argument("DataDir",
                    type=str,
                    help="the data directory with clean fasta files (Data/derived) ")
args = parser.parse_args()

####################################################################################################################
# Open and write to the log file
out_file = open("Data/derived/log.log","a")
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Write to the log file
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}")
print(f"argument passed: {args.DataDir}",
      file = out_file)
print(f"number of required arguments: 1",
      file=out_file)
print(f"argument should contain: the data/derived directory of the project",
      file=out_file)
########################################################################################################################



def cdhitReduc(cdhit, file_in, file_out, c, n, T):
    import subprocess
    #c = '%s -i %s -o %s -c %s -n %s -T %s' % (cdhit, file_in, file_out,c,n,T)
    c=f"{cdhit} -i {file_in} -o {file_out} -c {c} -n {n} -T {T}"
    exit_status = subprocess.Popen(c, shell=True).wait()
    assert exit_status ==0, "calling CDHit failed"



def checkFolder(dir_raw_data):
    import os
    list_Cand_Files = os.listdir(dir_raw_data)
    return [i for i in list_Cand_Files if ("_clean.fasta" in i)]

#########################################################################################################################
# Iteration over the levels of CDHIT reduction to use
level_of_CDHIT=[100,90,40]
for lev in level_of_CDHIT:
    # Word size selection fitting to the level of CDHIT. Rules taken from CDHIT manual.
    if lev>= 70:
        word_size=5
    elif 70>lev>=60:
        word_size=4
    elif 60>lev>=50:
        word_size=3
    else:
        word_size=2
    # Selection of the clean fasta file from the provided folder.
    files_to_run=checkFolder(args.DataDir)
    # Interation over the clean fasta files.
    for i in files_to_run:
        print("################################")
        print(f"level_of_CDHIT={lev}")
        print(f"used file {i}")
        # Extraction of the naming convention
        m = re.search("(.*)_clean.fasta", i)
        savename =f"Data/derived/CDHITintermediateResults/CDHIT{lev}_{m.group(1)}"
        print(f"the file {i} was CDHIT reduced at level{lev}",out_file)
        # CDHIT reduction of one file and saving under the folder CDHITintermediateResults
        cdhitReduc("cdhit", f"Data/derived/{i}",f"{savename}", lev/100, word_size, 8)

out_file.close()
########################################################################################################################


