#!/usr/bin/env python3
# coding: utf-8

######################################################################################################################
# Author: Tanja Krüger
# Aim: this file visualizes the results of a fold cluster analysis
# Input: the file of the original IDs that belong to protein group1
# Input: the file of the clustered proteins as tsv at 100SST (Sequence similarity threshold)
# Input: the file of the clustered proteins as tsv at 50SST
# Input: the file of the clustered proteins as tsv at25SST
# Output:  the protein cluster numbers of different thresholds

########################################################################################################################
import pandas as pd
import matplotlib.pyplot as plt
import argparse
from datetime import datetime

########################################################################################################################
parser = argparse.ArgumentParser(prog="fold_cluster_analysis",
                                 description="visualized the fold clustering results")

parser.add_argument("r100",
                    type=str,
                    help="  100% sequence similarity  ")
parser.add_argument("r75",
                    type=str,
                    help="  75% sequence similarity  ")
parser.add_argument("r50",
                    type=str,
                    help="  50% sequence similarity  ")
parser.add_argument("r25",
                    type=str,
                    help="  25% sequence similarity")

args = parser.parse_args()
########################################################################################################################
# Option depending where the user wants the run the code form, default running the code with make from the project folder.
cl = ""
# If one wants to execute this file from the Code/python folder uncomment the next line.
# cl="../../"

########################################################################################################################
# Open and write to the log file
out_file = open(f"{cl}Data/derived/log.log","a")
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Write to the log file
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}")
print(f"number of required arguments:3 ",
      file=out_file)

########################################################################################################################
# Check which datasets are passed: a unique color scale is chosen
if "bacterial" in args.r100:
    data_name = "bacterial_toxins"
elif "animal" in args.r100:
    data_name="animal_toxins"
else:
    data_name="all_toxins"
print("###################################",data_name)

########################################################################################################################
# Open the files of different fold similarity thresholds 100,50 and 25
r100=pd.read_csv(args.r100,sep="\t",header=None)
r75=pd.read_csv(args.r75,sep="\t",header=None)
r50=pd.read_csv(args.r50,sep="\t",header=None)
r25=pd.read_csv(args.r25,sep="\t",header=None)

# Plot the different numbers of clusters that the different SST result in.
count_unreduced=len(set(r100.iloc[:,1]))
count100=len(set(r100.iloc[:,0]))
count75=len(set(r75.iloc[:,0]))
count50=len(set(r50.iloc[:,0]))
count25=len(set(r25.iloc[:,0]))
plt.style.use("seaborn")
fig, ax=plt.subplots()
plt.bar(["unreduced","100%\n sequence similarity","75%\nsequence similarity","50%\nsequence similarity","25%\nsequence similarity"],[count_unreduced,count100,count75,count50,count25])
plt.title(f"{data_name} sequence similarity clustering")
plt.ylabel("number of clusters")
plt.xlabel("sequence  similarity reduction ")
plt.savefig(f"{cl}Figures/{data_name}_mmseqs_reduction_level.png")


#######################################################################################################################
plt.close("all")
out_file.close()



