#!/usr/bin/env python3
# coding: utf-8


########################################################################################################################
# Author: Tanja Krüger
# Aim: This file plots the sequence similarity/diversity in a shared plot for animal toxins, bacterial toxins and both
#      combined.
# Input: animal toxins  75% sequence similarity
# Input: animal toxins  50% sequence similarity
# Input: animal toxins  25% sequence similarity
# Input: bacterial toxins  100% sequence similarity
# Input: bacterial toxins  75% sequence similarity
# Input: bacterial toxins  50% sequence similarity
# Input: bacterial toxins  25% sequence similarity
# Input: total toxins  100% sequence similarity
# Input: total toxins  75% sequence similarity
# Input: total toxins  50% sequence similarity
# Input: total toxins  25% sequence similarity
# output: figure of sequence similarity in a shared plot for animal toxins, bacterial toxins and both combined


########################################################################################################################
# downloaded
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.transforms as mtransforms
import re, argparse, csv, collections, random
from datetime import datetime
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO
import logomaker

# #################################################################################################
# Option depending where the user wants the run the code form, default running the code with make from the project folder.
cl = ""
# If one wants to execute this file from the Code/python folder uncomment the next line.
#cl = "../../"

########################################################################################################################
#Step1: Get the arguments from the command line.
parser = argparse.ArgumentParser(prog="data_analysis_4.py",
                                 description="sequence similarity / diversity in shared plot")

parser.add_argument("at100",
                    type=str,
                    help="animal toxins  100% sequence similarity ")
parser.add_argument("at75",
                    type=str,
                    help="animal toxins  75% sequence similarity ")
parser.add_argument("at50",
                    type=str,
                    help="animal toxins  50% sequence similarity ")
parser.add_argument("at25",
                    type=str,
                    help="animal toxins  25% sequence similarity")
parser.add_argument("bt100",
                    type=str,
                    help="bacterial toxins  100% sequence similarity ")
parser.add_argument("bt75",
                    type=str,
                    help="bacterial toxins  75% sequence similarity ")
parser.add_argument("bt50",
                    type=str,
                    help="bacterial toxins  50% sequence similarity ")
parser.add_argument("bt25",
                    type=str,
                    help="bacterial toxins  25% sequence similarity")
parser.add_argument("tt100",
                    type=str,
                    help="total toxins  100% sequence similarity ")
parser.add_argument("tt75",
                    type=str,
                    help="total toxins  75% sequence similarity ")
parser.add_argument("tt50",
                    type=str,
                    help="total toxins  50% sequence similarity ")
parser.add_argument("tt25",
                    type=str,
                    help="totaltoxins  25% sequence similarity")
args = parser.parse_args()

# Step2: Log
# Step2.1: Open the predictor logfile and the general logfile.
out_file = open(f"{cl}Data/derived/log.log", "a")

# Step2.2: Get the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")

# Step2.3: Write to the predictor log file and the general logfile
print(f"""########## \n
program {parser.prog} was executed at {dt_string} \n
program {parser.prog} was executed at {dt_string} \n
argments passed: the 100,75,50,25% reduced animal toxins {args.at100},{args.at75},{args.at50} and {args.at25} \n
                the 100,75,50,25% reduced bacterial toxins {args.bt100},{args.bt75},{args.bt50} and {args.bt25} \n
                the 100,75,50,25% reduced all toxins {args.tt100},{args.tt75},{args.tt50} and {args.tt25} \n
number of required arguments:12""",file=out_file)

#Step 3.: Open the further reduced sequence sets
#Step 3.1: animal toxins at
at100=pd.read_csv(args.at100,sep="\t",header=None)
at75=pd.read_csv(args.at75,sep="\t",header=None)
at50=pd.read_csv(args.at50,sep="\t",header=None)
at25=pd.read_csv(args.at25,sep="\t",header=None)
#Step 3.2: bacterial toxins bt
bt100=pd.read_csv(args.bt100,sep="\t",header=None)
bt75=pd.read_csv(args.bt75,sep="\t",header=None)
bt50=pd.read_csv(args.bt50,sep="\t",header=None)
bt25=pd.read_csv(args.bt25,sep="\t",header=None)
#Step 3.3: total toxins tt
tt100=pd.read_csv(args.tt100,sep="\t",header=None)
tt75=pd.read_csv(args.tt75,sep="\t",header=None)
tt50=pd.read_csv(args.tt50,sep="\t",header=None)
tt25=pd.read_csv(args.tt25,sep="\t",header=None)


#Step 4: Extract the number of sequences at the different reduction levels
at_count_unreduced=len(set(at100.iloc[:,1]))
at_count100=len(set(at100.iloc[:,0]))
at_count75=len(set(at75.iloc[:,0]))
at_count50=len(set(at50.iloc[:,0]))
at_count25=len(set(at25.iloc[:,0]))

bt_count_unreduced=len(set(bt100.iloc[:,1]))
bt_count100=len(set(bt100.iloc[:,0]))
bt_count75=len(set(bt75.iloc[:,0]))
bt_count50=len(set(bt50.iloc[:,0]))
bt_count25=len(set(bt25.iloc[:,0]))

tt_count_unreduced=len(set(tt100.iloc[:,1]))
tt_count100=len(set(tt100.iloc[:,0]))
tt_count75=len(set(tt75.iloc[:,0]))
tt_count50=len(set(tt50.iloc[:,0]))
tt_count25=len(set(tt25.iloc[:,0]))

# print("total toxins")
# print(tt_count_unreduced,tt_count100,tt_count75,tt_count50,tt_count25)
# print("animal toxins")
# print(at_count_unreduced,at_count100,at_count75,at_count50,at_count25)
# print("bacterial toxins")
# print(bt_count_unreduced,bt_count100,bt_count75,bt_count50,bt_count25)
# print("deltas total toxins")
# print(tt_count_unreduced-tt_count100,tt_count100-tt_count75,tt_count75-tt_count50,tt_count50-tt_count25)
# print("deltas animal toxins")
# print(at_count_unreduced-at_count100,at_count100-at_count75,at_count75-at_count50,at_count50-at_count25)
# print("delats bacterial toxins")
# print(bt_count_unreduced-bt_count100,bt_count100-bt_count75,bt_count75-bt_count50,bt_count50-bt_count25)
# print("percentage total toxins")
# print((tt_count_unreduced-tt_count100)/tt_count_unreduced,(tt_count100-tt_count75)/tt_count100,(tt_count75-tt_count50)/tt_count75,(tt_count50-tt_count25)/tt_count50)
# print("percentage animal toxins")
# print((at_count_unreduced-at_count100)/at_count_unreduced,(at_count100-at_count75)/at_count100,(at_count75-at_count50)/at_count75,(at_count50-at_count25)/at_count50)
# print("percentage bacterial toxins")
# print((bt_count_unreduced-bt_count100)/bt_count_unreduced,(bt_count100-bt_count75)/bt_count100,(bt_count75-bt_count50)/bt_count75,(bt_count50-bt_count25)/bt_count50)

#Plot 5: Plot the differnt numbers of clusters that the different SST result in.
df=pd.DataFrame({"total toxins":[tt_count_unreduced,tt_count100,tt_count75,tt_count50,tt_count25],"animal toxins":[at_count_unreduced,at_count100,at_count75,at_count50,at_count25],"bacterial toxins":[bt_count_unreduced,bt_count100,bt_count75,bt_count50,bt_count25]})
index=["unreduced","100%\n sequence similarity","75%\nsequence similarity","50%\nsequence similarity","25%\nsequence similarity"]
location=[1,2,3,4,5]
plt.style.use("seaborn")
fig, ax=plt.subplots()
ax=df.plot.bar(rot=0,color={"total toxins":'#808080',"animal toxins":"#B1041B","bacterial toxins":"#2156B5"})
plt.ylabel("number of clusters",fontsize=15)
plt.xlabel("sequence  similarity reduction ",fontsize=15)
plt.suptitle(f"sequence diversity", fontsize=20)
ax.set_xticklabels(index)
plt.tight_layout()
plt.rcParams['savefig.dpi'] = 300
plt.savefig(f"{cl}Figures/shared_sequence_diversity_4.png", bbox_inches="tight")

# Close the files
plt.close("all")
out_file.close()
