#!/usr/bin/env python3
# coding: utf-8


########################################################################################################################
# Author: Tanja Krüger
# Aim: This file visualizes the amino acid composition, the average length, the aromaticity and the logos in joined
# graphics
# Input: four fasta files with the seperate bacterial and animal toxins, and four matrices that hold the surprise of
# the amino
#       acids to occur in one of two sets
# Outpout: a serires of files that show the length distribution, the pI, instability, aromaticity distribution and logos in
#       cominded graphs


########################################################################################################################
# downloaded
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.transforms as mtransforms
import re, argparse, csv, collections, random
from datetime import datetime
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO
import logomaker

# #################################################################################################
# Option depending where the user wants the run the code form, default running the code with make from the project folder.
cl = ""
# If one wants to execute this file from the Code/python folder uncomment the next line.
#cl = "../../"

########################################################################################################################
#Step1: Get the arguments from the command line.
parser = argparse.ArgumentParser(prog="data_analysis_4.py",
                                 description="sequence similarity / diversity in shared plot")

parser.add_argument("at100",
                    type=str,
                    help="animal toxins  100% sequence similarity ")
parser.add_argument("at75",
                    type=str,
                    help="animal toxins  75% sequence similarity ")
parser.add_argument("at50",
                    type=str,
                    help="animal toxins  50% sequence similarity ")
parser.add_argument("at25",
                    type=str,
                    help="animal toxins  25% sequence similarity")
parser.add_argument("bt100",
                    type=str,
                    help="bacterial toxins  100% sequence similarity ")
parser.add_argument("bt75",
                    type=str,
                    help="bacterial toxins  75% sequence similarity ")
parser.add_argument("bt50",
                    type=str,
                    help="bacterial toxins  50% sequence similarity ")
parser.add_argument("bt25",
                    type=str,
                    help="bacterial toxins  25% sequence similarity")
parser.add_argument("tt100",
                    type=str,
                    help="total toxins  100% sequence similarity ")
parser.add_argument("tt75",
                    type=str,
                    help="total toxins  75% sequence similarity ")
parser.add_argument("tt50",
                    type=str,
                    help="total toxins  50% sequence similarity ")
parser.add_argument("tt25",
                    type=str,
                    help="totaltoxins  25% sequence similarity")
args = parser.parse_args()

# Step2: Log
# Step2.1: Open the predictor logfile and the general logfile.
out_file = open(f"{cl}Data/derived/log.log", "a")

# Step2.2: Get the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")


# Step 3: Create reused functions
# Step 3.1: Define how to read in the data
def read_csv_files(file_dict, sep="\t", header=None):
    # The file dict has a tuple as values (the first tuple entry is the file to be opend, the second which column is used)
    #Create a new dictionary that inherits the postion of the column that is used again in a tuple
    return {key: (pd.read_csv(val[0], sep=sep, header=header),val[1]) for key, val in file_dict.items()}

# Step count the unique values in the dataframe
def count_unique_values(df, col):
    return len(set(df.iloc[:, col]))


# Step 4: Create the dictionary that needs to be opend with the datanames as keys and a tuple (of dataset +in which row needs to be used) as values
file_dict_tt = {'raw':(args.tt100,1), '100': (args.tt100,0), '75': (args.tt75,0), '50': (args.tt50,0), '25': (args.tt25,0)}
file_dict_at = {'raw':(args.at100,1), '100': (args.at100,0), '75': (args.at75,0), '50': (args.at50,0), '25': (args.at25,0)}
file_dict_bt = {'raw':(args.bt100,1), '100': (args.bt100,0), '75': (args.bt75,0), '50': (args.bt50,0), '25': (args.bt25,0)}


#Step 5: Open and read the data into the dictionary, now values are the opened data
tt_data = read_csv_files(file_dict_tt)
at_data = read_csv_files(file_dict_at)
bt_data = read_csv_files(file_dict_bt)


# Step 6: Counting sequences
count_data = {}
# Step 6.1: Iterate simultanously over the label and the dictionaries with the data
for label, data_dict in zip(['All toxins', 'Animal toxins', 'Bacterial toxins'], [tt_data,at_data, bt_data]):
    # Step 6.2:  the three datasets are the keys and the values are the counts of the reduced and unreduced data
    count_data[label] = {key: count_unique_values(tup[0], tup[1]) for key, tup in data_dict.items()}


# Step 7: Printing and other operations
# for label, counts in count_data.items():
#     keys = list(counts.keys())
#     values = list(counts.values())
#     deltas = [values[i] - values[i + 1] for i in range(len(values) - 1)]
#     percentages = [deltas[i] / values[i] for i in range(len(deltas))]
#
#     print(f"{label} toxins")
#     print(*values)
#     print("deltas")
#     print(*deltas)
#     print("percentage")
#     print(*percentages)
#
# DataFrame creation


# Step 8: Creating the DataFrame
df = pd.DataFrame(count_data)
print(df)
index = ["raw", "100%\n sequence similarity", "75%\nsequence similarity", "50%\nsequence similarity",
         "25%\nsequence similarity"]
df.index = index
# Step 9: Calculate the overlap
df['delta'] = df["All toxins"]- df['Animal toxins'] - df['Bacterial toxins']
df['Animal toxins only'] = df['Animal toxins'] - df['delta']
df['Bacterial toxins only']=df['Bacterial toxins'] - df['delta']
# Plotting

plt.style.use("seaborn")
fig, ax = plt.subplots()

print(df.to_string())
# Setting index as x labels
x_labels = df.index

# Plotting 'tt' (it will serve as the bottom-most bar)
ax.bar(x_labels, df['Animal toxins only'], label='Animal toxins',color="#B1041B")

# Stacking 'at' on top of 'tt'
ax.bar(x_labels, df['delta'], bottom=df['Animal toxins only'], label='Overlap between the sets',color="yellow")

# Stacking 'bt' on top of 'tt' and 'at'
ax.bar(x_labels, df['Bacterial toxins only'], bottom=df['Animal toxins only'] + df['delta'], label="Bacterial toxins",color="#2156B5")

#ax = df.plot.bar(rot=0, color={"at": '#B1041B', "bt": "#2156B5", "tt": '#808080'})
plt.ylabel("Number of clusters", fontsize=15)
plt.xlabel("sequence similarity reduction", fontsize=15)
plt.legend()
plt.suptitle(f"sequence diversity", fontsize=20)
ax.set_xticklabels(index)
plt.tight_layout()
plt.rcParams['savefig.dpi'] = 300
