#!/usr/bin/env python3
# coding: utf-8

#######################################################################################################################
# This file visualilzes the protein lenghts
# Input: animal toxins fasta file,animal control protein fasta file,bacterial toxins fasta file and bacterial control fasta file
# Output1: a plot with the average protein lengths as distribution
# Output2: two sets of data with roughly the same protein length, one with control proteins from animals, the other with animal toxins


#######################################################################################################################
# Import libraries needed.
import requests
from requests import Session
import pandas as pd
import numpy as np
import re
import argparse
import random
from datetime import datetime
import csv
from pandas.io.json import json_normalize
from Bio import SeqIO
import matplotlib.pyplot as plt
####################################################################################################################
# Option depending where the user wants the run the code form, default running the code with make from the project folder
cl=""
# If one wants to execute this file from the Code/python folder uncomment the next line
#cl="../../"
#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="prot_length.py",
                                 description="visualizes protein lengths and generates subsets of similar length")
parser.add_argument("an_tox",
                    type=str,
                    help="animal toxins fasta file")
parser.add_argument("an_con",
                    type=str,
                    help="animal control protein fasta file")
parser.add_argument("bac_tox",
                    type=str,
                    help="bacterial toxins fasta file")
parser.add_argument("bac_con",
                    type=str,
                    help="bacterial control fasta file")

args = parser.parse_args()

####################################################################################################################
# Get the time and date for the log.
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Open and write to the log file.
out_file = open(f"{cl}Data/derived/log.log","a")
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"argument passed: {args.an_tox},{args.an_con} ,{args.bac_tox} and {args.bac_con}",
      file = out_file)
print(f"number of required arguments: 4",
      file=out_file)


########################################################################################################################
# Step1: Open the animal toxins (at)
with open(args.an_tox) as handle:
    df_at = pd.DataFrame({record.id: [record.description, str(record.seq)]  for record in SeqIO.parse(handle, "fasta")}).T

# Step 2: Open the animal control proteins (ac)
with open(args.an_con) as handle:
    df_ac = pd.DataFrame({record.id: [record.description, str(record.seq)]  for record in SeqIO.parse(handle, "fasta")}).T

# Step 3: Open the bacterial toxins (bt)
with open(args.bac_tox) as handle:
    df_bt = pd.DataFrame({record.id: [record.description, str(record.seq)]  for record in SeqIO.parse(handle, "fasta")}).T

# Step 4: Open the bacerial control proteins (bc)
with open(args.bac_con) as handle:
    df_bc = pd.DataFrame({record.id: [record.description, str(record.seq)]  for record in SeqIO.parse(handle, "fasta")}).T

# Get the average length of the sets and add it as a column
df_ac["len"]=[len(i) for i in df_ac.iloc[:,1]]
df_at["len"]=[len(i) for i in df_at.iloc[:,1]]
df_bt["len"]=[len(i) for i in df_bt.iloc[:,1]]
df_bc["len"]=[len(i) for i in df_bc.iloc[:,1]]

# Visualize the average length to get an overview
plt.style.use("seaborn")
fig, ax= plt.subplots()
ax.hist(df_at["len"],label="animal toxins", bins=np.arange(0,1000,10) ,alpha=0.5, density=True,color= '#B1041B' )
ax.hist(df_ac["len"],label="animal contorl", bins=np.arange(0,1000,10),alpha=0.5, density=True,color= '#EABA49')
ax.hist(df_bt["len"],label="bacterial toxins" ,bins=np.arange(0,1000,10),alpha=0.5 , density=True,color='#2156B5')
ax.hist(df_bc["len"],label="bacterial contorl", bins=np.arange(0,1000,10),alpha=0.5, density=True,color= '#61BDD2')
ax.set_ylabel("realtive frequency of the length")
plt.legend()
plt.savefig(f"{cl}Figures/protein_length")

CB_color_cycle = ['#EABA49', '#B1041B', '#61BDD2', '#2156B5'] #grey, red, cyan, blue
# get new subset of only sequence length between 50 and 200
df_ac_subset = df_ac[(100 >= df_ac["len"]) & (df_ac["len"] >= 50)]
df_at_subset= df_at[(100>= df_at["len"]) & (df_at["len"]>= 50)]

# Check if they have about even mean
print(df_at_subset["len"].mean(),df_at["len"].std())
print(df_ac_subset["len"].mean(),df_ac["len"].std())

# assert that the subset was constructed correctly
assert  all(x in df_at.index for x in df_at_subset.index), "an additional index not original in the animal toxins poped up"
assert all(x in df_ac.index for x in df_ac_subset.index), "an additional index not original in the animal contorls poped up"

# save the results
save_at=f"{cl}Data/derived{re.search('derived(.*)_CDHIT100_outerCDHIT_clean', args.an_tox).group(1)}_short_CDHIT100_clean2.fasta"
save_ac=f"{cl}Data/derived{re.search('derived(.*)_CDHIT100_outerCDHIT_clea', args.an_con).group(1)}_short_CDHIT100_clean2.fasta"


fasta_file1 = open(save_at, "w")
for num,i in enumerate(df_at_subset.index):
    print(f">{df_at_subset.iloc[num, 0]}", file=fasta_file1)
    print(df_at_subset.iloc[num, 1], file=fasta_file1)
fasta_file2 = open(save_ac, "w")
for num,i in enumerate(df_ac_subset.index):
    print(f">{df_ac_subset.iloc[num, 0]}", file=fasta_file2)
    print(df_ac_subset.iloc[num, 1], file=fasta_file2)

#close all the files
out_file.close()
plt.close()
fasta_file1.close()
fasta_file2.close()
