#!/usr/bin/env python3
# coding: utf-8


########################################################################################################################
# Author: Tanja Krüger
# Aim: This file downloads a set of bacterial control using the Signal P 6 resources
# Input: A truncated fasta file about all proteins used to train SignalP6
# Output: a figure that shows the length of the SignalP training data

########################################################################################################################
# downloaded
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, argparse, csv, collections, random
from datetime import datetime
from Bio import SeqIO
from requests import Session
from scipy.stats import gaussian_kde
import csv


# #################################################################################################
# Option depending where the user wants the run the code form, default running the code with make from the project folder.
cl = ""
# If one wants to execute this file from the Code/python folder uncomment the next line.
cl = "../../"

########################################################################################################################
#Step1: Get the arguments from the command line.
parser = argparse.ArgumentParser(prog="data_downloading_SignalP6.py",
                                 description="downloading non-toxins data of the signal P training")

parser.add_argument("SignalP",
                    type=str,
                    help="SignalP6 truncated training sequences")
args = parser.parse_args()

# Step2: Log
out_file = open(f"{cl}Data/derived/log.log", "a")
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
print(f"""########## \n
program {parser.prog} was executed at {dt_string} \n
argments passed: the SingnalP6 training data {args.SignalP} 
number of required arguments:1""",file=out_file)

#Open the data
with open(args.SignalP) as handle:
    sig_raw = pd.DataFrame({record.id: [record.description, str(record.seq)]  for record in SeqIO.parse(handle, "fasta")}).T

secreted_bac=[]
nonsecreted_bac=[]
secreted_euk=[]
nonsecreted_euk=[]

#Extract the secreted bacterial IDs
for i in sig_raw.iloc[:, 0]:
    if ("NEGATIVE" in i or "POSITIVE" in i) and "NO_SP" not in i: #bac and secreted
        secreted_bac.append(re.match(r'([^|]+)', i).group(1))
    elif ("NEGATIVE" in i or "POSITIVE" in i) and "NO_SP" in i: # bac and not secreted
        nonsecreted_bac.append(re.match(r'([^|]+)', i).group(1))
    elif "EUKARYA" in i and "NO_SP" not in i: # euk and sec
        secreted_euk.append(re.match(r'([^|]+)', i).group(1))
    elif "EUKARYA" in i and "NO_SP" in i:
        nonsecreted_euk.append(re.match(r'([^|]+)', i).group(1))

def get_query(IDs):
    """This function uses a list of Unitprot accession numbers and returns a list of packages of 100
    IDs: list of Uniprot accession numbers
    returns:
    List with packages of 100IDs (Uniprot limit per query is 100iDs)"""
    n_packages=int(len(IDs)/100)+1
    print(f"number of packages needed: {n_packages}")
    # Create an empty list with the length of the number of packages.
    all_iD_string=list(np.arange(n_packages))
    j=0
    for i in range(n_packages):
        # Join a string for each package (up to 100 accession per package).
        my_string = ','.join(str(k) for k in IDs[j:j+100])
        all_iD_string[i] = [my_string]
        j+=100
    return all_iD_string

def get_seqs(all_iD_string):
    """ This function downloads the sequences of a list of IDs from uniprot
    all_id_string: list of ID-packages, each ID-package is 100 IDs long"""
    dfs=[]
    # Loop over all the packages of 100
    package=0
    for i in all_iD_string:
        package+=1
        with Session() as session:
            response = session.get(
                url='https://www.ebi.ac.uk/proteins/api/proteins?',
                params={ 'accession':i}, # i are packages of 100 iDs
                headers={"Accept": "application/json"})
            if response.ok:
                # If query for package of 100 was successful build a dataframe.
                print(f"downloading unique species  successful")
                df=(pd.DataFrame([{"id": r['accession'], 'seq': r['sequence']["sequence"]} for r in response.json()]))
                dfs=dfs+[df]
            else:
                # If query was unsuccessful print the error message form the API.
                print("An error occurred:", response.status_code)
        #if package == 2:
        #    break
    # Concatenate the dataframes of length 100 to one large dataframe
    if package >1:
        return pd.concat(dfs,axis=0)
    else:
        return df


def process_ids(id_list, label):
    ids_query = get_query(id_list)
    df_result = get_seqs(ids_query)
    return df_result

def calculate_kde(sequence_length,bw_method):
    return gaussian_kde(sequence_length, bw_method=bw_method)

id_dict = {'secreted bacteria': secreted_bac,
    'non-secreted bacteria': nonsecreted_bac,
    'secreted eukarya': secreted_euk,
    'non-secreted eukarya': nonsecreted_euk}

# Create an empty dictionary to store the resulting dataframes
df_results = {}
for label, id_dict in id_dict.items():
    df_results[label] = process_ids(id_dict, label)

#Calculate the length of the sequences and store in a new column in each of the dataframes
for _,dataset in df_results.items():
    dataset.loc[:,"length_seq"] = dataset.loc[:,"seq"].apply(len)

# Plot the res
x_range = np.linspace(0,2000, 1000)
plt.style.use("seaborn")
fig, ax =plt.subplots(figsize=(9,6))
for label,dataset in df_results.items():
    kde=gaussian_kde(dataset["length_seq"], bw_method=0.075)
    ax.plot(x_range,kde(x_range),label=f"{label}: ({len(dataset)} sequences)")
plt.legend(loc='upper right',fontsize=16)
plt.xlabel("length",fontsize=18)
plt.ylabel("density",fontsize=18)
#plt.show()
plt.savefig(f"{cl}Figures/SignalP6_lengths3.png", bbox_inches="tight")




# Write the results in a csv file:
# Reuse code
lens_sb = pd.DataFrame([(len(i),"secreted\nbacteria") for i in df_results["secreted bacteria"]["seq"]],columns=["length","origin"])
lens_ub = pd.DataFrame([(len(i),"non-secreted\nbacteria") for i in df_results["non-secreted bacteria"]["seq"]],columns=["length","origin"])
lens_se = pd.DataFrame([(len(i),"secreted\neukarya") for i in df_results["secreted eukarya"]["seq"]],columns=["length","origin"])
lens_ue = pd.DataFrame([(len(i),"non-secreted\neukarya") for i in df_results["non-secreted eukarya"]["seq"]],columns=["length","origin"])
df_all_lens = pd.concat([lens_sb, lens_ub, lens_se, lens_ue], ignore_index=True)

# Open a new csv file in write mode
def writer(df):
    with open(f"{cl}Data/derived/length_output_SignalP.csv", "w") as f:
        # Create a csv writer object
        writer = csv.writer(f)
        for i in df.index:
            writer.writerow(df.iloc[i,:])
# Write the file
writer(df_all_lens)



























