#!/usr/bin/env python3
# coding: utf-8

#######################################################################################################################
# This file creates a scv with columns for the protein ID, the sequence, if a signal peptide exists, and start and end of the signal peptide
# Input: a tsv or fasta file that contain protein IDs
# Output: a csv

# steps how to go about
# identification if input is fasta or tsv
# API request with the protein IDs in batches of 100
# Exctract signal seuqence information
# then write the ouput file from here

#######################################################################################################################
# Import libraries needed.
import requests
from requests import Session
import pandas as pd
import numpy as np
import re
import argparse
import random
from datetime import datetime
import csv
from Bio import SeqIO

#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="signal_p_extration_part1.py",
                                 description="gets the signal peptides for a list of proteins form uniprot")
parser.add_argument("prots",
                    type=str,
                    help="tsv or fasta file that contains protein IDs ")
args = parser.parse_args()

####################################################################################################################
# Open and write to the log file
out_file = open("../../Data/derived/log.log","a")
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Write to the log file
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"argument passed: {args.prots}",
      file = out_file)
print(f"number of required arguments: 1",
      file=out_file)
print(f"argument should contain: a file with protein IDs, can be a tsv or a fasta file",
      file=out_file)

########################################################################################################################
## Open the provided files
tox_file=args.prots

# Check what kind of input  (fasta or tsv).
if "fasta" in args.prots:
    # Open and read the fasta file
    with open(tox_file) as handle:
        df_at = pd.DataFrame({record.id: [record.description, str(record.seq)]  for record in SeqIO.parse(handle, "fasta")}).T
    # Introduce column headers
    df_at.columns=["species","sequence"]
    # Split the combined ID and species that only the species is left over
    df_at["species"]=df_at["species"].str.split(" ",n=2, expand=True)[2]
    # Get the protein Ids
    IDs=df_at.index

elif "tsv" in args.prots:
    # Define the missing headers.
    header = ["id", "species", "genus"]
    # Create a DataFrame.
    df_at = pd.read_table(tox_file, header=None, names=header)
    # Set the protein IDs to the index
    df_at.set_index("id",inplace=True,drop=True,)
    # Get the protein Ids
    IDs=df_at.index

else:
    # In case neither a fasta file or tsv file was provided.
    print("error, the file type must either be tsv, or fasta")


# ####################################################################################################################
## Query the signal peptide position from uniprot with the use of an API.
## The API accepts a single string with the iDs sperated with commata. The string may not contain more than
## 100 iDs at a time, therefore make packages of 100 IDs at a time.

all_iDs=[]
# Separate the Ids in packages of 100.
n_packages=int(len(IDs)/100)+1
all_iD_string=list(np.arange(n_packages))
j = 0
for i in np.arange(n_packages):
    iDs_part=IDs[j:j+100]
    iD_string=""
    for iD in iDs_part:
        iD_string=iD_string+iD + ","
    # get rid of the last comma
    iD_string = iD_string[0:-1]
    all_iD_string[i]=[iD_string]
    j+=100

# Query using an API of uniprot and EMBL.
dfs=[]
# Loop over all the packages of 100
for i in all_iD_string:
    with Session() as session:
        response = session.get(
            url='https://www.ebi.ac.uk/proteins/api/proteins?',
            params={ 'accession':i}, # i are packages of 100 iDs
            headers={"Accept": "application/json"})
        if response.ok:
            print("it worked")

            df=(pd.DataFrame([{"signal": r['features'][0],"id": r['accession'],  "sequence":r['sequence']['sequence']} for r in response.json()]))
            dfs=dfs+[df]
        else:
            print("An error occurred:", response.status_code)

########################################################################################################################
## Extract the signal peptides from the downloaded json.

# Concateneate the dataframes of length 100 to one large dataframe.
df=pd.concat(dfs,axis=0,ignore_index=True)
# Unpack the dictionary in the signal column by normalizing.
df2=pd.json_normalize(df["signal"])
# Merge the two datasets.
df=df.join(df2)
# Get rid of superfluous columns.
df=df.drop(columns=["category","molecule","description","ftId","evidences","signal"],axis=1,errors='ignore')
# Check the unique starting and unique end points, they must be integers
print(list(set([i for i in df["end"]])))
# most start at 1, but a small subset starts at <1, replace this with 1
df["begin"]=1
# Replace beginning and end for everything that is not a signal peptide with 0 entries
df.loc[df["type"]!="SIGNAL",["begin","end"]]=0
# Also replace beginning and end for everything that is otherwise undidentified
df.loc[df["end"]=='~',["begin","end"]]=0

########################################################################################################################
# Combine the input species information with the downloaded signal p information
# to a) check if the protein IDs are the same
# to b) keep the curated input information abou the species (species info from API not as curated)

# Use the protein IDs as index.
df.set_index(keys="id",drop=True,inplace=True)
# Join the singal p info with the input. Attention input varies, fasta file also contains sequence info.
# Use the redundant sequence info as additional checkpoint if everything is as it should.
if "fasta" in args.prots:
    df=df.join(df_at,lsuffix="l")
    if df["sequencel"].equals(df["sequence"]):
        df.drop("sequencel",axis=1, inplace=True)
    else:
        print("error: the sequences from input fasta file and the downlaoded seuqences are not the same")

elif "tsv" in args.prots:
    df = df.join(df_at, lsuffix="l")

#######################################################################################################################
## Write the results csv file.
# Extract the name from the input.
m = re.search("\/derived\/(.*)\.", args.prots)
savename=m.group(1)
# Save the data in a scv file.
df.to_csv(path_or_buf=f'../../Data/derived/{savename}_signal_peptide.csv')

#######################################################################################################################
# logs:
print(f"created file named:{savename}_signal_peptide.csv, file was saved under derived data",file=out_file)
out_file.close()



########################################################################################################################
# Test the program
# the following test were carried out
# general running:
# tsv input having one iD only : ok
# tsv input having mulitple iDs : ok
# fasta input having one iD only :ok
# fasta input having multiple iDs only :ok
# the length of the input and the output was compared
    #: ok

# checking if end of sequence can be transformed to an integer (in database some are ? or similar stuff)
for i in (list(set([int(i) for i in df["end"]]))):
    if isinstance(i,int) == False:
        print(" error: end position of the signal peptide has to be an integer")
# randomly picking 5 results and checking by hand in uniprot if everything as it should

