#!/usr/bin/env python3
# coding: utf-8

#######################################################################################################################
# This file creates the equivalent non toxic dataset to a toxins set. The non-toxic dataset will use the same species and
    #will only contain secreted proteins that are not labled as toxins
# Input: a csv with the toxins and species
# Output: a fasta file with non-toxic sequences from the same species that are also secreted


#######################################################################################################################
# Import libraries needed.
import requests
from requests import Session
import pandas as pd
import numpy as np
import re
import argparse
import random
from datetime import datetime
import csv


#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="data_collection_part1.py",
                                 description="gets control proteins for a toxins dataset from the same species")
parser.add_argument("tox",
                    type=str,
                    help="csv with the species from the toxins dataset")
args = parser.parse_args()

####################################################################################################################
# Define the endresult fasta file
pipapo="a string that needs replacment"
fasta_file = open("../../Data/derived/controlProteinsAnimals_thisisanexample.fasta","w")#if this file is to run with make, change the address
# Open and write to the log file
out_file = open("../../Data/derived/log.log","a")
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Write to the log file
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"argument passed: {args.tox }",
      file = out_file)
print(f"number of required arguments: 1",
      file=out_file)
print(f"argument should contain: a file with species names in scv format",
      file=out_file)

########################################################################################################################
## Get a list of unique species from the provieded file.
# Get the file from the command line.
tox_file=args.tox
# Define the missing headers.
header=["iD","spec","genus"]
# Create a DataFrame.
df_at= pd.read_table(tox_file,header=None,names=header)
# Create a list of unique species from the dataframe.
uniq_spec=list(df_at["spec"].unique())

# ####################################################################################################################
## Query the taxonomy identification number from uniprot with the use of an API.
## The API accepts a single string with the iDs sperated with commata. The string may not contain more than
## 100 iDs at a time, therefore make packages of 100 IDs at a time.

all_iDs=[]
# Separate the Ids in packages of 100.
n_packages=int(len(df_at["iD"])/100)+1
all_iD_string=list(np.arange(n_packages))
j = 0
for i in np.arange(n_packages):
    iDs_part=df_at["iD"][j:j+100]
    iD_string=""
    for iD in iDs_part:
        iD_string=iD_string+iD + ","
    # get rid of the last comma
    iD_string = iD_string[0:-1]
    all_iD_string[i]=[iD_string]
    j+=100

# Query using an API of uniprot and EMBL.
dfs=[]
# Loop over all the packages of 100
for i in all_iD_string:
    with Session() as session:
        response = session.get(
            url='https://www.ebi.ac.uk/proteins/api/proteins?',
            params={ 'accession':i}, # i are packages of 100 iDs
            headers={"Accept": "application/json"})
        if response.ok:
            print("it worked")
            df=(pd.DataFrame([{"id": r['accession'], 'name': r['id'], "dbReferences": r["dbReferences"],"keywords": r['keywords'], "organism":r["organism"]} for r in response.json()]))
            dfs=dfs+[df]
        else:
            print("An error occurred:", response.status_code)
# Concateneate the dataframes of length 100 to one large dataframe
df=pd.concat(dfs,axis=0)

########################################################################################################################
# List the unique taxonomy identification numbers
uniq_tax= list(set([i["taxonomy"] for i in df.iloc[:,4]]))

#######################################################################################################################
## Consturct the url that is fed to the API in order to retrieve the control protein set.

# The first part or the url consists of the base url, the format, the keywords: seceted and NOT toxin, and a open bracket
part1="https://rest.uniprot.org/uniprotkb/stream?format=json&query=%28NOT%20%28keyword%3AKW-0800%29%20AND%20%28keyword%3AKW-0964%29%20AND%28%20%28"
# The second part consists of the taxonomy identification numbers
part2=""
for i in uniq_tax:
    part2=part2+"organism_id%3A"+str(i)+"%29%20OR%20%28"
# Remove the last %29%20OR%20%28
part2=part2[0:-14]
# The third part consists of a closing bracket.
part3="%29%29%29"
# combine all parts of the url
constructed_url=part1+part2+part3

#######################################################################################################################
## Query the Uniprot API with the constructed url
response=requests.get(constructed_url)
if response.status_code == 200:
    # If the call is successful, the json file will be saved as a dict in the results variable
    results=response.json()["results"]
else:
    # If the call is not successful, print an error message
    print("An error occurred:", response.status_code)

# Create a dataframe that only contains the id, name, keywords, sequence and organism
df2=pd.DataFrame([{"id": r['primaryAccession'], 'name': r['uniProtkbId'], "keywords": r['keywords'], "sequence":r['sequence']["value"],"organism":r['organism'][ "scientificName"]} for r in results])
# Tweak the keywords column into a list of keywords
keys_per_prot=[[[j["name"]] for j in i] for i in df2["keywords"]]
df2["keywords"]=keys_per_prot

########################################################################################################################
## Create the fasta file with the iDs, species and sequence only
#create three lists and always take the same element fromm the lists, as a simple list comprehension loop will interate over the columns
ids=df2["id"].values.tolist()
organism=df2["organism"].values.tolist()
sequence=df2["sequence"].values.tolist()
len_fasta=len(ids)

if len(ids)==len(organism)==len(sequence):
    #Construct the fasta file
    for i in np.arange(len_fasta):
    # Use two seperate print statements instead of \n to not get empty lines
        print(f"> {ids[i]} [{organism[i]}]",file=fasta_file)
        print(sequence[i],file=fasta_file)
else:
    print("error: lists must be the same length")





# logs:
print(f"created file named:{pipapo} ",file=out_file) # misses the name for the fasta
print(f"{pipapo} was stored under derived data",file=out_file)
out_file.close()
fasta_file.close()

########################################################################################################################
# Checking if the downloaded results have the right keywords
for i in df2["keywords"]:
    if ["Secreted"] not in i:
        print("error: downloaded sequences must be secreted")
    if ["Toxin"] in i:
        print("error: downloaded sequences must not contain any Toxin")


# Listing the
missing=[]
for i in set(uniq_spec):
    if i  not in set(organism):
        missing=missing+[i]

to_many=[]
for i in set(organism):
    if i  not in set(uniq_spec):
        to_many=to_many+[i]
print(missing)
print(to_many)