#!/usr/bin/env python3
# coding: utf-8

#######################################################################################################################
# This file creates the equivalent non toxic dataset to a toxins set. The non-toxic dataset will use the same species and
    #will only contain secreted proteins that are not labled as toxins
# Input: a fasta with the toxins accession number and species
# Output: a fasta file with non-toxic sequences from the same species that are also secreted


#######################################################################################################################
# Import libraries needed.
import requests
from requests import Session
import pandas as pd
import numpy as np
import re
import argparse
import random
from datetime import datetime
import csv
from pandas.io.json import json_normalize
from Bio import SeqIO

#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="data_collection_part1_4.py",
                                 description="gets control proteins for a toxins dataset from the same species")
parser.add_argument("tox",
                    type=str,
                    help="fasta with the species from the toxins dataset")
args = parser.parse_args()

####################################################################################################################
# Setting up the output and the log file.
cl="../../"
# Extract the filename from the input with regex.

fasta_file=open(f"{cl}Data/derived/experimental_animal_control_proteins.fasta","w")


# Get the time and date for the log.
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Open and write to the log file.
out_file = open(f"{cl}Data/derived/log.log","a")
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"argument passed: {args.tox }",
      file = out_file)
print(f"number of required arguments: 1",
      file=out_file)
print(f"argument should contain: a file with species names in scv format",
      file=out_file)

########################################################################################################################
## Get a list of unique species from the provieded fasta file.
# Get the file from the command line.
with open(args.tox) as handle:
    df_at = pd.DataFrame({record.id: [record.description, str(record.seq)]  for record in SeqIO.parse(handle, "fasta")}).T
df_at.columns=["info","seq"]
uniq_accession=list(df_at["iD"])


# ####################################################################################################################
## Query the taxonomy identification number from uniprot with the use of an API.
## The API accepts a single string with the iDs sperated with commata. The string may not contain more than
## 100 iDs at a time, therefore make packages of 100 IDs at a time.

# get the number of packages of 100.
n_packages=int(len(df_at["iD"])/100)+1
print(f"number of packages needed: {n_packages}")
# Create an empty list with the length of the number of packages.
all_iD_string=list(np.arange(n_packages))
j=0
# Iterate over the number of packages.
for i in range(n_packages):
    # Join a string for each package (up to 100 accession per package).
    my_string = ','.join(str(k) for k in uniq_accession[j:j+100])
    # Add all packages to the list.
    all_iD_string[i] = [my_string]
    j+=100

# Query using an API of uniprot and EMBL.
dfs=[]
# Loop over all the packages of 100
package=0
for i in all_iD_string:
    package+=1
    print(package)
    with Session() as session:
        response = session.get(
            url='https://www.ebi.ac.uk/proteins/api/proteins?',
            params={ 'accession':i}, # i are packages of 100 iDs
            headers={"Accept": "application/json"})
        if response.ok:
            # If query for package of 100 was successful build a dataframe.
            print(f"downloading unique species for file {tox_file} successful")
            df=(pd.DataFrame([{"id": r['accession'], 'name': r['id'],"keywords": r['keywords'], "organism":r["organism"]} for r in response.json()]))
            dfs=dfs+[df]
        else:
            # If query was unsuccessful print the error message form the API.
            print("An error occurred:", response.status_code)
# Concatenate the dataframes of length 100 to one large dataframe
df=pd.concat(dfs,axis=0)

########################################################################################################################
## Buildingt a URL from the unique taxonomy identification numbers, previously queried, togehter with the keywords "not toxic" and "secreted"

# List the unique taxonomy identification numbers
uniq_tax= list(set([i["taxonomy"] for i in df.iloc[:,3]]))


# The first part or the url consists of the base url, the format, the keywords: seceted and NOT toxin, and a open bracket
part1="https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28NOT%20%28keyword%3AKW-0800%29%20AND%20%28keyword%3AKW-0964%29%20AND%28%20%28"
# The second part consists of the taxonomy identification numbers
part2=""
for i in uniq_tax:
    part2=part2+"organism_id%3A"+str(i)+"%29%20OR%20%28"
# Remove the last %29%20OR%20%28
part2=part2[0:-14]
# The third part consists of a closing bracket.
part3="%29%29%29"
# Combine all parts of the url.
constructed_url=part1+part2+part3

#######################################################################################################################
## Query the Uniprot API with the constructed url, the response is a fasta file.
response=requests.get(constructed_url)
if response.status_code == 200:
    # If the call is successful, the fasta file will be directly saved in the derived folder.
    print(f"#{m.group(1)} control proteins",file=fasta_file)
    print(response.text,file=fasta_file)
else:
    # If the call is not successful, print an error message.
    print("An error occurred:", response.status_code)


#######################################################################################################################
# logs:
print(f"created file named:{savename} ",file=out_file)
print(f"created file named:{savename} ")
print(f"{savename} was stored under derived data",file=out_file)
out_file.close()
fasta_file.close()

