#!/usr/bin/env python3
# coding: utf-8

#######################################################################################################################
# This file counts the number of files in the raw data directory. If the number of files has changed from the previous run

#######################################################################################################################
# Import libraries needed.
import requests
from requests import Session
import pandas as pd
import numpy as np
import re
import argparse
import random
from datetime import datetime
import csv
from pandas.io.json import json_normalize
from Bio import SeqIO

#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="data_collection_part1.py",
                                 description="gets control proteins for a toxins dataset from the same species")
parser.add_argument("tox",
                    type=str,
                    help="tsv with the species from the toxins dataset")
args = parser.parse_args()

####################################################################################################################

# Extract the filename from the input with regex.
m = re.search("\/derived\/(.*)_(.*)_(.*)_(.*)\.tsv", args.tox)
savename=m.group(1)+"_control_proteins"
print(f"the savename is {savename}")
# Define the end-result fasta file.
fasta_file=open(f"Data/derived/{savename}.fasta","w")
# Open and write to the log file.
out_file = open("Data/derived/log.log","a")
# Get the time and date for the log.
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Write to the log file.
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"argument passed: {args.tox }",
      file = out_file)
print(f"number of required arguments: 1",
      file=out_file)
print(f"argument should contain: a file with species names in scv format",
      file=out_file)

########################################################################################################################
## Get a list of unique species from the provieded file.
# Get the file from the command line.
tox_file=args.tox
# Define the missing headers.
header=["iD","spec","genus"]
# Create a DataFrame.
df_at= pd.read_table(tox_file,header=None,names=header)
# Create a list of unique species from the dataframe.
uniq_accession=list(df_at["iD"])


# ####################################################################################################################
## Query the taxonomy identification number from uniprot with the use of an API.
## The API accepts a single string with the iDs sperated with commata. The string may not contain more than
## 100 iDs at a time, therefore make packages of 100 IDs at a time.

# get the number of packages of 100.
n_packages=int(len(df_at["iD"])/100)+1
# Create an empty list with the length of the number of packes
all_iD_string=list(np.arange(n_packages))
j=0
# Iterate over the number of packages
for i in range(n_packages):
    # Join a string for each packages (up to 100 accession per packge)
    my_string = ','.join(str(k) for k in uniq_accession[j:j+100])
    # Add all packages to the
    all_iD_string[i] = [my_string]
    j+=100

# Query using an API of uniprot and EMBL.
dfs=[]
# Loop over all the packages of 100
for i in all_iD_string:
    with Session() as session:
        response = session.get(
            url='https://www.ebi.ac.uk/proteins/api/proteins?',
            params={ 'accession':i}, # i are packages of 100 iDs
            headers={"Accept": "application/json"})
        if response.ok:
            print(f"downloading unique species for file {tox_file} successful")
            df=(pd.DataFrame([{"id": r['accession'], 'name': r['id'], "dbReferences": r["dbReferences"],"keywords": r['keywords'], "organism":r["organism"]} for r in response.json()]))
            dfs=dfs+[df]
        else:
            print("An error occurred:", response.status_code)
# Concatenate the dataframes of length 100 to one large dataframe
df=pd.concat(dfs,axis=0)

########################################################################################################################
# List the unique taxonomy identification numbers
uniq_tax= list(set([i["taxonomy"] for i in df.iloc[:,4]]))

### try to get the fasta directly instead of downloading it ridiculusly and then constructing it
# The first part or the url consists of the base url, the format, the keywords: seceted and NOT toxin, and a open bracket
part1="https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28NOT%20%28keyword%3AKW-0800%29%20AND%20%28keyword%3AKW-0964%29%20AND%28%20%28"
# The second part consists of the taxonomy identification numbers
part2=""
for i in uniq_tax:
    part2=part2+"organism_id%3A"+str(i)+"%29%20OR%20%28"
# Remove the last %29%20OR%20%28
part2=part2[0:-14]
# The third part consists of a closing bracket.
part3="%29%29%29"
# combine all parts of the url
constructed_url=part1+part2+part3

#######################################################################################################################
## Query the Uniprot API with the constructed url
response=requests.get(constructed_url)
if response.status_code == 200:
    # If the call is successful, the fasta file will be directly saved in the derived folder.
    print(response.text,file=fasta_file)
else:
    # If the call is not successful, print an error message.
    print("An error occurred:", response.status_code)


#######################################################################################################################
# logs:
print(f"created file named:{savename} ",file=out_file)
print(f"created file named:{savename} ")
print(f"{savename} was stored under derived data",file=out_file)
out_file.close()
fasta_file.close()

