# This file downloads all secreted AND non-toxic proteins
import requests
import pandas as pd
import numpy as np
####################################################################################################################
# What you need to run this script:
# The previous analysations what species you have and what their unique speicies IDs are! Attention not the prot ID buth the
# identificaiton number of the species. in the json file its named taxonomy or similar
# Then built the human readable search query based on the species IDs
# The result looks something like the second line:
# #NOT (keyword:KW-0800) AND (keyword:KW-0964) AND( (organism_id:42752) OR (organism_id:529024) OR (organism_id:8707) OR (organism_id:1437190) OR (organism_id:329991) OR (organism_id:130704) OR (organism_id:7446) OR (organism_id:7447) OR (organism_id:8729) OR (organism_id:8730) OR (organism_id:7452) OR (organism_id:8732) OR (organism_id:8736) OR (organism_id:8738) OR (organism_id:7460) OR (organism_id:8741) OR (organism_id:8742) OR (organism_id:8740) OR (organism_id:8744) OR (organism_id:9387) OR (organism_id:8620) OR (organism_id:31150) OR (organism_id:91439) OR (organism_id:31921) OR (organism_id:31155) OR (organism_id:128077) OR (organism_id:72279) OR (organism_id:1808362) OR (organism_id:30192) OR (organism_id:8697) )
# Use this human readable seaach term to get the URL from uniprot

####################################################################################################################
# The next code part defines were the log file and the fasta file are stored
# Define where to write the log.
log_file = open("../../Data/derived/log.log","a") #if this file is to run with make, change the address
# Define where to write the fasta_file
fasta_file = open("../../Data/derived/fasta_clean_controlProteinsAnimals","w")#if this file is to run with make, change the address
# log:
print(f"####\nthe program: api_uniprot_to_fasta was executed",file=log_file)


####################################################################################################################
# The next code section requests the non toxic, secreted proteins form the same species like the toxin set.
# First download als the info in json format

response = requests.get("https://rest.uniprot.org/uniprotkb/stream?format=json&query=%28NOT%20%28keyword%3AKW-0800%29%20AND%20%28keyword%3AKW-0964%29%20AND%28%20%28organism_id%3A42752%29%20OR%20%28organism_id%3A529024%29%20OR%20%28organism_id%3A8707%29%20OR%20%28organism_id%3A1437190%29%20OR%20%28organism_id%3A329991%29%20OR%20%28organism_id%3A412038%29%20OR%20%28organism_id%3A130704%29%20OR%20%28organism_id%3A8724%29%20OR%20%28organism_id%3A537493%29%20OR%20%28organism_id%3A7446%29%20OR%20%28organism_id%3A7447%29%20OR%20%28organism_id%3A88087%29%20OR%20%28organism_id%3A8729%29%20OR%20%28organism_id%3A8730%29%20OR%20%28organism_id%3A96794%29%20OR%20%28organism_id%3A7452%29%20OR%20%28organism_id%3A8732%29%20OR%20%28organism_id%3A8605%29%20OR%20%28organism_id%3A8726%29%20OR%20%28organism_id%3A8736%29%20OR%20%28organism_id%3A8738%29%20OR%20%28organism_id%3A7460%29%20OR%20%28organism_id%3A8741%29%20OR%20%28organism_id%3A8742%29%20OR%20%28organism_id%3A8740%29%20OR%20%28organism_id%3A8744%29%20OR%20%28organism_id%3A9258%29%20OR%20%28organism_id%3A9387%29%20OR%20%28organism_id%3A8620%29%20OR%20%28organism_id%3A31150%29%20OR%20%28organism_id%3A91439%29%20OR%20%28organism_id%3A31921%29%20OR%20%28organism_id%3A31155%29%20OR%20%28organism_id%3A129461%29%20OR%20%28organism_id%3A8633%29%20OR%20%28organism_id%3A8637%29%20OR%20%28organism_id%3A196418%29%20OR%20%28organism_id%3A8649%29%20OR%20%28organism_id%3A111177%29%20OR%20%28organism_id%3A709964%29%20OR%20%28organism_id%3A128077%29%20OR%20%28organism_id%3A35670%29%20OR%20%28organism_id%3A72279%29%20OR%20%28organism_id%3A8665%29%20OR%20%28organism_id%3A292442%29%20OR%20%28organism_id%3A8667%29%20OR%20%28organism_id%3A8668%29%20OR%20%28organism_id%3A8670%29%20OR%20%28organism_id%3A132961%29%20OR%20%28organism_id%3A1808362%29%20OR%20%28organism_id%3A8554%29%20OR%20%28organism_id%3A30192%29%20OR%20%28organism_id%3A54390%29%20OR%20%28organism_id%3A8694%29%20OR%20%28organism_id%3A8697%29%29%29")
# Transform the json to a dict and select the results
results=response.json()["results"]

# Make a simple dataframe that contains everything, just to check out what columns you have
# df=pd.json_normalize(results,meta=["primaryAccession","uniProtkbId",["sequence","value"]],record_path=["keywords"])
# # Check the columns for better accessibility
# print(df.columns)
# df.drop(["evidences","category","name"],axis=1,inplace=True)
# df.rename(columns={"id":"keyword"},inplace=True)
# print(df.head())
# # Create a second dataframe that only contains the relevant information for us
df2=pd.DataFrame([{"id": r['primaryAccession'], 'name': r['uniProtkbId'], "keywords": r['keywords'], "sequence":r['sequence']["value"],"organism":r['organism'][ "scientificName"]} for r in results])
# # Tweak the keywords column into a list of keywords

keys_per_prot=[]
for i in df2["keywords"]:
    list_of_keywords=[]
    for j in i:
        list_of_keywords=list_of_keywords+[j["name"]]
    keys_per_prot=keys_per_prot+[list_of_keywords]
df2["keywords"]=keys_per_prot
########################################################################################################################
#
# ########################################################################################################################
# # The next code section created the fasta file and logs the information in the log file.
#
# # Create the fasta file with the iDs, species and sequence only
# #create three lists and always take the same element fromm the lists, as a simple list comprehension loop will interate over the columns
# ids=df2["id"].values.tolist()
# organism=df2["organism"].values.tolist()
# sequence=df2["sequence"].values.tolist()
# len_fasta=len(ids)
# for i in np.arange(len_fasta):
#     # Use two seperate print statements instead of \n to not get empty lines
#     print(f"> {ids[i]} [{organism[i]}]",file=fasta_file)
#     print(sequence[i],file=fasta_file)
# # logs:
# print(f"created file fasta_clean_controlProteinsAnimals: ",file=log_file)
# print(f"fasta_clean_controlProteinsAnimals was stored under derived data",file=log_file)
# log_file.close()
# fasta_file.close()
# m = re.search("\/derived\/(.*)", args.fasta)
# print(m)
# name_data = m.group(1)
# print(name_data)