# This file is ment to explore the animal toxins data

# The file uses the following abbreviations
# at =animal toxins
########################################################################################################################
# All imports needed are:
from requests import Session
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
########################################################################################################################
# Read in the IDs and the genus tsv
at_file= "../../Data/derived/Animal_Toxins_Dataset_IDgenus.tsv"
# The file is missing headers, therefore add them yourself.
header=["iD","spec","genus"]
# Create a DataFrame.
df_at= pd.read_table(at_file,header=None,names=header)
print(df_at)
# Create a list of unique species
uniq_spec=df_at["spec"].unique()
uniq_spec=list(uniq_spec)
# for i in uniq_spec:
#     print(i)
#print(f"the number of diffrent species are: {len(uniq_spec)}")

# Creata a single string with the iDs separated with commas for the API query. The string may not contain more than
#100 iDs at a time, therefore make packages of 100 IDs at a time.
# Start with an empty string.
all_iDs=[]
# seperate the Ids in packages of 100
n_packages=int(len(df_at["iD"])/100)+1
all_iD_string=list(np.arange(n_packages))
j = 0
for i in np.arange(n_packages):
    iDs_part=df_at["iD"][j:j+100]
    iD_string=""
    for iD in iDs_part:
        iD_string=iD_string+iD + ","
    # get rid of the last comma
    iD_string = iD_string[0:-1]
    all_iD_string[i]=[iD_string]
    j+=100
print(f"new idea:{all_iD_string}")
# Query using an API of uniprot and EMBL.
dfs=[]
# Loop over all the packages of 100
for i in all_iD_string:
    with Session() as session:
        response = session.get(
            url='https://www.ebi.ac.uk/proteins/api/proteins?',
            params={ 'accession':i},
            headers={"Accept": "application/json"})
        if response.ok:
            print("it worked")
            df=(pd.DataFrame([{"id": r['accession'], 'name': r['id'], "dbReferences": r["dbReferences"],"keywords": r['keywords'], "organism":r["organism"]} for r in response.json()]))
            dfs=dfs+[df]
# Concateneate the dataframes so that one large dataframe is the result.
df=pd.concat(dfs,axis=0)  # save as csv
print(f"df is: {df}")

########################################################################################################################
# The next code part extract the go annotations, the species and the keywords for the animal toxins from the dataframe
# List all the go annotations
all_go=[]
for i in df.iloc[:,2]:
    for source in i:
        if source["type"]=="GO":
            all_go=all_go+[source["properties"]["term"]]
# Get a list of unique go annotations to specify the amount of bins needed in the histogram.
unique_gos=list(set(all_go))
# Repeat for the GO annotations
len_gos=len(unique_gos)+1 # +1 for the use in the histogram

# List all species names
all_species=[]
for i in df.iloc[:,4]:
    for j in i["names"]:
        if j["type"]== "common": #"scientific",
#            print(j["value"])
            all_species=all_species+[j["value"]]
uniq_species=list(set(all_species))
len_uniq_spec=len(uniq_species)+1 # +1 for the use in the histogram

# List all the taxonomy identification numbers
all_taxonomy=[]
for i in df.iloc[:,4]:
    all_taxonomy=all_taxonomy+[i["taxonomy"]]
uniq_tax=list(set(all_taxonomy))

# List all the keywords
all_keywords=[]
for i in df.iloc[:,3]:
    for key_val_pair in i:
        all_keywords=all_keywords+[key_val_pair["value"]]
unique_keywords=list(set(all_keywords))
len_keywords=len(unique_keywords)+1 # +1 for the use in the histogram

########################################################################################################################
# The next part visualizes the keywords,the go annotations and the species
# Visualize the counts of each keyword for the animal toxins
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(16,10))
ax.hist(all_keywords,edgecolor='white', linewidth=1.2,bins=np.arange(len_keywords)-0.5)
plt.xticks(rotation = 90)
ax.set_xlabel("keywords in uniprot",fontsize=20)
ax.set_ylabel("frequency in the used dataset",fontsize=20)
ax.set_title(f"frequency of keywords in the animal toxins ",fontsize=26)
plt.tight_layout()
#plt.show()

# Repeat for the GO annotations
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(16,10))
ax.hist(all_go,edgecolor='white', linewidth=1.2,bins=np.arange(len_gos)-0.5)
plt.xticks(rotation = 90)
ax.set_xlabel("go annotation in uniprot",fontsize=20)
ax.set_ylabel("frequency in the used dataset",fontsize=20)
ax.set_title(f"frequency of go annotations in the animal toxins ",fontsize=26)
plt.tight_layout()
#plt.show()

# Repeat for the species
fig, ax = plt.subplots(figsize=(16,10))
ax.hist(all_species,edgecolor='white', linewidth=1.2,bins=np.arange(len_uniq_spec)-0.5)
plt.xticks(rotation = 90)
ax.set_xlabel("species in uniprot",fontsize=20)
ax.set_ylabel("frequency in the used dataset",fontsize=20)
ax.set_title(f"frequency of species in the animal toxins ",fontsize=26)
plt.tight_layout()
plt.show()

########################################################################################################################
# The next part generates the human readable search termn for the uniprot website to generate the url from
# The human readable url contains two parts first the keywords: we decided on secreted proteins that are NO toxins
#                                           second the specieses: we selected all the species based on their taxonomy iD

keywords= "NOT (keyword:KW-0800) AND (keyword:KW-0964) AND "
taxonomy_ids=""
for i in uniq_tax:
    taxonomy_ids=taxonomy_ids+"(organism_id:"+str(i)+") OR "
# Remove the last "OR"
taxonomy_ids=taxonomy_ids[0:-3]
# Set everything together
human_readable_search= keywords +"(" +taxonomy_ids +")"
# and find out about pagination


## The human readable format is then used on the website to generate the URL
# idealy we would need
print(human_readable_search)

