#!/usr/bin/env python3
# coding: utf-8

#######################################################################################################################
# This file is for the keyword and go annotation visualization within a provided dataset
# Input: a fasta file with sequence and protein IDs
# Output: histogram of the keywords in the provided protein IDs
        # histogram of the go annotation in the provided protein IDs

#######################################################################################################################
# Import libraries needed.
import requests
from requests import Session
import pandas as pd
import numpy as np
import re
import argparse
import random
from datetime import datetime
import csv
from Bio import SeqIO
import matplotlib.pyplot as plt

#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="signal_p_extraction_part2.py",
                                 description="substracte signal peptides from sequences in a fasta file")

parser.add_argument("fasta",
                    type=str,
                    help="a fasta file with protein IDs")
args = parser.parse_args()

####################################################################################################################
# Open and write to the log file
out_file = open("../../Data/derived/log.log","a")
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Write to the log file
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"argument passed:  {args.fasta}",
      file = out_file)
print(f"number of required arguments: 1",
      file=out_file)
print(f"argument should contain: a fasta file with protein IDs ",
      file=out_file)

########################################################################################################################
# Get the file from the command line.
## Open the provided files
tox_file=args.fasta
if "fasta" in args.fasta:
    # Open and read the fasta file
    with open(tox_file) as handle:
        df_at = pd.DataFrame({record.id: [record.description, str(record.seq)]  for record in SeqIO.parse(handle, "fasta")}).T
    # Introduce column headers
    df_at.columns=["species","sequence"]
    # Split the combined ID and species that only the species is left over
    df_at["species"]=df_at["species"].str.split(" ",n=2, expand=True)[2]
    # Get the protein Ids
    IDs=df_at.index
else:
    # In case the wrong file format provided.
    print("error, the file type must either be tsv, or fasta")


# ####################################################################################################################
## Query additional information such as the keywords and the go annotations form the uniprot API
## The API accepts a single string with the iDs sperated with commata. The string may not contain more than
## 100 iDs at a time, therefore make packages of 100 IDs at a time.

all_iDs=[]
# Separate the Ids in packages of 100.
n_packages=int(len(IDs)/100)+1
all_iD_string=list(np.arange(n_packages))
j = 0
for i in np.arange(n_packages):
    iDs_part=IDs[j:j+100]
    iD_string=""
    for iD in iDs_part:
        iD_string=iD_string+iD + ","
    # get rid of the last comma
    iD_string = iD_string[0:-1]
    all_iD_string[i]=[iD_string]
    j+=100

# Query using an API of uniprot and EMBL.
dfs=[]
# Loop over all the packages of 100
for i in all_iD_string:
    with Session() as session:
        response = session.get(
            url='https://www.ebi.ac.uk/proteins/api/proteins?',
            params={ 'accession':i}, # i are packages of 100 iDs
            headers={"Accept": "application/json"})
        if response.ok:
            print("it worked")
            df = (pd.DataFrame([{"id": r['accession'], 'name': r['id'], "dbReferences": r["dbReferences"],"keywords": r['keywords'], "organism": r["organism"]} for r in response.json()]))
            dfs=dfs+[df]
        else:
            print("An error occrred:", response.status_code)
# Concateneate the dataframes so that one large dataframe is the result.
df=pd.concat(dfs,axis=0)
print(f"df is: {df}")


########################################################################################################################
# The next code part extract the go annotations, and the keywords fofrom the dataframe
# go annotations
all_go=[]
for i in df.iloc[:,2]:
    for source in i:
        if source["type"]=="GO":
            all_go=all_go+[source["properties"]["term"]]
# Get a list of unique go annotations to specify the amount of bins needed in the histogram.
unique_gos=list(set(all_go))
# Repeat for the GO annotations
len_gos=len(unique_gos)+1 # +1 for the use in the histogram

# keywords
all_keywords=[]
for i in df.iloc[:,3]:
    for key_val_pair in i:
        all_keywords=all_keywords+[key_val_pair["value"]]
unique_keywords=list(set(all_keywords))
len_keywords=len(unique_keywords)+1 # +1 for the use in the histogram

#######################################################################################################################
# Extract the name from the input for saving the figures
m = re.search("\/derived\/(.*)\.", args.fasta)
savename =m.group(1)

########################################################################################################################
# The next part visualizes the keywords and the go annotations
# keywords
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(16,10))
ax.hist(all_keywords,edgecolor='white', linewidth=1.2,bins=np.arange(len_keywords)-0.5)
plt.xticks(rotation = 90)
ax.set_xlabel("keywords in uniprot",fontsize=20)
ax.set_ylabel("frequency in the used dataset",fontsize=20)
ax.set_title(f"frequency of keywords in file: {savename}  ",fontsize=26)
plt.tight_layout()
plt.savefig(f"../../Figures/{savename}_keywords_histogram.png") #../../ has to be remooved before it can be run form the command line

#plt.show()

# GO annotations
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(16,10))
ax.hist(all_go,edgecolor='white', linewidth=1.2,bins=np.arange(len_gos)-0.5)
plt.xticks(rotation = 90)
ax.set_xlabel("go annotation in uniprot",fontsize=20)
ax.set_ylabel("frequency in the used dataset",fontsize=20)
ax.set_title(f"frequency of go annotations in file: {savename} ",fontsize=26)
plt.tight_layout()
plt.savefig(f"../../Figures/{savename}_go_histogram.png")
plt.show()

##################################################################################################################
# log what you created
# logs:
print(f"created fasta files: {savename}_go_histogram.png and {savename}_keywords_histogram.png  ",file=out_file)
print(f"files were stored under Figures",file=out_file)
# close all files
out_file.close()
# Do I have to close the figures as well?
#og_seq_file.close()
#new_seq_file.close()
