#!/usr/bin/env python3
# coding: utf-8

#######################################################################################################################
# This file downloads the  pdbs for a fasta file
# Input: a tsv with the toxins accession number and species
# Output: a fasta file with non-toxic sequences from the same species that are also secreted


cl = ""
cl = "../../"

#######################################################################################################################
# Import libraries needed.
import requests
from requests import Session
import pandas as pd
import numpy as np
import re
import argparse
import random
from datetime import datetime
import csv
from pandas.io.json import json_normalize
from Bio import SeqIO
import os
from Bio.PDB import PDBIO
#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="pdb_collection_part1",
                                 description="downloads the pdb with folding data ")
parser.add_argument("f",
                    type=str,
                    help="a fasta file")
args = parser.parse_args()

####################################################################################################################
# Get the time and date for the log.
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Open and write to the log file.
out_file = open(f"{cl}Data/derived/log.log","a")
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"argument passed: {args.f }",
      file = out_file)
print(f"number of required arguments: 1",
      file=out_file)


########################################################################################################################
# Extract what kind of data
if "toxin" in args.f:
    if "animal" in args.f:
        data_name1="animal_toxins"
    else:
        data_name1="bacterial_toxins"
else:
    if "animal" in args.f:
        data_name1 = "animal_control"
    else:
        data_name1 = "bacterial_control"


# Open the fasta file
with open(args.f) as handle: #open
    dfa = pd.DataFrame({record.id: [record.description, str(record.seq)]  for record in SeqIO.parse(handle, "fasta")}).T
header=["info","seq"] #define headers
dfa.columns=header #set headers



if any("|" in i for i in dfa.index):
    # Option a: the fasta file is not "clean"
    uniq_accession=[re.search("\|(.*)\|",i).group(1) for i in dfa.index] #get the indces
else:
    # Option b: the fasta file has clean indicies
    uniq_accession=dfa.index

#List of unique accession IDs
file1 = open(f"{cl}Data/derived/folds/{data_name1}_indeces_file", "w")
for i in uniq_accession:
    print(i,file=file1)



# Loop over the accessions
for i in uniq_accession:
    # Construct the url to download the PDB
    try:
        model_url = f'https://alphafold.ebi.ac.uk/files/AF-{i}-F1-model_v4.pdb'
    # save the results
        os.system(f"curl {model_url} > {cl}Data/raw/folds/{data_name1}_folds/{i}.pdb")
    except: print(f"the accession {i} was not able to be linked to a Alphafold")


file1.close()
out_file.close()



