#! /usr/bin/env python3

import argparse
import re
import os.path
import myprint as p #The myprint.py script HAS to be in same folder where this script is stored.

#import taxonomy as ta # To activate as soon as the taxonomy in the NCBI has been updated.

'''
Purpose: read multifasta files and extract the information for better more understandable fasta files.
'''

#Add arguments parser
# in configurations:
    #For the baseDatabase.fasta
    #--file "/media/luisa/HDD_extension/databases/ExotoxinsBacteriaOnly/finalFastaCollection/baseDatabase.fasta"
    # for the baseControlToxins.fasta
    #--file "/media/luisa/HDD_extension/databases/ExotoxinsBacteriaOnly/NegativeControls/controlToxinsFiles/baseControlToxins.fasta"
    # for the baseControlProteins.fasta
    #--file "/media/luisa/HDD_extension/databases/ExotoxinsBacteriaOnly/NegativeControls/controlProteinsFiles/controlProteins.fasta"
    # For all fasta sequences, to create the list of all genus and accession numbers for analysis in R
    #--file "/media/luisa/HDD_extension/databases/ExotoxinsBacteriaOnly/finalFastaCollection/BaseAnalysis/mergeAll_fasta_clean.fasta"
    # For all fasta sequences (ORIGINAL FORM), to create the list of all genus and accession numbers for analysis in R
    #--file "/media/luisa/HDD_extension/databases/ExotoxinsBacteriaOnly/finalFastaCollection/BaseAnalysis/AllFastaDatasetsMerged.fasta"
    # For all fasta sequences filterd by CD-HIT
    #--file "/media/luisa/HDD_extension/databases/ExotoxinsBacteriaOnly/SecondProteinDownload/downloadsTemp/allSets_CDHIT100Filtered.fasta"



# parser = argparse.ArgumentParser()
# parser.add_argument("--directory", type=str, help="Location of directory with Data. Files to transform have to be a multi-sequence fasta format.", required=True)
# #In this case is not necessary, since the output is now defined to a folder called derivedData under the /Exotoxins/Data/derivedData
# #parser.add_argument("--output", type = str, required = True, help= "Location where output file as multiple fasta should be stored")
#
# #during debugging use the following path
#
# dir_rawData = "/home/"
#
#
# args = parser.parse_args()
# dir_rawData = args.directory + "/rawData"

# #search for raw
# fileFasta = args.file
# fileName = os.path.basename(fileFasta)
# print(fileName)

#fileOutput = args.output
#locationPattern = '.+\/'
#fileOut= re.search(locationPattern,fileFasta)
#fileOutput = fileFasta[fileOut.start():fileOut.end()]

#Option using os.path
#fileOutput = os.path.dirname(fileFasta)
#fileFasta = "/home/luisa/PycharmProjects/SignalSequences/FastaCleaning/testUniProtFasta10.fasta" # For test only


def multiFastaRead(file):
    """
    To read special multifasta files and manipulate them as I need them. The files are made to contain in a # tag the type
     of protein (label) corresponding to the following fasta sequences. It will store all values relevant into a dictionary
    with the accession number as key and values are species, sequence and gene name. It is for proteins, NOT for DNA.
    It can handle UniProt sequence format and the two different NCBI formats: GI and the NP_.
    :param file: global path/or relative path
    :return: dictionary with {accession: (species, sequence, geneName, typeToxin)}
    """
    # Open files
    with open(file) as fileIn:
        fileRead = fileIn.readlines()

    #Dict where information will be stored while parsing a multifasta file
    fasta_dict = {}
    
    ##################################################################################################################
    #########################################   Pattern definition ###################################################
    ##################################################################################################################

    #Universal patterns to extract the species from the line containing the information
    patternSpeciesOnly = re.compile(r'\w+\s\w+[\s|\.]')  # Only to be used AFTER the patternSpeciesGi
    patternSpeciesGiPerfect = re.compile(r'\w+\s\w+')  # Only to be used after patternSpeciesGi for perfect species name

    # Patterns for sequences with the form >ge|<number>|Species_name from only partially annotated proteins
    patternAccNumGe = re.compile(r'\|\S+\|')
    patternSpeciesGe = re.compile(r'\|[A-Za-z]+_[A-Za-z]+')

    #Patterns for sequences with the form >gi|<number>|AC| from the ncbi
    patternAccNumGi = re.compile(r'\|\w+\.\d+\|')
    patternSpeciesGi = re.compile(r'\[\w+\s\w+.+')
    patternGeneNameGi = re.compile(r'\s.+\[')
    #patternGeneNameGi = re.compile(r'[|].+\[') #Old version, causing problems with the extraction

    #Patterns for sequence with the form >AC [Species]
    patternAccNumNCBI = re.compile(r'\>\w+\.\d')
    patternSpeciesNCBI = re.compile(r'\[.\w+.+')
    patternGeneNameNCBI = re.compile(r'[.]\d.+\[')

    #Patterns for sequence with the form >sp|AC|ShortProtName NameGene OS=<NAME SPECIES> OX=Number from UniProt
    patternAccUniProt = re.compile( r'\|\w+\|')
    patternSpeciesUniProt = re.compile(r'=.+\sO')
    patternGeneNameUniProt = re.compile(r'[A-Z0-9]\s.+\sOS')



    ##################################################################################################################
    ###########################  Reading the fasta file and extracting the information  ##############################
    ##################################################################################################################
    counterSeqs = 0
    counterPDBseqs= 0
    counterUniProtseqs = 0
    counterGISeq = 0
    counterNCBIseq = 0
    typeToxin = ""
    source = ""
    for line in fileRead:
        #typeToxin =""
        if "#" in line:
            typeToxin = line.strip("#")
            typeToxin = typeToxin.strip()


        elif ">" in line: #All fasta sequences start with this symbol
            accession=""
            species =""
            geneName=""
            sequence = ""
            if "pdb|" in line:
                counterPDBseqs =+ 1
                continue
            elif (">ge|" in line): # Only ge format (a stand in format for UNIPROT accession that dont follow the regular format)
                print(line)
                accNumSearch = patternAccNumGe.search(line)
                speciesSearch = patternSpeciesGe.search(line)
                print(accNumSearch)
                print(speciesSearch)

                accession = line[accNumSearch.start() + 1:accNumSearch.end() - 1]

                species = line[speciesSearch.start():speciesSearch.end()]

                species = species.rstrip()


                geneName = "unknown"
                counterGISeq += 1
                source = "unknown"
                # Search for the patterns in the line.

            elif (("|" in line) and ("gi" in line)) and (("sp|" not in line) and ("tr|" not in line)): # Only gi format
                accNumSearch = patternAccNumGi.search(line)
                speciesSearch = patternSpeciesGi.search(line)
                geneNameSearch = patternGeneNameGi.search(line)

                accession = line[accNumSearch.start() + 1:accNumSearch.end() - 1]

                speciesVar = line[speciesSearch.start() :speciesSearch.end()]
                if speciesSearch != None:
                    firstSel = patternSpeciesOnly.search(speciesVar)
                    if firstSel == None:
                        speciesPerfect = patternSpeciesGiPerfect.search(speciesVar)
                        species = speciesVar[speciesPerfect.start(): speciesPerfect.end()]
                    else:
                        species = speciesVar[firstSel.start():firstSel.end()].strip('\\s')

                species = species.rstrip()
                #else:
                 #   print(line, speciesVar)

                geneName = line[geneNameSearch.start() + 1: geneNameSearch.end() - 1]
                counterGISeq += 1
                source = "NCBI"
                #Search for the patterns in the line.
            elif ("|" in line) and ((">sp" in line) or (">tr" in line)): # It is a UniProt sequence
                accNumSearch= patternAccUniProt.search(line)
                speciesSearch= patternSpeciesUniProt.search(line)
                geneNameSearch = patternGeneNameUniProt.search(line)
                sequence =""
                accession = line[accNumSearch.start()+ 1:accNumSearch.end() - 1]
                speciesVar2 = line[speciesSearch.start():speciesSearch.end()-1]
                if speciesSearch != None:
                    first = patternSpeciesOnly.search(speciesVar2)
                    species = speciesVar2[first.start():first.end()].strip('\\s')
                else:
                    species = speciesVar2
                    print("There was an unrecognized species format in the fasta input at line: " + line)
                species = species.rstrip()
                geneName = line[geneNameSearch.start() + 2: geneNameSearch.end() - 2]
                source = "UniProt"
                counterUniProtseqs += 1
            elif ("|" not in line):
                accNumSearch = patternAccNumNCBI.search(line)
                speciesSearch= patternSpeciesNCBI.search(line)
                geneNameSearch = patternGeneNameNCBI.search(line)
                geneName = line[geneNameSearch.start()+3: geneNameSearch.end()-2]
                speciesVar3 = line[speciesSearch.start():speciesSearch.end() - 1]
                if speciesSearch != None:
                    first2 = patternSpeciesOnly.search(speciesVar3)
                    if first2 == None:
                        species = speciesVar3.strip("[")
                    else:
                        species = speciesVar3[first2.start():first2.end()]
                species = species.rstrip()
                accession = line[accNumSearch.start() + 1:accNumSearch.end()+1]
                source = "NCBI"
                counterNCBIseq += 1
            counterSeqs += 1
            #print(line)

        else:

            sequence = sequence + line.strip()
            #if "Type" not in typeToxin:
            #    typeToxin = "Unknown"
            #if accession not in fasta_dict.keys():
            fasta_dict.update({accession.strip(): (species, sequence, geneName, typeToxin, source)})
        #print (fasta_dict.keys())

    print("The total number of sequences are " + str(counterSeqs))
    print("The total number of PDB sequences are "+str(counterPDBseqs))
    print("The total number of GI sequences are "+str(counterGISeq))
    print("The total number of UniProt sequences are " + str(counterUniProtseqs))
    print("The total number of NCBI plain sequences are " + str(counterNCBIseq))

    return fasta_dict

# testing
#multiFastaRead("/home/agjimenez/Documents/02_analysis_animal_toxins/Data/raw/placeholder_animal_toxins.fasta")
#HERE WERE ORIGINALLY THE PRINT FUNCTIONS NOW IN MYPRINT.PY

######################################################################################################################
#################################  USAGE OF FUNCTIONS FOR READING FASTA FILES  #######################################
######################################################################################################################
#uniprotFasta = multiFastaRead(fileFasta)

#(The parameters used last time:
# --file "/media/luisa/HDD_extension/databases/ExotoxinsBacteriaOnly/SecondProteinDownload/downloadsTemp/allSets_CDHIT100Filtered.fasta")
#--file "/media/luisa/HDD_extension/databases/ExotoxinsBacteriaOnly/CleanUpForPublication/newTXUpdatedAndClassifiedAsFile/mergeNewToxinsAndTypes.fasta"
#--file "/media/luisa/HDD_extension/databases/ExotoxinsBacteriaOnly/Uniprot_Download/fullMergeExotoxinsUniprot.fasta"
#--file "/media/luisa/HDD_extension/databases/ExotoxinsBacteriaOnly/finalFastaCollection/baseDatabase.fasta"

#print (len(uniprotFasta))
#p.printFasta(uniprotFasta, fileOutput)

#Commands to create the file with the ID and type used later for classification table.
#outputIDType = fileOutput+"/ID_Type.tsv"
#p.printDict2(uniprotFasta, 0, 3, "\t", outputIDType)


#fastaDict = multiFastaRead(fileFasta, fileOutput)


#printDict(fastaDict,0)

#taxaDic = dictTaxa("/media/luisa/HDD_extension/databases/uniqueListBac_120Tax.txt")
#printDict(taxaDic, 0, 1, 2, 3, 4, "\t", "/media/luisa/HDD_extension/databases/listTotalTaxaBac_120.tab")


def tableAccessionAndTaxa(fastaDict, taxaDict):
    accTaxaDict={}
    for key,value in taxaDict.items():
        genus = value[2]
        group = value[1]
        subgroup = value[0]
        taxID = value[3]
        for key, value in fastaDict.items():
            if genus in value[0]:
                accTaxaDict.update({key: (value[0], genus, subgroup, group, taxID)})

    #Open a file to store the tab separated values
    tableFile= open ("speciesTaxa.txt", "w+")

    #printing the first line(header)
    print("AccessionNr" + "\t" + "Species" + "\t" + "genus" + "\t" +"subgroup" + "\t" + "group"+ "\t" +"taxID", file=tableFile )
    #Printing the dictionary to file
    for key, value in accTaxaDict.items():
        print(key + "\t" + value[0] +"\t" + value[1]+ "\t" + value[2]+ "\t" + value[3] +"\t" + value[4], file=tableFile)

    tableFile.close()



#gene="SecA"
#fastaDictbyGene= selectFastaAlignmentByGene(fastaDict,gene)

#selectFastaAlignmentByGenus(fastaDictbyGene)

def createDictTaxa(list):
    # Open the list file
    with open(list, "r") as listProk:
        listRead = listProk.readlines()

    taxaDict = {}

    for line in listRead:

        if "#" not in line:
            line = line.split("\t")
            species = line[0]
            genusName = ""

            if "'" in species:
                genusName = "undetermined"
                continue
            else:
                # Pattern for genus
                patternGenus = re.compile(r'[A-Z][a-z]+\s')
                genusSearch = patternGenus.search(line[0])
                if genusSearch == None:
                    patternGenusCand = re.compile(r'\[[A-Z][a-z]+')
                    genusSearch2 = patternGenusCand.search(line[0])
                    if genusSearch2 == None:
                        genusName == "undetermined"
                    else:
                        genusName = species[genusSearch2.start() + 1: genusSearch2.end()]
                        # print (line)
                    # time.sleep(2)

                else:
                    genusName = species[genusSearch.start(): genusSearch.end() - 1]
            taxaID = line[1]
            subgroup = line[5]
            group = line[4]

            taxaDict.update({genusName: (group, subgroup)})

    return taxaDict

# HERE WAS ORIGINALLY  THE FUNCTION FOR THE EXTRACTION OF THE GENUS NAME FROM THE SPECIES (extractGenus)
# NOW PART OF THE taxonomy.py module

# taxaDictionary = createDictTaxa("prokaryotes.txt")
# taxaFile = open("./TaxaTable.tsv", "w+" )
# print ("Genus" +"\t" + "Subgroup" + "\t" + "Group", file=taxaFile)
# for key,value in taxaDictionary.items():
#     print(key +"\t" + value[0] +"\t"+ value[1], file=taxaFile)
# taxaFile.close()

############ Copied from taxonomy.py to be able to run it without conflicts ##################
def extractGenus(species):
    """
    Extract the genus from the name of the species
    :param species: String representing the name of the species. Composed by genus and another name.
    Careful: Some genus are named as "candidatus". They follow other distribution of names
    :return: string corresponding to the genus
    """
    genusName=""
    if "Candidatus" not in species:
        genusSearch = re.search(r'[A-Z][a-z]+', species)

        if genusSearch == None:
            patternGenusCand = re.compile(r'\[[A-Z][a-z]+')
            genusSearch2 = patternGenusCand.search(species)
            if genusSearch2 == None:
                genusName == "undetermined"
            else:
                genusName = species[genusSearch2.start() + 1: genusSearch2.end()]

            # time.sleep(2)
        else:
            genusName = species[genusSearch.start(): genusSearch.end()+1]

    elif "Candidatus" in species or "candidatus" in species:
        genusSearch = re.search(r'[Cc]andidatus\s[A-Z][a-z]+', species)
        genusName = species[11: genusSearch.end() + 1]

    genusName= genusName.strip()
    #print(genusName)
    return genusName


def multiFastaReadGenus(file, fileOut):
    """
    Function created to read multifasta files, extract the species name, extract the genus.
    Requires the function extractGenus() from the taxonomy.py module
    and the function of this module multiFastaRead() giving a dict with
    {accession: (species, sequence, geneName, typeToxin)})
    It requires python package os
    :param file: Multifasta file. Can be from NCBI or UniProt. Formats are described below. Global path.
    :return: a dictionary containing the accessionNr as key, and a tuple with species, geneName, and genus as value.
    """
    #test file was: --file "/media/luisa/HDD_extension/ForProtection/TempfilesAndIntermediates/multifasta_sample10.txt"
    #Used for the following files
    #--file "/media/luisa/HDD_extension/databases/AllFastaProteinsMerged.fasta"
    #--file "/media/luisa/HDD_extension/databases/DataWithPSORTdbvalues/TotalFastaAllGram.fasta"
    #--file "/media/luisa/HDD_extension/databases/ExotoxinsBacteriaOnly/allExotoxinsAsFor01042020BacteriaOnly.fasta"

    fastaDict = multiFastaRead(file)
    fasta_dict ={}

    for key,value in fastaDict.items():
        accession = key
        species = value[0]
        geneName = value[2]
        genus = extractGenus(species)
        #genus = ta.extractGenus(species) #-- Old code!
        fasta_dict.update({accession: (species, geneName, genus)})


    # BEFORE printing in a file, use the following for the first line(header)
    #print("AccessionNr" + "\t" + "Species" + "\t" + "Genus", file=fileOut)
    #Printing the dictionary to file
    #p.print(fasta_dict,0,2,fileOut)

    return fasta_dict



# Commands to create an output file containing the ID, the species and the Genus as tabulated file for later
# filter of the secreted proteins.
#outputIDGenus = fileOutput+"/"+fileName+"_IDgenus.tsv"
#fastaDict = multiFastaReadGenus(fileFasta,outputIDGenus)
#p.printDict2(fastaDict,0,2,"\t", outputIDGenus)
