#! /usr/bin/env python3

#######################################################################################################################
"""
author: Luisa Jimenez Soto
adapted by: Tanja Krueger
Purpose of script:
This script takes fasta files and performs  a transformation into a unified fasta format.
Input: a fasta file with hashtag discribing the toxin type, and with format caviats from origin of the fasta sequence
Output: a clean fasta file with uniform format independent of the sequence origin
        # a tsv file including the protein accession and type of toxin, file ends with "_ID_Type.tsv"
        # a tsv file including the protein accession and the genus, file ends with "_IDgenus.tsv"
        # a tsv file that list the source, file ends with  "_source.tsv"
        """
########################################################################################################################
# All packages needed for the program.
import argparse, os, sys, subprocess
# Custom packages also in the provided Code folder.
import verifyingFasta
import multiFastaClean
import myprint

def main():
    # Add arguments from the command line.
    parser = argparse.ArgumentParser()
    # Establish and read the args added in the CLI when calling the program
    parser.add_argument("--directory", type=str,
                        help="Location of Data directory (one above the raw folder) contains multi-sequence fasta format files.",
                        required=True)
    # parser.add_argument("--outdir", type=str,
    #                     help="Optional directory to save processed data",
    #                     required=False) # default is "derived"

    args = parser.parse_args()
    # Assigning the values from the args to variables and modifying the directory addresses.
    dir_rawData = args.directory + "/raw"
    # Selecting the output directory
    dir_derivedData = args.directory + "/derived"


    # Verify that the fasta data for the type of toxins is what I need. This includes the lines with "#" declaring
    # the type of toxin. If the file seems to be ok, it will be added to the list of files to be processed.
    list_files2Transform = verifyingFasta.verifyingFasta(args.directory) # but this assumes that the data is under the data folder not under raw
    for i in list_files2Transform:
        print(i )

    # Translation of fastaFiles to clean fasta and creation of tsv files with type and species
    for i in list_files2Transform:
        datafile = dir_rawData + "/" + i
        baseNameFile = os.path.basename(i)
        base_wo_ext = baseNameFile.rsplit(".", 1)[0]
        dictFasta = multiFastaClean.multiFastaRead(datafile)
        myprint.printFasta(dictFasta, dir_derivedData, base_wo_ext)
        # Commands to create the file with the ID and type used later for classification table.
        outputIDType = dir_derivedData + "/" + base_wo_ext + "_ID_Type.tsv"
        myprint.printDict2(dictFasta, 0, 3, "\t", outputIDType)

        # Commands to create an output file containing the ID, the species and the Genus as tabulated file for later
        # filter of the secreted proteins.
        outputIDGenus = dir_derivedData + "/" + base_wo_ext + "_IDgenus.tsv"
        outputSource = dir_derivedData + "/" + base_wo_ext + "_source.tsv"
        fastaDict = multiFastaClean.multiFastaReadGenus(datafile, outputIDGenus)
        #Print the protein ID and the genus (tuple: species, sequence, geneName, typeToxin, source)
        myprint.printDict2(fastaDict, 0, 2, "\t", outputIDGenus)
        # Print the protein ID and the source (tuple: species, sequence, geneName, typeToxin, source)
        myprint.printDict1(dictFasta, 4, "\t", outputSource)

        print("The file " + i + " has been formatted and the derived data (fasta, Type classification and Genus)"
                                " stored under the " + dir_derivedData)



main()
