#! /usr/bin/env python3

#######################################################################################################################
"""
author: Luisa Jimenez Soto
adapted by: Tanja Krueger
Purpose of script:
This script takes one multi fasta files and performs  a transformation into a unified fasta format.
Input: a fasta file with hashtag describing the toxin type, and with format caviats from origin of the fasta sequence
Output: a clean fasta file with uniform format independent of the sequence origin, file ends with "_clean.fasta"
        # a tsv file including the protein accession and type of toxin, file ends with "_ID_Type.tsv"
        # a tsv file including the protein accession and the genus, file ends with "_IDgenus.tsv"
        # a tsv file that list the source, file ends with  "_source.tsv"
        """
########################################################################################################################
# All packages needed for the program.
import argparse, os, re,  sys, subprocess
import multiFastaClean
import myprint

########################################################################################################################
# Defining of the function that cleans the one single mulitfasta file and generates three tables.
def main():
    # Add arguments from the command line.
    parser = argparse.ArgumentParser()
    parser.add_argument("toClean", type=str,
                        help="filename that should be cleaned, contains multi-sequence fasta format files.")
    args = parser.parse_args()
    file_to_transform = args.toClean
    # The directory where to store the created files.
    dir_derivedData = "Data/derived"

    # Get the file from the input and split it into its parts
    file_to_transform = args.toClean
    baseNameFile = os.path.basename(file_to_transform)
    base_wo_ext = baseNameFile.rsplit(".", 1)[0]

    # Construct the dict with all the relevant information about each protein accession
    dictFasta = multiFastaClean.multiFastaRead(file_to_transform)


    # Creation of three separate files that contain the protein accession number and additional information about:
    ## file has additional information about genus
    outputIDGenus = dir_derivedData + "/" + base_wo_ext + "_IDgenus.tsv"
    ## file has additional information about source of the protein sequence eg. uniprot
    outputSource = dir_derivedData + "/" + base_wo_ext + "_source.tsv"
    fastaDict = multiFastaClean.multiFastaReadGenus(file_to_transform, outputIDGenus)
    ## file has additional information about the type of toxin
    outputIDType = dir_derivedData + "/" + base_wo_ext + "_ID_Type.tsv"

    # print all four files into the subfolder derived
    myprint.printDict2(dictFasta, 0, 3, "\t", outputIDType)      # toxin type
    myprint.printDict2(fastaDict, 0, 2, "\t", outputIDGenus)     # genus
    myprint.printDict1(dictFasta, 4, "\t", outputSource)         # source of sequence
    myprint.printFasta3(dictFasta, dir_derivedData, base_wo_ext) # clean fasta file

########################################################################################################################
# Calling the defined function
main()

