#! /usr/bin/env python3
'''Created by: Luisa F. Jimenez-Soto
Date: January 20th 2022
Purpose: To read the data frame stored as csv under the csv.gz file created by ncbitax2lin program
in a way that I can extract all the rows containing up to species
'''

#import packages
import gzip #Allows to work with compressed files
import argparse
import os
import myprint

#get the address of the file to open

parser = argparse.ArgumentParser()

parser.add_argument("--gzip", type=str,
                        help="Absolute address of gzip file with extension *.csv.gz.",
                        required=True)
parser.add_argument("--kingdom", type=str,
                        help="Kingdom to be selected (Bacteria, Archaea, Eukaryota, Viruses)",
                        required=True)
parser.add_argument("--level", type=str,
                        help="Level of taxonomy to choose. It can be: class, order, family, genus, species",
                        required=True)

args = parser.parse_args()
gzip_f = args.gzip
taxa = args.kingdom
level = args.level

# Setting the names of output file which should be under /Data/derivedData/
dirList = os.path.dirname(gzip_f)
listDirs = dirList.split(os.sep)
indexDerived = listDirs.index("Taxonomy")
listDirs = listDirs[0:indexDerived+1]
dirDerivedData = '/'.join(listDirs) # This is the final directory for the output WITHOUT the end /
dirOutput = dirDerivedData+ "/" + taxa

#Check if a directory exists, if not, create it
check_dir = os.path.isdir(dirOutput)
if not check_dir:
    os.makedirs(dirOutput)
    print( "Folder " + dirOutput + " created.")
else:
    print ( "Folder" + dirOutput + " exist.")



outFile = dirOutput + "/" + taxa + "_"+ level+"_DataFrame.tsv"


#For debugging
#gzip_f = "/home/luisa/Documents/Databases/Taxonomy/new_NCBI_lineages/NCBI_lineage20220120.csv.gz"

#Dictionary where the values will be stored
dict_bactSpecies = {}
# List to keep the headers of the file, to use for the output file
headers = []
listSpecies= {}
#Open *.gz file with gzip package
with gzip.open(gzip_f,'rt') as f:
    # Read line by line
    for line in f:
        # if list contains bacteria in index 1 and the index for species is not empty!
        if "tax_id" in line:
            headers = line.split(",")
            index_level = headers.index(level)
            dict_bactSpecies[headers[0]]= headers[1:index_level+1]
        elif taxa in line:
            # Convert line to list separating by ","
            line_list = line.split(",")
            if line_list[index_level] not in listSpecies:
                if line_list[index_level] != '' and line_list[index_level] not in listSpecies.keys():
                    dict_bactSpecies[line_list[0]] = line_list[1:index_level+1]
                    listSpecies[line_list[index_level]]=0
                else:
                    continue
        else:
            continue

#Print in a file the results

myprint.printDict6(dict_bactSpecies, 0,1,2,3,4,5, "\t", outFile)
print ("The length of the dict is "+ str(len(dict_bactSpecies.keys())))


