#! /usr/bin/env python3
import argparse
import multiFastaClean as mf
import myprint
import os
import re


'''
Created by: Luisa F. Jimenez Soto
Start date: 18.01.2022
The purpose of this script is to open fasta sequences and keep only those which are in an identifier list. 
The inputs are the .fasta only and the *_listReps.csv. The output is fasta file BUT filtered by the list.
The name should represented the filtered status and the CDHIT level for the filter'''


#open the files
parser = argparse.ArgumentParser()
parser.add_argument("--fasta", type=str,
                        help="Location of multiple sequence fasta file to be filtered.",
                        required=True)
parser.add_argument("--list", type=str,
                    help="Location of file with list of protein IDs to be selected.",
                    required=True)
#for debugging only
# TARGET TOXINS
#--fasta "/home/luisa/Documents/Databases/Exotoxins/Data/rawData/targetToxins_updated.fasta"
#--list "/home/luisa/Documents/Databases/Exotoxins/Data/derivedData/CD-HIT_Jan2022/targetToxins_Jan2022/targetToxinsCDHIT100_listReps.csv"

# CONTROL TOXINS
#--fasta "/home/luisa/Documents/Databases/Exotoxins/Data/rawData/controlToxins_updated.fasta"
#--list "/home/luisa/Documents/Databases/Exotoxins/Data/derivedData/CD-HIT_Jan2022/controlToxins_Jan2022/controlToxinsCDHIT100_listReps.csv"

# MERGED TOXINS
#--fasta "/home/luisa/Documents/Databases/Exotoxins/Data/rawData/mergedExotoxins_updated.fasta"
#--list "/home/luisa/Documents/Databases/Exotoxins/Data/derivedData/CD-HIT_Jan2022/mergedToxinsOnlyReductions2022/mergeToxinsCDHIT100_listReps.csv"

args = parser.parse_args()

fastFile = args.fasta
listRepresentative = args.list

#set output and basename variables
basename = os.path.basename(fastFile).split(".")[0]
#The output should be under /Data/derivedData/
dirList = os.path.dirname(listRepresentative)
listDirs = dirList.split(os.sep)
indexDerived = listDirs.index("derivedData")
listDirs = listDirs[0:indexDerived+1]
dirDerivedData = '/'.join(listDirs) # This is the final directory for the output WITHOUT the end /

#Searching for the right extension using the base name of the list file
basenameList = os.path.basename(listRepresentative)
try:
    cdhit = re.findall("CDHIT[0-9]+_", basenameList)
    #cdhit =
except AttributeError:
    cdhit = "noCDHIT"
extension = "_"+cdhit[0]+ "reduced.fasta"



#create the dict with IDs as key and sequence as values using ORIGINAL FASTA files. Not the clean.
dictFasta = mf.multiFastaRead(fastFile)
# Open files
with open(listRepresentative) as fList:
    tList = fList.readlines() #It creates already a list!

# Creating the empty list for the filtrated
filtratedFasta = {}

for j in tList:
    strippedJ = j.strip()
    try:
        dictFasta.get(strippedJ)
    except ValueError:
        print("not found")
    else:
        #Add the value to the new dict
        filtratedFasta[strippedJ] = dictFasta.get(strippedJ)

# print in file the values of the filtratedFasta dict
myprint.printFasta2(filtratedFasta, dirDerivedData,basename,extension)






