#!/bin/python3

import os

def verifyingFasta(directoryWithData):
	"""
	This function will verify that the fasta files for the update of the database ExotoxinsDB are really valid in
	form compatible with the task ahead in multiFastaRead.py
	:param directoryWithData: address for the Data directory
	:return: list containing the valid correct files
	"""
	dir_rawData = directoryWithData+"/raw"


	#Create a list of all files in the rawData
	list_Cand_Files = os.listdir(dir_rawData)
	extensionsToCheck = ['.txt', '.fa', '.fasta']
	#select files with the right extension
	list_correct_files = []
	#simple
	#loop over the list
	for i in list_Cand_Files:
		print(i)
		if (".txt" in i):
			list_correct_files.append(i)
		elif (".fa" in i):
			list_correct_files.append(i)
		elif (".fasta" in i):
			list_correct_files.append(i)

		#option 2: Use a generator together with any, which short-circuits on the first True:
		#if any(ext in i for ext in extensionsToCheck):
		#	print(i)
		#	list_correct_files.append(i)

	#verify that the list created contains the right files
	print("The files found with the correct extensions are: ")
	for i in list_correct_files:
		print(i)

	#Now, iterating over each correct file, open it and verify the first line and the existence of ">" in any other line below

	print("Verifying the right format content in the files found...")
	for i in list_correct_files:

		f = dir_rawData + "/" + i
		with open(f, "r") as file:
			first_line = file.readline()
			if not("#" in first_line):
				list_correct_files.remove(i)
				print(i + " file does not not start with '#' and Toxin type. Please verify.")
				continue
			second_line = file.readline()
			third_line = file.readline()
			if not (">" in second_line or ">" in third_line):
				list_correct_files.remove(i)
				print(i + " file does not contain a fasta identifier in the first three lines. Please verify.")


	return list_correct_files

