#!/usr/bin/env python3
# coding: utf-8
# Author: Tanja Krueger & code from: Luisa Jimenez Soto

#######################################################################################################################
# This program adds a hashtag to the beginning of all raw fasta files in a directory that are mising a hash.
# Content of the hashtag is extracted from the title and identifies the type of toxin.

# Input a directory that is checked if each file contains a hashtag in the beginning of the file
# Output: any files with missing # where premended by the # + identification of the toxin

#######################################################################################################################
# Import libraries needed.
import os
import re
import argparse
from datetime import datetime
from Bio import SeqIO

#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="hashToDir.py",
                                 description="")
parser.add_argument("DataDir",
                    type=str,
                    help="the data directory of the project ")
args = parser.parse_args()

####################################################################################################################
# Open and write to the log file
out_file = open("Data/derived/log.log","a")
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Write to the log file
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}")
print(f"argument passed: {args.DataDir}",
      file = out_file)
print(f"number of required arguments: 1",
      file=out_file)
print(f"argument should contain: the data directory of the project",
      file=out_file)

####################################################################################################################
# Get the raw data directory where the new animal toxins will be stored that is checked for files without the hashtag.
dir_rawData = args.DataDir+"/raw/animal"
# Create a list of all files in the rawData.
list_Cand_Files = os.listdir(dir_rawData)
# Select files with the right extension.
list_correct_files = []

# Loop over the list to append the correct files.
for i in list_Cand_Files:
    if ".fasta" in i: # any files without the extension fasta are not used
        list_correct_files.append(i)

# Iteration over each correct file, open it and verify the existence of a hashtag
for i in list_correct_files:
    f = dir_rawData + "/" + i
    # Extract the name from the input with a regex expression.
    m= re.search("\/animal\/(.*).fasta", f)
    savename = m.group(1)
    with open(f, "r+") as file:
        first_line = file.readline()
        # Check the first line for the presence of a #
        if "#" in first_line:
            print(f"{i} hast the right format")
        # In the absence of a # in the first line, add a # together with the extracted information from the title
        else:
            print(f"{i} was changed, a # with the file content description was added to the fasta file")
			#create a file with the extracted #
            content = file.read()
            file.seek(0, 0)
            file.write( f"#{savename}\n{first_line}{content}")
        file.close()

out_file.close()
########################################################################################################################


