#!/usr/bin/env python3
# coding: utf-8

#######################################################################################################################
# This file subracts the signal peptides from sequences in a fasta file
# Input: a csv with th signal peptides positions per protein IDs, the sequence info
# Output: a new fasta file with reduced sequences and one with their signal peptide intact

#######################################################################################################################
# Import libraries needed.
import requests
from requests import Session
import pandas as pd
import numpy as np
import re
import argparse
import random
from datetime import datetime
import csv
from Bio import SeqIO

#######################################################################################################################
# Get the arguments from the command line
parser = argparse.ArgumentParser(prog="signal_p_extraction_part2.py",
                                 description="substracte signal peptides from sequences in a fasta file")

parser.add_argument("sp",
                    type=str,
                    help=" a csv file with signal peptide (sp) positions and sequence information")
args = parser.parse_args()

####################################################################################################################
# Open and write to the log file
out_file = open("../../Data/derived/log.log","a")
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
# Write to the log file
print("##########",
      file=out_file)
print(f"program {parser.prog} was executed at {dt_string}",
      file=out_file)
print(f"argument passed:  {args.sp}",
      file = out_file)
print(f"number of required arguments: 1",
      file=out_file)
print(f"argument should contain: a scv with protein ID, sequence and signal peptide information ",
      file=out_file)

########################################################################################################################
## Open the input files
csv_file=args.sp
# Check if the input is csv format
if "csv" in args.sp:
    # Open and read the csv file with the signal peptides in a pandas dataframe, the protein IDs are the index
    df_sp= pd.read_csv(csv_file,index_col="id",header=0)

else:
    # In case the wrong format was provided.
    print("error, the first provided argument has to be a csv file")


########################################################################################################################
## If a sequence has a signal peptide, then remove the signal peptide, but keep the orginal sequence in its separte
for i in df_sp.index:
    j_end=df_sp.loc[i,"end"]
    if j_end !=0:
        df_sp.loc[i,"new_sequence"]=df_sp.loc[i,"sequence"][j_end:]


########################################################################################################################
## Write two fasta file. Both fasta files only contain the sequences that have a signal peptide. One fasta with the signal
# and the other fasta without the signal peptide.
# Extract the naming convention from the input file
m= re.search("derived\/(.*)_signal_peptide.csv", args.sp)
input_name= m.group(1)

# Write a fasta file with the unchanged sequences that do have a signal peptide
og_seq_file = open(f"../../Data/derived/{input_name}_seqs_with_signalp.fasta","w")
for i in df_sp.index:
    # Only include those sequences that have a signal peptide.
    if df_sp.loc[i,"end"] !=0:
        # Use two seperate print statements instead of \n to not get empty lines
        print(f"> {i} [{df_sp.loc[i,'species']}]",file=og_seq_file)
        print(df_sp.loc[i,"sequence"],file=og_seq_file)

# Write a fasta file with the sequences without the signal peptide
new_seq_file = open(f"../../Data/derived/{input_name}_seqs_without_signalp.fasta","w")
for i in df_sp.index:
    # Only include those sequences that use to have a signal peptide.
    if df_sp.loc[i, "end"] != 0:
        # Use two seperate print statements instead of \n to not get empty lines
        print(f"> {i} [{df_sp.loc[i,'species']}]",file=new_seq_file)
        print(df_sp.loc[i,"new_sequence"],file=new_seq_file)

##################################################################################################################
# log what you created
# logs:
print(f"created fasta files: {input_name}_seqs_without_signalp.fasta, and {input_name}_seqs_with_signalp.fasta ",file=out_file)
print(f"files were stored under derived data",file=out_file)
# close all files
out_file.close()
og_seq_file.close()
new_seq_file.close()


##################################################################################################################
# test that need doing
# run a input with just sequence with signal : ok
# run an input with just one sequence without signal : ok
# go over the output by hand and check generally :ok

# go over the output programatically check if the same length :ok
# check length differences in the sequences are :ok
# if the length differnece in sequences is the number of signal peptides or if I am missing one
# report in the lab-book
# c