#!/usr/bin/env python3
# coding: utf-8


########################################################################################################################
# Author: Tanja Krüger
"""
This script combines all raw files containing fungi toxins from Uniprot and NCBI. Duplicated proteins are printed as warning message

Input:
- path to the folder containing the fungi toxins as fasta
- path to the results

Output:
folder_path to the fungi fasta files
output_fasta_path to the combined fungi proteins as fasta file
output_csv_path to the combined fungi proteins as csv file 
"""

########################################################################################################################
# All imports
import os
import sys
import pandas as pd
from Bio import SeqIO, SeqRecord, Seq
from collections import Counter
########################################################################################################################


def parse_fasta_description(description):
    """
    Parse the description to extract accession, organism, and additional info.
    Handles both NCBI and UniProt formats.
    """
    accession = description.split()[0]  # First part before the first space
    organism = "Unknown"
    additional_info = "Unknown"

    # Check for NCBI format (organism in square brackets)
    if "[" in description and "]" in description:
        organism_start = description.index("[") + 1
        organism_end = description.index("]")
        organism = description[organism_start:organism_end]
        additional_info = description.split("[")[0].strip()  # Text before organism in brackets

    # Check for UniProt format (organism after OS=)
    elif "OS=" in description:
        organism_start = description.index("OS=") + 3
        organism_end = description.index("OX=") if "OX=" in description else len(description)
        organism = description[organism_start:organism_end].strip()
        additional_info = description.split("OS=")[0].strip()  # Text before OS= as additional info

    return accession, organism, additional_info

def process_fasta_files(folder_path, output_fasta_path, output_csv_path):
    """
    Read all FASTA files in the folder and save both a combined FASTA file and a DataFrame.
    """
    records = []
    dataframe_records = []
    seen_accessions = set()  # To track already stored accession numbers
    duplicate_count = 0  # Overall count of duplicates

    # Iterate over all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".fasta") or file_name.endswith(".fa"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r") as handle:
                for record in SeqIO.parse(handle, "fasta"):
                    accession, organism, additional_info = parse_fasta_description(record.description)
                    amino_acid_seq = str(record.seq)

                    if accession in seen_accessions:
                        # Increment the overall duplicate count and print a warning
                        duplicate_count += 1
                        print(f"\n⚠️  Duplicate accession '{accession}' found:")
                        print(f"  Organism: {organism}")
                        print(f"  Additional Info: {additional_info}")
                        print(f"  First 20 Amino Acids: {amino_acid_seq[:20]}")
                        continue  # Skip storing duplicates

                    # Store the first occurrence of this accession
                    seen_accessions.add(accession)

                    # Create FASTA header format
                    header = f"{additional_info} [{organism}]"
                    seq_record = SeqRecord.SeqRecord(Seq.Seq(amino_acid_seq), id=header, description="")
                    records.append(seq_record)

                    # Add to DataFrame records
                    dataframe_records.append([accession, organism, additional_info, amino_acid_seq])

    # Create DataFrame from records
    df = pd.DataFrame(dataframe_records, columns=["Accession", "Organism", "Additional Info", "Amino Acid Sequence"])
    df.set_index("Accession", inplace=True)

    # Save the DataFrame as a CSV
    df.to_csv(output_csv_path)
    print(f"\nDataFrame created and saved to: {output_csv_path}")

    # Save combined records to a FASTA file
    with open(output_fasta_path, "w") as output_handle:
        SeqIO.write(records, output_handle, "fasta")
    print(f"Combined FASTA saved to: {output_fasta_path}")

    # Print overall summary of duplicates
    print(f"\n=== Overall Summary ===")
    print(f"Total number of duplicates found (but not stored): {duplicate_count}")

    return df

def main():
    """
    Main function to handle command-line arguments.
    """
    if len(sys.argv) != 4:
        print("Usage: python process_fasta.py <folder_path> <output_fasta_path> <output_csv_path>")
        sys.exit(1)

    folder_path = sys.argv[1]
    output_fasta_path = sys.argv[2]
    output_csv_path = sys.argv[3]

    process_fasta_files(folder_path, output_fasta_path, output_csv_path)

if __name__ == "__main__":
    main()
