
# THIS FILE AUTOMATES THE DATA WRANGLING AND DATA ANALYSIS


# Rule structure 
# target: the exe producing the target, dependency_1, dependency_n
#    command executing the script producing the target
# $< replaces the first dependecy, by setting the first dependency to the exe, calling it a second time explicitly is no longer needed
# $^ replaces all dependencies exactly in the order that they are given, as the arguments are given positional, the order is therefore now very important




#Targets

all: Data/derived/animal_control_proteins.fasta\
Data/derived/animal_control_proteins_IDgenus.tsv\
Data/derived/animal_control_proteins_ID_Type.tsv\
Data/derived/animal_control_proteins_source.tsv\
Data/derived/animal_control_proteins_clean.fasta\
Data/derived/mode0_clu_rep_animal_control_SST100_rep_seq.fasta\
Data/derived/mode0_clu_rep_bacterial_toxins_SST100_rep_seq.fasta\
Data/derived/mode0_clu_rep_bacterial_control_SST100_rep_seq.fasta\
Data/derived/mode0_clu_rep_animal_toxins_SST100_rep_seq.fasta\
Data/derived/all_bacterial_proteins_SST100.fasta\
Data/derived/all_toxic_proteins_SST100.fasta\
Data/derived/all_control_proteins_SST100.fasta\
Data/derived/all_animal_proteins_SST100.fasta\
Data/derived/forLogoMaker/animal_control_animal_toxins_LogoMaker.csv\
Data/derived/forLogoMaker/animal_toxins_animal_control_LogoMaker.csv\
Data/derived/forLogoMaker/animal_toxins_bacterial_toxins_LogoMaker.csv\
Data/derived/forLogoMaker/bacterial_toxins_animal_toxins_LogoMaker.csv\
Data/derived/forLogoMaker/bacterial_toxins_bacterial_control_LogoMaker.csv\
Data/derived/forLogoMaker/bacterial_control_bacterial_toxins_LogoMaker.csv\
Data/derived/forLogoMaker/bacterial_control_animal_control_LogoMaker.csv\
Data/derived/forLogoMaker/animal_control_bacterial_control_LogoMaker.csv\
Figures/logomaker_animal_toxins_animal_control.png\
Figures/logomaker_bacterial_toxins_bacterial_control.png\
Figures/logomaker_animal_toxins_bacterial_toxins.png\
Figures/logomaker_animal_control_bacterial_control.png\
Figures/SST100_length_distribution_density_animal_control_bacterial_control.png\
Figures/SST100_length_distribution_density_animal_toxins_bacterial_toxins.png\
Figures/SST100_length_distribution_density_animal_toxins_animal_control.png\
Figures/SST100_length_distribution_density_bacterial_toxins_bacterial_control.png\
Data/derived/mode0_clu_rep_animal_toxins_SST75_rep_seq.fasta\
Data/derived/mode0_clu_rep_animal_toxins_SST50_rep_seq.fasta\
Data/derived/mode0_clu_rep_animal_toxins_SST25_rep_seq.fasta\
Data/derived/mode0_clu_rep_bacterial_toxins_SST75_rep_seq.fasta\
Data/derived/mode0_clu_rep_bacterial_toxins_SST50_rep_seq.fasta\
Data/derived/mode0_clu_rep_bacterial_toxins_SST25_rep_seq.fasta\
Data/derived/all_toxic_proteins.fasta\
Data/derived/mode0_clu_rep_all_toxic_proteins_SST100_rep_seq.fasta\
Data/derived/mode0_clu_rep_all_toxic_proteins_SST75_rep_seq.fasta\
Data/derived/mode0_clu_rep_all_toxic_proteins_SST50_rep_seq.fasta\
Data/derived/mode0_clu_rep_all_toxic_proteins_SST25_rep_seq.fasta\
Figures/all_toxic_proteins_mmseqs_reduction_level.png\
Figures/animal_toxins_mmseqs_reduction_level.png\
Figures/bacterial_toxins_mmseqs_reduction_level.png\
Data/derived/length_output.csv\
Figures/shared_sequence_diversity.png\
Figures/shared_lengths2.png\
Figures/shared_lengths3.png\
Figures/shared_pIs_1by2.png\
Figures/shared_lengths_SingnalP.png


# Delete the duplicated IDs from the animal controls
Data/derived/animal_control_proteins.fasta: Code/python/deletion_duplicates_fragments.py Data/derived/animal_control_proteins_raw.fasta
		python $^
		

#Clean the combined animal control fasta file and generating three tables.
Data/derived/animal_control_proteins_IDgenus.tsv: Code/python/fasta_cleaning/cleanOneFile.py Data/derived/animal_control_proteins.fasta
		python $^
		
Data/derived/animal_control_proteins_ID_Type.tsv: Code/python/fasta_cleaning/cleanOneFile.py Data/derived/animal_control_proteins.fasta
		python $^

Data/derived/animal_control_proteins_source.tsv: Code/python/fasta_cleaning/cleanOneFile.py Data/derived/animal_control_proteins.fasta
		python $^

Data/derived/animal_control_proteins_clean.fasta: Code/python/fasta_cleaning/cleanOneFile.py Data/derived/animal_control_proteins.fasta
		python $^



# Do 100% sequence similarity reduction of animal toxins, animal controls, bacterial toxins, bacterial controls.


Data/derived/mode0_clu_rep_animal_control_SST100_rep_seq.fasta: Data/derived/animal_control_proteins.fasta 
		mmseqs easy-cluster Data/derived/animal_control_proteins.fasta Data/derived/mode0_clu_rep_animal_control_SST100 Data/derived/temporary_files --min-seq-id 1.00 --cov-mode 0

Data/derived/mode0_clu_rep_bacterial_toxins_SST100_rep_seq.fasta: Data/raw/bacterial/bacterial_toxins_combined.fasta 
		mmseqs easy-cluster Data/raw/bacterial/bacterial_toxins_combined.fasta Data/derived/mode0_clu_rep_bacterial_toxins_SST100 Data/derived/temporary_files --min-seq-id 1.00 --cov-mode 0

Data/derived/mode0_clu_rep_bacterial_control_SST100_rep_seq.fasta: Data/raw/bacterial/bacterial_control_proteins.fasta 
		mmseqs easy-cluster Data/raw/bacterial/bacterial_control_proteins.fasta Data/derived/mode0_clu_rep_bacterial_control_SST100 Data/derived/temporary_files --min-seq-id 1.00 --cov-mode 0

Data/derived/mode0_clu_rep_animal_toxins_SST100_rep_seq.fasta: Data/derived/animal_toxins_combined.fasta 
		mmseqs easy-cluster Data/derived/animal_toxins_combined.fasta Data/derived/mode0_clu_rep_animal_toxins_SST100 Data/derived/temporary_files --min-seq-id 1.00 --cov-mode 0


# For the logomaker files need to be concatenated	
# Combining both bacterial sets
Data/derived/all_bacterial_proteins_SST100.fasta: Data/derived/mode0_clu_rep_bacterial_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_control_SST100_rep_seq.fasta
		cat  Data/derived/mode0_clu_rep_bacterial_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_control_SST100_rep_seq.fasta > Data/derived/all_bacterial_proteins_SST100.fasta

# Combined both toxin sets
Data/derived/all_toxic_proteins_SST100.fasta: Data/derived/mode0_clu_rep_animal_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_toxins_SST100_rep_seq.fasta
		cat Data/derived/mode0_clu_rep_animal_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_toxins_SST100_rep_seq.fasta > Data/derived/all_toxic_proteins_SST100.fasta
		
# Combine both control sets
Data/derived/all_control_proteins_SST100.fasta: Data/derived/mode0_clu_rep_animal_control_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_control_SST100_rep_seq.fasta
		cat Data/derived/mode0_clu_rep_animal_control_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_control_SST100_rep_seq.fasta > Data/derived/all_control_proteins_SST100.fasta

# Combine both animal sets 
Data/derived/all_animal_proteins_SST100.fasta: Data/derived/mode0_clu_rep_animal_control_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_animal_toxins_SST100_rep_seq.fasta
		cat Data/derived/mode0_clu_rep_animal_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_animal_control_SST100_rep_seq.fasta > Data/derived/all_animal_proteins_SST100.fasta
	
	
#	Data/derived/mode0_clu_rep_animal_toxins_SST100_rep_seq.fasta
# Surprise Matrix for the logomaker
# Surprise of animal toxins vs non toxic animal proteins
Data/derived/forLogoMaker/animal_control_animal_toxins_LogoMaker.csv: Code/R/matrix_for_logomaker2.R Data/derived/mode0_clu_rep_animal_control_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_animal_toxins_SST100_rep_seq.fasta Data/derived/all_animal_proteins_SST100.fasta
		Rscript $^

Data/derived/forLogoMaker/animal_toxins_animal_control_LogoMaker.csv: Code/R/matrix_for_logomaker2.R Data/derived/mode0_clu_rep_animal_control_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_animal_toxins_SST100_rep_seq.fasta Data/derived/all_animal_proteins_SST100.fasta
		Rscript $^

# Surprise of animal toxins vs bacterial toxins
Data/derived/forLogoMaker/animal_toxins_bacterial_toxins_LogoMaker.csv: Code/R/matrix_for_logomaker2.R Data/derived/mode0_clu_rep_animal_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_toxins_SST100_rep_seq.fasta Data/derived/all_toxic_proteins_SST100.fasta
		Rscript $^

Data/derived/forLogoMaker/bacterial_toxins_animal_toxins_LogoMaker.csv: Code/R/matrix_for_logomaker2.R Data/derived/mode0_clu_rep_animal_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_toxins_SST100_rep_seq.fasta Data/derived/all_toxic_proteins_SST100.fasta
		Rscript $^

# Surprise of bacterial toxins vs non toxic bacterial proteins
Data/derived/forLogoMaker/bacterial_toxins_bacterial_control_LogoMaker.csv: Code/R/matrix_for_logomaker2.R Data/derived/mode0_clu_rep_bacterial_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_control_SST100_rep_seq.fasta Data/derived/all_bacterial_proteins_SST100.fasta
		Rscript $^

Data/derived/forLogoMaker/bacterial_control_bacterial_toxins_LogoMaker.csv: Code/R/matrix_for_logomaker2.R Data/derived/mode0_clu_rep_bacterial_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_control_SST100_rep_seq.fasta Data/derived/all_bacterial_proteins_SST100.fasta
		Rscript $^		
		
# Surprise of non toxic bacterial vs non toxic animal proteins
Data/derived/forLogoMaker/animal_control_bacterial_control_LogoMaker.csv: Code/R/matrix_for_logomaker2.R Data/derived/mode0_clu_rep_animal_control_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_control_SST100_rep_seq.fasta Data/derived/all_control_proteins_SST100.fasta
		Rscript $^	

Data/derived/forLogoMaker/bacterial_control_animal_control_LogoMaker.csv: Code/R/matrix_for_logomaker2.R Data/derived/mode0_clu_rep_animal_control_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_control_SST100_rep_seq.fasta Data/derived/all_control_proteins_SST100.fasta
		Rscript $^	



# Visualize the stuff with Logomaker
Figures/logomaker_animal_toxins_animal_control.png: Code/python/visualization_logomaker.py Data/derived/forLogoMaker/animal_toxins_animal_control_LogoMaker.csv
		python $^
				
Figures/logomaker_bacterial_toxins_bacterial_control.png: Code/python/visualization_logomaker.py Data/derived/forLogoMaker/bacterial_toxins_bacterial_control_LogoMaker.csv
		python $^
		
Figures/logomaker_animal_toxins_bacterial_toxins.png: Code/python/visualization_logomaker.py Data/derived/forLogoMaker/animal_toxins_bacterial_toxins_LogoMaker.csv
		python $^
	
Figures/logomaker_animal_control_bacterial_control.png: Code/python/visualization_logomaker.py Data/derived/forLogoMaker/animal_control_bacterial_control_LogoMaker.csv
		python $^

# Visualizing the length the PI, the aromaticity
Figures/SST100_length_distribution_density_animal_control_bacterial_control.png: Code/python/data_analysis_2.py Data/derived/mode0_clu_rep_animal_control_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_control_SST100_rep_seq.fasta 
		python $^
		
Figures/SST100_length_distribution_density_animal_toxins_bacterial_toxins.png: Code/python/data_analysis_2.py Data/derived/mode0_clu_rep_animal_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_toxins_SST100_rep_seq.fasta
		python $^
		
Figures/SST100_length_distribution_density_animal_toxins_animal_control.png: Code/python/data_analysis_2.py Data/derived/mode0_clu_rep_animal_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_animal_control_SST100_rep_seq.fasta
		python $^
		
Figures/SST100_length_distribution_density_bacterial_toxins_bacterial_control.png: Code/python/data_analysis_2.py Data/derived/mode0_clu_rep_bacterial_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_control_SST100_rep_seq.fasta
		python $^
		


# Running the MMseqs2 clustering on the animal toxins on three different levels
Data/derived/mode0_clu_rep_animal_toxins_SST75_rep_seq.fasta: Data/derived/animal_toxins_combined.fasta 
		mmseqs easy-cluster Data/derived/animal_toxins_combined.fasta Data/derived/mode0_clu_rep_animal_toxins_SST75 Data/derived/temporary_files --min-seq-id 0.75 --cov-mode 0
		
Data/derived/mode0_clu_rep_animal_toxins_SST50_rep_seq.fasta: Data/derived/animal_toxins_combined.fasta 
		mmseqs easy-cluster Data/derived/animal_toxins_combined.fasta Data/derived/mode0_clu_rep_animal_toxins_SST50 Data/derived/temporary_files --min-seq-id 0.50 --cov-mode 0
		
Data/derived/mode0_clu_rep_animal_toxins_SST25_rep_seq.fasta: Data/derived/animal_toxins_combined.fasta 	
		mmseqs easy-cluster Data/derived/animal_toxins_combined.fasta Data/derived/mode0_clu_rep_animal_toxins_SST25 Data/derived/temporary_files --min-seq-id 0.25 --cov-mode 0


# Running the MMseqs2 clustering on the bacterial toxins on three different levels
Data/derived/mode0_clu_rep_bacterial_toxins_SST75_rep_seq.fasta: Data/raw/bacterial/bacterial_toxins_combined.fasta 
		mmseqs easy-cluster Data/raw/bacterial/bacterial_toxins_combined.fasta Data/derived/mode0_clu_rep_bacterial_toxins_SST75 Data/derived/temporary_files --min-seq-id 0.75 --cov-mode 0

Data/derived/mode0_clu_rep_bacterial_toxins_SST50_rep_seq.fasta: Data/raw/bacterial/bacterial_toxins_combined.fasta 
		mmseqs easy-cluster Data/raw/bacterial/bacterial_toxins_combined.fasta Data/derived/mode0_clu_rep_bacterial_toxins_SST50 Data/derived/temporary_files --min-seq-id 0.50 --cov-mode 0
		
Data/derived/mode0_clu_rep_bacterial_toxins_SST25_rep_seq.fasta: Data/raw/bacterial/bacterial_toxins_combined.fasta 	
		mmseqs easy-cluster Data/raw/bacterial/bacterial_toxins_combined.fasta Data/derived/mode0_clu_rep_bacterial_toxins_SST25 Data/derived/temporary_files --min-seq-id 0.25 --cov-mode 0


# Running the MMseqs2 clustering on all toxins on three different levels
Data/derived/all_toxic_proteins.fasta: Data/raw/bacterial/bacterial_toxins_combined.fasta Data/derived/animal_toxins_combined.fasta
		cat Data/raw/bacterial/bacterial_toxins_combined.fasta Data/derived/animal_toxins_combined.fasta > Data/derived/all_toxic_proteins.fasta
				
Data/derived/mode0_clu_rep_all_toxic_proteins_SST100_rep_seq.fasta: Data/derived/all_toxic_proteins.fasta 
		mmseqs easy-cluster Data/derived/all_toxic_proteins.fasta Data/derived/mode0_clu_rep_all_toxic_proteins_SST100 Data/derived/temporary_files --min-seq-id 1.00 --cov-mode 0
		
Data/derived/mode0_clu_rep_all_toxic_proteins_SST75_rep_seq.fasta: Data/derived/all_toxic_proteins.fasta 
		mmseqs easy-cluster Data/derived/all_toxic_proteins.fasta Data/derived/mode0_clu_rep_all_toxic_proteins_SST75 Data/derived/temporary_files --min-seq-id 0.75 --cov-mode 0		
		
Data/derived/mode0_clu_rep_all_toxic_proteins_SST50_rep_seq.fasta: Data/derived/all_toxic_proteins.fasta 
		mmseqs easy-cluster Data/derived/all_toxic_proteins.fasta Data/derived/mode0_clu_rep_all_toxic_proteins_SST50 Data/derived/temporary_files --min-seq-id 0.50 --cov-mode 0
		
Data/derived/mode0_clu_rep_all_toxic_proteins_SST25_rep_seq.fasta: Data/derived/all_toxic_proteins.fasta 	
		mmseqs easy-cluster Data/derived/all_toxic_proteins.fasta Data/derived/mode0_clu_rep_all_toxic_proteins_SST25 Data/derived/temporary_files --min-seq-id 0.25 --cov-mode 0


Figures/all_toxic_proteins_mmseqs_reduction_level.png: Code/python/mmseqs2_cluster_analyis.py Data/derived/mode0_clu_rep_all_toxic_proteins_SST100_cluster.tsv Data/derived/mode0_clu_rep_all_toxic_proteins_SST75_cluster.tsv Data/derived/mode0_clu_rep_all_toxic_proteins_SST50_cluster.tsv Data/derived/mode0_clu_rep_all_toxic_proteins_SST25_cluster.tsv
		python $^		
Figures/animal_toxins_mmseqs_reduction_level.png: Code/python/mmseqs2_cluster_analyis.py Data/derived/mode0_clu_rep_animal_toxins_SST100_cluster.tsv Data/derived/mode0_clu_rep_animal_toxins_SST75_cluster.tsv Data/derived/mode0_clu_rep_animal_toxins_SST50_cluster.tsv Data/derived/mode0_clu_rep_animal_toxins_SST25_cluster.tsv
		python $^	
Figures/bacterial_toxins_mmseqs_reduction_level.png: Code/python/mmseqs2_cluster_analyis.py Data/derived/mode0_clu_rep_bacterial_toxins_SST100_cluster.tsv Data/derived/mode0_clu_rep_bacterial_toxins_SST75_cluster.tsv Data/derived/mode0_clu_rep_bacterial_toxins_SST50_cluster.tsv Data/derived/mode0_clu_rep_bacterial_toxins_SST25_cluster.tsv
		python $^			
		
		
# Shared visualizations aromaticit, logos with amino acid use, pIs and so on
# data_analysis_3 for the four by four plots logos, pI and aromaticity
Data/derived/length_output.csv: Code/python/data_analysis_3.py Data/derived/mode0_clu_rep_animal_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_animal_control_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_control_SST100_rep_seq.fasta Data/derived/forLogoMaker/animal_toxins_animal_control_LogoMaker.csv Data/derived/forLogoMaker/animal_toxins_bacterial_toxins_LogoMaker.csv Data/derived/forLogoMaker/animal_control_bacterial_control_LogoMaker.csv Data/derived/forLogoMaker/bacterial_toxins_bacterial_control_LogoMaker.csv
		python $^
		
# data_analysis_4 for the sequence similarity plots 
Figures/shared_sequence_diversity.png: Code/python/data_analysis_4.py Data/derived/mode0_clu_rep_animal_toxins_SST100_cluster.tsv Data/derived/mode0_clu_rep_animal_toxins_SST75_cluster.tsv Data/derived/mode0_clu_rep_animal_toxins_SST50_cluster.tsv Data/derived/mode0_clu_rep_animal_toxins_SST25_cluster.tsv Data/derived/mode0_clu_rep_bacterial_toxins_SST100_cluster.tsv Data/derived/mode0_clu_rep_bacterial_toxins_SST75_cluster.tsv Data/derived/mode0_clu_rep_bacterial_toxins_SST50_cluster.tsv Data/derived/mode0_clu_rep_bacterial_toxins_SST25_cluster.tsv Data/derived/mode0_clu_rep_all_toxic_proteins_SST100_cluster.tsv Data/derived/mode0_clu_rep_all_toxic_proteins_SST75_cluster.tsv Data/derived/mode0_clu_rep_all_toxic_proteins_SST50_cluster.tsv Data/derived/mode0_clu_rep_all_toxic_proteins_SST25_cluster.tsv
		python $^

# R code for the length visualization
Figures/shared_lengths2.png: Code/R/visualization_length.R Data/derived/length_output.csv
		Rscript $^
Figures/shared_lengths3.png: Code/R/visualization_length2.R Data/derived/length_output.csv
		Rscript $^
		
# data_analysis_6 for additional plots that are even better readable 
Figures/shared_pIs_1by2.png: Code/python/data_analysis_6.py Data/derived/mode0_clu_rep_animal_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_animal_control_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_toxins_SST100_rep_seq.fasta Data/derived/mode0_clu_rep_bacterial_control_SST100_rep_seq.fasta Data/derived/forLogoMaker/animal_toxins_animal_control_LogoMaker.csv Data/derived/forLogoMaker/animal_toxins_bacterial_toxins_LogoMaker.csv Data/derived/forLogoMaker/animal_control_bacterial_control_LogoMaker.csv Data/derived/forLogoMaker/bacterial_toxins_bacterial_control_LogoMaker.csv
		python $^
		
		

# R code for the length visualization of SignalP
Figures/shared_lengths_SingnalP.png: Code/R/visualization_length_SignalP.R Data/derived/length_output_SignalP.csv
		Rscript $^
