#!/usr/bin/env Rscript
# coding: utf-8

#This file creates a matrix with surprise values on its diagonal.
#The surprise is calculated the following way: 
# (mean(population1)-mean(population1 + population2))/standard deviation(population1 +population2)
#
#Input:population1:
#      population2:
#      merged population 1 and 2:
#      example populations are (toxins vs non-toxin)(animal toxins vs bacterial toxins)
#Output:two 20 times 20 matrix with the sorted amino acids as the columns and rows
#      on the diagonal on the matrix is the individual surprise value. The rest 
#      of the matrix contains zeros. These matrixes are the input for the logomaker 
#      python script. """


#################################################################################
# The following libraries are imported: 
library(tidyverse)
library(data.table)
library(reshape2)
library(boot)
library(protr)
library(stringr)


################################################################################
# The inputs are defined as arguments from the command line
# A vector where all arguments are stored in. 
args <- commandArgs(trailingOnly=TRUE)
# Take the positional arguments and store them in variables.
# Remember R starts counting at 1, therefore arg[1] is the first positional argument 
# that was given in the command line. 
absoluteFileAddressPop1 <- args[1]
absoluteFileAddressPop2 <- args[2]
absoluteFileAddressPopMerged<- args[3]
print("processing")

# Example data
#absoluteFileAddressPop1 <- "/home/agjimenez/Documents/02_analysis_animal_toxins/Data/derived/fasta_clean_Animal_Toxins_Dataset"
#absoluteFileAddressPop2 <- "/home/agjimenez/Documents/02_analysis_animal_toxins/Data/derived/fasta_clean_mergedExotoxins_updated"
#absoluteFileAddressPopMerged <-"/home/agjimenez/Documents/02_analysis_animal_toxins/Data/derived/fasta_clean_animalToxins_mergedExotoxins"

# The next part is needed for more the saving of the files
# Get the names of the populations using regex.
namePop1<- str_match(absoluteFileAddressPop1,"fasta_clean_(.*)_")[2]
namePop2<- str_match(absoluteFileAddressPop2,"fasta_clean_(.*)_")[2]
# Get the location of the files using regex.
locationPop1 <-str_match(absoluteFileAddressPop1,"(.*)fasta_clean")[2]
locationPop2  <-str_match(absoluteFileAddressPop2,"(.*)fasta_clean")[2]

# The next part appends to an existing logfile-
stringToWrite<-"#### \nthe program: matrix_for_logomaker.R was carried out"
write(stringToWrite,file="Data/derived/log.log",append=TRUE)
stringToWrite<-paste("the arguments passed are:",absoluteFileAddressPop1,absoluteFileAddressPop2, absoluteFileAddressPopMerged, sep="")
write(stringToWrite,file="Data/derived/log.log",append=TRUE)
stringToWrite<- "the number of required argumetns: 3"
write(stringToWrite,file="Data/derived/log.log",append=TRUE)
stringToWrite<- "arguments should be: two diffrent fasta files with aa sequences, and the two fasta files merged"
write(stringToWrite,file="Data/derived/log.log",append=TRUE)

# Check if the number of arguments is correct.
if (length(args)!=3) {
  stop("Three arguments have to be supllied: fasta files of two diffrent populations and a merged fasta file ", call.=FALSE)
}


################################################################################      
# POPULATION 1: calculation of aa ratio 
pop1Fasta <- readFASTA(absoluteFileAddressPop1) 
# Double check if the fasta files only contains protein sequences
pop1FastaChecked <- pop1Fasta[(sapply(pop1Fasta, protcheck))]
# Removal of the unchecked data
rm(pop1Fasta)
# Analyse the ratio of amino acids per protein ID
aaCompositionPop1 <- sapply(pop1FastaChecked,extractAAC)
# The aaCompositionPop1 contains the protein IDs as headers, the amino acids
# would be more useful, calculate the inverse.
aaCompPop1TransMatrix <- t(aaCompositionPop1)
#print(aaCompPop1TransMatrix)
# Sort the column names alphabetically.
aaCompPop1TransMatrix <- aaCompPop1TransMatrix [, sort(colnames(aaCompPop1TransMatrix))]
# Remove unsorted data. 
rm(aaCompositionPop1)


################################################################################      
# POPULATION 2:  calculation of aa ratio 
pop2Fasta <- readFASTA(absoluteFileAddressPop2) 
# Double check if the fasta files only contains protein sequences
pop2FastaChecked <- pop2Fasta[(sapply(pop2Fasta, protcheck))]
# Removal of the unchecked data
rm(pop2Fasta)
# Analyse the ratio of amino acids per protein ID
aaCompositionPop2 <- sapply(pop2FastaChecked,extractAAC)
# The aaCompositionPop2 contains the protein IDs as headers, the amino acids
# would be more useful, calculate the inverse.
aaCompPop2TransMatrix <- t(aaCompositionPop2)
#print(aaCompPop2TransMatrix)
# Sort the column names alphabetically.
aaCompPop2TransMatrix <- aaCompPop2TransMatrix [, sort(colnames(aaCompPop2TransMatrix))]
# Remove unsorted data. 
rm(aaCompositionPop2)


################################################################################      
# MERGED POPULATIONS:  calculation of aa ratio 
popMergedFasta <- readFASTA(absoluteFileAddressPopMerged) 
# Double check if the fasta files only contains protein sequences
popMergedFastaChecked <- popMergedFasta[(sapply(popMergedFasta, protcheck))]
# Removal of the unchecked data
rm(popMergedFasta)
# Analyse the ratio of amino acids per protein ID
aaCompositionPopMerged <- sapply(popMergedFastaChecked,extractAAC)
# The aaCompositionPopMerged contains the protein IDs as headers, the amino acids
# would be more useful, calculate the inverse.
aaCompPopMergedTransMatrix <- t(aaCompositionPopMerged)
#print(aaCompPopMergedTransMatrix)
# Sort the column names alphabetically.
aaCompPopMergedTransMatrix <- aaCompPopMergedTransMatrix [, sort(colnames(aaCompPopMergedTransMatrix))]
# Remove unsorted data. 
rm(aaCompositionPopMerged)


################################################################################
aAMean <- function(x, d) {
  return(mean(x[d]))}


###############################################################################
# Calculate the mean and the standard deviation for the first population (pop1).
# Get the headers to have all the amino acids. 
aminoAcids <- colnames(aaCompPop1TransMatrix)
# Build an empty matrix
meansAAPop1 <- matrix(nrow= 1000, ncol= 20)
#print(meansAAPop1)
colnames(meansAAPop1) <- colnames(aaCompPop1TransMatrix)
#Create a matrix, the final matrix for each class, with dimensions 2 rows x 20 columns. The rows will contain the mean and SD for the results of the bootstrapping, while the columns will correspond for each of the amino acids.
pop1MeanSd <- matrix(nrow = 2, ncol = 20)
colnames(pop1MeanSd) <- colnames(aaCompPop1TransMatrix)
rownames(pop1MeanSd) <- c("mean.boot","sd.boot")
for (i in 1:20){
  bootN <- boot(aaCompPop1TransMatrix[,i], aAMean, R= 1000)
  for (j in 1:1000){
    meansAAPop1[j,i] <- bootN$t[j]
  }
  for (j in 1:2){
    #Adding the mean, which corresponds to the t0 value
    pop1MeanSd[1,i] <- bootN$t0
    #Adding the mean, which can be achieved using the sd function withing the boot class
    pop1MeanSd[2,i] <- sd(bootN$t)
  }
 }
print("still processing")

###############################################################################
# Calculate the mean and the standard deviation for the second population (pop2).
# Build an empty matrix
meansAAPop2 <- matrix(nrow= 1000, ncol= 20)
#print(meansAAPop2)
colnames(meansAAPop2) <- colnames(aaCompPop2TransMatrix)
#Create a matrix, the final matrix for each class, with dimensions 2 rows x 20 columns. The rows will contain the mean and SD for the results of the bootstrapping, while the columns will correspond for each of the amino acids.
pop2MeanSd <- matrix(nrow = 2, ncol = 20)
colnames(pop2MeanSd) <- colnames(aaCompPop2TransMatrix)
rownames(pop2MeanSd) <- c("mean.boot","sd.boot")
for (i in 1:20){
  bootN <- boot(aaCompPop2TransMatrix[,i], aAMean, R= 1000)
  for (j in 1:1000){
    meansAAPop2[j,i] <- bootN$t[j]
  }
  for (j in 1:2){
    #Adding the mean, which corresponds to the t0 value
    pop2MeanSd[1,i] <- bootN$t0
    #Adding the mean, which can be achieved using the sd function withing the boot class
    pop2MeanSd[2,i] <- sd(bootN$t)
  }
}
print("almost there")

###############################################################################
# Calculate the mean and the standard deviation for the merged populations (pop1 +pop2).
# Build an empty matrix
meansAAPopMerged <- matrix(nrow= 1000, ncol= 20)
#print(meansAAPopMerged)
colnames(meansAAPopMerged) <- colnames(aaCompPopMergedTransMatrix)
#Create a matrix, the final matrix for each class, with dimensions 2 rows x 20 columns. The rows will contain the mean and SD for the results of the bootstrapping, while the columns will correspond for each of the amino acids.
popMergedMeanSd <- matrix(nrow = 2, ncol = 20)
colnames(popMergedMeanSd) <- colnames(aaCompPopMergedTransMatrix)
rownames(popMergedMeanSd) <- c("mean.boot","sd.boot")
for (i in 1:20){
  bootN <- boot(aaCompPopMergedTransMatrix[,i], aAMean, R= 1000)
  for (j in 1:1000){
    meansAAPopMerged[j,i] <- bootN$t[j]
  }
  for (j in 1:2){
    #Adding the mean, which corresponds to the t0 value
    popMergedMeanSd[1,i] <- bootN$t0
    #Adding the mean, which can be achieved using the sd function withing the boot class
    popMergedMeanSd[2,i] <- sd(bootN$t)
  }
}



################################################################################
# Create the final matrix  with the surprise on the diagonal from the point of view of population1
# First create the empty matrix.
logoMakerPop1 <- matrix(0, nrow = 20, ncol = 20)
# Define the column names.
colnames(logoMakerPop1) <- aminoAcids
# Calculate the surprise and write the values on the diagonal. 
for (i in 1:20){logoMakerPop1[i,i] <- ( pop1MeanSd[1,i] -  popMergedMeanSd[1,i]) /  popMergedMeanSd[2,i] }
# Convert the matrix to a dataframe. 
logoMakerPop1DF <- as.data.frame(logoMakerPop1)
# Construct the name where you want to save it to.
saveNamePop1 <- paste(locationPop1,namePop1,"_",namePop2,"_LogoMaker.csv",sep="")
# Save the dataframe to a csv
write_csv(logoMakerPop1DF, saveNamePop1)



################################################################################
# Create the final matrix  with the surprise on the diagonal from the point of view of population2
# First create the empty matrix.
logoMakerPop2 <- matrix(0, nrow = 20, ncol = 20)
# Define the column names.
colnames(logoMakerPop2) <- aminoAcids
# Calculate the surprise and write the values on the diagonal. 
for (i in 1:20){logoMakerPop2[i,i] <- ( pop2MeanSd[1,i] -  popMergedMeanSd[1,i]) /  popMergedMeanSd[2,i]}
# Convert the matrix to a dataframe. 
logoMakerPop2DF <- as.data.frame(logoMakerPop2)
# Construct the name where you want to save it to.
saveNamePop2 <- paste(locationPop2,namePop2,"_",namePop1,"_LogoMaker.csv",sep="")
# Save the dataframe to a csv
write_csv(logoMakerPop2DF, saveNamePop2)


################################################################################
#logs: 
stringToWrite<- paste("files created:",saveNamePop1, saveNamePop2, sep=" ")
write(stringToWrite,file="Data/derived/log.log",append=TRUE)
stringToWrite <- paste(saveNamePop1, saveNamePop2, "were stored under Figures", sep=" " )
write(stringToWrite,file="Data/derived/log.log",append=TRUE)

print("done")
print(namePop1)
print(namePop2)
#find a way to close the running script
print(saveNamePop1)
print(saveNamePop2)



