#!/usr/bin/env Rscript

# January 2022
# created by : Luisa F. Jimenez Soto
# adapted by : Tanja Krueger

# This R script takes in the path of the Data directory and finds all files with extension *_out.tsv 
# containing a tsv table with the representative sequences of the of clusters found after redundancy 
# reduction using CD-HIT program.

# Load the required libraries


#Vector where all arguments will be stored
args = commandArgs(trailingOnly=TRUE)

# test if the arguments are the ones they are supposed to be and the number they should be

if (length(args)==0) {
  stop("One argument has to be supplied: the global address of the directory 'Data' ", call.=FALSE)
}else if (length(args)==1) {
  if (!grepl("\\<Data\\>", args[1])){
    stop("One argument has to be supplied: the global address of the directory 'Data'. Do not add a '/' at the end ", call.=FALSE)
  }
  data_path <- args[1]
} 

#For debugging
#data_path <-  "/home/agjimenez/Documents/02_analysis_animal_toxins/Data"
#if debugging, comment the following line and de-comment the one above
data_path <- args[1]

directory_data <- dirname(data_path)

#set the working directory to the derived data
derived_data_dir <-(paste0(data_path,"/derived/"))

#set the name for the path for storage of stats
stats_dir <- (paste0(data_path, "/derived/CDHITintermediateResults/")) 


#find all *_out.tsv files that need to be analyzed and store them in a vector called listFiles_out
listFiles_IDs <- list.files(paste0(data_path, "/derived/CDHITintermediateResults"), recursive = TRUE, full.names = TRUE, pattern = "*_listReps.csv$")

#find all *ID_Types.tsv files that need to be cross referenced with the cluster results and store them in a vector called listFiles_TableIDTypes
listFiles_TableIDTypes <- list.files(paste0(data_path, "/derived"), recursive = T, full.names = T, pattern = "*_ID_Type.tsv$")

#Checking for packages and installing if necessary
if(!require(tidyverse)){
  install.packages("tidyverse")
  library(tidyverse)
}

#load the tables in the listFiles_TableIDTypes
# it will need 2 loops. The first one to open the type files: files with the 
# protein ID and type of toxin. The second one with the clstr files. 
# If I would have made them more intelligent, I might have used the same name 
# for everything, but I was not and I want to leave a bit imperfect in case we
# get more interesting sequences in the future

#firstLoop for type files
for (i in listFiles_TableIDTypes) {
  #set the names for outfiles
  idTypes_SansExt <- tools::file_path_sans_ext(basename(i))
  tableTofilter <- i
  
  #for debugging only
  #tableTofilter <- listFiles_TableIDTypes[4]
  #idTypes_SansExt <- tools::file_path_sans_ext(basename(tableTofilter))
  
  # Open the table as df
  tableType <- read.delim(tableTofilter, header = F, sep= "\t")
  var_names <- c("proteinID", "species", "type")
  tableType <- tableType %>% rename( proteinID = V1, 
                                     species = V2,
                                     type = V3)

  #Take the factor away!
  tableType$proteinID <- as.character(tableType$proteinID)
  
  #Now the second  loop to go through the second file set
  for (j in listFiles_IDs){
    #chose the out file from the clstr parsing
    testFile_ID <- j
    print(paste0("The file ", tableTofilter, " is being tested against " , testFile_ID))
    
    #for debugging
    #testFile_ID <- listFiles_IDs[10] 
    
    
    # extract the basename and then extract only the last part corresponding to the
    # type of CDHIT it will be merged to 
    clstr_SansExt <- tools::file_path_sans_ext(basename(testFile_ID))
    #Use the function substr(text, start, stop) but as 
    #substring(string, regexpr("PATTERN", string))
    cdhitName <- substring(clstr_SansExt, regexpr("^", clstr_SansExt), regexpr("_", clstr_SansExt)-1)
    
    #cdhitName <- substring(clstr_SansExt, regexpr("CDHIT", clstr_SansExt), regexpr("_", clstr_SansExt)-1)
    #Create a report file for the merge 
    reportFile <- paste0(stats_dir, idTypes_SansExt,"_merged_", cdhitName, "_report.txt")
    
    #check that if the file exist it will be removed when starting the analysis, 
    #if not, then every cat will append to the existing one and at the end it will contain thre reports of each trial. 
    if (file.exists(reportFile)){
      file.remove(reportFile)
    }
    reportTextFileIntro <- paste0("Comparing file ", tableTofilter, " with " , testFile_ID)
    cat(reportTextFileIntro, sep = "\n", file = reportFile, append= T)
    
    
    # reading the list of IDs I got out from the clstr file into a vector
    df_Out <- scan(testFile_ID,character(), quote = "") 
    
    # Since on is a dataframe and the other is a vector, I will try to use the %in% operator
    filtered_TypeTable <- subset(tableType, proteinID %in% df_Out)
    
        
    # mergedDF <- merge(tableType, df_Out)
    mergeDimensions <- dim(filtered_TypeTable)
      if (mergeDimensions[1] > 0){
        cat(paste0("The file ", tableTofilter, " was filtered by ", testFile_ID, " and contains ", mergeDimensions, " common sequences stored at ", derived_data_dir, idTypes_SansExt, "_", cdhitName, ".tsv"), sep = "\n", file = reportFile, append = T)
        print(paste0("The file ", tableTofilter, " was filtered by ", testFile_ID, " and contains ", mergeDimensions, " common sequences stored at ", derived_data_dir, idTypes_SansExt, "_", cdhitName, ".tsv"))
        write.table(filtered_TypeTable, paste0(derived_data_dir, idTypes_SansExt, "_", cdhitName, ".tsv"), sep = "\t", row.names = F, quote = F, col.names = T)
      }else{
        print(paste0("The file ", tableTofilter, " found no match in ", testFile_ID))
        cat(paste0("The file ", tableTofilter, " found no match in ", testFile_ID), sep = "\n", file = reportFile, append = T)
      }

  } 
  
  
}

