#!/usr/bin/env Rscript
# coding: utf-8

#AIm: This file is a first attempt at exploring the glm analysis for the protein lenght
#Input: scv with protein length in the first column and second column the classes 
#Output: nothing decided yet"


##################################################################################Step1: Get all the packages
require(flexplot)
require(tidyverse)
require(cowplot)
library(readxl)
library(readODS)
library(ggplot2)
library(DHARMa) #for evaluaiton of models
library(MASS) #for negative binominal regression
library(plyr) #I use this for revaluing 

#Step1: Read the excel file into a data frame
df1 <- read_ods("Documents/learning/GLMs/test1/all.ods", sheet = 1) #1=at, 2=ac , 3=bt , 4=bc
# change this to proper relative address in the future..

#Step2: Set the at column to a factor and rename them form numbers to text
df1$spec <- as.factor(df1$spec)
df1$spec<- revalue(df1$spec, c("1" = "animal toxins", "2" = "animal control", "3"  = "bacterial toxins","4"="bacterial control"))


#Step3: Print the head o f the df1
head(df1,5)

#Step4: Visualize the data in violin plots and boxplots, for this cut the yaxis
#       otherwise the distributions are non-distinguishable
viol <- ggplot(df1, aes(x=spec, y=length)) + geom_violin()+ylim(0,6000)
boxp <- ggplot(df1, aes(x=spec, y=length)) +geom_boxplot() 
viol
boxp
# I can see that the median shifts upwards, however the mean stays relativly the 
# same because of the many outliers
# this plays a role in the latter comparision of the four sets, as the glm, makes
# use of the mean not median


#Step5: Try several glms with different distributions and one lm
#Step5.1: Poisson distribution
glm_poisson=glm(length ~ spec,data=df1,family = poisson())
#Step5.2: Gamma distribution, I read in a paper about the mathmatical modeling 
#         of the length of protein length that the gamma function is often a good 
#         representative
glm_gamma1=glm(length ~ spec,data=df1,family = Gamma(link="log"))
#Step 5.3: try a different link function
glm_gamma2=glm(length ~ spec,data=df1,family = Gamma(link="inverse"))
#Step 5.4: try the negative binominal
glm_negbin=glm.nb(length ~ spec,data=df1)
#Step5.2: Linear Model, just as a comparision to interpret the outcome of the models
lm=lm(length ~ spec,data=df1 )

#Step6: Check out the summary of the results
summary(glm_poisson) 
summary(glm_gamma1) 
summary(glm_gamma2)
summary(glm_negbin)
summary(lm) 
# note to lm. max an min not where they should be around 3
            #residual standard error is the grand mean

#Step7: Print the coefficients
coef(glm_poisson) 
coef(glm_gamma1) 
coef(glm_gamma2) 
coef(glm_negbin)
coef(lm) 

#Step8: Compare the models to see which one is the best fitting according to
#aic, bic and bayes factor
#aic is better when lower, bic is better when lower
#bayes factor is better when higher
model.comparison(glm_poisson,glm_gamma1)
model.comparison(glm_gamma2, glm_gamma1)
model.comparison(glm_negbin, glm_gamma1)
model.comparison(lm,glm_gamma1)

#Based on this analysis: the glm_gamma1 peforms better than poisson or the linear model
#the tow link functions for gamma dont lead to large differences in goodness
#the negative binomina has similar aic and bic but the larger bayes factor

#Step9: Check further /quality check
simulationOutput1<-simulateResiduals(fittedModel = glm_gamma1)
plot(simulationOutput1)
#the test was failed, but what to do with this?
simulationOutput2<-simulateResiduals(fittedModel = glm_negbin)
plot(simulationOutput2)
#the tests fail
#Open questions: 
#wha to do do now that the test is failed? What is the next step
#Is it possible to treat toxins and bacteria as two factors 
#Todo: read more about the last plot duo, what does that mean?
#      read up how to distinguish the statistical effect of my findings..
