

# This code is to create a simple binomical GLM
#the response/dependet variable is failure to fulifl a lone (here its continous)
#the independent variabels incule risklevel, year on book yar down joes index and so on

# Step1: import all the important libraries needed
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

#!/usr/bin/env python3
# coding: utf-8


########################################################################################################################
#Author: Tanja Krüger
#Aim: This file visualizes the amino acid composition, the average length, the aromaticity and the logos in joined graphics
#Input: four fasta files with the seperate bacterial and animal toxins, and four matrices that hold the surprise of the amino
#       acids to occur in one of two sets
#Outpout: a serires of files that show the length distribution, the pI, instability, aromaticity distribution and logos in
#       cominded graphs


########################################################################################################################
# downloaded
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.transforms as mtransforms
import re, argparse, csv, collections,random
from datetime import datetime
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO
import logomaker
from matplotlib_venn import venn2
import statsmodels.api as sm
import pylab
import scipy.stats as stats
from patsy import dmatrices
import csv
# #################################################################################################
# Option depending where the user wants the run the code form, default running the code with make from the project folder.
cl=""
# If one wants to execute this file from the Code/python folder uncomment the next line.
cl="../../"

########################################################################################################################
# Get the arguments from the command line.
parser = argparse.ArgumentParser(prog="data_analysis_3.py",
                                 description="visualizing protein length, PI and aromaticity between the animal and bacterial sets")
parser.add_argument("at100",
                    type=str,
                    help="fasta file animal toxins 100% reduced")
parser.add_argument("ac100",
                    type=str,
                    help="fasta file animal control 100% reduced")
parser.add_argument("bt100",
                    type=str,
                    help="fasta file bacterial toxins 100% reduced")
parser.add_argument("bc100",
                    type=str,
                    help="fasta file bacterial control 100% reduced")

parser.add_argument("matrix_atac",
                    type=str,
                    help="surprise matrix animal toxins v animal control")
parser.add_argument("matrix_atbt",
                    type=str,
                    help="surprise matrix animal toxins v bacterial toxins")
parser.add_argument("matrix_acbc",
                    type=str,
                    help="surprise matrix animal control v bacterial control")
parser.add_argument("matrix_btbc",
                    type=str,
                    help="surprise matrix bacterial toxins v bacterail control")

args = parser.parse_args()


########################################################################################################################
# Step1: Check if the datasets all have a same mmseqs2 redundancy reduction, if yes if they have the same level
try:
    sst_level1 = re.search("SST(\d+)",args.at100).group(1) #mmseqs2 reduction level
    sst_level2 = re.search("SST(\d+)", args.ac100).group(1)  # mmseqs2 reduction level
    sst_level3 = re.search("SST(\d+)", args.bt100).group(1)
    sst_level4 = re.search("SST(\d+)", args.bc100).group(1)
    assert sst_level1==sst_level2==sst_level3==sst_level4, "all sets are redundancy reduce but they do not share the same level of reduction"
except: sst_level1="full unreduced"

# Step2: Log
# Step2.1: Open the predictor logfile and the general logfile.
out_file = open(f"{cl}Data/derived/log.log", "a")
explore_file=open(f"{cl}Exploratory/smd_{sst_level1}.log","a")

# Step2.2: Get the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")

# Step2.3: Write to the predictor log file and the general logfile
print(f"""########## \n
program {parser.prog} was executed at {dt_string} \n
program {parser.prog} was executed at {dt_string} \n
argments passed: the 100 reduced animal toxins {args.at100} \n
                the 100 reduced bacterial toxins {args.bt100} \n
                the 100 reduced animal controls {args.ac100}\n
                the 100 reudced bacterail controls {args.bc100}\n
                the four surprise matrices {args.matrix_atac}, {args.matrix_atbt},{args.matrix_acbc},and {args.matrix_btbc},\n
number of required arguments:14""",file=out_file)

########################################################################################################################
# Step 3: Open the data
# Step 3.1: Open fasta files and modify it to a useable dataframe.
with open(args.at100) as handle:
    at100 = pd.DataFrame({record.id: [record.description, str(record.seq)]  for record in SeqIO.parse(handle, "fasta")}).T
with open(args.ac100) as handle:
    ac100 = pd.DataFrame(
        {record.id: [record.description, str(record.seq)] for record in SeqIO.parse(handle, "fasta")}).T
with open(args.bt100) as handle:
    bt100 = pd.DataFrame(
        {record.id: [record.description, str(record.seq)] for record in SeqIO.parse(handle, "fasta")}).T
with open(args.bc100) as handle:
    bc100 = pd.DataFrame(
        {record.id: [record.description, str(record.seq)] for record in SeqIO.parse(handle, "fasta")}).T
# Step 3.2: rename the column of the dataframes
at100.columns,ac100.columns, bt100.columns, bc100.columns=["info","seq"],["info","seq"],["info","seq"],["info","seq"]

# Step 3.3: Define the amino acids for the lables in the plot.
amino_acids = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
# Step 3.4: Open the matrices with the logo information
df_atac = pd.read_csv(args.matrix_atac)
df_atbt = pd.read_csv(args.matrix_atbt)
df_acbc = pd.read_csv(args.matrix_acbc)
df_btbc = pd.read_csv(args.matrix_btbc)
#read the length of each protein
lens_1 = [["at",len(i)] for i in at100.seq]
lens_2 = [["ac",len(i)] for i in ac100.seq]
lens_3 = [["bt",len(i)] for i in bt100.seq]
lens_4 = [["bc",len(i)] for i in bc100.seq]
print(np.mean(lens_1))
print(np.mean(lens_2))
print(np.mean(lens_3))
print(np.mean(lens_4))
#make a DataFrame from the length and the class (tidy data fromat)
df_1=pd.DataFrame(lens_1,columns=["factor","length"])
df_2=pd.DataFrame(lens_2,columns=["factor","length"])
df_3=pd.DataFrame(lens_3,columns=["factor","length"])
df_4=pd.DataFrame(lens_4,columns=["factor","length"])
#append the dataframes together
df_total=pd.concat([df_1,df_2,df_3,df_4])
# create the design matrices
y, X = dmatrices("length ~ factor", data=df_total, return_type='dataframe')
print(y)
print(X)
# Fit the model by describing the model
my_mod=sm.OLS(y,X)
# Fit the model se the class method
res=my_mod.fit()
# Inspect the results
print(res.summary())
#Create a f
glm_binom = sm.GLM(y, X, family=sm.families.Binomial())
res = glm_binom.fit()
print("#########################################")
print(res.summary)
# We look at the at the interquratile
means = X.mean(axis=0)
means25 = means.copy()
means25.iloc[0] = stats.scoreatpercentile(X.iloc[:,0], 25)
means75 = means.copy()
means75.iloc[0] = lowinc_75per = stats.scoreatpercentile(X.iloc[:,0], 75)
resp_25 = res.predict(means25)
resp_75 = res.predict(means75)
diff = resp_75 - resp_25
print(diff)
#
ynew = y.iloc[:,0]/X.sum(1)
print(y.iloc[:,0])
print(X.sum(1))
print(ynew)
yhat = res.mu
print(yhat)

fig, ax = plt.subplots()
ax.scatter(yhat, res.resid_pearson)
ax.hlines(0, 0, 1)
ax.set_xlim(0, 1)
ax.set_title('Residual Dependence Plot')
ax.set_ylabel('Pearson Residuals')
ax.set_xlabel('Fitted values')
from scipy import stats

fig, ax = plt.subplots()

resid = res.resid_deviance.copy()
resid_std = stats.zscore(resid)
ax.hist(resid_std, bins=25)
ax.set_title('Histogram of standardized deviance residuals')
from statsmodels import graphics
graphics.gofplots.qqplot(resid, line='r')
plt.show()



glm_gamma = sm.GLM(y, X, family=sm.families.Gamma(sm.families.links.Log()))
glm_results = glm_gamma.fit()
print(glm_results.summary())
resid2 = res.resid_deviance.copy()
graphics.gofplots.qqplot(resid2, line='r')
plt.show()
# Open a new csv file in write mode
#WE
# def writer(lens,data_name):
#     with open(f"length_output{data_name}.csv", "w") as f:
#         # Create a csv writer object
#         writer = csv.writer(f)
#         for i in lens:
#             writer.writerow([i])
# writer(lens_1,"at")
# writer(lens_2,"ac")
# writer(lens_3,"bt")
# writer(lens_4,"bc")


# df = sm.datasets.get_rdataset("Guerry", "HistData").data
#
# vars = ['Department', 'Lottery', 'Literacy', 'Wealth', 'Region']
# df = df[vars]
# df = df.dropna()
# df[-5:]

