# R version: 4.2.0 
library(tidyverse)
library(lme4)
library(lmerTest)
library(emmeans)

lm = read.table("VOT_landmarks.txt")

## get number of included data points per position
with(lm, table(var_age, position))

## compute VOT
lm$VOT <- lm$VOT.end - lm$clos.end

## compute duration of word, closure phase & stop
lm$word.dur <- lm$word.end - lm$word.start
lm$clos.dur <- lm$clos.end - lm$clos.start
lm$stop.dur <- lm$clos.dur + lm$VOT

## normalization of VOT
lm$normbasis <- lm$word.dur - lm$stop.dur
lm$VOT.rel <- lm$VOT/lm$normbasis

###
## Figure 2: VOT plot
###

# save this plot with ggsave() as PNG, height = 3.5, width = 7
vot.plot <- ggplot (data = lm) + 
  aes (y = VOT.rel, x = var_age, color = c.type) +
  scale_x_discrete(limits = c("Bavarian_a", "Bavarian_j", "Standard"),
                   labels=c("Bavarian_a" = "Bavarian, o.",
                            "Bavarian_j" = "Bavarian, y.",
                            "Standard" = "Standard")) +
  geom_boxplot () + 
  facet_wrap(~position) +
  scale_fill_grey() + 
  scale_colour_grey(start = 0.6, end = 0.2) + 
  labs(y = "Normalized VOT") +
  theme_linedraw() +
  theme(legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 13),
        axis.title.x = element_blank(),
        axis.title.y = element_text(size = 14),
        axis.text = element_text(size = 12),
        strip.text = element_text(color = "black", face = "bold", size = 14),
        strip.background = element_rect(fill = "white"))

###
## VOT model
###

## p-values in decimel
options(scipen = 999)

## check structure of factors for modelling
# SESSION
table(lm$c.type, lm$session)
# consonant type is distributed within session

table(lm$position, lm$session)
# position is distributed within session

# WORD
table(lm$var_age, lm$word)
# speaker grooup is distributed within word

## full model all stops
VOT <- lmer(
  VOT.rel ~ c.type * position * var_age + 
    (1 + c.type + position | session) + 
    (1 + var_age | word), 
  data = lm)
#step(VOT) returns full model as most parsimonious

## check basic model assumptions
# homoscedasticity
plot(VOT)

# normal distribution of residuals
qqnorm(resid(VOT))
qqline(resid(VOT))

# add column for residuals
lm$resid = resid(VOT)

# define sd = 2.5
sd = 2.5*sd(resid(VOT))

# add column for outliers
lm$outs = ifelse(abs(lm$resid)>sd, 1, 0)

# plot outliers
plot(lm$resid, col = lm$outs+1, pch = 16, ylim = c(-0.3, 0.3))

# remove 52 outliers
lm_clean = lm[!lm$outs, ]

# plot again
plot(lm_clean$resid, col = lm_clean$outs+1, pch = 16, ylim = c(-0.3, 0.3))

## model again
VOT_clean <- lmer(
  VOT.rel ~ c.type * position * var_age + 
    (1 + c.type + position | session) + 
    (1 + var_age | word), 
  data = lm_clean)

## recheck model assumptions
# homoscedasticity
plot(VOT_clean)

# normal distribution of residuals
qqnorm(resid(VOT_clean))
qqline(resid(VOT_clean))

## get final model outcomes
anova(VOT_clean)

## Table 2 (selected/rearranged values)
# difference in consonant type by position and/or speaker group
VOT_clean.emm <- emmeans(VOT_clean, ~c.type:var_age|position, lmer.df = "satterthwaite")
pairs(VOT_clean.emm)

pairs(emmeans(VOT_clean, ~c.type|position + var_age, lmer.df = "satterthwaite"))