# R version: 4.2.0
library(tidyverse)
library(lme4)
library(lmerTest)
library(emmeans)

td = read.table("CF0_trackdata.txt")

## get number of included data points per position
td %>%
  distinct(var_age, position, bundle) %>%
  group_by(var_age, position) %>%
  count()

###
## Figure 1: CF0 plot
###

# new facet label names for var_age
labs <- c("Bavarian, o.", "Bavarian, y.", "Standard")
names(labs) <- c("Bavarian_a", "Bavarian_j", "Standard")

# save this plot with ggsave() as PNG, height = 7, width = 10.5
f0.plot <- ggplot(td, 
                  aes(
                    x = times_norm,  
                    y = scaledF0,  
                    colour = c.type,
                    linetype = c.type
                  ) 
) + 
  geom_smooth(method = "loess") +   
  scale_colour_grey(start = 0.6, end = 0.1) + 
  labs(x = "Normalized time", y = "Scaled f0") + 
  facet_grid(position~var_age, labeller = labeller(var_age = labs)) + 
  theme_linedraw() +
  theme(legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 18),
        axis.title = element_text(size = 20),
        axis.text = element_text(size = 18),
        strip.text = element_text(color = "black", face = "bold", size = 20),
        strip.background = element_rect(fill = "white")) 

## get f0 means over first 25% of vowel
td.25 <- td %>%  
  filter(times_norm <= 0.25) %>%
  group_by(bundle, session, position, POA, c.type, repetition, freq, var_age, word) %>% 
  summarise(scaled.f0.mean = mean(scaledF0, na.rm = TRUE)) %>%
  ungroup()

###
## Initial position: CF0 model
###

## p-values in decimel
options(scipen = 999)

## check structure of factors for modelling
# SESSION
table(td.25$c.type, td.25$session)
# consonant type is distributed within session

# WORD
table(td.25$var_age, td.25$word)
# speaker group is distributed within word

## include only stops in initial position
td.25_ini = td.25 %>%
  subset(position == "initial")

## full model scaled f0
scaledf0.25_ini <- lmer(
  scaled.f0.mean ~ c.type * var_age + 
    (1 + c.type | session) + 
    (1 + var_age | word), 
  data = td.25_ini)

## check basic model assumptions
# homoscedasticity
plot(scaledf0.25_ini)

# normal distribution of residuals
qqnorm(resid(scaledf0.25_ini))
qqline(resid(scaledf0.25_ini))

# add column for residuals
td.25_ini$resid = resid(scaledf0.25_ini)

# define sd = 2.5
sd = 2.5*sd(resid(scaledf0.25_ini))

# add column for outliers
td.25_ini$outs = ifelse(abs(td.25_ini$resid)>sd, 1, 0)

# plot outliers
plot(td.25_ini$resid, col = td.25_ini$outs+1, pch = 16, ylim = c(-5, 7))

# remove 21 outliers
td.25_ini_clean = td.25_ini[!td.25_ini$outs, ]

# plot again
plot(td.25_ini_clean$resid, col = td.25_ini_clean$outs+1, pch = 16, ylim = c(-5, 7))

## model again
scaledf0.25_ini_clean <- lmer(
  scaled.f0.mean ~ c.type * var_age + 
    (1 + c.type | session) + 
    (1 + var_age | word), 
  data = td.25_ini_clean)

## use step to address convergence failure
step(scaledf0.25_ini_clean)
scaledf0.25_ini_clean_best <- lmer(
  scaled.f0.mean ~ c.type + var_age + (1 + c.type | session) + (1 | word) + c.type:var_age, 
  data = td.25_ini_clean)

## recheck model assumptions
# homoscedasticity
plot(scaledf0.25_ini_clean_best)

# normal distribution of residuals
qqnorm(resid(scaledf0.25_ini_clean_best))
qqline(resid(scaledf0.25_ini_clean_best))

## get final model outcomes
anova(scaledf0.25_ini_clean_best)

## Table 1
# difference in consonant type by speaker group
scaledf0_ini_clean_best.emm <- emmeans(scaledf0.25_ini_clean_best, ~c.type|var_age, lmer.df = "satterthwaite")
pairs(scaledf0_ini_clean_best.emm)

###
## Medial position: CF0 model
###

## include only stops in medial position
td.25_med = td.25 %>%
  subset(position == "medial")

## full model scaled f0
scaledf0.25_med <- lmer(
  scaled.f0.mean ~ c.type * var_age + 
    (1 + c.type | session) + 
    (1 + var_age | word), 
  data = td.25_med)

## use step to address convergence failure
anova(get_model(step(scaledf0.25_med)))