# R version: 4.2.0
library(tidyverse)
library(lme4)
library(lmerTest)
library(ez)
library(gridExtra)

load(paste0("CF0_VOT_correlation.Rdata"))

## get number of included data points per position
with(corr, table(var_age, c.type, position))

## remove 9 outliers
corr <- corr %>%
  filter(scaled.f0.mean>=-3.5) %>%
  filter(scaled.f0.mean<=4.2) %>%
  filter(VOT.rel<=0.55) %>%
  filter(!bundle == "WB0002_w_a0006MS_FoKoIn_lo_Teckel_2")

## group-level correlation for scaled f0 & normalized VOT (Pearson’s r)
corr %>% 
  group_by(var_age, position) %>% 
  summarise(corr = cor(VOT.rel, scaled.f0.mean)) %>%
  ungroup()

## model correlation differences at individual level
## initial fortis stops
ini = corr %>%
  subset(position == "initial")

# fit model
i.vot = lmer(scaled.f0.mean ~ VOT.rel + (1 + VOT.rel|session) + (1 + VOT.rel|word), data = ini)

# extract speaker-specific slopes from model
z.vot = coef(i.vot)[[1]]

# create columns for speaker group and session
labs.vot = str_trunc(rownames(z.vot), 4, "center", ellipsis = ".")
labs.vot[labs.vot=="SD.a"]="SD"
labs.vot[labs.vot=="SD.j"]="SD"
labs.vot[labs.vot=="SD"]=0
labs.vot[labs.vot=="WB.a"]=1
labs.vot[labs.vot=="WB.j"]=2
labs.vot = as.factor(labs.vot)
levels(labs.vot) = c("Standard","Bavarian_a","Bavarian_j")
vpn = factor(rownames(z.vot))

# write slopes, speaker group and session into df
Initial.df = data.frame(CF0 = z.vot[,2], session = vpn, var_age = labs.vot)

## medial fortis stops
med = corr %>%
  subset(position == "medial")

# fit model
m.vot = lmer(scaled.f0.mean ~ VOT.rel + (1 + VOT.rel|session) + (1 + VOT.rel|word), data = med)

# remove outliers based on model residuals to address convergence failure
# homoscedasticity
plot(m.vot)

# normal distribution of residuals
qqnorm(resid(m.vot))
qqline(resid(m.vot))

# add column for residuals
med$resid = resid(m.vot)

# define sd = 2.5
sd = 2.5*sd(resid(m.vot))

# add column for outliers
med$outs = ifelse(abs(med$resid)>sd, 1, 0)

# plot outliers
plot(med$resid, col = med$outs+1, pch = 16, ylim = c(-4, 4))

# remove 7 outliers
med_clean = med[!med$outs, ]

# plot again
plot(med_clean$resid, col = med_clean$outs+1, pch = 16, ylim = c(-4, 4))

# model again
m.vot_clean = lmer(scaled.f0.mean ~ VOT.rel + (1 + VOT.rel|session) + (1 + VOT.rel|word), data = med_clean)

# extract speaker-specific slopes from model
z.vot = coef(m.vot_clean)[[1]]

# create columns for speaker group and session
labs.vot = str_trunc(rownames(z.vot), 4, "center", ellipsis = ".")
labs.vot[labs.vot=="SD.a"]="SD"
labs.vot[labs.vot=="SD.j"]="SD"
labs.vot[labs.vot=="SD"]=0
labs.vot[labs.vot=="WB.a"]=1
labs.vot[labs.vot=="WB.j"]=2
labs.vot = as.factor(labs.vot)
levels(labs.vot) = c("Standard","Bavarian_a","Bavarian_j")
vpn = factor(rownames(z.vot))

# write slopes, speaker group and session into df
Medial.df = data.frame(CF0 = z.vot[,2], session = vpn, var_age = labs.vot)

### 
# Figure 3: Correlation plot
###

## group-level correlation
fig1 =  ggplot(corr) + 
  aes(y = scaled.f0.mean, x = VOT.rel) + 
  geom_point(aes(color = var_age, shape = var_age)) +
  geom_smooth(aes(linetype = var_age, color = var_age), method = "lm", se = F) + 
  scale_shape_manual(values = c(16, 2, 3),
                     name = "Speaker group", 
                     breaks = c("Bavarian_a", "Bavarian_j", "Standard"), 
                     labels = c("Bavarian, o.", "Bavarian, y.", "Standard")) +
  scale_color_manual(values=c("slategrey", "slategrey", "black"),
                     name = "Speaker group", 
                     breaks = c("Bavarian_a", "Bavarian_j", "Standard"), 
                     labels = c("Bavarian, o.", "Bavarian, y.", "Standard")) +
  scale_linetype_manual(values = c("solid", "dashed", "longdash"),
                        name = "Speaker group", 
                        breaks = c("Bavarian_a", "Bavarian_j", "Standard"), 
                        labels = c("Bavarian, o.", "Bavarian, y.", "Standard")) +
  facet_grid(~position) +
  labs(y = "Mean scaled f0", x = "Normalized VOT") +
  theme_linedraw() +
  theme(legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 15),
        axis.title = element_text(size = 17),
        axis.text = element_text(size = 14),
        strip.text = element_text(color = "black", face = "bold", size = 17),
        strip.background = element_rect(fill = "white"))

## speaker-specific slopes
# combine dataframes for plotting
Initial.df$position = rep("initial", nrow(Initial.df))
Medial.df$position = rep("medial", nrow(Medial.df))
Plot.df = rbind(Initial.df, Medial.df)

fig2 = ggplot(data = Plot.df) +
  geom_boxplot() +
  aes(x = var_age, y = CF0) +
  scale_x_discrete(limits = c("Bavarian_a", "Bavarian_j", "Standard"),
                   labels=c("Bavarian_a" = "Bavarian, o.",
                            "Bavarian_j" = "Bavarian, y.",
                            "Standard" = "Standard")) +
  geom_hline(aes(yintercept=0)) +
  facet_grid(~position) +
  labs(y = "Slope") + 
  theme_linedraw() +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_text(size = 17),
        axis.text = element_text(size = 14),
        strip.text = element_blank(),
        strip.background = element_blank())

## save combined correlation plot with ggsave() as PNG, height = 7, width = 8
fig = grid.arrange(fig1, fig2, nrow = 2, heights = c(1, 0.8))

### 
## Correlation statistics at individual level
###

## ANOVA with slope ("CF0") as DV, speaker group ("var_age") as IV and speaker ("session") as random factor
## Pairwise comparisons for initial fortis stops
# WB age groups
temp = Initial.df$var_age == "Standard"
ini_wb.age <- ezANOVA(Initial.df[!temp,], .(CF0), .(session), between = .(var_age))$ANOVA
ini_wb.age[,4:5] = round(ini_wb.age[,4:5],3)
ini_wb.age

# SG vs. WB younger
temp = Initial.df$var_age == "Bavarian_a"
ini_wb.y_sd <- ezANOVA(Initial.df[!temp,], .(CF0), .(session), between = .(var_age))$ANOVA
ini_wb.y_sd[,4:5] = round(ini_wb.y_sd[,4:5],3)
ini_wb.y_sd

# SG vs. WB older
temp = Initial.df$var_age == "Bavarian_j"
ini_wb.o_sd <- ezANOVA(Initial.df[!temp,], .(CF0), .(session), between = .(var_age))$ANOVA
ini_wb.o_sd[,4:5] = round(ini_wb.o_sd[,4:5],3)
ini_wb.o_sd

## Pairwise comparisons for medial fortis stops
# WB age groups
temp = Medial.df$var_age == "Standard"
med_wb.age <- ezANOVA(Medial.df[!temp,], .(CF0), .(session), between = .(var_age))$ANOVA
med_wb.age[,4:5] = round(med_wb.age[,4:5],3)
med_wb.age

# SG vs. WB younger
temp = Medial.df$var_age == "Bavarian_a"
med_wb.y_sd <- ezANOVA(Medial.df[!temp,], .(CF0), .(session), between = .(var_age))$ANOVA
med_wb.y_sd[,4:5] = round(med_wb.y_sd[,4:5],3)
med_wb.y_sd

# SG vs. WB older
temp = Medial.df$var_age == "Bavarian_j"
med_wb.o_sd <- ezANOVA(Medial.df[!temp,], .(CF0), .(session), between = .(var_age))$ANOVA
med_wb.o_sd[,4:5] = round(med_wb.o_sd[,4:5],3)
med_wb.o_sd