# R version: 4.2.0
library(tidyverse)
library(car)

data = read.table("V1C2V2_trackdata_wb_sg.txt")
land = read.table("V1C2V2_landmarks_wb_sg.txt")

################
### PREANALYSIS
## calculate time-normalized positions of C2-onsets and -offsets
# use original time stamps of times_norm == 0 and == 1 as f0-start/-end
vowelend.norm = (land$V1.end - land$times_orig_zero)/(land$times_orig_one - land$times_orig_zero)
nasalend.norm = (land$C2.end - land$times_orig_zero)/(land$times_orig_one - land$times_orig_zero)

# write into table
v.df.norm = tibble(v.end.norm = vowelend.norm, bundle = land$bundle, variety = land$variety)
n.df.norm = tibble(n.end.norm = nasalend.norm, bundle = land$bundle, variety = land$variety)

## Table 3
# get mean proportional vowel & nasal end per speaker group
vmean.end.norm = v.df.norm %>% 
  group_by(variety) %>% 
  summarise(mean = mean(v.end.norm)) %>%
  ungroup()

nmean.end.norm = n.df.norm %>% 
  group_by(variety) %>% 
  summarise(mean = mean(n.end.norm)) %>%
  ungroup()

## Figure 1
e <- ggplot(data, aes(t.norm, scaledF0))

# save this plot with ggsave() as PNG, height = 4, width = 6
e + geom_smooth(aes(linetype = variety), method = "loess", span = 0.75, color = "black") +
  geom_hline(aes(yintercept = 0), linetype = 3, color = "azure4") +
  geom_vline(data = vmean.end.norm, aes(xintercept = mean, linetype = variety)) +
  geom_vline(data = nmean.end.norm, aes(xintercept = mean, linetype = variety)) +
  scale_linetype_manual(values = c("dotted", "dashed", "solid"), 
                        name = "Variety", 
                        breaks = c("Bavarian_older", "Bavarian_younger", "Standard"),
                        labels = c("Dialect, older", "Dialect, younger", "Standard German")) +
  labs(x = "Normalized time", y = "Scaled f0") + theme_linedraw() + guides(linetype = guide_legend(override.aes = list(fill = NA))) +
  theme(legend.justification = c(0, 0), legend.position = c(0.01, 0.01), legend.text = element_text(size = 15), legend.title = element_blank(),
        axis.text = element_text(size = 15),
        axis.title = element_text(size = 20))

################
### CORRELATION
## add columns for normalization of V1C2V2 duration
land$V1C2V2dur = land$V2.end - land$V1.start

# use V2.end-targ.start as normalization basis of comparable length for Panik-Bundles
land <- land %>%
  mutate(normbas = ifelse(word == "Panik", land$V2.end - land$targ.start, land$targ.end - land$targ.start))
land$V1C2V2dur.norm = land$V1C2V2dur/land$normbas

## get mean time points of smooth maxima & normalized V1C2V2 durations per category (averaged across repetition)
land_mean <- land %>% 
  group_by(session, word, variety) %>% 
  summarise(m.normT.secmaxSmooth = mean(times_norm_secmaxSmooth), m.normD.V1C2V2 = mean(V1C2V2dur.norm)) %>%
  ungroup()

## remove 3 outliers (x close to 0 or 1)
land_mean_clean = land_mean %>%
  subset(!m.normT.secmaxSmooth<0.05)
land_mean_clean = land_mean_clean %>%
  subset(!m.normT.secmaxSmooth==1)

## Figure 4
f <- ggplot(land_mean_clean, aes(m.normT.secmaxSmooth, m.normD.V1C2V2, label = word))

# save this plot with ggsave() as PNG, height = 6, width = 6
f + geom_point(aes(shape = variety), size = 2) +
  scale_shape_manual(values=c(17, 2, 4),
                     name = "Variety", 
                     breaks = c("Bavarian_older", "Bavarian_younger", "Standard"),
                     labels = c("Dialect, older", "Dialect, younger", "Standard German")) +
  geom_text(size = 4, check_overlap = T, color = "azure4") +
  geom_smooth(aes(linetype = variety), method = lm, se = F, color = "black", size = 0.8) +
  scale_linetype_manual(values = c("dotted", "dashed", "solid"), 
                        name = "Variety", 
                        breaks = c("Bavarian_older", "Bavarian_younger", "Standard"),
                        labels = c("Dialect, older", "Dialect, younger", "Standard German")) + 
  labs(x = "Mean time points (norm.) of smooth maxima", y = expression(Mean~duration~(norm.)~of~V[1]~C[2]~V[2]-sequence)) + xlim(0, 1) +
  theme_linedraw() + theme(legend.justification = c(0, 0), legend.position = c(0.01, 0.82), legend.text = element_text(size = 14), legend.title = element_blank(),
                           axis.text = element_text(size = 15),
                           axis.title = element_text(size = 17))

## subset into speaker groups for correlation tests
bav_old = land_mean_clean %>%
  subset(variety == "Bavarian_older")

bav_young = land_mean_clean %>%
  subset(variety == "Bavarian_younger")

sta = land_mean_clean %>%
  subset(variety == "Standard")

## check residual (error) variance
bav_old.m <- lm(m.normD.V1C2V2 ~ m.normT.secmaxSmooth, data = bav_old)
ncvTest(bav_old.m)
#p-value < 0.05 —>  heteroscedasticity

bav_young.m <- lm(m.normD.V1C2V2 ~ m.normT.secmaxSmooth, data = bav_young)
ncvTest(bav_young.m)
#p-value > 0.05 —>  homoscedasticity

sta.m <- lm(m.normD.V1C2V2 ~ m.normT.secmaxSmooth, data = sta)
ncvTest(sta.m)
#p-value > 0.05 —>  homoscedasticity

## test significance of correlation
# positive correlation expected, therefore alternative = "greater"
# use non-parametric correlation statistics for older Bavarians due to heteroscedasticity
with(bav_old, cor.test(m.normT.secmaxSmooth, m.normD.V1C2V2, method = "spearman", alternative = "greater"))

# use Pearson's correlation coefficient r for younger Bavarians & SG speakers
with(bav_young, cor.test(m.normT.secmaxSmooth, m.normD.V1C2V2, alternative = "greater"))

with(sta, cor.test(m.normT.secmaxSmooth, m.normD.V1C2V2, alternative = "greater"))