# R version: 4.2.0
library(tidyverse)
library(ggpubr)
library(car)

################
### DATA
## load final data sets for sonorant and obstruent tokens
data_son = read.table("Son_V1C2_trackdata_sd_ws_zh_younger.txt")
land_son = read.table("Son_V1C2_landmarks_sd_ws_zh_younger.txt")

data_obs = read.table("Obs_V1_trackdata_sd_ws_zh_younger.txt")
land_obs = read.table("Obs_V1_landmarks_sd_ws_zh_younger.txt")

## combine sonorant and obstruent data
data = rbind(data_obs, data_son) %>%
  arrange(bundle)

## Table 1
# get number of included productions per category (i.e., data set and variety)
data %>%
  distinct(C2.type, variety, bundle) %>%
  group_by(C2.type, variety) %>% # for each level of Category
  count() %>%
  ungroup()

# repeat counting, this time additionally split by syllable structure
data %>%
  distinct(C2.type, variety, V1.quantity, bundle) %>%
  group_by(C2.type, variety, V1.quantity) %>% # for each level of Category
  count() %>%
  ungroup()

################
### PREANALYSIS
## calculate time-normalized position of V1C2-segment boundary for sonorant data
# use original time stamps of times_norm == 0 and == 1 as f0-start/-end
vowelend.norm = (land_son$V1.end - land_son$times_orig_zero)/(land_son$times_orig_one - land_son$times_orig_zero)

# write into table
v.df.norm = tibble(v.end.norm = vowelend.norm, bundle = land_son$bundle, variety = land_son$variety, V1.quantity = land_son$V1.quantity)

## Table 2
# get mean proportional vowel end per category (i.e., variety and syllable structure)
vmean.end.norm = v.df.norm %>% 
  group_by(variety, V1.quantity) %>% 
  summarise(mean = mean(v.end.norm)) %>%
  mutate(C2.type = "Sonorant") %>%
  ungroup()

## Figure 3
# create character vectors for labels
vowel.quant <- c(
  'long' = "open/long",
  'short' = "closed/short"
)

cons.type <- c(
  'Sonorant' = "Vowel+sonorant-sequences",
  'Obstruent' = "Vowels preceding obstruents"
)

# save this plot with ggsave() as PNG, height = 7, width = 10
e <- ggplot(data, aes(t.norm, scaledF0))
  
e + geom_smooth(aes(linetype = variety), method = "loess", span = 0.75, color = "black") +
  geom_hline(aes(yintercept = 0), linetype = 3, color = "azure4") +
  geom_vline(data = vmean.end.norm, aes(xintercept = mean, linetype = variety)) +
  facet_grid(V1.quantity ~ C2.type, labeller = labeller(V1.quantity = as_labeller(vowel.quant),
                                                        C2.type = as_labeller(cons.type))) +
  scale_linetype_manual(values = c("solid", "dotdash", "dashed"), 
                        name = "Variety", 
                        breaks = c("German_Standard", "Austrian_Standard", "Swiss_Standard"),
                        labels = c("Munich Standard German", "Vienna Standard German", "Zurich Standard German")) +
  labs(x = "Normalized time", y = "Scaled f0") + 
  theme_linedraw() + 
  guides(linetype = guide_legend(override.aes = list(fill = NA))) +
  theme(legend.justification = c(0, 0), legend.position = c(0.01, 0.01), legend.text = element_text(size = 17), legend.title = element_blank(),
        axis.text = element_text(size = 16),
        axis.title = element_text(size = 20),
        strip.text = element_text(color = "black", size = 20),
        strip.background = element_rect(fill = "white"))

################
### CORRELATION
## add columns for normalization of V1C2 duration (in case of sonorants) and V1 duration (in case of obstruents)
land_son$V1C2dur = land_son$C2.end - land_son$V1.start
land_son$targdur = land_son$targ.end - land_son$targ.start
land_son$V1C2dur.norm = land_son$V1C2dur/land_son$targdur

land_obs$V1dur = land_obs$V1.end - land_obs$V1.start
land_obs$targdur = land_obs$targ.end - land_obs$targ.start
land_obs$V1dur.norm = land_obs$V1dur/land_obs$targdur

## count & remove time points == 0 or == 1 for correlation plots and stats of both data sets
with(land_son, table(variety, times_norm_secmaxSmooth == 0))
with(land_son, table(variety, times_norm_secmaxSmooth == 1))
land_son_mid <- land_son %>%
  subset(times_norm_secmaxSmooth != 0 & times_norm_secmaxSmooth != 1)

with(land_obs, table(variety, times_norm_secmaxSmooth == 0))
with(land_obs, table(variety, times_norm_secmaxSmooth == 1))
land_obs_mid <- land_obs %>%
  subset(times_norm_secmaxSmooth != 0 & times_norm_secmaxSmooth != 1)

## further exclude 4 outliers from sonorant data and 8 outliers from obstruent data
land_son_mid_clean = land_son_mid %>%
  filter(!bundle=="WS_0008_Hoehle_normal_03") %>%
  filter(!bundle=="SD_0009_Dinner_normal_04") %>%
  filter(!bundle=="SH_ZH_1030_Hoehle_normal_05") %>%
  filter(!bundle=="SH_ZH_0026_Hoelle_normal_03")

land_obs_mid_clean = land_obs_mid %>%
  filter(!bundle=="SH_ZH_0025_wieder_normal_02") %>%
  filter(!bundle=="SH_ZH_0026_Kater_normal_03") %>%
  filter(!bundle=="SH_ZH_0026_wieder_normal_03") %>%
  filter(!bundle=="SH_ZH_1029_Bieter_normal_04") %>%
  filter(!bundle=="SH_ZH_0024_Pute_normal_04") %>%
  filter(!bundle=="SH_ZH_0023_bitter_normal_02") %>%
  filter(!bundle=="SH_ZH_0025_bitter_normal_05") %>%
  filter(!bundle=="SH_ZH_1030_Pudding_normal_04")

## Figure 10
# save this plot with ggsave() as PNG, height = 5, width = 10

# plot sonorant data first
e_son <- ggplot(land_son_mid_clean, aes(times_norm_secmaxSmooth, V1C2dur.norm))

corr_son <- e_son + geom_point(aes(color = variety, shape = V1.quantity)) +
  geom_smooth(aes(color = variety, linetype = V1.quantity), method = lm, se = F) +  
  theme_linedraw() +
  scale_color_manual(values = c("gray20", "gray60", "gray78"),
                     name = "Variety", 
                     breaks = c("German_Standard", "Austrian_Standard", "Swiss_Standard"),
                     labels = c("Munich Standard German", "Vienna Standard German", "Zurich Standard German")) +
  scale_shape_manual(values = c(16, 17),
                     name = "Syllable structure", 
                     breaks = c("long", "short"),
                     labels = c("open/long", "closed/short")) +
  scale_linetype_manual(values = c("solid", "dashed"),
                        name = "Syllable structure", 
                        breaks = c("long", "short"),
                        labels = c("open/long", "closed/short")) +
  guides(linetype = guide_legend(override.aes= list(color = "black"))) +
  labs(x = "Norm. time points of smooth maxima", y = "Norm. duration of V+S-sequence") +
  theme(axis.title = element_text(size = 14),
        legend.title = element_text(size = 11, face = "italic"),
        legend.text = element_text(size = 10))

# repeat plotting for obstruent data
e_obs <- ggplot(land_obs_mid_clean, aes(times_norm_secmaxSmooth, V1dur.norm))

corr_obs <- e_obs + geom_point(aes(color = variety, shape = V1.quantity)) +
  geom_smooth(aes(color = variety, linetype = V1.quantity), method = lm, se = F) +  
  theme_linedraw() + 
  scale_color_manual(values = c("gray20", "gray60", "gray78"),
                     name = "Variety", 
                     breaks = c("German_Standard", "Austrian_Standard", "Swiss_Standard"),
                     labels = c("Munich Standard German", "Vienna Standard German", "Zurich Standard German")) +
  scale_shape_manual(values = c(16, 17),
                     name = "Syllable structure", 
                     breaks = c("long", "short"),
                     labels = c("open/long", "closed/short")) +
  scale_linetype_manual(values = c("solid", "dashed"),
                        name = "Syllable structure", 
                        breaks = c("long", "short"),
                        labels = c("open/long", "closed/short")) +
  guides(linetype = guide_legend(override.aes= list(color = "black"))) +
  labs(x = "Norm. time points of smooth maxima", y = "Norm. duration of VbO") +
  theme(axis.title = element_text(size = 14),
        legend.title = element_text(size = 11, face = "italic"),
        legend.text = element_text(size = 10))

# combine both correlation plots into one figure
corr <- ggarrange(corr_son + rremove("xlab"), corr_obs + rremove("xlab"), 
                  ncol = 2, nrow = 1, labels = c("A", "B"),
                  common.legend = TRUE, legend="top")
annotate_figure(corr,
                bottom = text_grob("Norm. time points of smooth maxima", size = 14))

## Table 4
# get Pearson's r for both data sets (per category; i.e., split by variety and syllable structure)
land_son_mid_clean %>% 
  group_by(variety, V1.quantity) %>% 
  summarise(corr = cor(times_norm_secmaxSmooth, V1C2dur.norm)) %>%
  ungroup()

land_obs_mid_clean %>% 
  group_by(variety, V1.quantity) %>% 
  summarise(corr = cor(times_norm_secmaxSmooth, V1dur.norm)) %>%
  ungroup()

## subset both data sets into speaker groups and by underlying syllable structure for correlation tests
# sonorant data
son_vsg_long = land_son_mid_clean %>%
  subset(variety == "Austrian_Standard" & V1.quantity == "long")
son_vsg_short = land_son_mid_clean %>%
  subset(variety == "Austrian_Standard" & V1.quantity == "short")

son_msg_long = land_son_mid_clean %>%
  subset(variety == "German_Standard" & V1.quantity == "long")
son_msg_short = land_son_mid_clean %>%
  subset(variety == "German_Standard" & V1.quantity == "short")

son_zsg_long = land_son_mid_clean %>%
  subset(variety == "Swiss_Standard" & V1.quantity == "long")
son_zsg_short = land_son_mid_clean %>%
  subset(variety == "Swiss_Standard" & V1.quantity == "short")

# obstruent data
obs_vsg_long = land_obs_mid_clean %>%
  subset(variety == "Austrian_Standard" & V1.quantity == "long")
obs_vsg_short = land_obs_mid_clean %>%
  subset(variety == "Austrian_Standard" & V1.quantity == "short")

obs_msg_long = land_obs_mid_clean %>%
  subset(variety == "German_Standard" & V1.quantity == "long")
obs_msg_short = land_obs_mid_clean %>%
  subset(variety == "German_Standard" & V1.quantity == "short")

obs_zsg_long = land_obs_mid_clean %>%
  subset(variety == "Swiss_Standard" & V1.quantity == "long")
obs_zsg_short = land_obs_mid_clean %>%
  subset(variety == "Swiss_Standard" & V1.quantity == "short")

## check residual (error) variance
# sonorant data
son_vsg_long.m <- lm(V1C2dur.norm ~ times_norm_secmaxSmooth, data = son_vsg_long)
ncvTest(son_vsg_long.m)
#p-value > 0.05 —>  homoscedasticity
son_vsg_short.m <- lm(V1C2dur.norm ~ times_norm_secmaxSmooth, data = son_vsg_short)
ncvTest(son_vsg_short.m)
#p-value > 0.05 —>  homoscedasticity

son_msg_long.m <- lm(V1C2dur.norm ~ times_norm_secmaxSmooth, data = son_msg_long)
ncvTest(son_msg_long.m)
#p-value < 0.05 —>  heteroscedasticity
son_msg_short.m <- lm(V1C2dur.norm ~ times_norm_secmaxSmooth, data = son_msg_short)
ncvTest(son_msg_short.m)
#p-value > 0.05 —>  homoscedasticity

son_zsg_long.m <- lm(V1C2dur.norm ~ times_norm_secmaxSmooth, data = son_zsg_long)
ncvTest(son_zsg_long.m)
#p-value > 0.05 —>  homoscedasticity
son_zsg_short.m <- lm(V1C2dur.norm ~ times_norm_secmaxSmooth, data = son_zsg_short)
ncvTest(son_zsg_short.m)
#p-value < 0.05 —>  heteroscedasticity

# obstruent data
obs_vsg_long.m <- lm(V1dur.norm ~ times_norm_secmaxSmooth, data = obs_vsg_long)
ncvTest(obs_vsg_long.m)
#p-value > 0.05 —>  homoscedasticity
obs_vsg_short.m <- lm(V1dur.norm ~ times_norm_secmaxSmooth, data = obs_vsg_short)
ncvTest(obs_vsg_short.m)
#p-value > 0.05 —>  heteroscedasticity

obs_msg_long.m <- lm(V1dur.norm ~ times_norm_secmaxSmooth, data = obs_msg_long)
ncvTest(obs_msg_long.m)
#p-value < 0.05 —>  homoscedasticity
obs_msg_short.m <- lm(V1dur.norm ~ times_norm_secmaxSmooth, data = obs_msg_short)
ncvTest(obs_msg_short.m)
#p-value > 0.05 —>  homoscedasticity

obs_zsg_long.m <- lm(V1dur.norm ~ times_norm_secmaxSmooth, data = obs_zsg_long)
ncvTest(obs_zsg_long.m)
#p-value > 0.05 —>  homoscedasticity
obs_zsg_short.m <- lm(V1dur.norm ~ times_norm_secmaxSmooth, data = obs_zsg_short)
ncvTest(obs_zsg_short.m)
#p-value < 0.05 —>  homoscedasticity

## test significance of correlation
# use non-parametric correlation statistics for both data sets due to heteroscedasticity in some subsets (see above)

# sonorant data: negative correlation expected, therefore alternative = "less"
with(son_vsg_long, cor.test(times_norm_secmaxSmooth, V1C2dur.norm, method = "spearman", alternative = "less"))
with(son_vsg_short, cor.test(times_norm_secmaxSmooth, V1C2dur.norm, method = "spearman", alternative = "less"))

with(son_msg_long, cor.test(times_norm_secmaxSmooth, V1C2dur.norm, method = "spearman", alternative = "less"))
with(son_msg_short, cor.test(times_norm_secmaxSmooth, V1C2dur.norm, method = "spearman", alternative = "less"))

with(son_zsg_long, cor.test(times_norm_secmaxSmooth, V1C2dur.norm, method = "spearman", alternative = "less"))
with(son_zsg_short, cor.test(times_norm_secmaxSmooth, V1C2dur.norm, method = "spearman", alternative = "less"))

# obstruent data: positive correlation expected, therefore alternative = "greater"
with(obs_vsg_long, cor.test(times_norm_secmaxSmooth, V1dur.norm, method = "spearman", alternative = "greater"))
with(obs_vsg_short, cor.test(times_norm_secmaxSmooth, V1dur.norm, method = "spearman", alternative = "greater"))

with(obs_msg_long, cor.test(times_norm_secmaxSmooth, V1dur.norm, method = "spearman", alternative = "greater"))
with(obs_msg_short, cor.test(times_norm_secmaxSmooth, V1dur.norm, method = "spearman", alternative = "greater"))

with(obs_zsg_long, cor.test(times_norm_secmaxSmooth, V1dur.norm, method = "spearman", alternative = "greater"))
with(obs_zsg_short, cor.test(times_norm_secmaxSmooth, V1dur.norm, method = "spearman", alternative = "greater"))