--- title: "Data, Plots & Models" date: "June 25, 2022" author: "Markus Jochim & Felicitas Kleber" output: html_document: toc: true number_sections: true toc_float: collapsed: true df_print: paged code_folding: hide --- ```{r include=FALSE} library(tidyverse) library(emuR) library(lmerTest) library(emmeans) df_sd = read_csv("results-20190626/df_sd-stops-real_words-normal_and_fast.e92a42b.csv") df_wb = read_csv("results-20190626/df_wb-stops-real_words-normal_and_fast.0136fb4.csv") df = rbind(df_sd, df_wb) %>% mutate (release_type = case_when( following_sound %in% c("6", "@", "i", "I") ~ "oral", following_sound %in% c("m", "n", "N") ~ "nasal", following_sound %in% c("_h", "_t", "h") & second_next_sound %in% c("6", "@", "i", "I") ~ "oral", following_sound %in% c("_h", "_t", "h") & second_next_sound %in% c("m", "n", "N") ~ "nasal", following_sound %in% c("_h") & second_next_sound %in% c("l") ~ "lateral" )) %>% mutate (legal = case_when(variety == "SD" ~ "legal", variety == "WB" ~ recode(category, "V:C" = "legal", "VC:" = "legal", "VC" = "illegal", "V:C:" = "illegal"))) %>% mutate (speaker_group = paste0(age, "_", variety)) %>% mutate (speaker_group_label = factor(speaker_group, levels = c("Y_SD", "Y_WB", "O_WB"), labels = c("Standard German", "Dialect, younger", "Dialect, older"))) %>% mutate (standard_dialect_continuum = case_when( age == "O" & variety == "WB" ~ 1, age == "Y" & variety == "WB" ~ 2, age == "Y" & variety == "SD" ~ 3 )) %>% mutate (consonant_strength = recode(category, "V:C" = "Lenis", "VC" = "Lenis", "V:C:" = "Fortis", "VC:" = "Fortis")) %>% mutate (vowel_length = recode(category, "V:C" = "Long", "VC" = "Short", "V:C:" = "Long", "VC:" = "Short")) %>% mutate (rate = recode(rate, "normal" = "normal", "schnell" = "fast")) %>% mutate (rate = factor(rate, ordered = T, levels = c("normal", "fast"))) df_normal = df %>% filter(rate == "normal") df_fast = df %>% filter(rate == "fast") ``` # Underlying data set ## Speaker groups ```{r} df %>% count(variety, age, speaker) %>% count (variety, age) ``` ## Tokens per speech rate ```{r} df %>% count(rate) ``` ## Tokens per target word ```{r} df %>% count(word_group, target_word) %>% arrange(word_group, desc(n)) %>% select(-word_group) ``` ## Matrix of target words ```{r} df %>% count(word_group, category, target_word) %>% select (-n) %>% spread(category, target_word) ``` ## Types and tokens per quantity category ```{r} df %>% count(category, target_word) %>% count(category) df %>% count(category) ``` # Closure-norm (word-normalized stop closure duration) ## Plot (Figure 4.1) ```{r plot_normalized_closure_duration} df %>% ggplot() + aes(x = category, y = dur_c / dur_word) + facet_grid(cols = vars(speaker_group_label), rows = vars(rate)) + geom_boxplot(aes(fill = legal)) + ylab(bquote("closure"["norm"])) + xlab("Quantity category") + scale_x_discrete(labels = c("V:C:" = "V:C:\ne.g. Bieter", "V:C" = "V:C\ne.g. wieder", "VC" = "VC\ne.g. Widder", "VC:" = "VC:\ne.g. bitter" )) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + scale_fill_manual(name = "Phonotactically", values = c("red", "green")) ``` ## Model: Normalized closure duration ```{r model_normalized_closure_duration} emm_options(pbkrtest.limit = nrow(df)) emm_options(lmerTest.limit = nrow(df)) general_closure.lmer = df %>% mutate (speaker_group = factor(speaker_group, ordered = T, levels = c("Y_SD", "Y_WB", "O_WB"))) %>% lmer(data = ., dur_c / dur_word ~ speaker_group * category * rate + (category + rate | speaker) + (speaker_group + rate | target_word) ) general_closure.anova = anova(general_closure.lmer) print(general_closure.anova) ``` ### Pairwise comparisons for Section 4.3.1 (Closure duration) ```{r pairwise_comparisons_4_3_1} emmeans(general_closure.lmer, pairwise ~ speaker_group | category, lmer.df = "satterthwaite") emmeans(general_closure.lmer, pairwise ~ rate | speaker_group, lmer.df = "satterthwaite") ``` ### Pairwise comparisons for Section 4.3.3 (Subsection: Lenition in fast speech) ```{r pairwise_comparisons_4_3_3} emmeans(general_closure.lmer, pairwise ~ rate | category | speaker_group, lmer.df = "satterthwaite") ``` # VOT-norm (word-normalized voice onset time) ## Plot (Figure 4.2) ```{r plot_normalized_vot} df %>% ggplot() + aes(x = category, y = dur_aspiration / dur_word) + facet_grid(cols = vars(speaker_group_label), rows = vars(rate)) + geom_boxplot(aes(fill = legal)) + ylab(bquote("VOT"["norm"])) + xlab("Quantity category") + scale_x_discrete(labels = c("V:C:" = "V:C:\ne.g. Bieter", "V:C" = "V:C\ne.g. wieder", "VC" = "VC\ne.g. Widder", "VC:" = "VC:\ne.g. bitter" )) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + scale_fill_manual(name = "Phonotactically", values = c("red", "green")) ``` ## Model ```{r model_normalized_vot} emm_options(pbkrtest.limit = nrow(df)) emm_options(lmerTest.limit = nrow(df)) normalized_vot.lmer = df %>% mutate (speaker_group = factor(speaker_group, ordered = T, levels = c("Y_SD", "Y_WB", "O_WB"))) %>% lmer(data = ., dur_aspiration / dur_word ~ speaker_group * category * rate + (category + rate | speaker) + (speaker_group + rate | target_word) ) normalized_vot.anova = anova(normalized_vot.lmer) print(normalized_vot.anova) ``` ### Pairwise comparisons for Section 4.3.2 (Voice onset time) ```{r pairwise_comparisons_4_3_2} emmeans(normalized_vot.lmer, pairwise ~ category | speaker_group, lmer.df = "satterthwaite") ``` # Optimal category boundary ## Calculation ```{r calculation_optimal_category_boundary} possible_boundaries = seq(0, 280, by = 1) # (Pre-allocation) Generate a data frame of the correct size with an empty # column "classified_correctly". That column will be filled in the for loop # below. all_boundary_data_normal = tidyr::crossing(speaker = df$speaker, boundary = possible_boundaries) %>% mutate(classified_correctly = NA) for (current_row in rownames(all_boundary_data_normal)) { current_speaker = all_boundary_data_normal[current_row,]$speaker current_boundary = all_boundary_data_normal[current_row,]$boundary df %>% filter(rate == "normal") %>% filter(speaker == current_speaker) %>% mutate (classified_correctly = case_when((consonant_strength == "Fortis" & dur_c >= current_boundary) ~ "correct", (consonant_strength == "Fortis" & dur_c < current_boundary) ~ "incorrect", (consonant_strength == "Lenis" & dur_c >= current_boundary) ~ "incorrect", (consonant_strength == "Lenis" & dur_c < current_boundary) ~ "correct")) %>% count(classified_correctly) %>% spread(classified_correctly, n) %>% mutate(classified_correctly = correct / (correct + incorrect)) -> df_augmented all_boundary_data_normal[current_row, "classified_correctly"] = df_augmented$classified_correctly } best_boundary_data_normal = all_boundary_data_normal %>% group_by(speaker) %>% top_n(1, classified_correctly) %>% group_by(speaker) %>% summarise(min_boundary = min(boundary), max_boundary = max(boundary), boundary = mean(c(min(boundary), max(boundary))), classified_correctly = first(classified_correctly)) %>% mutate(rate = "normal") #### The same again for fast speech # (Pre-allocation) Generate a data frame of the correct size with an empty # column "classified_correctly". That column will be filled in the for loop # below. all_boundary_data_fast = tidyr::crossing(speaker = df$speaker, boundary = possible_boundaries) %>% mutate(classified_correctly = NA) for (current_row in rownames(all_boundary_data_fast)) { current_speaker = all_boundary_data_fast[current_row,]$speaker current_boundary = all_boundary_data_fast[current_row,]$boundary df %>% filter(rate == "fast") %>% filter(speaker == current_speaker) %>% mutate (classified_correctly = case_when((consonant_strength == "Fortis" & dur_c >= current_boundary) ~ "correct", (consonant_strength == "Fortis" & dur_c < current_boundary) ~ "incorrect", (consonant_strength == "Lenis" & dur_c >= current_boundary) ~ "incorrect", (consonant_strength == "Lenis" & dur_c < current_boundary) ~ "correct")) %>% count(classified_correctly) %>% spread(classified_correctly, n) %>% mutate(classified_correctly = correct / (correct + incorrect)) -> df_augmented all_boundary_data_fast[current_row, "classified_correctly"] = df_augmented$classified_correctly } best_boundary_data_fast = all_boundary_data_fast %>% group_by(speaker) %>% top_n(1, classified_correctly) %>% group_by(speaker) %>% summarise(min_boundary = min(boundary), max_boundary = max(boundary), boundary = mean(c(min(boundary), max(boundary))), classified_correctly = first(classified_correctly)) %>% mutate(rate = "fast") best_boundary_data_all = rbind (best_boundary_data_normal, best_boundary_data_fast) ``` ## Plot (Figure 4.3) ```{r} left_join(x = best_boundary_data_all, y = select(df, speaker, age, variety, speaker_group_label), by = "speaker") %>% distinct %>% ggplot() + facet_wrap(vars(speaker_group_label)) + aes (x = reorder(rate, desc(rate)), y = boundary, group = speaker) + geom_point() + geom_line() + ylab("Optimal category boundary [ms]") + xlab("Speech rate") ``` # Category expansion ## Calculation ```{r calculation_category_expansion} df_dispersion_difference_with_target_word = df %>% gather(key = "measure", value = "value", dur_c_norm_word) %>% group_by(measure, variety, age, speaker_group, speaker_group_label, category, legal, rate, speaker, target_word) %>% summarise(coefficient_of_variation = sd(value, na.rm = TRUE) / mean(value, na.rm = TRUE)) %>% ungroup() %>% spread(rate, coefficient_of_variation) %>% mutate(change_in_dispersion = fast - normal) df_dispersion_difference_without_target_word = df %>% gather(key = "measure", value = "value", dur_c_norm_word) %>% group_by(measure, variety, age, speaker_group, speaker_group_label, category, legal, rate, speaker) %>% summarise(coefficient_of_variation = sd(value, na.rm = TRUE) / mean(value, na.rm = TRUE)) %>% ungroup() %>% spread(rate, coefficient_of_variation) %>% mutate(change_in_dispersion = fast - normal) ``` ## Plot (Figure 4.4) ```{r plot_category_expansion} df_dispersion_difference_without_target_word %>% filter(measure == "dur_c_norm_word") %>% mutate(speaker_group = factor(speaker_group, levels = c("Y_SD", "Y_WB", "O_WB"), labels = c("Standard German", "Dialect, younger", "Dialect, older"))) %>% ggplot() + aes(x = category, y = change_in_dispersion) + facet_grid(cols = vars(speaker_group), scales = "free_y") + geom_point() + geom_boxplot(alpha = 0.8, outlier.shape = NA, coef = 0) + ylab("Category expansion") + xlab("") + scale_x_discrete(labels = c("V:C:" = "V:C:\ne.g. Bieter", "V:C" = "V:C\ne.g. wieder", "VC" = "VC\ne.g. Widder", "VC:" = "VC:\ne.g. bitter" )) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + scale_fill_manual(name = "Phonotactically", values = c("red", "green")) + coord_cartesian(ylim = c(-0.15, 0.3)) + geom_hline (aes(yintercept = 0)) ``` ## Model ```{r model_category_expansion} emm_options(pbkrtest.limit = nrow(df_dispersion_difference_with_target_word)) emm_options(lmerTest.limit = nrow(df_dispersion_difference_with_target_word)) dispersion_difference.lmer = df_dispersion_difference_with_target_word %>% filter(measure == "dur_c_norm_word") %>% mutate (speaker_group = factor(speaker_group, ordered = T, levels = c("Y_SD", "Y_WB", "O_WB"))) %>% lmer(data = ., change_in_dispersion ~ speaker_group * category + (category | speaker) + (speaker_group | target_word) ) dispersion_difference.anova = anova(dispersion_difference.lmer) print(dispersion_difference.anova) ``` # Fortis–lenis overlap ## Calculation ```{r} # Calculate the baseline of lenis tokens for fortis target words, normal rate typical_lenis_upper_bound_normal = df_normal %>% filter(consonant_strength == "Lenis") %>% group_by(speaker, place_of_articulation) %>% summarise(typical_lenis_upper_bound = quantile(dur_c / dur_word, probs = 0.75)) # Calculate the baseline of lenis tokens for fortis target words, fast rate typical_lenis_upper_bound_fast = df_fast %>% filter(consonant_strength == "Lenis") %>% group_by(speaker, place_of_articulation) %>% summarise(typical_lenis_upper_bound = quantile(dur_c / dur_word, probs = 0.75)) # Calculate the baseline of fortis tokens for lenis target words, normal rate typical_fortis_lower_bound_normal = df_normal %>% filter(consonant_strength == "Fortis") %>% group_by(speaker, place_of_articulation) %>% summarise(typical_fortis_lower_bound = quantile(dur_c / dur_word, probs = 0.25)) # Calculate the baseline of fortis tokens for lenis target words, fast rate typical_fortis_lower_bound_fast = df_fast %>% filter(consonant_strength == "Fortis") %>% group_by(speaker, place_of_articulation) %>% summarise(typical_fortis_lower_bound = quantile(dur_c / dur_word, probs = 0.25)) ``` ### Fortis as target, lenis as baseline #### Per place of articulation ```{r} flo_per_place_of_articulation_normal_rate_fortis_as_target = # FLO is short for fortis–lenis overlap left_join(df_normal, typical_lenis_upper_bound_normal, by = c("speaker", "place_of_articulation")) %>% mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside", dur_c/dur_word > typical_lenis_upper_bound ~ "outside")) %>% group_by(variety, age, speaker_group_label, speaker, place_of_articulation, consonant_strength) %>% mutate (number_of_tokens_in_group = n()) %>% group_by(variety, age, speaker, speaker_group_label, place_of_articulation, consonant_strength, number_of_tokens_in_group, inside_typical_lenis_region) %>% summarise(absolute_frequency = n()) %>% mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>% select(-absolute_frequency) %>% spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>% filter(consonant_strength == "Fortis") %>% ungroup %>% mutate (speech_rate = "normal") flo_per_place_of_articulation_fast_rate_fortis_as_target = # FLO is short for fortis–lenis overlap left_join(df_fast, typical_lenis_upper_bound_fast, by = c("speaker", "place_of_articulation")) %>% mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside", dur_c/dur_word > typical_lenis_upper_bound ~ "outside")) %>% group_by(variety, age, speaker_group_label, speaker, place_of_articulation, consonant_strength) %>% mutate (number_of_tokens_in_group = n()) %>% group_by(variety, age, speaker_group_label, speaker, place_of_articulation, consonant_strength, number_of_tokens_in_group, inside_typical_lenis_region) %>% summarise(absolute_frequency = n()) %>% mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>% select(-absolute_frequency) %>% spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>% filter(consonant_strength == "Fortis") %>% ungroup %>% mutate (speech_rate = "fast") flo_per_place_of_articulation_all_rates_fortis_as_target = rbind(flo_per_place_of_articulation_normal_rate_fortis_as_target, flo_per_place_of_articulation_fast_rate_fortis_as_target) ``` #### Per word ```{r} flo_per_word_normal_rate_fortis_as_target = # FLO is short for fortis–lenis overlap left_join(df_normal, typical_lenis_upper_bound_normal, by = c("speaker", "place_of_articulation")) %>% mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside", dur_c/dur_word > typical_lenis_upper_bound ~ "outside")) %>% group_by(variety, age, speaker_group_label, speaker, place_of_articulation, consonant_strength, target_word) %>% mutate (number_of_tokens_in_group = n()) %>% group_by(variety, age, speaker_group_label, speaker, place_of_articulation, consonant_strength, target_word, number_of_tokens_in_group, inside_typical_lenis_region) %>% summarise(absolute_frequency = n()) %>% mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>% select(-absolute_frequency) %>% spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>% filter(consonant_strength == "Fortis") %>% mutate (speech_rate = "normal") %>% ungroup flo_per_word_fast_rate_fortis_as_target = # FLO is short for fortis–lenis overlap left_join(df_fast, typical_lenis_upper_bound_fast, by = c("speaker", "place_of_articulation")) %>% mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside", dur_c/dur_word > typical_lenis_upper_bound ~ "outside")) %>% group_by(variety, age, speaker_group_label, speaker, place_of_articulation, consonant_strength, target_word) %>% mutate (number_of_tokens_in_group = n()) %>% group_by(variety, age, speaker_group_label, speaker, place_of_articulation, consonant_strength, target_word, number_of_tokens_in_group, inside_typical_lenis_region) %>% summarise(absolute_frequency = n()) %>% mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>% select(-absolute_frequency) %>% spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>% filter(consonant_strength == "Fortis") %>% mutate (speech_rate = "fast") %>% ungroup flo_per_word_all_rates_fortis_as_target = rbind(flo_per_word_normal_rate_fortis_as_target, flo_per_word_fast_rate_fortis_as_target) ``` ## Plot per place of articulation (Figure 4.5) ```{r} ggplot(flo_per_place_of_articulation_normal_rate_fortis_as_target) + aes(y = inside, x = place_of_articulation) + facet_grid(cols = vars(speaker_group_label)) + coord_cartesian(ylim = c(-0.1,1)) + ylab("Fortis–lenis overlap") + xlab("Place of articulation") + geom_jitter(aes(color = place_of_articulation)) + scale_color_discrete(name = "Place of articulation") + scale_y_continuous(breaks = seq(from = 0, to = 1, by = 0.2)) ``` ## Plots per word (Figure 4.6) ```{r} for (current_poa in c("alveolar")) { flo_per_word_normal_rate_fortis_as_target %>% filter(place_of_articulation == current_poa) %>% ggplot() + aes(y = inside, x = target_word) + facet_grid(cols = vars(speaker_group_label)) + coord_cartesian(ylim = c(-0.1,1)) + ylab("Fortis–lenis overlap") + xlab("Target word") + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + geom_jitter(aes(color = target_word)) + scale_color_discrete(name = "Target word") + scale_y_continuous(breaks = seq(from = 0, to = 1, by = 0.2)) -> plot print(plot) } ```