---
title: "Data, Plots & Models"
date: "June 25, 2022"
author: "Markus Jochim & Felicitas Kleber"
output:
  html_document:
    toc: true
    number_sections: true
    toc_float:
      collapsed: true
    df_print: paged
    code_folding: hide
---


```{r include=FALSE}
library(tidyverse)
library(emuR)

library(lmerTest)
library(emmeans)


df_sd = read_csv("results-20190626/df_sd-stops-real_words-normal_and_fast.e92a42b.csv")
df_wb = read_csv("results-20190626/df_wb-stops-real_words-normal_and_fast.0136fb4.csv")

df = rbind(df_sd, df_wb) %>%
  mutate (release_type = case_when(
    following_sound %in% c("6", "@", "i", "I") ~ "oral",
    following_sound %in% c("m", "n", "N") ~ "nasal",
    following_sound %in% c("_h", "_t", "h") & second_next_sound %in% c("6", "@", "i", "I") ~ "oral",
    following_sound %in% c("_h", "_t", "h") & second_next_sound %in% c("m", "n", "N") ~ "nasal",
    following_sound %in% c("_h") & second_next_sound %in% c("l") ~ "lateral"
  )) %>%
  mutate (legal = case_when(variety == "SD" ~ "legal",
                            variety == "WB" ~ recode(category,
                                                     "V:C" = "legal",
                                                     "VC:" = "legal",
                                                     "VC" = "illegal",
                                                     "V:C:" = "illegal"))) %>%
  mutate (speaker_group = paste0(age, "_", variety)) %>%
  mutate (speaker_group_label = factor(speaker_group,
                                       levels = c("Y_SD", "Y_WB", "O_WB"),
                                       labels = c("Standard German", "Dialect, younger", "Dialect, older"))) %>% 
  mutate (standard_dialect_continuum = case_when(
    age == "O" & variety == "WB" ~ 1,
    age == "Y" & variety == "WB" ~ 2,
    age == "Y" & variety == "SD" ~ 3
  )) %>%
  mutate (consonant_strength = recode(category,
                                      "V:C"  = "Lenis",
                                      "VC"   = "Lenis",
                                      "V:C:" = "Fortis",
                                      "VC:"  = "Fortis")) %>%
  mutate (vowel_length = recode(category,
                                "V:C"  = "Long",
                                "VC"   = "Short",
                                "V:C:" = "Long",
                                "VC:"  = "Short")) %>% 
  mutate (rate = recode(rate,
                        "normal" = "normal",
                        "schnell" = "fast")) %>% 
  mutate (rate = factor(rate,
                        ordered = T,
                        levels = c("normal", "fast")))

df_normal = df %>% 
  filter(rate == "normal")

df_fast = df %>% 
  filter(rate == "fast")
```

# Underlying data set

## Speaker groups

```{r}
df %>% count(variety, age, speaker) %>% count (variety, age)
```

## Tokens per speech rate

```{r}
df %>% count(rate)
```

## Tokens per target word

```{r}
df %>% count(word_group, target_word) %>% arrange(word_group, desc(n)) %>% select(-word_group)
```

## Matrix of target words

```{r}
df %>%
  count(word_group, category, target_word) %>%
  select (-n) %>% 
  spread(category, target_word)
```

## Types and tokens per quantity category

```{r}
df %>% count(category, target_word) %>% count(category)
df %>% count(category)
```

# Closure-norm (word-normalized stop closure duration)

## Plot (Figure 4.1)

```{r plot_normalized_closure_duration}

df %>%
  ggplot() +
  aes(x = category, y = dur_c / dur_word) +
  facet_grid(cols = vars(speaker_group_label),
             rows = vars(rate)) +
  geom_boxplot(aes(fill = legal)) +
  ylab(bquote("closure"["norm"])) +
  xlab("Quantity category") +
  scale_x_discrete(labels = c("V:C:" = "V:C:\ne.g. Bieter",
                              "V:C"  = "V:C\ne.g. wieder",
                              "VC"   = "VC\ne.g. Widder",
                              "VC:"  = "VC:\ne.g. bitter"
  )) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  scale_fill_manual(name = "Phonotactically",
                    values = c("red", "green"))
```


## Model: Normalized closure duration

```{r model_normalized_closure_duration}

emm_options(pbkrtest.limit = nrow(df))
emm_options(lmerTest.limit = nrow(df))

general_closure.lmer = df %>%
  mutate (speaker_group = factor(speaker_group,
                                 ordered = T,
                                 levels = c("Y_SD", "Y_WB", "O_WB"))) %>%
  lmer(data = .,
       dur_c / dur_word
       ~ speaker_group * category * rate +
         
         (category + rate | speaker) +
         (speaker_group + rate | target_word)
  )

general_closure.anova = anova(general_closure.lmer)

print(general_closure.anova)

```

### Pairwise comparisons for Section 4.3.1 (Closure duration)

```{r pairwise_comparisons_4_3_1}
emmeans(general_closure.lmer,
        pairwise ~ speaker_group | category,
        lmer.df = "satterthwaite")

emmeans(general_closure.lmer,
        pairwise ~ rate | speaker_group,
        lmer.df = "satterthwaite")
```

### Pairwise comparisons for Section 4.3.3 (Subsection: Lenition in fast speech)

```{r pairwise_comparisons_4_3_3}
emmeans(general_closure.lmer,
        pairwise ~ rate | category | speaker_group,
        lmer.df = "satterthwaite")
```

# VOT-norm (word-normalized voice onset time)

## Plot (Figure 4.2)

```{r plot_normalized_vot}

df %>%
  ggplot() +
  aes(x = category, y = dur_aspiration / dur_word) +
  facet_grid(cols = vars(speaker_group_label),
             rows = vars(rate)) +
  geom_boxplot(aes(fill = legal)) +
  ylab(bquote("VOT"["norm"])) +
  xlab("Quantity category") +
  scale_x_discrete(labels = c("V:C:" = "V:C:\ne.g. Bieter",
                              "V:C"  = "V:C\ne.g. wieder",
                              "VC"   = "VC\ne.g. Widder",
                              "VC:"  = "VC:\ne.g. bitter"
  )) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  scale_fill_manual(name = "Phonotactically",
                    values = c("red", "green"))

```

## Model

```{r model_normalized_vot}

emm_options(pbkrtest.limit = nrow(df))
emm_options(lmerTest.limit = nrow(df))

normalized_vot.lmer = df %>%
  mutate (speaker_group = factor(speaker_group,
                                 ordered = T,
                                 levels = c("Y_SD", "Y_WB", "O_WB"))) %>%
  lmer(data = .,
       dur_aspiration / dur_word
       ~ speaker_group * category * rate +
         
         (category + rate | speaker) +
         (speaker_group + rate | target_word)
  )

normalized_vot.anova = anova(normalized_vot.lmer)

print(normalized_vot.anova)
```

### Pairwise comparisons for Section 4.3.2 (Voice onset time)

```{r pairwise_comparisons_4_3_2}
emmeans(normalized_vot.lmer,
        pairwise ~ category | speaker_group,
        lmer.df = "satterthwaite")
```

# Optimal category boundary

## Calculation

```{r calculation_optimal_category_boundary}
possible_boundaries = seq(0, 280, by = 1)

# (Pre-allocation) Generate a data frame of the correct size with an empty
# column "classified_correctly". That column will be filled in the for loop
# below.
all_boundary_data_normal = tidyr::crossing(speaker = df$speaker, boundary = possible_boundaries) %>% 
  mutate(classified_correctly = NA)


for (current_row in rownames(all_boundary_data_normal)) {
  current_speaker = all_boundary_data_normal[current_row,]$speaker
  current_boundary = all_boundary_data_normal[current_row,]$boundary
  
  
  df %>%
    filter(rate == "normal") %>%
    filter(speaker == current_speaker) %>%
    
    mutate (classified_correctly = case_when((consonant_strength == "Fortis" & dur_c >= current_boundary) ~ "correct",
                                             (consonant_strength == "Fortis" & dur_c <  current_boundary) ~ "incorrect",
                                             (consonant_strength == "Lenis"  & dur_c >= current_boundary) ~ "incorrect",
                                             (consonant_strength == "Lenis"  & dur_c <  current_boundary) ~ "correct")) %>%
    
    count(classified_correctly) %>%
    spread(classified_correctly, n) %>%
    mutate(classified_correctly = correct / (correct + incorrect)) -> df_augmented
  
  all_boundary_data_normal[current_row, "classified_correctly"] = df_augmented$classified_correctly
}

best_boundary_data_normal = all_boundary_data_normal %>% 
  group_by(speaker) %>% 
  top_n(1, classified_correctly) %>%
  group_by(speaker) %>%
  summarise(min_boundary = min(boundary),
            max_boundary = max(boundary),
            boundary = mean(c(min(boundary),
                              max(boundary))),
            classified_correctly = first(classified_correctly)) %>%
  mutate(rate = "normal")


#### The same again for fast speech

# (Pre-allocation) Generate a data frame of the correct size with an empty
# column "classified_correctly". That column will be filled in the for loop
# below.
all_boundary_data_fast = tidyr::crossing(speaker = df$speaker, boundary = possible_boundaries) %>% 
  mutate(classified_correctly = NA)


for (current_row in rownames(all_boundary_data_fast)) {
  current_speaker = all_boundary_data_fast[current_row,]$speaker
  current_boundary = all_boundary_data_fast[current_row,]$boundary
  
  
  df %>%
    filter(rate == "fast") %>%
    filter(speaker == current_speaker) %>%
    
    mutate (classified_correctly = case_when((consonant_strength == "Fortis" & dur_c >= current_boundary) ~ "correct",
                                             (consonant_strength == "Fortis" & dur_c <  current_boundary) ~ "incorrect",
                                             (consonant_strength == "Lenis"  & dur_c >= current_boundary) ~ "incorrect",
                                             (consonant_strength == "Lenis"  & dur_c <  current_boundary) ~ "correct")) %>%
    
    count(classified_correctly) %>%
    spread(classified_correctly, n) %>%
    mutate(classified_correctly = correct / (correct + incorrect)) -> df_augmented
  
  all_boundary_data_fast[current_row, "classified_correctly"] = df_augmented$classified_correctly
}

best_boundary_data_fast = all_boundary_data_fast %>% 
  group_by(speaker) %>% 
  top_n(1, classified_correctly) %>%
  group_by(speaker) %>%
  summarise(min_boundary = min(boundary),
            max_boundary = max(boundary),
            boundary = mean(c(min(boundary),
                              max(boundary))),
            classified_correctly = first(classified_correctly)) %>% 
  mutate(rate = "fast")


best_boundary_data_all = rbind (best_boundary_data_normal,
                                best_boundary_data_fast)


```

## Plot (Figure 4.3)

```{r}
left_join(x = best_boundary_data_all,
          y = select(df, speaker, age, variety, speaker_group_label),
          by = "speaker") %>% 
  distinct %>%
  
  ggplot() +
  facet_wrap(vars(speaker_group_label)) +
  aes (x = reorder(rate, desc(rate)), y = boundary, group = speaker) +
  geom_point() +
  geom_line() +
  ylab("Optimal category boundary [ms]") +
  xlab("Speech rate")

```

# Category expansion

## Calculation

```{r calculation_category_expansion}
df_dispersion_difference_with_target_word = df %>%
  gather(key = "measure", value = "value", dur_c_norm_word) %>%
  group_by(measure, variety, age, speaker_group, speaker_group_label, category, legal, rate, speaker, target_word) %>%
  summarise(coefficient_of_variation = sd(value, na.rm = TRUE) / mean(value, na.rm = TRUE)) %>%
  ungroup() %>%
  spread(rate, coefficient_of_variation) %>%
  mutate(change_in_dispersion = fast - normal)

df_dispersion_difference_without_target_word = df %>%
  gather(key = "measure", value = "value", dur_c_norm_word) %>%
  group_by(measure, variety, age, speaker_group, speaker_group_label, category, legal, rate, speaker) %>%
  summarise(coefficient_of_variation = sd(value, na.rm = TRUE) / mean(value, na.rm = TRUE)) %>%
  ungroup() %>%
  spread(rate, coefficient_of_variation) %>%
  mutate(change_in_dispersion = fast - normal)
```

## Plot (Figure 4.4)

```{r plot_category_expansion}
df_dispersion_difference_without_target_word %>%
  filter(measure == "dur_c_norm_word") %>%
  
  mutate(speaker_group = factor(speaker_group,
                                levels = c("Y_SD", "Y_WB", "O_WB"),
                                labels = c("Standard German", "Dialect, younger", "Dialect, older"))) %>%
  ggplot() +
  aes(x = category, y = change_in_dispersion) +
  facet_grid(cols = vars(speaker_group),
             scales = "free_y") +
  geom_point() +
  geom_boxplot(alpha = 0.8,
               outlier.shape = NA,
               coef = 0) +
  ylab("Category expansion") +
  xlab("") +
  scale_x_discrete(labels = c("V:C:" = "V:C:\ne.g. Bieter",
                              "V:C"  = "V:C\ne.g. wieder",
                              "VC"   = "VC\ne.g. Widder",
                              "VC:"  = "VC:\ne.g. bitter"
                              )) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  scale_fill_manual(name = "Phonotactically",
                    values = c("red", "green")) +
  coord_cartesian(ylim = c(-0.15, 0.3)) +
  geom_hline (aes(yintercept = 0))
```

## Model

```{r model_category_expansion}

emm_options(pbkrtest.limit = nrow(df_dispersion_difference_with_target_word))
emm_options(lmerTest.limit = nrow(df_dispersion_difference_with_target_word))

dispersion_difference.lmer = df_dispersion_difference_with_target_word %>%
  filter(measure == "dur_c_norm_word") %>%
  
  mutate (speaker_group = factor(speaker_group,
                                 ordered = T,
                                 levels = c("Y_SD", "Y_WB", "O_WB"))) %>%
  lmer(data = .,
       change_in_dispersion
       ~ speaker_group * category +
         
         (category | speaker) +
         (speaker_group | target_word)
  )

dispersion_difference.anova = anova(dispersion_difference.lmer)

print(dispersion_difference.anova)
```

# Fortis–lenis overlap

## Calculation


```{r}
# Calculate the baseline of lenis tokens for fortis target words, normal rate
typical_lenis_upper_bound_normal = df_normal %>%
  filter(consonant_strength == "Lenis") %>%
  group_by(speaker, place_of_articulation) %>%
  summarise(typical_lenis_upper_bound = quantile(dur_c / dur_word, probs = 0.75))

# Calculate the baseline of lenis tokens for fortis target words, fast rate
typical_lenis_upper_bound_fast = df_fast %>%
  filter(consonant_strength == "Lenis") %>%
  group_by(speaker, place_of_articulation) %>%
  summarise(typical_lenis_upper_bound = quantile(dur_c / dur_word, probs = 0.75))

# Calculate the baseline of fortis tokens for lenis target words, normal rate
typical_fortis_lower_bound_normal = df_normal %>%
  filter(consonant_strength == "Fortis") %>%
  group_by(speaker, place_of_articulation) %>%
  summarise(typical_fortis_lower_bound = quantile(dur_c / dur_word, probs = 0.25))

# Calculate the baseline of fortis tokens for lenis target words, fast rate
typical_fortis_lower_bound_fast = df_fast %>%
  filter(consonant_strength == "Fortis") %>%
  group_by(speaker, place_of_articulation) %>%
  summarise(typical_fortis_lower_bound = quantile(dur_c / dur_word, probs = 0.25))
```

### Fortis as target, lenis as baseline

#### Per place of articulation

```{r}
flo_per_place_of_articulation_normal_rate_fortis_as_target =
  # FLO is short for fortis–lenis overlap
  left_join(df_normal,
            typical_lenis_upper_bound_normal,
            by = c("speaker", "place_of_articulation")) %>%
  
  mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside",
                                                 dur_c/dur_word >  typical_lenis_upper_bound ~ "outside")) %>% 
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength) %>%
  mutate (number_of_tokens_in_group = n()) %>%
  
  group_by(variety,
           age,
           speaker,
           speaker_group_label,
           place_of_articulation,
           consonant_strength,
           number_of_tokens_in_group,
           inside_typical_lenis_region) %>%
  
  summarise(absolute_frequency = n()) %>%
  mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>%
  select(-absolute_frequency) %>%
  
  spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>% 
  
  filter(consonant_strength == "Fortis") %>%
  ungroup %>% 
  mutate (speech_rate = "normal")


flo_per_place_of_articulation_fast_rate_fortis_as_target =
  # FLO is short for fortis–lenis overlap
  left_join(df_fast,
            typical_lenis_upper_bound_fast,
            by = c("speaker", "place_of_articulation")) %>%
  
  mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside",
                                                 dur_c/dur_word >  typical_lenis_upper_bound ~ "outside")) %>% 
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength) %>%
  mutate (number_of_tokens_in_group = n()) %>%
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength,
           number_of_tokens_in_group,
           inside_typical_lenis_region) %>%
  
  summarise(absolute_frequency = n()) %>%
  mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>%
  select(-absolute_frequency) %>%
  
  spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>%
  
  filter(consonant_strength == "Fortis") %>%
  ungroup %>% 
  mutate (speech_rate = "fast")


flo_per_place_of_articulation_all_rates_fortis_as_target = rbind(flo_per_place_of_articulation_normal_rate_fortis_as_target,
                                                                 flo_per_place_of_articulation_fast_rate_fortis_as_target)
```


#### Per word

```{r}
flo_per_word_normal_rate_fortis_as_target =
  # FLO is short for fortis–lenis overlap
  left_join(df_normal,
            typical_lenis_upper_bound_normal,
            by = c("speaker", "place_of_articulation")) %>%
  
  mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside",
                                                 dur_c/dur_word >  typical_lenis_upper_bound ~ "outside")) %>% 
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength,
           target_word) %>%
  mutate (number_of_tokens_in_group = n()) %>%
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength,
           target_word,
           number_of_tokens_in_group,
           inside_typical_lenis_region) %>%
  
  summarise(absolute_frequency = n()) %>%
  mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>%
  select(-absolute_frequency) %>%
  
  spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>% 
  
  filter(consonant_strength == "Fortis") %>%
  mutate (speech_rate = "normal") %>%
  ungroup


flo_per_word_fast_rate_fortis_as_target =
  # FLO is short for fortis–lenis overlap
  left_join(df_fast,
            typical_lenis_upper_bound_fast,
            by = c("speaker", "place_of_articulation")) %>%
  
  mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside",
                                                 dur_c/dur_word >  typical_lenis_upper_bound ~ "outside")) %>% 
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength,
           target_word) %>%
  mutate (number_of_tokens_in_group = n()) %>%
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength,
           target_word,
           number_of_tokens_in_group,
           inside_typical_lenis_region) %>%
  
  summarise(absolute_frequency = n()) %>%
  mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>%
  select(-absolute_frequency) %>%
  
  spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>%
  
  filter(consonant_strength == "Fortis") %>%
  mutate (speech_rate = "fast") %>%
  ungroup

flo_per_word_all_rates_fortis_as_target = rbind(flo_per_word_normal_rate_fortis_as_target,
                                                flo_per_word_fast_rate_fortis_as_target)
```

## Plot per place of articulation (Figure 4.5)

```{r}
ggplot(flo_per_place_of_articulation_normal_rate_fortis_as_target) +
  aes(y = inside, x = place_of_articulation) +
  facet_grid(cols = vars(speaker_group_label)) +
  coord_cartesian(ylim = c(-0.1,1)) +
  ylab("Fortis–lenis overlap") +
  xlab("Place of articulation") +
  geom_jitter(aes(color = place_of_articulation)) +
  scale_color_discrete(name = "Place of articulation") +
  scale_y_continuous(breaks = seq(from = 0, to = 1, by = 0.2))
```


## Plots per word (Figure 4.6)

```{r}
for (current_poa in c("alveolar")) {
  flo_per_word_normal_rate_fortis_as_target %>% 
    filter(place_of_articulation == current_poa) %>% 
    
    ggplot() +
    aes(y = inside, x = target_word) +
    facet_grid(cols = vars(speaker_group_label)) +
    coord_cartesian(ylim = c(-0.1,1)) +
    ylab("Fortis–lenis overlap") +
    xlab("Target word") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
    geom_jitter(aes(color = target_word)) +
    scale_color_discrete(name = "Target word") +
    scale_y_continuous(breaks = seq(from = 0, to = 1, by = 0.2)) -> plot
  
  print(plot)
}
```