1 Underlying data set

1.1 Speaker groups

df %>% count(variety, age, speaker) %>% count (variety, age)

1.2 Tokens per speech rate

df %>% count(rate)

1.3 Tokens per target word

df %>% count(word_group, target_word) %>% arrange(word_group, desc(n)) %>% select(-word_group)

1.4 Matrix of target words

df %>%
  count(word_group, category, target_word) %>%
  select (-n) %>% 
  spread(category, target_word)

1.5 Types and tokens per quantity category

df %>% count(category, target_word) %>% count(category)
df %>% count(category)

2 Closure-norm (word-normalized stop closure duration)

2.1 Plot (Figure 4.1)

df %>%
  ggplot() +
  aes(x = category, y = dur_c / dur_word) +
  facet_grid(cols = vars(speaker_group_label),
             rows = vars(rate)) +
  geom_boxplot(aes(fill = legal)) +
  ylab(bquote("closure"["norm"])) +
  xlab("Quantity category") +
  scale_x_discrete(labels = c("V:C:" = "V:C:\ne.g. Bieter",
                              "V:C"  = "V:C\ne.g. wieder",
                              "VC"   = "VC\ne.g. Widder",
                              "VC:"  = "VC:\ne.g. bitter"
  )) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  scale_fill_manual(name = "Phonotactically",
                    values = c("red", "green"))

2.2 Model: Normalized closure duration

emm_options(pbkrtest.limit = nrow(df))
emm_options(lmerTest.limit = nrow(df))

general_closure.lmer = df %>%
  mutate (speaker_group = factor(speaker_group,
                                 ordered = T,
                                 levels = c("Y_SD", "Y_WB", "O_WB"))) %>%
  lmer(data = .,
       dur_c / dur_word
       ~ speaker_group * category * rate +
         
         (category + rate | speaker) +
         (speaker_group + rate | target_word)
  )
## boundary (singular) fit: see help('isSingular')
general_closure.anova = anova(general_closure.lmer)

print(general_closure.anova)
## Type III Analysis of Variance Table with Satterthwaite's method
##                               Sum Sq  Mean Sq NumDF  DenDF F value    Pr(>F)
## speaker_group               0.043382 0.021691     2   42.1 12.2464 6.433e-05
## category                    0.116567 0.038856     3   22.9 21.9373 6.272e-07
## rate                        0.008194 0.008194     1   36.8  4.6263  0.038116
## speaker_group:category      0.048138 0.008023     6   32.3  4.5297  0.001943
## speaker_group:rate          0.023508 0.011754     2  110.5  6.6361  0.001898
## category:rate               0.008341 0.002780     3   36.4  1.5697  0.213366
## speaker_group:category:rate 0.011324 0.001887     6 7117.4  1.0656  0.380713
##                                
## speaker_group               ***
## category                    ***
## rate                        *  
## speaker_group:category      ** 
## speaker_group:rate          ** 
## category:rate                  
## speaker_group:category:rate    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2.2.1 Pairwise comparisons for Section 4.3.1 (Closure duration)

emmeans(general_closure.lmer,
        pairwise ~ speaker_group | category,
        lmer.df = "satterthwaite")
## NOTE: Results may be misleading due to involvement in interactions
## $emmeans
## category = V:C:
##  speaker_group emmean     SE   df lower.CL upper.CL
##  Y_SD           0.140 0.0180 24.4   0.1023    0.177
##  Y_WB           0.114 0.0188 24.1   0.0752    0.153
##  O_WB           0.146 0.0198 23.8   0.1052    0.187
## 
## category = V:C::
##  speaker_group emmean     SE   df lower.CL upper.CL
##  Y_SD           0.210 0.0209 31.1   0.1671    0.252
##  Y_WB           0.255 0.0217 30.3   0.2112    0.300
##  O_WB           0.265 0.0227 29.3   0.2185    0.311
## 
## category = VC:
##  speaker_group emmean     SE   df lower.CL upper.CL
##  Y_SD           0.148 0.0246 27.2   0.0972    0.198
##  Y_WB           0.156 0.0255 26.7   0.1034    0.208
##  O_WB           0.248 0.0268 26.1   0.1927    0.303
## 
## category = VC::
##  speaker_group emmean     SE   df lower.CL upper.CL
##  Y_SD           0.271 0.0189 35.2   0.2325    0.309
##  Y_WB           0.335 0.0196 34.2   0.2952    0.375
##  O_WB           0.342 0.0204 32.9   0.3004    0.383
## 
## Results are averaged over the levels of: rate 
## Degrees-of-freedom method: satterthwaite 
## Confidence level used: 0.95 
## 
## $contrasts
## category = V:C:
##  contrast    estimate     SE   df t.ratio p.value
##  Y_SD - Y_WB  0.02562 0.0164 29.8   1.561  0.2778
##  Y_SD - O_WB -0.00649 0.0181 28.0  -0.357  0.9322
##  Y_WB - O_WB -0.03211 0.0113 39.9  -2.836  0.0191
## 
## category = V:C::
##  contrast    estimate     SE   df t.ratio p.value
##  Y_SD - Y_WB -0.04563 0.0207 42.6  -2.200  0.0827
##  Y_SD - O_WB -0.05506 0.0224 39.7  -2.461  0.0471
##  Y_WB - O_WB -0.00942 0.0163 44.3  -0.578  0.8323
## 
## category = VC:
##  contrast    estimate     SE   df t.ratio p.value
##  Y_SD - Y_WB -0.00820 0.0233 36.3  -0.352  0.9342
##  Y_SD - O_WB -0.10019 0.0255 33.5  -3.933  0.0011
##  Y_WB - O_WB -0.09199 0.0172 45.1  -5.345  <.0001
## 
## category = VC::
##  contrast    estimate     SE   df t.ratio p.value
##  Y_SD - Y_WB -0.06393 0.0196 46.2  -3.265  0.0057
##  Y_SD - O_WB -0.07098 0.0209 44.4  -3.399  0.0040
##  Y_WB - O_WB -0.00705 0.0161 41.4  -0.437  0.9003
## 
## Results are averaged over the levels of: rate 
## Degrees-of-freedom method: satterthwaite 
## P value adjustment: tukey method for comparing a family of 3 estimates
emmeans(general_closure.lmer,
        pairwise ~ rate | speaker_group,
        lmer.df = "satterthwaite")
## NOTE: Results may be misleading due to involvement in interactions
## $emmeans
## speaker_group = Y_SD:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.188 0.0108 36.3    0.166    0.210
##  fast    0.196 0.0117 36.3    0.172    0.220
## 
## speaker_group = Y_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.216 0.0113 34.8    0.193    0.239
##  fast    0.214 0.0119 35.7    0.190    0.238
## 
## speaker_group = O_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.249 0.0119 33.4    0.225    0.273
##  fast    0.251 0.0123 34.5    0.226    0.276
## 
## Results are averaged over the levels of: category 
## Degrees-of-freedom method: satterthwaite 
## Confidence level used: 0.95 
## 
## $contrasts
## speaker_group = Y_SD:
##  contrast      estimate      SE   df t.ratio p.value
##  normal - fast -0.00777 0.00202 91.2  -3.850  0.0002
## 
## speaker_group = Y_WB:
##  contrast      estimate      SE   df t.ratio p.value
##  normal - fast  0.00191 0.00203 92.5   0.945  0.3469
## 
## speaker_group = O_WB:
##  contrast      estimate      SE   df t.ratio p.value
##  normal - fast -0.00263 0.00203 92.5  -1.299  0.1971
## 
## Results are averaged over the levels of: category 
## Degrees-of-freedom method: satterthwaite

2.2.2 Pairwise comparisons for Section 4.3.3 (Subsection: Lenition in fast speech)

emmeans(general_closure.lmer,
        pairwise ~ rate | category | speaker_group,
        lmer.df = "satterthwaite")
## $emmeans
## category = V:C, speaker_group = Y_SD:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.135 0.0175 24.9   0.0988    0.171
##  fast    0.144 0.0187 24.3   0.1056    0.183
## 
## category = V:C:, speaker_group = Y_SD:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.207 0.0203 31.7   0.1657    0.248
##  fast    0.213 0.0217 30.9   0.1683    0.257
## 
## category = VC, speaker_group = Y_SD:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.145 0.0238 27.6   0.0957    0.193
##  fast    0.151 0.0256 27.3   0.0983    0.203
## 
## category = VC:, speaker_group = Y_SD:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.266 0.0184 35.8   0.2285    0.303
##  fast    0.276 0.0196 35.0   0.2362    0.316
## 
## category = V:C, speaker_group = Y_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.112 0.0185 24.5   0.0742    0.151
##  fast    0.115 0.0192 24.2   0.0758    0.155
## 
## category = V:C:, speaker_group = Y_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.256 0.0213 30.5   0.2120    0.299
##  fast    0.255 0.0222 30.4   0.2101    0.301
## 
## category = VC, speaker_group = Y_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.159 0.0251 26.9   0.1076    0.211
##  fast    0.153 0.0262 27.0   0.0988    0.206
## 
## category = VC:, speaker_group = Y_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.337 0.0192 34.4   0.2978    0.376
##  fast    0.333 0.0200 34.4   0.2923    0.374
## 
## category = V:C, speaker_group = O_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.143 0.0196 24.1   0.1020    0.183
##  fast    0.149 0.0201 23.9   0.1081    0.191
## 
## category = V:C:, speaker_group = O_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.261 0.0225 29.4   0.2150    0.307
##  fast    0.269 0.0230 29.6   0.2217    0.316
## 
## category = VC, speaker_group = O_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.249 0.0266 26.1   0.1942    0.303
##  fast    0.247 0.0273 26.4   0.1908    0.303
## 
## category = VC:, speaker_group = O_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.343 0.0202 32.9   0.3020    0.384
##  fast    0.341 0.0207 33.3   0.2986    0.383
## 
## Degrees-of-freedom method: satterthwaite 
## Confidence level used: 0.95 
## 
## $contrasts
## category = V:C, speaker_group = Y_SD:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast -0.009278 0.00356 180  -2.604  0.0100
## 
## category = V:C:, speaker_group = Y_SD:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast -0.005457 0.00383 183  -1.424  0.1560
## 
## category = VC, speaker_group = Y_SD:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast -0.006156 0.00463 184  -1.329  0.1855
## 
## category = VC:, speaker_group = Y_SD:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast -0.010189 0.00332 171  -3.071  0.0025
## 
## category = V:C, speaker_group = Y_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast -0.003083 0.00364 196  -0.848  0.3977
## 
## category = V:C:, speaker_group = Y_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast  0.000174 0.00383 182   0.045  0.9638
## 
## category = VC, speaker_group = Y_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast  0.006659 0.00463 184   1.438  0.1522
## 
## category = VC:, speaker_group = Y_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast  0.003909 0.00331 170   1.179  0.2398
## 
## category = V:C, speaker_group = O_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast -0.006968 0.00359 187  -1.939  0.0541
## 
## category = V:C:, speaker_group = O_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast -0.007850 0.00383 184  -2.047  0.0420
## 
## category = VC, speaker_group = O_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast  0.001968 0.00465 187   0.423  0.6727
## 
## category = VC:, speaker_group = O_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast  0.002322 0.00333 173   0.698  0.4862
## 
## Degrees-of-freedom method: satterthwaite

3 VOT-norm (word-normalized voice onset time)

3.1 Plot (Figure 4.2)

df %>%
  ggplot() +
  aes(x = category, y = dur_aspiration / dur_word) +
  facet_grid(cols = vars(speaker_group_label),
             rows = vars(rate)) +
  geom_boxplot(aes(fill = legal)) +
  ylab(bquote("VOT"["norm"])) +
  xlab("Quantity category") +
  scale_x_discrete(labels = c("V:C:" = "V:C:\ne.g. Bieter",
                              "V:C"  = "V:C\ne.g. wieder",
                              "VC"   = "VC\ne.g. Widder",
                              "VC:"  = "VC:\ne.g. bitter"
  )) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  scale_fill_manual(name = "Phonotactically",
                    values = c("red", "green"))

3.2 Model

emm_options(pbkrtest.limit = nrow(df))
emm_options(lmerTest.limit = nrow(df))

normalized_vot.lmer = df %>%
  mutate (speaker_group = factor(speaker_group,
                                 ordered = T,
                                 levels = c("Y_SD", "Y_WB", "O_WB"))) %>%
  lmer(data = .,
       dur_aspiration / dur_word
       ~ speaker_group * category * rate +
         
         (category + rate | speaker) +
         (speaker_group + rate | target_word)
  )
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.014128 (tol = 0.002, component 1)
normalized_vot.anova = anova(normalized_vot.lmer)

print(normalized_vot.anova)
## Type III Analysis of Variance Table with Satterthwaite's method
##                               Sum Sq   Mean Sq NumDF  DenDF F value    Pr(>F)
## speaker_group               0.001735 0.0008674     2   32.6  0.7842   0.46488
## category                    0.035573 0.0118577     3   26.5 10.7204 8.644e-05
## rate                        0.000610 0.0006097     1  106.8  0.5513   0.45943
## speaker_group:category      0.046282 0.0077137     6   34.6  6.9739 6.417e-05
## speaker_group:rate          0.004183 0.0020914     2  192.2  1.8908   0.15374
## category:rate               0.008637 0.0028789     3  197.2  2.6028   0.05318
## speaker_group:category:rate 0.004970 0.0008283     6 7117.0  0.7489   0.61026
##                                
## speaker_group                  
## category                    ***
## rate                           
## speaker_group:category      ***
## speaker_group:rate             
## category:rate               .  
## speaker_group:category:rate    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.2.1 Pairwise comparisons for Section 4.3.2 (Voice onset time)

emmeans(normalized_vot.lmer,
        pairwise ~ category | speaker_group,
        lmer.df = "satterthwaite")
## NOTE: Results may be misleading due to involvement in interactions
## $emmeans
## speaker_group = Y_SD:
##  category emmean      SE   df lower.CL upper.CL
##  V:C      0.0319 0.00780 45.4   0.0162   0.0476
##  V:C:     0.0966 0.00946 44.9   0.0775   0.1156
##  VC       0.0424 0.00998 45.0   0.0224   0.0625
##  VC:      0.0925 0.00805 45.2   0.0763   0.1088
## 
## speaker_group = Y_WB:
##  category emmean      SE   df lower.CL upper.CL
##  V:C      0.0358 0.00755 45.5   0.0206   0.0510
##  V:C:     0.0637 0.00921 43.7   0.0451   0.0822
##  VC       0.0542 0.00961 45.2   0.0348   0.0735
##  VC:      0.0666 0.00783 44.1   0.0508   0.0824
## 
## speaker_group = O_WB:
##  category emmean      SE   df lower.CL upper.CL
##  V:C      0.0352 0.00956 40.1   0.0159   0.0546
##  V:C:     0.0585 0.01119 44.6   0.0359   0.0810
##  VC       0.0761 0.01237 38.5   0.0510   0.1011
##  VC:      0.0702 0.00957 44.1   0.0509   0.0895
## 
## Results are averaged over the levels of: rate 
## Degrees-of-freedom method: satterthwaite 
## Confidence level used: 0.95 
## 
## $contrasts
## speaker_group = Y_SD:
##  contrast   estimate      SE   df t.ratio p.value
##  V:C - V:C: -0.06469 0.00987 40.0  -6.554  <.0001
##  V:C - VC   -0.01058 0.01054 36.2  -1.003  0.7484
##  V:C - VC:  -0.06067 0.00932 40.9  -6.513  <.0001
##  V:C: - VC   0.05411 0.01107 37.6   4.889  0.0001
##  V:C: - VC:  0.00402 0.00795 24.7   0.506  0.9570
##  VC - VC:   -0.05009 0.01007 34.4  -4.972  0.0001
## 
## speaker_group = Y_WB:
##  contrast   estimate      SE   df t.ratio p.value
##  V:C - V:C: -0.02787 0.00942 41.6  -2.960  0.0251
##  V:C - VC   -0.01840 0.01000 37.9  -1.841  0.2707
##  V:C - VC:  -0.03081 0.00890 42.4  -3.461  0.0066
##  V:C: - VC   0.00947 0.01051 39.3   0.901  0.8045
##  V:C: - VC: -0.00294 0.00740 25.2  -0.397  0.9784
##  VC - VC:   -0.01240 0.00952 36.0  -1.303  0.5673
## 
## speaker_group = O_WB:
##  contrast   estimate      SE   df t.ratio p.value
##  V:C - V:C: -0.02325 0.01279 32.2  -1.818  0.2837
##  V:C - VC   -0.04085 0.01397 29.3  -2.924  0.0317
##  V:C - VC:  -0.03500 0.01200 33.0  -2.916  0.0306
##  V:C: - VC  -0.01760 0.01454 30.3  -1.210  0.6256
##  V:C: - VC: -0.01175 0.01120 22.8  -1.049  0.7231
##  VC - VC:    0.00585 0.01348 28.1   0.434  0.9721
## 
## Results are averaged over the levels of: rate 
## Degrees-of-freedom method: satterthwaite 
## P value adjustment: tukey method for comparing a family of 4 estimates

4 Optimal category boundary

4.1 Calculation

possible_boundaries = seq(0, 280, by = 1)

# (Pre-allocation) Generate a data frame of the correct size with an empty
# column "classified_correctly". That column will be filled in the for loop
# below.
all_boundary_data_normal = tidyr::crossing(speaker = df$speaker, boundary = possible_boundaries) %>% 
  mutate(classified_correctly = NA)


for (current_row in rownames(all_boundary_data_normal)) {
  current_speaker = all_boundary_data_normal[current_row,]$speaker
  current_boundary = all_boundary_data_normal[current_row,]$boundary
  
  
  df %>%
    filter(rate == "normal") %>%
    filter(speaker == current_speaker) %>%
    
    mutate (classified_correctly = case_when((consonant_strength == "Fortis" & dur_c >= current_boundary) ~ "correct",
                                             (consonant_strength == "Fortis" & dur_c <  current_boundary) ~ "incorrect",
                                             (consonant_strength == "Lenis"  & dur_c >= current_boundary) ~ "incorrect",
                                             (consonant_strength == "Lenis"  & dur_c <  current_boundary) ~ "correct")) %>%
    
    count(classified_correctly) %>%
    spread(classified_correctly, n) %>%
    mutate(classified_correctly = correct / (correct + incorrect)) -> df_augmented
  
  all_boundary_data_normal[current_row, "classified_correctly"] = df_augmented$classified_correctly
}

best_boundary_data_normal = all_boundary_data_normal %>% 
  group_by(speaker) %>% 
  top_n(1, classified_correctly) %>%
  group_by(speaker) %>%
  summarise(min_boundary = min(boundary),
            max_boundary = max(boundary),
            boundary = mean(c(min(boundary),
                              max(boundary))),
            classified_correctly = first(classified_correctly)) %>%
  mutate(rate = "normal")



#### The same again for fast speech

# (Pre-allocation) Generate a data frame of the correct size with an empty
# column "classified_correctly". That column will be filled in the for loop
# below.
all_boundary_data_fast = tidyr::crossing(speaker = df$speaker, boundary = possible_boundaries) %>% 
  mutate(classified_correctly = NA)


for (current_row in rownames(all_boundary_data_fast)) {
  current_speaker = all_boundary_data_fast[current_row,]$speaker
  current_boundary = all_boundary_data_fast[current_row,]$boundary
  
  
  df %>%
    filter(rate == "fast") %>%
    filter(speaker == current_speaker) %>%
    
    mutate (classified_correctly = case_when((consonant_strength == "Fortis" & dur_c >= current_boundary) ~ "correct",
                                             (consonant_strength == "Fortis" & dur_c <  current_boundary) ~ "incorrect",
                                             (consonant_strength == "Lenis"  & dur_c >= current_boundary) ~ "incorrect",
                                             (consonant_strength == "Lenis"  & dur_c <  current_boundary) ~ "correct")) %>%
    
    count(classified_correctly) %>%
    spread(classified_correctly, n) %>%
    mutate(classified_correctly = correct / (correct + incorrect)) -> df_augmented
  
  all_boundary_data_fast[current_row, "classified_correctly"] = df_augmented$classified_correctly
}

best_boundary_data_fast = all_boundary_data_fast %>% 
  group_by(speaker) %>% 
  top_n(1, classified_correctly) %>%
  group_by(speaker) %>%
  summarise(min_boundary = min(boundary),
            max_boundary = max(boundary),
            boundary = mean(c(min(boundary),
                              max(boundary))),
            classified_correctly = first(classified_correctly)) %>% 
  mutate(rate = "fast")


best_boundary_data_all = rbind (best_boundary_data_normal,
                                best_boundary_data_fast)

4.2 Plot (Figure 4.3)

left_join(x = best_boundary_data_all,
          y = select(df, speaker, age, variety, speaker_group_label),
          by = "speaker") %>% 
  distinct %>%
  
  ggplot() +
  facet_wrap(vars(speaker_group_label)) +
  aes (x = reorder(rate, desc(rate)), y = boundary, group = speaker) +
  geom_point() +
  geom_line() +
  ylab("Optimal category boundary [ms]") +
  xlab("Speech rate")

5 Category expansion

5.1 Calculation

df_dispersion_difference_with_target_word = df %>%
  gather(key = "measure", value = "value", dur_c_norm_word) %>%
  group_by(measure, variety, age, speaker_group, speaker_group_label, category, legal, rate, speaker, target_word) %>%
  summarise(coefficient_of_variation = sd(value, na.rm = TRUE) / mean(value, na.rm = TRUE)) %>%
  ungroup() %>%
  spread(rate, coefficient_of_variation) %>%
  mutate(change_in_dispersion = fast - normal)
## `summarise()` has grouped output by 'measure', 'variety', 'age',
## 'speaker_group', 'speaker_group_label', 'category', 'legal', 'rate', 'speaker'.
## You can override using the `.groups` argument.
df_dispersion_difference_without_target_word = df %>%
  gather(key = "measure", value = "value", dur_c_norm_word) %>%
  group_by(measure, variety, age, speaker_group, speaker_group_label, category, legal, rate, speaker) %>%
  summarise(coefficient_of_variation = sd(value, na.rm = TRUE) / mean(value, na.rm = TRUE)) %>%
  ungroup() %>%
  spread(rate, coefficient_of_variation) %>%
  mutate(change_in_dispersion = fast - normal)
## `summarise()` has grouped output by 'measure', 'variety', 'age',
## 'speaker_group', 'speaker_group_label', 'category', 'legal', 'rate'. You can
## override using the `.groups` argument.

5.2 Plot (Figure 4.4)

df_dispersion_difference_without_target_word %>%
  filter(measure == "dur_c_norm_word") %>%
  
  mutate(speaker_group = factor(speaker_group,
                                levels = c("Y_SD", "Y_WB", "O_WB"),
                                labels = c("Standard German", "Dialect, younger", "Dialect, older"))) %>%
  ggplot() +
  aes(x = category, y = change_in_dispersion) +
  facet_grid(cols = vars(speaker_group),
             scales = "free_y") +
  geom_point() +
  geom_boxplot(alpha = 0.8,
               outlier.shape = NA,
               coef = 0) +
  ylab("Category expansion") +
  xlab("") +
  scale_x_discrete(labels = c("V:C:" = "V:C:\ne.g. Bieter",
                              "V:C"  = "V:C\ne.g. wieder",
                              "VC"   = "VC\ne.g. Widder",
                              "VC:"  = "VC:\ne.g. bitter"
                              )) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  scale_fill_manual(name = "Phonotactically",
                    values = c("red", "green")) +
  coord_cartesian(ylim = c(-0.15, 0.3)) +
  geom_hline (aes(yintercept = 0))

5.3 Model

emm_options(pbkrtest.limit = nrow(df_dispersion_difference_with_target_word))
emm_options(lmerTest.limit = nrow(df_dispersion_difference_with_target_word))

dispersion_difference.lmer = df_dispersion_difference_with_target_word %>%
  filter(measure == "dur_c_norm_word") %>%
  
  mutate (speaker_group = factor(speaker_group,
                                 ordered = T,
                                 levels = c("Y_SD", "Y_WB", "O_WB"))) %>%
  lmer(data = .,
       change_in_dispersion
       ~ speaker_group * category +
         
         (category | speaker) +
         (speaker_group | target_word)
  )
## boundary (singular) fit: see help('isSingular')
dispersion_difference.anova = anova(dispersion_difference.lmer)

print(dispersion_difference.anova)
## Type III Analysis of Variance Table with Satterthwaite's method
##                          Sum Sq   Mean Sq NumDF  DenDF F value Pr(>F)
## speaker_group          0.032333 0.0161667     2 21.801  1.8137 0.1868
## category               0.003040 0.0010133     3 23.487  0.1137 0.9512
## speaker_group:category 0.078655 0.0131092     6 25.183  1.4707 0.2283

6 Fortis–lenis overlap

6.1 Calculation

# Calculate the baseline of lenis tokens for fortis target words, normal rate
typical_lenis_upper_bound_normal = df_normal %>%
  filter(consonant_strength == "Lenis") %>%
  group_by(speaker, place_of_articulation) %>%
  summarise(typical_lenis_upper_bound = quantile(dur_c / dur_word, probs = 0.75))
## `summarise()` has grouped output by 'speaker'. You can override using the
## `.groups` argument.
# Calculate the baseline of lenis tokens for fortis target words, fast rate
typical_lenis_upper_bound_fast = df_fast %>%
  filter(consonant_strength == "Lenis") %>%
  group_by(speaker, place_of_articulation) %>%
  summarise(typical_lenis_upper_bound = quantile(dur_c / dur_word, probs = 0.75))
## `summarise()` has grouped output by 'speaker'. You can override using the
## `.groups` argument.
# Calculate the baseline of fortis tokens for lenis target words, normal rate
typical_fortis_lower_bound_normal = df_normal %>%
  filter(consonant_strength == "Fortis") %>%
  group_by(speaker, place_of_articulation) %>%
  summarise(typical_fortis_lower_bound = quantile(dur_c / dur_word, probs = 0.25))
## `summarise()` has grouped output by 'speaker'. You can override using the
## `.groups` argument.
# Calculate the baseline of fortis tokens for lenis target words, fast rate
typical_fortis_lower_bound_fast = df_fast %>%
  filter(consonant_strength == "Fortis") %>%
  group_by(speaker, place_of_articulation) %>%
  summarise(typical_fortis_lower_bound = quantile(dur_c / dur_word, probs = 0.25))
## `summarise()` has grouped output by 'speaker'. You can override using the
## `.groups` argument.

6.1.1 Fortis as target, lenis as baseline

6.1.1.1 Per place of articulation

flo_per_place_of_articulation_normal_rate_fortis_as_target =
  # FLO is short for fortis–lenis overlap
  left_join(df_normal,
            typical_lenis_upper_bound_normal,
            by = c("speaker", "place_of_articulation")) %>%
  
  mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside",
                                                 dur_c/dur_word >  typical_lenis_upper_bound ~ "outside")) %>% 
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength) %>%
  mutate (number_of_tokens_in_group = n()) %>%
  
  group_by(variety,
           age,
           speaker,
           speaker_group_label,
           place_of_articulation,
           consonant_strength,
           number_of_tokens_in_group,
           inside_typical_lenis_region) %>%
  
  summarise(absolute_frequency = n()) %>%
  mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>%
  select(-absolute_frequency) %>%
  
  spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>% 
  
  filter(consonant_strength == "Fortis") %>%
  ungroup %>% 
  mutate (speech_rate = "normal")
## `summarise()` has grouped output by 'variety', 'age', 'speaker',
## 'speaker_group_label', 'place_of_articulation', 'consonant_strength',
## 'number_of_tokens_in_group'. You can override using the `.groups` argument.
flo_per_place_of_articulation_fast_rate_fortis_as_target =
  # FLO is short for fortis–lenis overlap
  left_join(df_fast,
            typical_lenis_upper_bound_fast,
            by = c("speaker", "place_of_articulation")) %>%
  
  mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside",
                                                 dur_c/dur_word >  typical_lenis_upper_bound ~ "outside")) %>% 
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength) %>%
  mutate (number_of_tokens_in_group = n()) %>%
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength,
           number_of_tokens_in_group,
           inside_typical_lenis_region) %>%
  
  summarise(absolute_frequency = n()) %>%
  mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>%
  select(-absolute_frequency) %>%
  
  spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>%
  
  filter(consonant_strength == "Fortis") %>%
  ungroup %>% 
  mutate (speech_rate = "fast")
## `summarise()` has grouped output by 'variety', 'age', 'speaker_group_label',
## 'speaker', 'place_of_articulation', 'consonant_strength',
## 'number_of_tokens_in_group'. You can override using the `.groups` argument.
flo_per_place_of_articulation_all_rates_fortis_as_target = rbind(flo_per_place_of_articulation_normal_rate_fortis_as_target,
                                                                 flo_per_place_of_articulation_fast_rate_fortis_as_target)

6.1.1.2 Per word

flo_per_word_normal_rate_fortis_as_target =
  # FLO is short for fortis–lenis overlap
  left_join(df_normal,
            typical_lenis_upper_bound_normal,
            by = c("speaker", "place_of_articulation")) %>%
  
  mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside",
                                                 dur_c/dur_word >  typical_lenis_upper_bound ~ "outside")) %>% 
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength,
           target_word) %>%
  mutate (number_of_tokens_in_group = n()) %>%
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength,
           target_word,
           number_of_tokens_in_group,
           inside_typical_lenis_region) %>%
  
  summarise(absolute_frequency = n()) %>%
  mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>%
  select(-absolute_frequency) %>%
  
  spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>% 
  
  filter(consonant_strength == "Fortis") %>%
  mutate (speech_rate = "normal") %>%
  ungroup
## `summarise()` has grouped output by 'variety', 'age', 'speaker_group_label',
## 'speaker', 'place_of_articulation', 'consonant_strength', 'target_word',
## 'number_of_tokens_in_group'. You can override using the `.groups` argument.
flo_per_word_fast_rate_fortis_as_target =
  # FLO is short for fortis–lenis overlap
  left_join(df_fast,
            typical_lenis_upper_bound_fast,
            by = c("speaker", "place_of_articulation")) %>%
  
  mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside",
                                                 dur_c/dur_word >  typical_lenis_upper_bound ~ "outside")) %>% 
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength,
           target_word) %>%
  mutate (number_of_tokens_in_group = n()) %>%
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength,
           target_word,
           number_of_tokens_in_group,
           inside_typical_lenis_region) %>%
  
  summarise(absolute_frequency = n()) %>%
  mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>%
  select(-absolute_frequency) %>%
  
  spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>%
  
  filter(consonant_strength == "Fortis") %>%
  mutate (speech_rate = "fast") %>%
  ungroup
## `summarise()` has grouped output by 'variety', 'age', 'speaker_group_label',
## 'speaker', 'place_of_articulation', 'consonant_strength', 'target_word',
## 'number_of_tokens_in_group'. You can override using the `.groups` argument.
flo_per_word_all_rates_fortis_as_target = rbind(flo_per_word_normal_rate_fortis_as_target,
                                                flo_per_word_fast_rate_fortis_as_target)

6.2 Plot per place of articulation (Figure 4.5)

ggplot(flo_per_place_of_articulation_normal_rate_fortis_as_target) +
  aes(y = inside, x = place_of_articulation) +
  facet_grid(cols = vars(speaker_group_label)) +
  coord_cartesian(ylim = c(-0.1,1)) +
  ylab("Fortis–lenis overlap") +
  xlab("Place of articulation") +
  geom_jitter(aes(color = place_of_articulation)) +
  scale_color_discrete(name = "Place of articulation") +
  scale_y_continuous(breaks = seq(from = 0, to = 1, by = 0.2))

6.3 Plots per word (Figure 4.6)

for (current_poa in c("alveolar")) {
  flo_per_word_normal_rate_fortis_as_target %>% 
    filter(place_of_articulation == current_poa) %>% 
    
    ggplot() +
    aes(y = inside, x = target_word) +
    facet_grid(cols = vars(speaker_group_label)) +
    coord_cartesian(ylim = c(-0.1,1)) +
    ylab("Fortis–lenis overlap") +
    xlab("Target word") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
    geom_jitter(aes(color = target_word)) +
    scale_color_discrete(name = "Target word") +
    scale_y_continuous(breaks = seq(from = 0, to = 1, by = 0.2)) -> plot
  
  print(plot)
}