1 Underlying data set

1.1 Speaker groups

df %>% count(variety, age, speaker) %>% count (variety, age)

1.2 Tokens per speech rate

df %>% count(rate)

1.3 Tokens per target word

df %>% count(word_group, target_word) %>% arrange(word_group, desc(n)) %>% select(-word_group)

1.4 Matrix of target words

df %>%
  count(word_group, category, target_word) %>%
  select (-n) %>% 
  spread(category, target_word)

1.5 Types and tokens per quantity category

df %>% count(category, target_word) %>% count(category)

df %>% count(category)

2 Closure-norm (word-normalized stop closure duration)

2.1 Plot (Figure 4.1)

df %>%
  ggplot() +
  aes(x = category, y = dur_c / dur_word) +
  facet_grid(cols = vars(speaker_group_label),
             rows = vars(rate)) +
  geom_boxplot(aes(fill = legal)) +
  ylab(bquote("closure"["norm"])) +
  xlab("Quantity category") +
  scale_x_discrete(labels = c("V:C:" = "V:C:\ne.g. Bieter",
                              "V:C"  = "V:C\ne.g. wieder",
                              "VC"   = "VC\ne.g. Widder",
                              "VC:"  = "VC:\ne.g. bitter"
  )) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  scale_fill_manual(name = "Phonotactically",
                    values = c("red", "green"))

2.2 Model: Normalized closure duration

emm_options(pbkrtest.limit = nrow(df))
emm_options(lmerTest.limit = nrow(df))

general_closure.lmer = df %>%
  mutate (speaker_group = factor(speaker_group,
                                 ordered = T,
                                 levels = c("Y_SD", "Y_WB", "O_WB"))) %>%
  lmer(data = .,
       dur_c / dur_word
       ~ speaker_group * category * rate +
         
         (category + rate | speaker) +
         (speaker_group + rate | target_word)
  )

## boundary (singular) fit: see help('isSingular')

general_closure.anova = anova(general_closure.lmer)

print(general_closure.anova)

## Type III Analysis of Variance Table with Satterthwaite's method
##                               Sum Sq  Mean Sq NumDF  DenDF F value    Pr(>F)
## speaker_group               0.043382 0.021691     2   42.1 12.2464 6.433e-05
## category                    0.116567 0.038856     3   22.9 21.9373 6.272e-07
## rate                        0.008194 0.008194     1   36.8  4.6263  0.038116
## speaker_group:category      0.048138 0.008023     6   32.3  4.5297  0.001943
## speaker_group:rate          0.023508 0.011754     2  110.5  6.6361  0.001898
## category:rate               0.008341 0.002780     3   36.4  1.5697  0.213366
## speaker_group:category:rate 0.011324 0.001887     6 7117.4  1.0656  0.380713
##                                
## speaker_group               ***
## category                    ***
## rate                        *  
## speaker_group:category      ** 
## speaker_group:rate          ** 
## category:rate                  
## speaker_group:category:rate    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2.2.1 Pairwise comparisons for Section 4.3.1 (Closure duration)

emmeans(general_closure.lmer,
        pairwise ~ speaker_group | category,
        lmer.df = "satterthwaite")

## NOTE: Results may be misleading due to involvement in interactions

## $emmeans
## category = V:C:
##  speaker_group emmean     SE   df lower.CL upper.CL
##  Y_SD           0.140 0.0180 24.4   0.1023    0.177
##  Y_WB           0.114 0.0188 24.1   0.0752    0.153
##  O_WB           0.146 0.0198 23.8   0.1052    0.187
## 
## category = V:C::
##  speaker_group emmean     SE   df lower.CL upper.CL
##  Y_SD           0.210 0.0209 31.1   0.1671    0.252
##  Y_WB           0.255 0.0217 30.3   0.2112    0.300
##  O_WB           0.265 0.0227 29.3   0.2185    0.311
## 
## category = VC:
##  speaker_group emmean     SE   df lower.CL upper.CL
##  Y_SD           0.148 0.0246 27.2   0.0972    0.198
##  Y_WB           0.156 0.0255 26.7   0.1034    0.208
##  O_WB           0.248 0.0268 26.1   0.1927    0.303
## 
## category = VC::
##  speaker_group emmean     SE   df lower.CL upper.CL
##  Y_SD           0.271 0.0189 35.2   0.2325    0.309
##  Y_WB           0.335 0.0196 34.2   0.2952    0.375
##  O_WB           0.342 0.0204 32.9   0.3004    0.383
## 
## Results are averaged over the levels of: rate 
## Degrees-of-freedom method: satterthwaite 
## Confidence level used: 0.95 
## 
## $contrasts
## category = V:C:
##  contrast    estimate     SE   df t.ratio p.value
##  Y_SD - Y_WB  0.02562 0.0164 29.8   1.561  0.2778
##  Y_SD - O_WB -0.00649 0.0181 28.0  -0.357  0.9322
##  Y_WB - O_WB -0.03211 0.0113 39.9  -2.836  0.0191
## 
## category = V:C::
##  contrast    estimate     SE   df t.ratio p.value
##  Y_SD - Y_WB -0.04563 0.0207 42.6  -2.200  0.0827
##  Y_SD - O_WB -0.05506 0.0224 39.7  -2.461  0.0471
##  Y_WB - O_WB -0.00942 0.0163 44.3  -0.578  0.8323
## 
## category = VC:
##  contrast    estimate     SE   df t.ratio p.value
##  Y_SD - Y_WB -0.00820 0.0233 36.3  -0.352  0.9342
##  Y_SD - O_WB -0.10019 0.0255 33.5  -3.933  0.0011
##  Y_WB - O_WB -0.09199 0.0172 45.1  -5.345  <.0001
## 
## category = VC::
##  contrast    estimate     SE   df t.ratio p.value
##  Y_SD - Y_WB -0.06393 0.0196 46.2  -3.265  0.0057
##  Y_SD - O_WB -0.07098 0.0209 44.4  -3.399  0.0040
##  Y_WB - O_WB -0.00705 0.0161 41.4  -0.437  0.9003
## 
## Results are averaged over the levels of: rate 
## Degrees-of-freedom method: satterthwaite 
## P value adjustment: tukey method for comparing a family of 3 estimates

emmeans(general_closure.lmer,
        pairwise ~ rate | speaker_group,
        lmer.df = "satterthwaite")

## NOTE: Results may be misleading due to involvement in interactions

## $emmeans
## speaker_group = Y_SD:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.188 0.0108 36.3    0.166    0.210
##  fast    0.196 0.0117 36.3    0.172    0.220
## 
## speaker_group = Y_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.216 0.0113 34.8    0.193    0.239
##  fast    0.214 0.0119 35.7    0.190    0.238
## 
## speaker_group = O_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.249 0.0119 33.4    0.225    0.273
##  fast    0.251 0.0123 34.5    0.226    0.276
## 
## Results are averaged over the levels of: category 
## Degrees-of-freedom method: satterthwaite 
## Confidence level used: 0.95 
## 
## $contrasts
## speaker_group = Y_SD:
##  contrast      estimate      SE   df t.ratio p.value
##  normal - fast -0.00777 0.00202 91.2  -3.850  0.0002
## 
## speaker_group = Y_WB:
##  contrast      estimate      SE   df t.ratio p.value
##  normal - fast  0.00191 0.00203 92.5   0.945  0.3469
## 
## speaker_group = O_WB:
##  contrast      estimate      SE   df t.ratio p.value
##  normal - fast -0.00263 0.00203 92.5  -1.299  0.1971
## 
## Results are averaged over the levels of: category 
## Degrees-of-freedom method: satterthwaite

2.2.2 Pairwise comparisons for Section 4.3.3 (Subsection: Lenition in fast speech)

emmeans(general_closure.lmer,
        pairwise ~ rate | category | speaker_group,
        lmer.df = "satterthwaite")

## $emmeans
## category = V:C, speaker_group = Y_SD:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.135 0.0175 24.9   0.0988    0.171
##  fast    0.144 0.0187 24.3   0.1056    0.183
## 
## category = V:C:, speaker_group = Y_SD:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.207 0.0203 31.7   0.1657    0.248
##  fast    0.213 0.0217 30.9   0.1683    0.257
## 
## category = VC, speaker_group = Y_SD:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.145 0.0238 27.6   0.0957    0.193
##  fast    0.151 0.0256 27.3   0.0983    0.203
## 
## category = VC:, speaker_group = Y_SD:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.266 0.0184 35.8   0.2285    0.303
##  fast    0.276 0.0196 35.0   0.2362    0.316
## 
## category = V:C, speaker_group = Y_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.112 0.0185 24.5   0.0742    0.151
##  fast    0.115 0.0192 24.2   0.0758    0.155
## 
## category = V:C:, speaker_group = Y_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.256 0.0213 30.5   0.2120    0.299
##  fast    0.255 0.0222 30.4   0.2101    0.301
## 
## category = VC, speaker_group = Y_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.159 0.0251 26.9   0.1076    0.211
##  fast    0.153 0.0262 27.0   0.0988    0.206
## 
## category = VC:, speaker_group = Y_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.337 0.0192 34.4   0.2978    0.376
##  fast    0.333 0.0200 34.4   0.2923    0.374
## 
## category = V:C, speaker_group = O_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.143 0.0196 24.1   0.1020    0.183
##  fast    0.149 0.0201 23.9   0.1081    0.191
## 
## category = V:C:, speaker_group = O_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.261 0.0225 29.4   0.2150    0.307
##  fast    0.269 0.0230 29.6   0.2217    0.316
## 
## category = VC, speaker_group = O_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.249 0.0266 26.1   0.1942    0.303
##  fast    0.247 0.0273 26.4   0.1908    0.303
## 
## category = VC:, speaker_group = O_WB:
##  rate   emmean     SE   df lower.CL upper.CL
##  normal  0.343 0.0202 32.9   0.3020    0.384
##  fast    0.341 0.0207 33.3   0.2986    0.383
## 
## Degrees-of-freedom method: satterthwaite 
## Confidence level used: 0.95 
## 
## $contrasts
## category = V:C, speaker_group = Y_SD:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast -0.009278 0.00356 180  -2.604  0.0100
## 
## category = V:C:, speaker_group = Y_SD:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast -0.005457 0.00383 183  -1.424  0.1560
## 
## category = VC, speaker_group = Y_SD:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast -0.006156 0.00463 184  -1.329  0.1855
## 
## category = VC:, speaker_group = Y_SD:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast -0.010189 0.00332 171  -3.071  0.0025
## 
## category = V:C, speaker_group = Y_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast -0.003083 0.00364 196  -0.848  0.3977
## 
## category = V:C:, speaker_group = Y_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast  0.000174 0.00383 182   0.045  0.9638
## 
## category = VC, speaker_group = Y_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast  0.006659 0.00463 184   1.438  0.1522
## 
## category = VC:, speaker_group = Y_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast  0.003909 0.00331 170   1.179  0.2398
## 
## category = V:C, speaker_group = O_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast -0.006968 0.00359 187  -1.939  0.0541
## 
## category = V:C:, speaker_group = O_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast -0.007850 0.00383 184  -2.047  0.0420
## 
## category = VC, speaker_group = O_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast  0.001968 0.00465 187   0.423  0.6727
## 
## category = VC:, speaker_group = O_WB:
##  contrast       estimate      SE  df t.ratio p.value
##  normal - fast  0.002322 0.00333 173   0.698  0.4862
## 
## Degrees-of-freedom method: satterthwaite

3 VOT-norm (word-normalized voice onset time)

3.1 Plot (Figure 4.2)

df %>%
  ggplot() +
  aes(x = category, y = dur_aspiration / dur_word) +
  facet_grid(cols = vars(speaker_group_label),
             rows = vars(rate)) +
  geom_boxplot(aes(fill = legal)) +
  ylab(bquote("VOT"["norm"])) +
  xlab("Quantity category") +
  scale_x_discrete(labels = c("V:C:" = "V:C:\ne.g. Bieter",
                              "V:C"  = "V:C\ne.g. wieder",
                              "VC"   = "VC\ne.g. Widder",
                              "VC:"  = "VC:\ne.g. bitter"
  )) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  scale_fill_manual(name = "Phonotactically",
                    values = c("red", "green"))

3.2 Model

emm_options(pbkrtest.limit = nrow(df))
emm_options(lmerTest.limit = nrow(df))

normalized_vot.lmer = df %>%
  mutate (speaker_group = factor(speaker_group,
                                 ordered = T,
                                 levels = c("Y_SD", "Y_WB", "O_WB"))) %>%
  lmer(data = .,
       dur_aspiration / dur_word
       ~ speaker_group * category * rate +
         
         (category + rate | speaker) +
         (speaker_group + rate | target_word)
  )

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.014128 (tol = 0.002, component 1)

normalized_vot.anova = anova(normalized_vot.lmer)

print(normalized_vot.anova)

## Type III Analysis of Variance Table with Satterthwaite's method
##                               Sum Sq   Mean Sq NumDF  DenDF F value    Pr(>F)
## speaker_group               0.001735 0.0008674     2   32.6  0.7842   0.46488
## category                    0.035573 0.0118577     3   26.5 10.7204 8.644e-05
## rate                        0.000610 0.0006097     1  106.8  0.5513   0.45943
## speaker_group:category      0.046282 0.0077137     6   34.6  6.9739 6.417e-05
## speaker_group:rate          0.004183 0.0020914     2  192.2  1.8908   0.15374
## category:rate               0.008637 0.0028789     3  197.2  2.6028   0.05318
## speaker_group:category:rate 0.004970 0.0008283     6 7117.0  0.7489   0.61026
##                                
## speaker_group                  
## category                    ***
## rate                           
## speaker_group:category      ***
## speaker_group:rate             
## category:rate               .  
## speaker_group:category:rate    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.2.1 Pairwise comparisons for Section 4.3.2 (Voice onset time)

emmeans(normalized_vot.lmer,
        pairwise ~ category | speaker_group,
        lmer.df = "satterthwaite")

## NOTE: Results may be misleading due to involvement in interactions

## $emmeans
## speaker_group = Y_SD:
##  category emmean      SE   df lower.CL upper.CL
##  V:C      0.0319 0.00780 45.4   0.0162   0.0476
##  V:C:     0.0966 0.00946 44.9   0.0775   0.1156
##  VC       0.0424 0.00998 45.0   0.0224   0.0625
##  VC:      0.0925 0.00805 45.2   0.0763   0.1088
## 
## speaker_group = Y_WB:
##  category emmean      SE   df lower.CL upper.CL
##  V:C      0.0358 0.00755 45.5   0.0206   0.0510
##  V:C:     0.0637 0.00921 43.7   0.0451   0.0822
##  VC       0.0542 0.00961 45.2   0.0348   0.0735
##  VC:      0.0666 0.00783 44.1   0.0508   0.0824
## 
## speaker_group = O_WB:
##  category emmean      SE   df lower.CL upper.CL
##  V:C      0.0352 0.00956 40.1   0.0159   0.0546
##  V:C:     0.0585 0.01119 44.6   0.0359   0.0810
##  VC       0.0761 0.01237 38.5   0.0510   0.1011
##  VC:      0.0702 0.00957 44.1   0.0509   0.0895
## 
## Results are averaged over the levels of: rate 
## Degrees-of-freedom method: satterthwaite 
## Confidence level used: 0.95 
## 
## $contrasts
## speaker_group = Y_SD:
##  contrast   estimate      SE   df t.ratio p.value
##  V:C - V:C: -0.06469 0.00987 40.0  -6.554  <.0001
##  V:C - VC   -0.01058 0.01054 36.2  -1.003  0.7484
##  V:C - VC:  -0.06067 0.00932 40.9  -6.513  <.0001
##  V:C: - VC   0.05411 0.01107 37.6   4.889  0.0001
##  V:C: - VC:  0.00402 0.00795 24.7   0.506  0.9570
##  VC - VC:   -0.05009 0.01007 34.4  -4.972  0.0001
## 
## speaker_group = Y_WB:
##  contrast   estimate      SE   df t.ratio p.value
##  V:C - V:C: -0.02787 0.00942 41.6  -2.960  0.0251
##  V:C - VC   -0.01840 0.01000 37.9  -1.841  0.2707
##  V:C - VC:  -0.03081 0.00890 42.4  -3.461  0.0066
##  V:C: - VC   0.00947 0.01051 39.3   0.901  0.8045
##  V:C: - VC: -0.00294 0.00740 25.2  -0.397  0.9784
##  VC - VC:   -0.01240 0.00952 36.0  -1.303  0.5673
## 
## speaker_group = O_WB:
##  contrast   estimate      SE   df t.ratio p.value
##  V:C - V:C: -0.02325 0.01279 32.2  -1.818  0.2837
##  V:C - VC   -0.04085 0.01397 29.3  -2.924  0.0317
##  V:C - VC:  -0.03500 0.01200 33.0  -2.916  0.0306
##  V:C: - VC  -0.01760 0.01454 30.3  -1.210  0.6256
##  V:C: - VC: -0.01175 0.01120 22.8  -1.049  0.7231
##  VC - VC:    0.00585 0.01348 28.1   0.434  0.9721
## 
## Results are averaged over the levels of: rate 
## Degrees-of-freedom method: satterthwaite 
## P value adjustment: tukey method for comparing a family of 4 estimates

4 Optimal category boundary

4.1 Calculation

possible_boundaries = seq(0, 280, by = 1)

# (Pre-allocation) Generate a data frame of the correct size with an empty
# column "classified_correctly". That column will be filled in the for loop
# below.
all_boundary_data_normal = tidyr::crossing(speaker = df$speaker, boundary = possible_boundaries) %>% 
  mutate(classified_correctly = NA)


for (current_row in rownames(all_boundary_data_normal)) {
  current_speaker = all_boundary_data_normal[current_row,]$speaker
  current_boundary = all_boundary_data_normal[current_row,]$boundary
  
  
  df %>%
    filter(rate == "normal") %>%
    filter(speaker == current_speaker) %>%
    
    mutate (classified_correctly = case_when((consonant_strength == "Fortis" & dur_c >= current_boundary) ~ "correct",
                                             (consonant_strength == "Fortis" & dur_c <  current_boundary) ~ "incorrect",
                                             (consonant_strength == "Lenis"  & dur_c >= current_boundary) ~ "incorrect",
                                             (consonant_strength == "Lenis"  & dur_c <  current_boundary) ~ "correct")) %>%
    
    count(classified_correctly) %>%
    spread(classified_correctly, n) %>%
    mutate(classified_correctly = correct / (correct + incorrect)) -> df_augmented
  
  all_boundary_data_normal[current_row, "classified_correctly"] = df_augmented$classified_correctly
}

best_boundary_data_normal = all_boundary_data_normal %>% 
  group_by(speaker) %>% 
  top_n(1, classified_correctly) %>%
  group_by(speaker) %>%
  summarise(min_boundary = min(boundary),
            max_boundary = max(boundary),
            boundary = mean(c(min(boundary),
                              max(boundary))),
            classified_correctly = first(classified_correctly)) %>%
  mutate(rate = "normal")



#### The same again for fast speech

# (Pre-allocation) Generate a data frame of the correct size with an empty
# column "classified_correctly". That column will be filled in the for loop
# below.
all_boundary_data_fast = tidyr::crossing(speaker = df$speaker, boundary = possible_boundaries) %>% 
  mutate(classified_correctly = NA)


for (current_row in rownames(all_boundary_data_fast)) {
  current_speaker = all_boundary_data_fast[current_row,]$speaker
  current_boundary = all_boundary_data_fast[current_row,]$boundary
  
  
  df %>%
    filter(rate == "fast") %>%
    filter(speaker == current_speaker) %>%
    
    mutate (classified_correctly = case_when((consonant_strength == "Fortis" & dur_c >= current_boundary) ~ "correct",
                                             (consonant_strength == "Fortis" & dur_c <  current_boundary) ~ "incorrect",
                                             (consonant_strength == "Lenis"  & dur_c >= current_boundary) ~ "incorrect",
                                             (consonant_strength == "Lenis"  & dur_c <  current_boundary) ~ "correct")) %>%
    
    count(classified_correctly) %>%
    spread(classified_correctly, n) %>%
    mutate(classified_correctly = correct / (correct + incorrect)) -> df_augmented
  
  all_boundary_data_fast[current_row, "classified_correctly"] = df_augmented$classified_correctly
}

best_boundary_data_fast = all_boundary_data_fast %>% 
  group_by(speaker) %>% 
  top_n(1, classified_correctly) %>%
  group_by(speaker) %>%
  summarise(min_boundary = min(boundary),
            max_boundary = max(boundary),
            boundary = mean(c(min(boundary),
                              max(boundary))),
            classified_correctly = first(classified_correctly)) %>% 
  mutate(rate = "fast")


best_boundary_data_all = rbind (best_boundary_data_normal,
                                best_boundary_data_fast)

4.2 Plot (Figure 4.3)

left_join(x = best_boundary_data_all,
          y = select(df, speaker, age, variety, speaker_group_label),
          by = "speaker") %>% 
  distinct %>%
  
  ggplot() +
  facet_wrap(vars(speaker_group_label)) +
  aes (x = reorder(rate, desc(rate)), y = boundary, group = speaker) +
  geom_point() +
  geom_line() +
  ylab("Optimal category boundary [ms]") +
  xlab("Speech rate")

5 Category expansion

5.1 Calculation

df_dispersion_difference_with_target_word = df %>%
  gather(key = "measure", value = "value", dur_c_norm_word) %>%
  group_by(measure, variety, age, speaker_group, speaker_group_label, category, legal, rate, speaker, target_word) %>%
  summarise(coefficient_of_variation = sd(value, na.rm = TRUE) / mean(value, na.rm = TRUE)) %>%
  ungroup() %>%
  spread(rate, coefficient_of_variation) %>%
  mutate(change_in_dispersion = fast - normal)

## `summarise()` has grouped output by 'measure', 'variety', 'age',
## 'speaker_group', 'speaker_group_label', 'category', 'legal', 'rate', 'speaker'.
## You can override using the `.groups` argument.

df_dispersion_difference_without_target_word = df %>%
  gather(key = "measure", value = "value", dur_c_norm_word) %>%
  group_by(measure, variety, age, speaker_group, speaker_group_label, category, legal, rate, speaker) %>%
  summarise(coefficient_of_variation = sd(value, na.rm = TRUE) / mean(value, na.rm = TRUE)) %>%
  ungroup() %>%
  spread(rate, coefficient_of_variation) %>%
  mutate(change_in_dispersion = fast - normal)

## `summarise()` has grouped output by 'measure', 'variety', 'age',
## 'speaker_group', 'speaker_group_label', 'category', 'legal', 'rate'. You can
## override using the `.groups` argument.

5.2 Plot (Figure 4.4)

df_dispersion_difference_without_target_word %>%
  filter(measure == "dur_c_norm_word") %>%
  
  mutate(speaker_group = factor(speaker_group,
                                levels = c("Y_SD", "Y_WB", "O_WB"),
                                labels = c("Standard German", "Dialect, younger", "Dialect, older"))) %>%
  ggplot() +
  aes(x = category, y = change_in_dispersion) +
  facet_grid(cols = vars(speaker_group),
             scales = "free_y") +
  geom_point() +
  geom_boxplot(alpha = 0.8,
               outlier.shape = NA,
               coef = 0) +
  ylab("Category expansion") +
  xlab("") +
  scale_x_discrete(labels = c("V:C:" = "V:C:\ne.g. Bieter",
                              "V:C"  = "V:C\ne.g. wieder",
                              "VC"   = "VC\ne.g. Widder",
                              "VC:"  = "VC:\ne.g. bitter"
                              )) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  scale_fill_manual(name = "Phonotactically",
                    values = c("red", "green")) +
  coord_cartesian(ylim = c(-0.15, 0.3)) +
  geom_hline (aes(yintercept = 0))

5.3 Model

emm_options(pbkrtest.limit = nrow(df_dispersion_difference_with_target_word))
emm_options(lmerTest.limit = nrow(df_dispersion_difference_with_target_word))

dispersion_difference.lmer = df_dispersion_difference_with_target_word %>%
  filter(measure == "dur_c_norm_word") %>%
  
  mutate (speaker_group = factor(speaker_group,
                                 ordered = T,
                                 levels = c("Y_SD", "Y_WB", "O_WB"))) %>%
  lmer(data = .,
       change_in_dispersion
       ~ speaker_group * category +
         
         (category | speaker) +
         (speaker_group | target_word)
  )

## boundary (singular) fit: see help('isSingular')

dispersion_difference.anova = anova(dispersion_difference.lmer)

print(dispersion_difference.anova)

## Type III Analysis of Variance Table with Satterthwaite's method
##                          Sum Sq   Mean Sq NumDF  DenDF F value Pr(>F)
## speaker_group          0.032333 0.0161667     2 21.801  1.8137 0.1868
## category               0.003040 0.0010133     3 23.487  0.1137 0.9512
## speaker_group:category 0.078655 0.0131092     6 25.183  1.4707 0.2283

6 Fortis–lenis overlap

6.1 Calculation

# Calculate the baseline of lenis tokens for fortis target words, normal rate
typical_lenis_upper_bound_normal = df_normal %>%
  filter(consonant_strength == "Lenis") %>%
  group_by(speaker, place_of_articulation) %>%
  summarise(typical_lenis_upper_bound = quantile(dur_c / dur_word, probs = 0.75))

## `summarise()` has grouped output by 'speaker'. You can override using the
## `.groups` argument.

# Calculate the baseline of lenis tokens for fortis target words, fast rate
typical_lenis_upper_bound_fast = df_fast %>%
  filter(consonant_strength == "Lenis") %>%
  group_by(speaker, place_of_articulation) %>%
  summarise(typical_lenis_upper_bound = quantile(dur_c / dur_word, probs = 0.75))

## `summarise()` has grouped output by 'speaker'. You can override using the
## `.groups` argument.

# Calculate the baseline of fortis tokens for lenis target words, normal rate
typical_fortis_lower_bound_normal = df_normal %>%
  filter(consonant_strength == "Fortis") %>%
  group_by(speaker, place_of_articulation) %>%
  summarise(typical_fortis_lower_bound = quantile(dur_c / dur_word, probs = 0.25))

## `summarise()` has grouped output by 'speaker'. You can override using the
## `.groups` argument.

# Calculate the baseline of fortis tokens for lenis target words, fast rate
typical_fortis_lower_bound_fast = df_fast %>%
  filter(consonant_strength == "Fortis") %>%
  group_by(speaker, place_of_articulation) %>%
  summarise(typical_fortis_lower_bound = quantile(dur_c / dur_word, probs = 0.25))

## `summarise()` has grouped output by 'speaker'. You can override using the
## `.groups` argument.

6.1.1 Fortis as target, lenis as baseline

6.1.1.1 Per place of articulation

flo_per_place_of_articulation_normal_rate_fortis_as_target =
  # FLO is short for fortis–lenis overlap
  left_join(df_normal,
            typical_lenis_upper_bound_normal,
            by = c("speaker", "place_of_articulation")) %>%
  
  mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside",
                                                 dur_c/dur_word >  typical_lenis_upper_bound ~ "outside")) %>% 
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength) %>%
  mutate (number_of_tokens_in_group = n()) %>%
  
  group_by(variety,
           age,
           speaker,
           speaker_group_label,
           place_of_articulation,
           consonant_strength,
           number_of_tokens_in_group,
           inside_typical_lenis_region) %>%
  
  summarise(absolute_frequency = n()) %>%
  mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>%
  select(-absolute_frequency) %>%
  
  spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>% 
  
  filter(consonant_strength == "Fortis") %>%
  ungroup %>% 
  mutate (speech_rate = "normal")

## `summarise()` has grouped output by 'variety', 'age', 'speaker',
## 'speaker_group_label', 'place_of_articulation', 'consonant_strength',
## 'number_of_tokens_in_group'. You can override using the `.groups` argument.

flo_per_place_of_articulation_fast_rate_fortis_as_target =
  # FLO is short for fortis–lenis overlap
  left_join(df_fast,
            typical_lenis_upper_bound_fast,
            by = c("speaker", "place_of_articulation")) %>%
  
  mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside",
                                                 dur_c/dur_word >  typical_lenis_upper_bound ~ "outside")) %>% 
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength) %>%
  mutate (number_of_tokens_in_group = n()) %>%
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength,
           number_of_tokens_in_group,
           inside_typical_lenis_region) %>%
  
  summarise(absolute_frequency = n()) %>%
  mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>%
  select(-absolute_frequency) %>%
  
  spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>%
  
  filter(consonant_strength == "Fortis") %>%
  ungroup %>% 
  mutate (speech_rate = "fast")

## `summarise()` has grouped output by 'variety', 'age', 'speaker_group_label',
## 'speaker', 'place_of_articulation', 'consonant_strength',
## 'number_of_tokens_in_group'. You can override using the `.groups` argument.

flo_per_place_of_articulation_all_rates_fortis_as_target = rbind(flo_per_place_of_articulation_normal_rate_fortis_as_target,
                                                                 flo_per_place_of_articulation_fast_rate_fortis_as_target)

6.1.1.2 Per word

flo_per_word_normal_rate_fortis_as_target =
  # FLO is short for fortis–lenis overlap
  left_join(df_normal,
            typical_lenis_upper_bound_normal,
            by = c("speaker", "place_of_articulation")) %>%
  
  mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside",
                                                 dur_c/dur_word >  typical_lenis_upper_bound ~ "outside")) %>% 
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength,
           target_word) %>%
  mutate (number_of_tokens_in_group = n()) %>%
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength,
           target_word,
           number_of_tokens_in_group,
           inside_typical_lenis_region) %>%
  
  summarise(absolute_frequency = n()) %>%
  mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>%
  select(-absolute_frequency) %>%
  
  spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>% 
  
  filter(consonant_strength == "Fortis") %>%
  mutate (speech_rate = "normal") %>%
  ungroup

## `summarise()` has grouped output by 'variety', 'age', 'speaker_group_label',
## 'speaker', 'place_of_articulation', 'consonant_strength', 'target_word',
## 'number_of_tokens_in_group'. You can override using the `.groups` argument.

flo_per_word_fast_rate_fortis_as_target =
  # FLO is short for fortis–lenis overlap
  left_join(df_fast,
            typical_lenis_upper_bound_fast,
            by = c("speaker", "place_of_articulation")) %>%
  
  mutate(inside_typical_lenis_region = case_when(dur_c/dur_word <= typical_lenis_upper_bound ~ "inside",
                                                 dur_c/dur_word >  typical_lenis_upper_bound ~ "outside")) %>% 
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength,
           target_word) %>%
  mutate (number_of_tokens_in_group = n()) %>%
  
  group_by(variety,
           age,
           speaker_group_label,
           speaker,
           place_of_articulation,
           consonant_strength,
           target_word,
           number_of_tokens_in_group,
           inside_typical_lenis_region) %>%
  
  summarise(absolute_frequency = n()) %>%
  mutate (relative_frequency = absolute_frequency / number_of_tokens_in_group) %>%
  select(-absolute_frequency) %>%
  
  spread(inside_typical_lenis_region, relative_frequency, fill = 0) %>%
  
  filter(consonant_strength == "Fortis") %>%
  mutate (speech_rate = "fast") %>%
  ungroup

## `summarise()` has grouped output by 'variety', 'age', 'speaker_group_label',
## 'speaker', 'place_of_articulation', 'consonant_strength', 'target_word',
## 'number_of_tokens_in_group'. You can override using the `.groups` argument.

flo_per_word_all_rates_fortis_as_target = rbind(flo_per_word_normal_rate_fortis_as_target,
                                                flo_per_word_fast_rate_fortis_as_target)

6.2 Plot per place of articulation (Figure 4.5)

ggplot(flo_per_place_of_articulation_normal_rate_fortis_as_target) +
  aes(y = inside, x = place_of_articulation) +
  facet_grid(cols = vars(speaker_group_label)) +
  coord_cartesian(ylim = c(-0.1,1)) +
  ylab("Fortis–lenis overlap") +
  xlab("Place of articulation") +
  geom_jitter(aes(color = place_of_articulation)) +
  scale_color_discrete(name = "Place of articulation") +
  scale_y_continuous(breaks = seq(from = 0, to = 1, by = 0.2))

6.3 Plots per word (Figure 4.6)

for (current_poa in c("alveolar")) {
  flo_per_word_normal_rate_fortis_as_target %>% 
    filter(place_of_articulation == current_poa) %>% 
    
    ggplot() +
    aes(y = inside, x = target_word) +
    facet_grid(cols = vars(speaker_group_label)) +
    coord_cartesian(ylim = c(-0.1,1)) +
    ylab("Fortis–lenis overlap") +
    xlab("Target word") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
    geom_jitter(aes(color = target_word)) +
    scale_color_discrete(name = "Target word") +
    scale_y_continuous(breaks = seq(from = 0, to = 1, by = 0.2)) -> plot
  
  print(plot)
}

Data, Plots & Models

Markus Jochim & Felicitas Kleber

June 25, 2022

1 Underlying data set

1.1 Speaker groups

1.2 Tokens per speech rate

1.3 Tokens per target word

1.4 Matrix of target words

1.5 Types and tokens per quantity category

2 Closure-norm (word-normalized stop closure duration)

2.1 Plot (Figure 4.1)

2.2 Model: Normalized closure duration

2.2.1 Pairwise comparisons for Section 4.3.1 (Closure duration)

2.2.2 Pairwise comparisons for Section 4.3.3 (Subsection: Lenition in fast speech)

3 VOT-norm (word-normalized voice onset time)

3.1 Plot (Figure 4.2)

3.2 Model

3.2.1 Pairwise comparisons for Section 4.3.2 (Voice onset time)

4 Optimal category boundary

4.1 Calculation

4.2 Plot (Figure 4.3)

5 Category expansion

5.1 Calculation

5.2 Plot (Figure 4.4)

5.3 Model

6 Fortis–lenis overlap

6.1 Calculation

6.1.1 Fortis as target, lenis as baseline

6.1.1.1 Per place of articulation

6.1.1.2 Per word

6.2 Plot per place of articulation (Figure 4.5)

6.3 Plots per word (Figure 4.6)