#################################################################################################################################
# Associated publication:
#
#  P. Greca and J. Harrington (2026):
# “Is metaphony (and sound change) sensitive to grammatical category? The case of verbs in the dialects of the Lausberg area”, 
#  Phonology (https://www.cambridge.org/core/journals/phonology).
#
# Script to generate all dataframes used for the analyses in "Greca&Harrington26_analyses.Rmd/html"
# Authors: Pia Greca & Jonathan Harrington
#################################################################################################################################

library(data.table)
library(emuR)
library(tidyverse)
library(dtt)
library(magrittr)
library(gridExtra)

# WARNING! when running this script, please execute ALL tests, because at some point data extraction gives problems for some segments

setwd("your_working_directory")

pfad = "filepath_of_folder_containing_emuDB"

db = load_emuDB(file.path(pfad, "db_emuDB")) #99 speakers

# if you want to have a look:

#serve(db, useViewer = F)

#########################
##### segment query #####
#########################

e.s = query(db, "[MAU=e|je & End(Word, MAU)==0]", resultType = "emuRsegs") %>% sort()
e.w.s = requery_hier(db, e.s, "Word", resultType = "emuRsegs") %>% sort()

# remove cervelli, cervello, tenete, prete
temp = label(e.w.s) %in% c("cervelli", "cervello", "tenete", "prete", "preti")

e.s = e.s[!temp,]%>% sort()
e.w.s = e.w.s[!temp,]%>% sort()

# get /e/ in these words
e.cer = query(db, "[[#MAU = e ^ Word = cervelli|cervello] -> MAU = ddZ]", resultType = "emuRsegs") %>% sort()
e.cer.w = requery_hier(db, e.cer, "Word", resultType = "emuRsegs") %>% sort()

e.ten = query(db, "[[#MAU = e ^ Word = tenete] -> MAU = t]", resultType = "emuRsegs") %>% sort()
e.ten.w = requery_hier(db, e.ten, "Word", resultType = "emuRsegs") %>% sort()

e.preti = query(db, "[[#MAU = e ^ Word = prete|preti] -> MAU = v]", resultType = "emuRsegs") %>% sort()
e.preti.w = requery_hier(db, e.preti, "Word", resultType = "emuRsegs") %>% sort()

e.s = rbind(e.s, e.cer, e.ten, e.preti)%>% sort()
e.w.s = rbind(e.w.s, e.cer.w, e.ten.w, e.preti.w)%>% sort()

# check same number of rows
nrow(e.s) == nrow(e.w.s)
# check start time of e.s is always bigger than start time of word
# should all be F
any(start(e.s) < start(e.w.s))

# if this gives error, use the test at line 67 to get what the problem is

E.s = query(db, "[MAU=E & End(Word, MAU)==0]",resultType = "emuRsegs") %>% sort()
E.w.s = requery_hier(db, E.s, "Word", resultType = "emuRsegs") %>% sort()

nrow(E.s) == nrow(E.w.s)
any(start(E.s) < start(E.w.s))

o.s = query(db, "[MAU=o|O|jo & End(Word, MAU)==0]",resultType = "emuRsegs") %>% sort()
o.w.s = requery_hier(db, o.s, "Word", resultType = "emuRsegs") %>% sort()

#if this (or anything here!) gives error:
#m = match(o.s$bundle, o.w.s$bundle)
#segs = o.s[is.na(m),]
#segs
#serve(db, seglist = segs)

nrow(o.s) == nrow(o.w.s)
any(start(o.s) < start(o.w.s))

a.s = query(db, "[MAU=a & End(Word, MAU)==0]", resultType = "emuRsegs") %>% sort()
a.w.s = requery_hier(db, a.s, "Word", resultType = "emuRsegs") %>% sort()
# remove capelli  capello cappelli cappello capretta capretti capretto pensate anello anelli
vec = c("capelli", "capello", "cappelli", "cappello", "capretta", "capretta", "capretto", "pensate", "anelli", "anello", "capretti", "trovi", "trovate", "pensi")
temp = label(a.w.s) %in% vec
a.s = a.s[!temp,]%>% sort()
a.w.s = a.w.s[!temp,]%>% sort()

nrow(a.s) == nrow(a.w.s)
any(start(a.s) < start(a.w.s))

#U, no problems
u.s = query(db, "[MAU=u|ju & End(Word, MAU)==0]", resultType = "emuRsegs") %>% sort()
u.w.s = requery_hier(db, u.s, "Word", resultType = "emuRsegs") %>% sort()
# remove "coltelli coltello nera neri nero pecora  pecore
temp = label(u.w.s) %in% c("coltelli", "coltello", "nera", "neri", "nero", "pecora", "pecore")

u.s = u.s[!temp,]%>% sort()
u.w.s = u.w.s[!temp,]%>% sort()

nrow(u.s) == nrow(u.w.s)
any(start(u.s) < start(u.w.s))


# /i/

i.s = query(db, "[MAU=i & End(Word, MAU)==0]", resultType = "emuRsegs") %>% sort()
i.w.s = requery_hier(db, i.s, "Word", resultType = "emuRsegs") %>% sort()
# IF IT GIVES PROBLEMS HERE this may mean that some elements are not bound in the hierarchy.
# find out which ones:
# a=levels(factor(i.s$bundle))
# b=levels(factor(i.w.s$bundle))
# setdiff(a, b)

# this method is the same as the one with "m" at line 67 above

# CHECK what words have 2 /i/s in KAN (final vowel excluded, this was already sorted out in the first query)

# i.kan = requery_hier(db, i.s, "KAN", resultType = "emuRsegs") %>% sort()
# 
# levels(factor(i.kan$labels))


# remove these
temp = label(i.w.s) %in% c("anelli","apri", "aprite", "bevete", "bevi", "cervelli", "cervello", "correte", "corri", 
                           "donna", "donne", "dormi", "dormite", "esci", "ginocchio", "ginocchia", "morite", "muori",
                           "nipote", "nipoti", "pettine", "pettini", "prete", "preti", "tieni", "topi", "topo", "uomini",
                           "uomo", "uscite", "volete", "dita", "dito", "vedete", "vedi", "volete", "cenere" )
i.s = i.s[!temp,]
i.w.s = i.w.s[!temp,]

i.bevete = query(db, "[[#MAU = i ^ Word = bevete|bevi] -> MAU = v]", resultType = "emuRsegs") %>% sort()
i.bevete.w = requery_hier(db, i.bevete, "Word",resultType = "emuRsegs") %>% sort()

#nrow(i.bevete) == nrow(i.bevete.w)
#any(start(i.bevete) < start(i.bevete.w))

i.dit = query(db, "[[#MAU = i ^ Word = dita|dito] -> MAU = j]", resultType = "emuRsegs") %>% sort()
i.dit.w = requery_hier(db, i.dit, "Word", resultType = "emuRsegs") %>% sort()

# nrow(i.dit) == nrow(i.dit.w)
# any(start(i.dit) < start(i.dit.w))

i.donne = query(db, "[[#MAU = i ^ Word = donna|donne] -> MAU = mm]", resultType = "emuRsegs") %>% sort()
i.donne.w = requery_hier(db, i.donne, "Word", resultType = "emuRsegs") %>% sort()

# nrow(i.donne) == nrow(i.donne.w)
# any(start(i.donne) < start(i.donne.w))

i.ved = query(db, "[[#MAU = i ^ Word = vedi|vedete] -> MAU = d]", resultType = "emuRsegs") %>% sort()
i.ved.w = requery_hier(db, i.ved, "Word", resultType = "emuRsegs") %>% sort()

#ecc

i.cen = query(db, "[[#MAU = i ^ Word = cenere] -> MAU = nn]", resultType = "emuRsegs") %>% sort()
i.cen.w = requery_hier(db, i.cen, "Word", resultType = "emuRsegs") %>% sort()

 
i.inilli = query(db, "[[#MAU = i ^ Word = anelli] -> MAU = ll|l]", resultType = "emuRsegs") %>% sort()
i.inilli.w = requery_hier(db, i.inilli, "Word", resultType = "emuRsegs") %>% sort()


i.cerv = query(db, "[[#MAU = i ^ Word = cervelli|cervello] -> MAU = ll|l|ddZ]", resultType = "emuRsegs") %>% sort()
i.cerv.w = requery_hier(db, i.cerv, "Word", resultType = "emuRsegs") %>% sort()


i.pett = query(db, "[[#MAU = i ^ Word = pettine|pettini] -> MAU = tt]", resultType = "emuRsegs") %>% sort()
i.pett.w = requery_hier(db, i.pett, "Word", resultType = "emuRsegs") %>% sort()


i.prete = query(db, "[[#MAU = i ^ Word = prete|preti] -> MAU = v]",resultType = "emuRsegs") %>% sort()
i.prete.w = requery_hier(db, i.prete, "Word", resultType = "emuRsegs") %>% sort()

i.s = rbind(i.s, i.bevete, i.cen, i.cerv, i.dit, i.donne, i.inilli, i.pett, i.prete, i.ved) %>% sort()
i.w.s = rbind(i.w.s, i.bevete.w, i.cen.w, i.cerv.w, i.dit.w, i.donne.w, i.inilli.w, i.pett.w, i.prete.w, i.ved.w) %>% sort()

nrow(i.s) == nrow(i.w.s)
any(start(i.s) < start(i.w.s))

#to check where further problems may be located:

#table(start(i.s) < start(i.w.s))

#which(start(i.s) < start(i.w.s))

###################################################

# get target vowels and their corresponding words
target.s = rbind(e.s, E.s, o.s, a.s, i.s, u.s) %>% sort()
target.w = rbind(e.w.s, E.w.s, o.w.s, a.w.s, i.w.s, u.w.s) %>% sort()

# check same number of rows and that start of target is less than start of word
# should be T, F
nrow(target.s) == nrow(target.w)
any(start(target.s) < start(target.w))

# verify that there are no duplicated utterances
length(unique(target.s$bundle))
# should be zero
tempdup = duplicated(target.s$bundle)
sum(tempdup)
#if it is not zero, check for duplicates: target.s$bundle[tempdup]

#####################

# get all word-final segments
final.s = query(db, "[MAU!= xxxx & End(Word, MAU) == 1]", resultType = "emuRsegs") %>% sort()

# get the corresponding words
final.w = requery_hier(db, final.s, "Word", resultType = "emuRsegs") %>% sort()


# find which final.w are in target.w
temp = final.w$bundle %in% target.w$bundle

mat = NULL
n = 1:nrow(final.w)
for(j in 1:nrow(target.w)){
  temp = final.w$bundle == target.w$bundle[j]
  mat = c(mat, n[temp])
}

# these should be the same
all(final.w$bundle[mat] == target.w$bundle)
# so these are the trigger vowels
trigger.s = final.s[mat,]

# verify that the bundles of the target and trigger vowels are the same
all(target.s$bundle == trigger.s$bundle)
# verify that the start time of the trigger vowels is greater than the start time of the target vowels
all(start(trigger.s) > start(target.s))

# here we list the word, target vowel, trigger vowel
t.labs = data.frame(Word = target.w$labels, Target = target.s$labels, Trigger =trigger.s$labels)
t.labs

# these are the unique ones
u = with(t.labs, unique(interaction(Word, Target, Trigger)))

# here are the labels of trigger vowels
table(label(trigger.s))
# there are some that are not /a, e, i, u/ These are the ones
temp = label(trigger.s) %in% c("a", "e", "i", "u")

sum(!temp) 
# check and fix eventual errors in the annotation:
#trigfalse=trigger.s %>% filter(labels %in% c("u<p:>"))
#serve(db, seglist=trigfalse, useViewer=F)

###################################
##### sort and create labels ######
###################################

target.w2  = sort(target.w)
p = match(target.w2$bundle, target.w$bundle)
target.s2 = target.s[p,]
trigger.s2 = trigger.s[p,]
target.phon2 = requery_hier(db, target.w2, "MAU", resultType = "emuRsegs") %>% sort()

all(target.w2$bundle==target.phon2$bundle)
all(start(target.w2) == start(target.phon2))
all(target.s2$bundle==target.phon2$bundle)
all(target.s2$bundle==target.s2$trigger)
# now rename
target.s = target.s2
target.w = target.w2
target.phon = target.phon2
trigger.s = trigger.s2

labs.df = data.frame(V = label(target.s), Word= label(target.w), P = label(target.phon), Trig = label(trigger.s))

# check that Word and Trig are matched
u = sort(unique(with(labs.df, as.character(interaction(Word, Trig)))))
u

##############
# add in information about the stem
##############

stem.info = read.table("stem.txt", header=T)  # check file before running if it looks good
                                   
# match stem to labs.df
m = match(as.character(labs.df$Word), as.character(stem.info$Word))
stems = as.character(stem.info$STEM[m])
labs.df = data.frame(labs.df, Stem = factor(stems))
# check
o = with(labs.df, as.character(interaction(Word, Stem)))
# seems OK
sort(table(o))

# add speaker info and utterance info
labs.df = data.frame(labs.df, speaker = target.s$session, bundle = target.s$bundle)

##############
# get formants of the target
##############

target.fm  = get_trackdata(db, target.s, "FORMANTS", resultType="tibble")

# linearly time-normalise to 11 data points.
target.fmt = normalize_length(target.fm, N=11)

# ALTERNATIVE WITH TIDYVERSE, but it deletes many variables:

# N = 11
# target.fmt = target.fm %>%
#   group_by(sl_rowIdx, labels) %>%
#   summarise(F2n = approx(times_rel, T2, n = N)$y,
#             times = seq(0, 1, length.out = N)) %>%
#  ungroup()

###############

# convert to data-frame
target.fmt %>% setDT

# choose the columns you want to filter
Fcols <- c("T1", "T2")

# run the filter, separately for each track (by = sl_rowIdx)
# I chose order = 5, you can always play around with it

target.fmt[, (Fcols) := lapply(.SD, runmed, k = 5), .SDcols = Fcols, by = sl_rowIdx]

# if you really have to, undo the data.tabling, so v.fmt.sent is back to data.frame
# convert back to data-frame
target.fmt %>% setDF

# round the times_norm to the nearest decimal place
target.fmt$times_norm = round(target.fmt$times_norm, 1)

# add in Word labels
target.fmt$Word= factor(rep(as.character(labs.df$W), table(target.fmt$sl_rowIdx)))
# add in Stem labels
target.fmt$Stem = factor(rep(as.character(labs.df$Stem), table(target.fmt$sl_rowIdx)))
# add in the labels of the trigger vowels
target.fmt$Trig = factor(rep(label(trigger.s), table(target.fmt$sl_rowIdx)))

# check for words that are not in the word list (rows of labs.df)
a=labs.df %>% filter(!Word%in% stem.info$Word)

# IMPORTANT: with these lines you may check further annotation errors, if needed:
#levels(factor(a$bundle))
#levels(factor(a$Word)) 

# create local copies of what we need and detach
D <- target.fmt
D %>% setDT
# bind info about phonological triggers and targets
phonology <- fread("words and phontrig.csv", header = TRUE) # CHECK IF FILE LOOKS GOOD before running this line
D %<>% .[phonology, on = c(Word= "Word"), nomatch = 0]
D %>% setnames('labels', 'Target')
D %>% setnames(paste0('T', 1:5), paste0('F', 1:5))
D %>% setnames('session', 'speaker')

# Lobanov normalisation
Fcols <- c('F1', 'F2')
FNcols <- c('F1n', 'F2n')
D[, (FNcols) := lapply(.SD, function(x) as.vector(scale(x))), .SDcols = Fcols, by = speaker]
# Bark scale
FBcols <- c('F1b', 'F2b')
D[, (FBcols) := lapply(.SD, bark), .SDcols = Fcols]

# the final D is a dataframe with all F values related also to phonological targets
# similar to target.fmt

#exclude outliers (second person plural of some verbs)
D %<>% .[!Word %in% c("tenete", "morite", "pensate", "trovate", "volete")]

# and if we only want stems /e, o/:
D %<>% .[Stem_vowel %in% c("e", "o")]

# all columns there?
names(D)

#let's exclude some of them:
D = D %>% select(-c(start, end, utts, db_uuid, start_item_id, end_item_id, level, start_item_seq_idx, end_item_seq_idx, type,
                    sample_start, sample_end, sample_rate,times_orig, F3, F4, F5))

###################################################################################

# what columns do we have now?
names(D)

# [1] "sl_rowIdx"    "Target"       "speaker"      "bundle"       "times_rel"    "times_norm"   "F1"           "F2"          
# [9] "Word"         "Stem"         "Trig"         "Stem_vowel"   "Suffix_vowel" "F1n"          "F2n"          "F1b"         
# [17] "F2b"

# list of speakers

levels(factor(D$speaker))

# ADD FURTHER INFO: Village and Region

# add village

mm = D %>% filter(str_detect(speaker, "MM"))
ca = D %>% filter(str_detect(speaker, "CA"))
cc = D %>% filter(str_detect(speaker, "CC"))
cv = D %>% filter(str_detect(speaker, "CV"))
la = D %>% filter(str_detect(speaker, "LA"))
mg = D %>% filter(str_detect(speaker, "MG"))
om = D %>% filter(str_detect(speaker, "OM"))
ri = D %>% filter(str_detect(speaker, "RI"))
sl = D %>% filter(str_detect(speaker, "SL"))
tr = D %>% filter(str_detect(speaker, "TR"))
vi = D %>% filter(str_detect(speaker, "VI"))
li = D %>% filter(str_detect(speaker, "LI"))
sc = D %>% filter(str_detect(speaker, "SC"))
sd = D %>% filter(str_detect(speaker, "SD"))
sh = D %>% filter(str_detect(speaker, "SH"))
vb = D %>% filter(str_detect(speaker, "VB"))

mm$Village <- "Mormanno"
ca$Village <-  "Canna"
cc$Village <- "Cerchiara" 
cv$Village <- "Castrovillari" 
la$Village <- "Lauria" 
mg$Village <- "Montegiordano" 
om$Village <- "Orsomarso" 
ri$Village <- "Rocca Imperiale" 
sl$Village <- "San Lorenzo Bellizzi"
tr$Village <- "Trebisacce"
vi$Village <- "Villapiana" 
li$Village <- "Laino" 
sc$Village <- "Scalea" 
sd$Village <- "S. Domenica Talao" 
sh$Village <- "Schiavonea" 
vb$Village <- "Verbicaro"

#you want to know how many speakers per village?

levels(factor(mm$speaker)) %>% length()


D1=rbind(mm, ca, cc, cv, la, mg, om, ri, sl,tr,vi, li, sc, sd, sh, vb)

#check that D1 has same number of rows as D

nrow(D1) == nrow(D)

D = D1

rm(D1, mm, ca, cc, cv, la, mg, om, ri, sl,tr,vi, li, sc, sd, sh, vb)

#distribution tokens/village

table(D$Village) %>% sort(decreasing=T) %>% as.data.frame() #numbers are number of tokens per village

# 1              Mormanno 56848    # 28 speakers
# 2             Cerchiara 22220    # 11 speakers
# 3                Scalea 20306    # 10 speakers
# 4            Trebisacce 16203    #  9 speakers
# 5             Verbicaro 15653    #  8 speakers
# 6             Orsomarso 10879    #  5 speakers
# 7            Villapiana 10241    #  6 speakers
# 8  San Lorenzo Bellizzi  7953    #  5 speakers
# 9         Castrovillari  6875    #  3 speakers
# 10        Montegiordano  4290    #  2 speakers
# 11      Rocca Imperiale  4059    #  2 speakers
# 12                Laino  3531    #  6 speakers
# 13               Lauria  2376    #  1 speaker
# 14    S. Domenica Talao  2233    #  1 speaker
# 15                Canna  2134    #  1 speaker
# 16           Schiavonea  2090    #  1 speaker



# add region: NB "Region" here is based on metaphonic patterns also detected in Greca et al. 2024 (LabPhon) rather than geographically
# (read explanations in the comments below)

# Mormanno (MM)

MM=D %>%dplyr::filter(speaker %in% c("MM02F", "MM03F", "MM03M", "MM04F", "MM04M", "MM05F", "MM05M", "MM06F", "MM06M",
                                     "MM07F", "MM07M", "MM08F", "MM08M", "MM09F", "MM09M", "MM10F", "MM11F", "MM11M",
                                    "MM12F", "MM12M", "MM13F", "MM13M", "MM14F", "MM14M", "MM15F", "MM15M", "MM16F",
                                     "MM16M"))

# Mittelzone (MZ)

# NB: compared to Greca et al. 2024, MZ now includes also Orsomarso (a monophthongal isle in the Zwischenzone), 
# Lauria (which is Mittelzone but the western, Basilicata side),
# and Castrovillari (Suedzone, monopht. metaphony)


MZ=D %>%dplyr::filter(speaker %in% c("CA01F", 
                                     "CC01F", "CC01M", "CC02F", "CC02M", "CC03F", "CC03M", "CC04F", "CC05F", "CC06F", "CC07F", "CC08F", 
                                     "CV01F", "CV02F", "CV03F", 
                                     "LA01F",
                                     "MG01M", "MG02M",
                                     "OM01F", "OM01M", "OM02F", "OM02M", "OM03F",
                                     "RI01M", "RI02M",
                                     "SL01F", "SL01M", "SL02F", "SL02M", "SL03F",
                                     "TR01F", "TR01M", "TR02F", "TR02M", "TR03F", "TR03M", "TR04F", "TR05F", "TR06F",
                                     "VI01F", "VI01M", "VI02F", "VI02M", "VI03F", "VI03M"))

# Zwischenzone (ZZ)

# NB: compared to Greca et al. 2024, this also includes Schiavonea, which belongs to the East coast, Suedzone

ZZ=D %>%dplyr::filter(speaker %in% c("LI01M", "LI02M", "LI03M", "LI04M", "LI05M", "LI06M",  
                                     "SC01F", "SC01M", "SC02F", "SC02M", "SC03F","SC03M", "SC04M", "SC05M", "SC06M", "SC07M", 
                                     "SD01F", 
                                     "SH01M",
                                     "VB01F", "VB01M", "VB02F", "VB02M", "VB03F", "VB04F", "VB05F" ,"VB06F"))


MM$Region<-"MM"
MZ$Region<-"MZ"
ZZ$Region<-"ZZ"

D1=rbind(MM, MZ, ZZ)


# some data:
table(D$Region) %>% sort(decreasing=T) %>% as.data.frame() #numbers are number of tokens per region

levels(factor(MM$speaker)) %>% length()
levels(factor(ZZ$speaker)) %>% length()
levels(factor(MZ$speaker)) %>% length()

# 1   MZ 87230  # 45 speakers
# 2   MM 56848  # 28 speakers
# 3   ZZ 43813  # 26 speakers

#check that D1 has same number of rows as D

nrow(D1) == nrow(D)

#if so:

rm(MM, ZZ, MZ)

D=D1

# Add info about sex:

spk=levels(factor(D$speaker)) %>% as.data.frame()

f = spk %>% dplyr::filter(str_detect(., "F")) #53

levels(factor(f$.)) # to see the list of females

D = D  %>%
  mutate(Sex = ifelse(speaker %in% 
                        c( "CA01F", "CC01F", "CC02F", "CC03F", "CC04F", "CC05F", "CC06F", "CC07F", "CC08F", "CV01F", "CV02F", "CV03F", "LA01F", "MM02F", "MM03F", "MM04F", "MM05F",
                           "MM06F", "MM07F", "MM08F", "MM09F", "MM10F", "MM11F", "MM12F", "MM13F", "MM14F", "MM15F" ,"MM16F", "OM01F" ,"OM02F", "OM03F", "SC01F", "SC02F", "SC03F",
                           "SD01F", "SL01F", "SL02F", "SL03F", "TR01F", "TR02F", "TR03F", "TR04F", "TR05F", "TR06F" ,"VB01F" ,"VB02F", "VB03F", "VB04F", "VB05F", "VB06F", "VI01F",
                           "VI02F", "VI03F"), "F", "M"))

# Add info about age:

setwd("file-path_Secondary_dataset")

age_info = read.table("age-education-info.txt", header=T)
D%<>%dplyr::right_join(age_info, by="speaker")

# save results:

write.table(D, "D.txt")


###################################
#######    DCT analysis  ##########
###################################

# D=read.table("D.txt")

# 1. Plots of aggregated formants

# We do this to have a look at how stem vowel formants generally look like 
# depending on coarticulation with different suffix vowels

#F1

v.mean.F1 = D %>%
  # 2 vowels
  filter(Stem_vowel %in% c("e", "o") ) %>%
  # for each label, speaker and normalised time point
  group_by(speaker, Village, times_norm, Suffix_vowel, Stem_vowel) %>%
  # calculate mean F2
  summarise(F1n = mean(F1n)) %>%
  ungroup()


# If you want to have a look at the formants, you can plot them with the code lines below:

a=v.mean.F1 %>% filter (Stem_vowel == "e") %>%
  ggplot +
  # plot F1 as a function of normalised time
  # colour-coded by Suffix_vowel type
  aes(y = F1n, x = times_norm, col=Suffix_vowel, group=Suffix_vowel) +
  facet_wrap(.~Village, nrow=2)+
  geom_smooth()+
  theme_light() +
  ggtitle("Stem-/e/")+
  theme(strip.text.x = element_text(color = "black"), strip.text.y = element_text(color = "black"), 
        text = element_text(size = 12), legend.position = "none")

b=v.mean.F1 %>% filter (Stem_vowel == "o") %>%
  ggplot +
  aes(y = F1n, x = times_norm, col=Suffix_vowel, group=Suffix_vowel) +
  facet_wrap(.~Village, nrow=2)+
  geom_smooth()+
  theme_light() +
  ggtitle("Stem-/o/")+
  theme(strip.text.x = element_text(color = "black"), strip.text.y = element_text(color = "black"), 
        text = element_text(size = 12), legend.position = "top") 

grid.arrange(b, a, nrow=2)

# Now let's move on to the analysis with DCTs

# 2. DCT analysis on F1

# What to expect, after observing the plots above:

# - k0 is proportional the signal’s mean: we expect higher values for /a/-suffixed forms and lower ones for the other suffixes
# - k1 is proportional the signal’s linear slope: this may mark the difference between MM/MZ and ZZ
# - k2 is proportional the signal’s curvature: should be somehow similar between stems, although /a/- suffixed
#                                            forms are somehow more "curved"
# - k3 is proportional the signal’s skew: F1 in /e/ is more skewed towards the onset, for /o/ it is more at the midpoint/offset


# Let us now extract the DCTs 

v.dct = D %>%
  # for each segment i.e. sl_rowIdx
  # vowels and speaker are included in order
  # to be able to identify them in the resulting data-frame
  group_by(sl_rowIdx, bundle, speaker, Village, Region, Stem_vowel, Suffix_vowel, Word, Stem, Sex, age, education) %>%
  # calculate DCT coeffs. k0...k3 separately for F1
  # the 2nd argument is 3 to calculate k0...k3
  summarise(F1dct = emuR::dct(F1n, 3)) %>%
  # add a column named k showing which DCT-coefficient
  mutate(k = 0:3) %>%
  ungroup()

# Test to check that everything is fine: 

D %>%
  filter(sl_rowIdx == 2) %>%
  pull(F1n) %>% emuR::dct(., 3)

# should be the same as:

v.dct %>%
  filter(sl_rowIdx == 2) %>%
  pull(F1dct)


# 3. some scatter plots of DCT coefficients

#The task is now to plot all data points from the four vowel categories in the plane of k0 × k1.

v.dctwide = 
  v.dct %>%
  pivot_wider(names_from = k, 
              values_from = F1dct)

# add GRAM! the column for Grammatical category: verbs (V) vs nouns and adjectives (NAdj))

v.dctwide = v.dctwide  %>%
  mutate(Gram = ifelse(Word %in% 
                         c("anelli", "anello", "bella", 
                           "bello", "cappelli", "cappello", 
                           "capretta", "capretti",
                           "capretto", "cervelli", "cervello",
                           "coltelli", "coltello", "dente", 
                           "denti" ,   "donna"  , 
                           "donne", "ferri",    
                           "ferro" ,   "letti",    "letto" ,  
                           "mela",     "mele" ,    "mese",   
                           "mesi",     "pecora" ,  "pecore", 
                           "pesca",    "pesche" ,   "pettine", 
                           "pettini",  "pezza",    "pezzo",   
                           "piede",    "piedi",    "pietra",  
                           "pietre",   "prete",    "preti",    
                           "sedia",    "sedie",    "stella" , 
                           "stelle", "venti",    "vento",  
                           "vecchia",  "vecchio",  "verme", "vermi",
                           "buona",  "buone",  "buoni", 
                           "buono",  "corna",  "corno", 
                           "cotta",  "cotto",  "cuore",  "cuori" ,   
                           "foglia", "foglie","fuochi", "fuoco",  
                           "grossa", "grosso", "lunga",  "lungo",  "lunghi",
                           "morta",  "morti",  "morto",    "nipote",
                           "nipoti", "nuova",  "nuovo", 
                           "occhi",  "occhio", "ossa",   "osso",   
                           "ponte",  "ponti",  "porci",  "porco",  
                           "rosa",   "rose",   "ruota",  "ruote" ,
                           "sole", "sposa","sposo", 
                           "topi",   "topo", "uomini", "uomo",  
                           "uova",   "uovo",    "zoppa",  "zoppo",
                           "chiodo", "chiodi", "fiore", "fiori", "gioco", "giochi"),
                       
                       "NAdj", "V"))

# "v.dctwide" is the final dataframe we analyse in the RMD file containing the analyses in the paper

# save results, you'll need the dataframe for the analyses in the Rmd:

write.table(v.dctwide, "D_dct.txt")
####################################################

###################################
########## SUFFIX ANALYSIS ########  we need this for section 4 of the paper ("Relationship between stem and suffix vowel height")
###################################

#re-load emuDB

db = load_emuDB(file.path(pfad, "db_emuDB"))


################################## QUERY VOWELS we are interested in
# 1. find all vowel suffixes

suff.a = query(db, "[MAU=a & End(Word, MAU)==1]",resultType = "emuRsegs") %>% sort()
suff.e = query(db, "[MAU=e & End(Word, MAU)==1]",resultType = "emuRsegs") %>% sort()
suff.i = query(db, "[MAU=i & End(Word, MAU)==1]",resultType = "emuRsegs") %>% sort()
suff.u = query(db, "[MAU=u & End(Word, MAU)==1]",resultType = "emuRsegs") %>% sort()

aaa=requery_hier(db, suff.a, "Word", resultType = "emuRsegs") %>% sort()
suff.a$Word = aaa$labels

bbb=requery_hier(db, suff.e, "Word", resultType = "emuRsegs") %>% sort()
suff.e$Word = bbb$labels

ccc=requery_hier(db, suff.i, "Word", resultType = "emuRsegs") %>% sort()
suff.i$Word = ccc$labels

ddd=requery_hier(db, suff.u, "Word", resultType = "emuRsegs") %>% sort()
suff.u$Word = ddd$labels

# 2. find all stressed vowels /a, i, e, o, u/, mid stem must be NON-metaphonic

aeiou = query(db, "[MAU=a|i|e|E|o|O|u & End(Word, MAU)==0 ^ Word = braccia|cane|casa|case|cenere|
                                                                   bella|buona|buone|croce|croci|corta|
                                                                   corto|corti|dita|dito|
                                                                   ferri|ferro|forno|forni|fredda|freddi|freddo|latte|
                                                                   legna|legno|noce|noci|peli|pelo|pesce|pesci|pietra|
                                                                   pietre|sedia|sedie|ponte|rosa|rose|ruota|ruote|santa|
                                                                   stella|stelle|unghia|unghie|vacca|vacche|
                                                                   
                      corna|cotta|cuore|dente|dorme|
                    esce|grossa|lunga|mese|morta|
                 nuova|ossa|pensa|pezza|piede|tiene|trova|uova|vecchia|zoppa|vuole|corro|corre|bevo|beve]",resultType = "emuRsegs") %>% sort()
aaa=requery_hier(db, aeiou, "Word", resultType = "emuRsegs") %>% sort()
aeiou$Word = aaa$labels # adds column "Word"

a=dplyr::filter(aeiou, labels == "a")

e=dplyr::filter(aeiou, labels %in% c("e", "E"))

i=dplyr::filter(aeiou, labels == "i")

o=dplyr::filter(aeiou, labels  %in%  c("o", "O"))

u=dplyr::filter(aeiou, labels == "u")

# 3a. new column, mark all suffix vowels with a "minus"

suff.a$Vowel<-"-a"
suff.e$Vowel<-"-e"
suff.i$Vowel<-"-i"
suff.u$Vowel<-"-u"

suff=rbind(suff.a, suff.e, suff.i, suff.u)
suff$Vowel_Suffix<-"Suffix vowel"

# 3b. new column in "aeiou", mark all stressed vowels with a "plus"
a$Vowel<-"a+"
e$Vowel<-"e+"
i$Vowel<-"i+"
o$Vowel<-"o+"
u$Vowel<-"u+"

aeiou=rbind(a, e, i, o, u)
aeiou$Vowel_Suffix<-"Stem vowel"

#bind two dfs if needed

D = rbind(suff, aeiou)

rm(a, aaa, bbb,ccc,ddd,e,i,o,u,suff.a,suff.e,suff.i,suff.u) 
#get track data


D = get_trackdata(db, D, "FORMANTS", resultType="tibble") 

#in case of problems (bundle is an example):
#serve(db, bundlePattern = "MM06M0007A1_114", useViewer=F)

# linearly time-normalise to 11 data points
D = normalize_length(D, N=11)

D = mutate(D, Duration = D$end - D$start)
#filter out too short vowels
#D = dplyr::filter(D, Duration > 25)

# speaker-normalise

#D must be a data.frame!!!!

D %>% setDF()


mat = matrix(0, ncol = 2, nrow=nrow(D))
# column names
nm = names(D)
# for each speaker
for(j in unique(D$session)){
  # find that speaker's observations
  temp.spk = D$session == j
  # find that speaker's /i, a, u/ observations in PhonTarget
  temp.iau =  D$session == j & D$Vowel %in% c("i+", "a+", "u+")
  # a counter for the next for-loop
  k = 1
  # separately for F1 and F2
  for(i in c("T1", "T2")){
    # calculate the speaker's mean in /i, a, u/
    form.mean = mean(D[temp.iau,names(D) ==  i]%>%unlist)
    # and the speaker's standard deviation
    form.sd = sd(D[temp.iau,names(D) ==  i]%>%unlist)
    # subtract mean from all speaker's vowels and divide by sd
    # and store the results in column k of mat
    mat[temp.spk, k] = ((D[temp.spk,names(D) ==  i] - form.mean)/form.sd) %>% unlist
    # advance counter (k will only reach 2)
    k = k+1
  }
}

# should be F
any(mat==0)

# bind in the normalised formants
D = data.frame(D, F1n = mat[,1], F2n = mat[,2])

suffixes=filter(D, Vowel_Suffix == "Suffix vowel")

#NB we do not need words with other stem vowels:


suffixes = suffixes %>% dplyr::filter(Word %in% 
                                        c(  "anelli",   "anello",   "bella",    "bello",    "buona",    "buone",    "buoni",   
                                            "buono",    "cappelli", "cappello", "capretta", "capretti", "capretto", "cervelli",
                                            "cervello", "coltelli", "coltello", "corna" ,   "corno" ,   "cotta",    "cotto",   
                                            "cuore",    "cuori",    "dente",    "denti",    "donna",    "donne",    "dorme" ,  
                                            "dormi",    "dormo" ,   "esce",     "esci" ,    "esco",     "ferri" ,   "ferro" ,  
                                            "foglia",   "foglie",   "fuochi",   "fuoco",    "grossa",   "grosso",   "letti" ,  
                                            "letto",    "lunga" ,   "lungo",    "mela",     "mele",     "mese" ,    "mesi",    
                                            "morta",    "morti",    "morto",    "muoio",    "muori",    "nipote",   "nipoti",  
                                            "nuova",    "nuovo" ,   "occhi",    "occhio",   "ossa" ,    "osso" ,    "pecora",  
                                            "pecore",   "pensa",    "pensi",    "penso",    "pesca",    "pesche",   "pettine", 
                                            "pettini",  "pezza",    "pezzo",    "piede",    "piedi",    "pietra",   "pietre" , 
                                            "ponte" ,   "ponti",    "porci",    "porco",    "prete",    "preti",    "rosa" ,   
                                            "rose" ,    "ruota",    "ruote",    "sedia",    "sedie",    "sole",     "sposa",   
                                            "sposo",    "stella",   "stelle",   "tengo",    "tiene",    "tieni",    "topi",    
                                            "topo",     "trova",    "trovi",    "trovo",    "uomini",   "uomo",     "uova" ,   
                                            "uovo",     "vecchia",  "vecchio",  "venti",    "vento" ,   "verme",    "vermi",   
                                            "voglio",   "vuoi",     "zoppa",    "zoppo"    ))


suff.fm.mean = suffixes %>% group_by(session, Word, Vowel, sl_rowIdx, bundle, Duration) %>%
  summarise(F1 = mean(T1), F2 = mean(T2), F1n = mean(F1n), F2n = mean (F2n))

#change name of Duration to SuffDuration

names(suff.fm.mean)[names(suff.fm.mean) == "Duration"] <- "SuffDuration"

#check

names(suff.fm.mean)

#save results
write.table(suff.fm.mean, "suff_mean.txt")
############################################################################################