####    Supplementary materials: Script for extraction of primary data for:           ####
####                                                                                  ####
####  "The relationship between the coarticulatory source and effect in sound change: ####
####  evidence from Italo-Romance metaphony in the Lausberg area."                    ####
####                                                                                  ####
####  by Pia Greca, Michele Gubian, and Jonathan Harrington                           ####
##########################################################################################


#1. Preliminaries

# We load all packages necessary and the acoustic database

library(data.table)
library(tidyverse)
library(magrittr)
library(fda)
library(gridExtra)
library(emuR)

# extra scripts needed for FPCA in the folder "scripts-for-FPCA" 
fpcaRootDir <- "your_directory/scripts-for-FPCA"
#fpcaRootDir <- "C:/Users/49178/Documents/FPCA35_all/scripts/"
source(file.path(fpcaRootDir, "defint.fd.R"))
source(file.path(fpcaRootDir, "fdPar.R"))
source(file.path(fpcaRootDir, "getPCscores.R"))
source(file.path(fpcaRootDir, "landmarkreg.nocurve.R"))

setwd("the_directory_in_which_the_db_is")

db = load_emuDB("db_emuDB")

#if you want to have a look at it:

#serve(db, useViewer = F)

# Otherwise, move on to creating segment lists to extract data from the stem vowels
######################################################################################

#2. Formant data from stem vowels

# /e/
e.s = query(db, "[MAU=e|je & End(Word, MAU)==0]", resultType = "emuRsegs")%>%sort()
e.w.s = requery_hier(db, e.s, "Word", resultType = "emuRsegs")%>%sort()

# remove cervelli, cervello, tenete, prete
temp = label(e.w.s) %in% c("cervelli", "cervello", "tenete", "prete", "preti")

e.s = e.s[!temp,]
e.w.s = e.w.s[!temp,]

# get /e/ in these words
e.cer = query(db, "[[#MAU = e ^ Word = cervelli|cervello] -> MAU = ddZ]", resultType = "emuRsegs")%>%sort()
e.cer.w = requery_hier(db, e.cer, "Word", resultType = "emuRsegs")%>%sort()

e.ten = query(db, "[[#MAU = e ^ Word = tenete] -> MAU = t]", resultType = "emuRsegs")%>%sort()
e.ten.w = requery_hier(db, e.ten, "Word", resultType = "emuRsegs")%>%sort()

e.preti = query(db, "[[#MAU = e ^ Word = prete|preti] -> MAU = v]", resultType = "emuRsegs")%>%sort()
e.preti.w = requery_hier(db, e.preti, "Word", resultType = "emuRsegs")%>%sort()

e.s = rbind(e.s, e.cer, e.ten, e.preti)
e.w.s = rbind(e.w.s, e.cer.w, e.ten.w, e.preti.w)


# /E/ 
E.s = query(db, "[MAU=E & End(Word, MAU)==0]",resultType = "emuRsegs")%>%sort()
E.w.s = requery_hier(db, E.s, "Word", resultType = "emuRsegs")%>%sort()


# /o/
o.s = query(db, "[MAU=o|O|jo & End(Word, MAU)==0]",resultType = "emuRsegs")%>%sort()
o.w.s = requery_hier(db, o.s, "Word", resultType = "emuRsegs")%>%sort()

# /a/
a.s = query(db, "[MAU=a & End(Word, MAU)==0]", resultType = "emuRsegs")%>%sort()
a.w.s = requery_hier(db, a.s, "Word", resultType = "emuRsegs")%>%sort()
# remove capelli  capello cappelli cappello capretta capretti capretto pensate anello anelli
vec = c("capelli", "capello", "cappelli", "cappello", "capretta", "capretta", "capretto", "pensate", "anelli", "anello", "capretti", "trovi", "trovate", "pensi")
temp = label(a.w.s) %in% vec
a.s = a.s[!temp,]
a.w.s = a.w.s[!temp,]


# /i/
i.s = query(db, "[MAU=i & End(Word, MAU)==0]", resultType = "emuRsegs")%>%sort()
i.w.s = requery_hier(db, i.s, "Word", resultType = "emuRsegs")%>%sort()
# remove these
temp = label(i.w.s) %in% c("anelli","apri", "aprite", "bevete", "bevi", "cervelli", "cervello", "correte", "corri", "donna", "donne", "dormi", "dormite", "esci", "ginocchio", "ginocchia", "morite", "muori", "nipote", "nipoti", "pettine", "pettini", "prete", "preti", "tieni", "topi", "topo", "uomini", "uomo", "uscite", "volete", "dita", "dito", "vedete", "vedi", "volete", "cenere")
i.s = i.s[!temp,]
i.w.s = i.w.s[!temp,]

# add the following information:

# first /i/ in bevete v->(i)->v->i->s->i"
# dita d->(i)->j->i->t->a"              
#"dito d->(i)->j->i->t->u"
#"donne f->(i)->mm->i->n->e"   
#vedete v->(i)->d->i->s->i"  
#vedi v->(i)->d->i->s->i" 
#"cenere"   "tS->i->nn->i->r->a"  

i.bevete = query(db, "[[#MAU = i ^ Word = bevete|bevi] -> MAU = v]", resultType = "emuRsegs")%>%sort()
i.bevete.w = requery_hier(db, i.bevete, "Word",resultType = "emuRsegs")%>%sort()

i.dit = query(db, "[[#MAU = i ^ Word = dita|dito] -> MAU = j]", resultType = "emuRsegs")%>%sort()
i.dit.w = requery_hier(db, i.dit, "Word", resultType = "emuRsegs")%>%sort()

i.donne = query(db, "[[#MAU = i ^ Word = donne] -> MAU = mm]", resultType = "emuRsegs")%>%sort()
i.donne.w = requery_hier(db, i.donne, "Word", resultType = "emuRsegs")%>%sort()

i.ved = query(db, "[[#MAU = i ^ Word = vedi|vedete] -> MAU = d]", resultType = "emuRsegs")%>%sort()
i.ved.w = requery_hier(db, i.ved, "Word", resultType = "emuRsegs")%>%sort()

i.cen = query(db, "[[#MAU = i ^ Word = cenere] -> MAU = nn]", resultType = "emuRsegs")%>%sort()
i.cen.w = requery_hier(db, i.cen, "Word", resultType = "emuRsegs")%>%sort()

i.inilli = query(db, "[[#MAU = i ^ Word = anelli] -> MAU = ll|l]", resultType = "emuRsegs")%>%sort()
i.inilli.w = requery_hier(db, i.inilli, "Word", resultType = "emuRsegs")%>%sort()

i.cerv = query(db, "[[#MAU = i ^ Word = cervelli|cervello] -> MAU = ll|l]", resultType = "emuRsegs")%>%sort()
i.cerv.w = requery_hier(db, i.cerv, "Word", resultType = "emuRsegs")%>%sort()

i.pett = query(db, "[[#MAU = i ^ Word = pettine|pettini] -> MAU = tt]", resultType = "emuRsegs")%>%sort()
i.pett.w = requery_hier(db, i.pett, "Word", resultType = "emuRsegs")%>%sort()

i.prete = query(db, "[[#MAU = i ^ Word = prete|preti] -> MAU = v]",resultType = "emuRsegs")%>%sort()
i.prete.w = requery_hier(db, i.prete, "Word", resultType = "emuRsegs")%>%sort()


i.s = rbind(i.s, i.bevete, i.dit, i.donne, i.ved, i.inilli, i.cerv, i.pett, i.prete, i.cen)
i.w.s = rbind(i.w.s, i.bevete.w, i.dit.w, i.donne.w, i.ved.w, i.inilli.w, i.cerv.w, i.cen.w, i.pett.w, i.prete.w)


# /u/
u.s = query(db, "[MAU=u|ju & End(Word, MAU)==0]", resultType = "emuRsegs")%>%sort()
u.w.s = requery_hier(db, u.s, "Word", resultType = "emuRsegs")%>%sort()
# remove "coltelli coltello nera neri nero pecora  pecore
temp = label(u.w.s) %in% c("coltelli", "coltello", "nera", "neri", "nero", "pecora", "pecore")
u.s = u.s[!temp,]
u.w.s = u.w.s[!temp,]

# get target vowels and their corresponding words
target.s = rbind(e.s, E.s, o.s, a.s, i.s, u.s)
target.w = rbind(e.w.s, E.w.s, o.w.s, a.w.s, i.w.s, u.w.s)

# check same number of rows and that start of target is less than start of word
# should be T, F
nrow(target.s) == nrow(target.w)
any(start(target.s) < start(target.w))
# if there are problems:
# which(start(target.s) < start(target.w))

# verify that there are no duplicated utterances
length(unique(target.s$bundle))
# should be zero
tempdup = duplicated(target.s$bundle)
sum(tempdup)

#if it is not zero, check for duplicates: target.s$bundle[tempdup]

# get all word-final segments
final.s = query(db, "[MAU!= xxxx & End(Word, MAU) == 1]", resultType = "emuRsegs")%>%sort()

# get the corresponding words
final.w = requery_hier(db, final.s, "Word", resultType = "emuRsegs")%>%sort()


# find which final.w are in target.w
temp = final.w$bundle %in% target.w$bundle

mat = NULL
n = 1:nrow(final.w)
for(j in 1:nrow(target.w)){
  temp = final.w$bundle == target.w$bundle[j]
  mat = c(mat, n[temp])
}

# these should be the same
all(final.w$bundle[mat] == target.w$bundle)
# so these are the trigger vowels
trigger.s = final.s[mat,]

# verify that the bundles of the target and trigger vowels are the same
all(target.s$bundle == trigger.s$bundle)
# verify that the start time of the trigger vowels is greater than the start time of the target vowels
all(start(trigger.s) > start(target.s))

# here we list the word, target vowel, trigger vowel
t.labs = data.frame(Word = target.w$labels, Target = target.s$labels, Trigger =trigger.s$labels)
t.labs

# these are the unique ones
u = with(t.labs, unique(interaction(Word, Target, Trigger)))

# here are the labels of trigger vowels
table(label(trigger.s))
# there are some that are not /a, e, i, u/ These are the ones
temp = label(trigger.s) %in% c("a", "e", "i", "u")
sum(!temp)
# here they are
#serve(db, seglist=trigger.s[!temp,])

##############
# SORT 
##############
target.w2  = sort(target.w)
p = match(target.w2$bundle, target.w$bundle)
target.s2 = target.s[p,]
trigger.s2 = trigger.s[p,]
target.phon2 = requery_hier(db, target.w2, "MAU", resultType = "emuRsegs")%>%sort()
all(target.w2$bundle==target.phon2$bundle)
all(start(target.w2) == start(target.phon2))
all(target.s2$bundle==target.phon2$bundle)
all(target.s2$bundle==target.s2$trigger)
# now rename
target.s = target.s2
target.w = target.w2
target.phon = target.phon2
trigger.s = trigger.s2

##############
# CREATE LABELS 
##############
labs.df = data.frame(V = label(target.s), Word= label(target.w), P = label(target.phon), Trig = label(trigger.s))

# check that Word and Trig are matched
u = sort(unique(with(labs.df, as.character(interaction(Word, Trig)))))
u

##############
# add in information about the stem
##############

setwd("the_directory_in_which_stem.txt_and_words and phontrig.csv_are")

stem.info = read.table("stem.txt", header=T)
# match stem to labs.df
m = match(as.character(labs.df$Word), as.character(stem.info$Word))
stems = as.character(stem.info$STEM[m])
labs.df = data.frame(labs.df, Stem = factor(stems))
# check
o = with(labs.df, as.character(interaction(Word, Stem)))
# seems OK
sort(table(o))

# add speaker info and utterance info
labs.df = data.frame(labs.df, Vpn = target.s$session, U = target.s$bundle)

##############
# get formants of the target
##############

target.fm  = get_trackdata(db, target.s, "FORMANTS", resultType="tibble")

# linearly time-normalise to 11 data points.
target.fmt = normalize_length(target.fm, N=11)
# smoothing:
# convert to data-frame
target.fmt %>% setDT
# choose the columns you want to filter
Fcols <- c("T1", "T2")

# run the filter, separately for each track (by = sl_rowIdx)
# I chose order = 5, you can always play around with it

target.fmt[, (Fcols) := lapply(.SD, runmed, k = 5), .SDcols = Fcols, by = sl_rowIdx]

# if you really have to, undo the data.tabling, so v.fmt.sent is back to data.frame
# convert back to data-frame
target.fmt %>% setDF

# round the times_norm to the nearest decimal place
target.fmt$times_norm = round(target.fmt$times_norm, 1)

# add in Word labels
target.fmt$Word= factor(rep(as.character(labs.df$Word), table(target.fmt$sl_rowIdx)))
# add in Stem labels
target.fmt$Stem = factor(rep(as.character(labs.df$Stem), table(target.fmt$sl_rowIdx)))
# add in the labels of the trigger vowels
target.fmt$Trig = factor(rep(label(trigger.s), table(target.fmt$sl_rowIdx)))

# create local copies of what we need and detach
D <- target.fmt
D %>% setDT

# bind info about phonological triggers and targets
phonology <- fread("words and phontrig.csv", header = TRUE)
#phonology[, V1 := NULL]
D %<>% .[phonology, on = c(Word= "Word"), nomatch = 0]
D %>% setnames('session', 'speaker')
D %>% setnames('labels', 'Target')
D %>% setnames(paste0('T', 1:5), paste0('F', 1:5))


# the final D is a dataframe with all F values related also to phonological targets
# similar to target.fmt

##Add a column with villages (this might be simplified)

D$Village = rep("Canna", nrow(D))
temp = D$speaker %in% c("CC01F", "CC01M", "CC02F", "CC02M", "CC03F", "CC03M", "CC04F", "CC05F","CC06F", "CC07F", "CC08F")
D$Village[temp] = "Cerchiara"

temp = D$speaker %in% c("LI01M","LI02M","LI03M","LI04M", "LI05M", "LI06M")
D$Village[temp] = "Laino"

temp = D$speaker %in% c("MG01M", "MG02M")
D$Village[temp] = "Montegiordano"

temp = D$speaker %in% c("MM02F","MM03F","MM03M","MM04F","MM04M","MM05F", "MM06F", "MM07F", "MM05M", "MM07M", "MM09M")
D$Village[temp] = "Mormanno"

temp = D$speaker %in% c("SD01F")
D$Village[temp] = "Santa_Domenica_Talao"

temp = D$speaker %in% c("SC01F", "SC01M", "SC02F")
D$Village[temp] = "Scalea"

# add region

Village_groups <- rbindlist(list(
  data.table(Region = "East", Village = c("Canna", "Cerchiara", "Montegiordano")),
  data.table(Region = "MM", Village = "Mormanno"),
  data.table(Region = "West", Village = c("Laino", "Scalea", "Santa_Domenica_Talao"))
))

# adds info of our Village_dt to the D_dt
# <- "join", unisce i due dt riferendosi ai valori di "village"

D <- Village_groups[D, on = "Village", nomatch = 0]

# add gender

D$Sex = rep("M", nrow(D))
temp = D$speaker %in% c("CC01F", "CC02F", "CC07F", "CC04F", "CC03F","CC06F", "MM02F", "MM03F","MM04F","MM05F", "SD01F", "CC05F", "CC08F", "CA01F", "SC01F", "SC02F", "MM07F", "MM06F")
D$Sex[temp] = "F"

#############################
### INFO about SUFFIX
### Create column where you distinguish phonetically realised from non phon. real. trigger

#D
notrig = dplyr::filter(D, Trig == "dd"| Trig == "d"| Trig == "ddZ"|Trig == "g" |Trig=="J"|Trig=="k"| Trig == "kk"| Trig =="l"|Trig =="ll" |Trig == "L"|Trig=="m"|Trig =="n"| Trig == "nn" | Trig=="p"|Trig =="pp"| Trig =="r"|Trig == "s"|Trig == "S"| Trig == "ss"| Trig=="SS"|Trig == "t"| Trig == "tt"| Trig =="tts"|Trig=="tS"| Trig =="v")

trig = dplyr::filter(D, Trig == "a"|Trig =="u"|Trig == "e"|Trig=="i"|Trig == "kki"|Trig == "io")


########## add a column


notrig$Suffix<-"Deleted"
trig$Suffix<-"Realised"

#### join dataframes 
D=rbind(notrig, trig)

###################### add information about metaphony

Met = dplyr::filter(D, Suffix_vowel == "u" | Suffix_vowel == "i")
NoMet = dplyr::filter(D, Suffix_vowel== "a"|Suffix_vowel == "e")

### add a column
Met$MET<-"metaphony"
NoMet$MET<-"No metaphony"

D=rbind(Met, NoMet)

# write.table("D.txt")

#####################################################################################

#3. Dataframe creation for FPCA analysis

class(D)
D %>% setDF()
# we will  speaker normalise with respect to /a, i, u/ 
# 2 columned matrix for storing results
mat = matrix(0, ncol = 2, nrow=nrow(D))
# column names
nm = names(D)
# for each speaker
for(j in unique(D$speaker)){
  # find that speaker's observations
  temp.spk = D$speaker == j
  # find that speaker's /i, a, u/ observations in Stem_vowel
  temp.iau =  D$speaker == j & D$Stem_vowel %in% c("i", "a", "u")
  # a counter for the next for-loop
  k = 1
  # separately for F1 and F2
  for(i in c("F1", "F2")){
    # calculate the speaker's mean in /i, a, u/
    form.mean = mean(D[temp.iau,names(D) ==  i])
    # and the speaker's standard deviation
    form.sd = sd(D[temp.iau,names(D) ==  i])
    # subtract mean from all speaker's vowels and divide by sd
    # and store the results in column k of mat
    mat[temp.spk, k] = (D[temp.spk,names(D) ==  i] - form.mean)/form.sd
    # advance counter (k will only reach 2)
    k = k+1
  }
}

# should be F
any(mat==0)

# check e.g. speaker MM05F, F2
temp = D$speaker == "MM05F"
temp2 = D$speaker == "MM05F" & D$Stem_vowel %in% c("i", "u", "a")
# mean F2 in /i, a, u/
mF2 = mean(D$F2[temp2]); sF2 = sd(D$F2[temp2])
# should be same: they are:
sum(mat[temp,2] - (D$F2[temp] - mF2)/sF2)
all(mat[temp,2] == (D$F2[temp] - mF2)/sF2)

# bind in the normalised formants
D = data.frame(D, F1n = mat[,1], F2n = mat[,2])

D %>% setDT

D[, .(Trigs = .SD[, Suffix_vowel] %>% unique %>% sort %>% paste(collapse = '_')), by = Stem]

# choose the columns you want to filter
Fcolsin <- c("F1n", "F2n")
Fcolsout <- c("F1ns", "F2ns")

# run the filter, separately for each track (by = sl_rowIdx)
# I chose order = 5, you can always play around with it
D %>% setDT()
D[, (Fcolsout) := lapply(.SD, runmed, k = 5), .SDcols = Fcolsin, by = sl_rowIdx]

# if you really have to, undo the data.tabling, so v.fmt.sent is back to data.frame

D %>% setDF

# remove i, u, a targets
temp = D$Stem_vowel %in% c("i", "u", "a")
D = D[!temp,]

###########################################

D %>% setDT

#exclude outliers
D %<>% .[!Stem %in% c("fior", "gioc", "chiod") & !Word %in% c("tenete", "morite", "pensate", "trovate", "volete")]
# make MET a logical, i.e. TRUE / FALSE variable, only these 2 levels
D[, MET := MET == "metaphony"] 
D[, setdiff(colnames(D),
            # keep only these columns
            c("sl_rowIdx", "Target", "speaker", "times_rel", "times_norm", "Word", "Stem", "Trig", "Stem_vowel", "Suffix_vowel",
              "Village", "Sex", "bundle", "Suffix", "MET", "F1ns", "F2ns")
) := NULL] 

wrongs = c("CA01F0003A1_123", "CA01F0003A1_124", "CA01F0003A1_22",  "CA01F0003A2_22", 
           "CC01F0062A1_22",  "CC01F0062A2_22",  "CC01M0066A1_123", "CC02F0063A1_22", 
           "CC02F0063A2_22",  "CC01M0066A1_124", "CC02M0071A1_22",  "CC02M0071A2_22", 
           "CC03F0064A1_22",  "CC03F0064A2_22",  "CC05F0067A2_22",  "CC02M0071A1_123",
           "CC06F0068A1_22",  "CC06F0068A2_22",  "CC07F0069A1_22",  "CC07F0069A2_22", 
           "CC08F0070A1_22",  "CC08F0070A2_22",  "LI02M0030A1_22",  "LI05M0033A1_022",
           "MG01M0002A1_22",  "MG01M0002A2_22",  "MG02M0003A1_22",  "MG02M0003A2_22", 
           "MM02F0007A1_24",  "MM02F0007A2_24",  "MM03F0009A1_24",  "MM03F0009A2_24", 
           "MM03M0008A1_24",  "MM03M0008A2_24",  "MM04F0010A1_24",  "MM04M0012A1_24", 
           "MM04M0012A2_24",  "MM05F0011A1_24",  "MM05M0013A1_24",  "MM05M0013A2_24", 
           "MM06F0006A1_022", "MM06F0006A2_022", "MM07F0010A1_022", "MM07F0010A2_022",
           "MM07M0013A1_022", "MM07M0013A2_022", "MG01M0002A1_123", "MG01M0002A1_124",
           "SC01M_A2_022",    "SC01M0039A1_022", "MG01M0002A2_123", "MG01M0002A2_124",
           "SD01F0005A1_24",  "SD01F0005A2_24")
D = D %>% dplyr::filter(!bundle %in% wrongs)
###############################################################
# NB WARNING! Select one Stem_vowel at a time!!!!

`Target Vowel` <- 'o'#or
#`Target Vowel` <- 'e'

D %<>% .[Stem_vowel == `Target Vowel`] 

nCurves <- D$sl_rowIdx %>% uniqueN 

# Smoothing
norm_range <- c(0,1)
n_knots = 6 
lambda = 1e-8 
Lfdobj <- 2 
norder <- 2 + Lfdobj 
nbasis <- n_knots + norder - 2 
basis <- create.bspline.basis(norm_range, nbasis, norder)
fdPar <- fdPar(basis, Lfdobj, lambda)

D %>% setDT
# compute splines coefs, arrange them in 3D array of dim:
# nbasis X nCurves X 2 (formants) 
coefs <- D %>%
  data.table::melt(measure.vars = c("F1ns", "F2ns"), variable.name = "Formant", value.name = "Freq") %>%
  .[, .(coefs = c(smooth.basis(times_norm, Freq, fdPar)$fd$coefs),
        splineIdx = 1:nbasis),
    by = .(sl_rowIdx, Formant)] %>%
  acast(splineIdx ~ sl_rowIdx ~ Formant, value.var = "coefs")

D.fd <- fd(coef=coefs, basisobj=basis) #fd = functional data

# acast orders data by the grouping columns, i.e. sl_rowIdx and Formant.
# Hence D has to be reordered accordingly.
D %>% setorder(sl_rowIdx)

lambda_pca <- lambda
pcafdPar <- fdPar(basis, 2, lambda_pca) 
D.pcafd <- pca.fd(D.fd, nharm = 3, pcafdPar) #nharm=number of PCs

# Dres ("results") comtains all factors and corresponding PC scores, one row for each formant pair
D %>%setDT
Dres <- D %>%
  .[, .SD[.N, .(StemDuration = times_rel)], 
    .SDcols = "times_rel", 
    by = setdiff(colnames(D), c("times_rel", "times_norm", "F1ns", "F2ns"))] %>% 
  .[, paste0('s', 1:3) := D.pcafd %>% getPCscores %>% as.data.frame]


Dres$Region <- factor(Dres$Region, levels = c("MM", "West", "East"))

######################## save results!

o.df = Dres

D.pcafd.o=D.pcafd

# OR (after re-running everything but just for stem-/e/):

e.df = Dres

D.pcafd.e=D.pcafd

met.df = rbind(e.df, o.df)


# EXTRA (not analysed in the paper): add more info about consonants preceding suffix vowels:
cons = read.table("wordscons.txt", header=T)
met.df%<>%dplyr::right_join(cons, by="Word")

############################################################################################

#4. Information about Suffix deletion and centralisation


# find all vowel suffixes

suff.a = query(db, "[MAU=a & End(Word, MAU)==1]",resultType = "emuRsegs") %>% sort()
suff.e = query(db, "[MAU=e & End(Word, MAU)==1]",resultType = "emuRsegs") %>% sort()
suff.i = query(db, "[MAU=i & End(Word, MAU)==1]",resultType = "emuRsegs") %>% sort()
suff.u = query(db, "[MAU=u & End(Word, MAU)==1]",resultType = "emuRsegs") %>% sort()

aaa=requery_hier(db, suff.a, "Word", resultType = "emuRsegs") %>% sort()
suff.a$Word = aaa$labels

bbb=requery_hier(db, suff.e, "Word", resultType = "emuRsegs") %>% sort()
suff.e$Word = bbb$labels

ccc=requery_hier(db, suff.i, "Word", resultType = "emuRsegs") %>% sort()
suff.i$Word = ccc$labels

ddd=requery_hier(db, suff.u, "Word", resultType = "emuRsegs") %>% sort()
suff.u$Word = ddd$labels

# find all stressed vowels /a, i, e, o, u/, mid stem must be NON-metaphonic

aeiou = query(db, "[MAU=a|i|e|E|o|O|u & End(Word, MAU)==0 ^ Word = braccia|cane|casa|case|cenere|
                                                                   bella|buona|buone|croce|croci|corta|
                                                                   corto|corti|dita|dito|
                                                                   ferri|ferro|forno|forni|fredda|freddi|freddo|latte|
                                                                   legna|legno|noce|noci|peli|pelo|pesce|pesci|pietra|
                                                                   pietre|sedia|sedie|ponte|rosa|rose|ruota|ruote|santa|
                                                                   stella|stelle|unghia|unghie|vacca|vacche|
                                                                   
                      corna|cotta|cuore|dente|dorme|
                    esce|grossa|lunga|mese|morta|
                 nuova|ossa|pensa|pezza|piede|tiene|trova|uova|vecchia|zoppa|vuole|corro|corre|bevo|beve]",resultType = "emuRsegs") %>% sort()
aaa=requery_hier(db, aeiou, "Word", resultType = "emuRsegs") %>% sort()
aeiou$Word = aaa$labels

a=dplyr::filter(aeiou, labels == "a")

e=dplyr::filter(aeiou, labels %in% c("e", "E"))

i=dplyr::filter(aeiou, labels == "i")

o=dplyr::filter(aeiou, labels  %in%  c("o", "O"))

u=dplyr::filter(aeiou, labels == "u")

# new column, mark all suffix vowels with a "minus"

suff.a$Vowel<-"-a"
suff.e$Vowel<-"-e"
suff.i$Vowel<-"-i"
suff.u$Vowel<-"-u"

suff=rbind(suff.a, suff.e, suff.i, suff.u)
suff$Vowel_Suffix<-"Suffix vowel"

# new column in "aeiou", mark all stressed vowels with a "plus"
a$Vowel<-"a+"
e$Vowel<-"e+"
i$Vowel<-"i+"
o$Vowel<-"o+"
u$Vowel<-"u+"

aeiou=rbind(a, e, i, o, u)
aeiou$Vowel_Suffix<-"Stem vowel"

#bind two dfs if needed

D = rbind(suff, aeiou)

rm(a, aaa, bbb,ccc,ddd,e,i,o,u,suff.a,suff.e,suff.i,suff.u) 
#get track data

D = get_trackdata(db, D, "FORMANTS", resultType="tibble")

#in case of problems (bundle below is an example):
#serve(db, bundlePattern = "MM06M0007A1_114", useViewer=F)

D = mutate(D, Duration = D$end - D$start)
#filter out too short vowels
#D = dplyr::filter(D, Duration > 25)

# speaker-normalise, NO time normalisation

#D must be a data.frame!!!!

D %>% setDF()


mat = matrix(0, ncol = 2, nrow=nrow(D))
# column names
nm = names(D)
# for each speaker
for(j in unique(D$session)){
  # find that speaker's observations
  temp.spk = D$session == j
  # find that speaker's /i, a, u/ observations in Stem_vowel
  temp.iau =  D$session == j & D$Vowel %in% c("i+", "a+", "u+")
  # a counter for the next for-loop
  k = 1
  # separately for F1 and F2
  for(i in c("T1", "T2")){
    # calculate the speaker's mean in /i, a, u/
    form.mean = mean(D[temp.iau,names(D) ==  i]%>%unlist)
    # and the speaker's standard deviation
    form.sd = sd(D[temp.iau,names(D) ==  i]%>%unlist)
    # subtract mean from all speaker's vowels and divide by sd
    # and store the results in column k of mat
    mat[temp.spk, k] = ((D[temp.spk,names(D) ==  i] - form.mean)/form.sd) %>% unlist
    # advance counter (k will only reach 2)
    k = k+1
  }
}

# should be F
any(mat==0)

# bind in the normalised formants
D = data.frame(D, F1n = mat[,1], F2n = mat[,2])


D %>% setDT


#  smooth the formants 
# choose the columns you want to filter
Fcolsin <- c("F1n", "F2n")
Fcolsout <- c("F1ns", "F2ns")

# run the filter, separately for each track (by = sl_rowIdx)
# I chose order = 5, you can always play around with it
D[, (Fcolsout) := lapply(.SD, runmed, k = 5), .SDcols = Fcolsin, by = sl_rowIdx]

# if you really have to, undo the data.tabling, so v.fmt.sent is back to data.frame

D %>% setDF

#write.table(D, "allvowels.txt")

suffixes=filter(D, Vowel_Suffix == "Suffix vowel")
#write.table(suffixes, "suffixes.txt")

stems=filter(D, Vowel_Suffix == "Stem vowel")
#write.table(stems, "stressedvowels.txt")

suff.fm.mean = suffixes %>% group_by(session, Word, Vowel, sl_rowIdx, bundle, Duration, Region) %>%
  summarise(F1 = mean(T1), F2 = mean(T2), F1ns = mean(F1ns), F2ns = mean (F2ns))

#change name of Duration to SuffDuration

names(suff.fm.mean)[names(suff.fm.mean) == "Duration"] <- "SuffDuration"

#check

names(suff.fm.mean)

# now calculate ci ("centralisation index")

suffmean.df = suff.fm.mean

# function for computing Euclidean distances between two vectors
euc = function(a, b)
{
  sqrt(sum((a - b)^2))
}

# work out the mean distance in an F1 x F2 space of a suffix vowel to the centroids of all vowel categories 
# this is for storing the results
mat = matrix(0, nrow(suffmean.df), 4)

# for each speaker
for(j in unique(suffmean.df$session)){
  # identify the speaker's observations
  speakertemp = suffmean.df$session == j
  # calculate centroids for all 4 vowels for that speaker
  centroids =  with(suffmean.df[speakertemp,], apply(cbind(F1ns, F2ns), 2, tapply, 
                                                     as.character(Vowel), mean))
  #sort a, e, i, u
  centroids = centroids[ match(c("-a", "-e", "-i", "-u"), rownames(centroids)),]
  
  # calculate Euclidean distances of all observations to all centroids
  # this is for storing results
  cenmat = NULL
  # for each of the above 4 centroids...
  for(i in 1:nrow(centroids)){
    # calculate the Euclidean distances from each vowel (for a given speaker) to each vowel centroid
    res = with(suffmean.df[speakertemp,], apply(cbind(F1ns, F2ns), 1, euc, centroids[i,]))
    # store
    cenmat = cbind(cenmat, res )
  }
  # store the results
  mat[speakertemp,] = cenmat
  
}

# convert -a, -e, -i, -u of suffmean.df into 1, 2, 3, 4 (since these are the column in mat that we want to access
# convert characters to integers
num = as.numeric(factor(suffmean.df$Vowel))
# they match
table(num, as.character(suffmean.df$Vowel))

# this is for storing the results
ci = NULL
# for each vowel in suffmean.df, calculate the mean Euclidean distance to the its centroid and the mean Euclidean distances to the other centroids, and then take the log (see above)
ci = NULL
for(j in 1:length(num)){
  ci = c(ci, log(mat[j,num[j]] / mean(mat[j,-num[j]])))
}
suffmean.df = data.frame(suffmean.df, ci)

#now you have ci, you have to add suffix info to met.df

ci.df=Dres %>% dplyr::filter(Suffix == "Realised")
met_deleted=Dres %>% dplyr::filter(Suffix == "Deleted")
met_deleted$ci <- NA

levels(factor(ci.df$Word))

Suff = suffmean.df %>% dplyr::filter(Word %in% 
                                       c("anelli",   "anello",   "bella",    "bello",    "buona",    "buone",    "buoni",    "buono",    "capelli", 
                                         "capello",  "cappelli", "cappello", "capretta", "capretti", "capretto", "cervelli", "cervello", "coltelli",
                                         "coltello", "corna",    "corno",    "cotta",    "cotto",    "cuore",    "cuori",    "dente",    "denti",   
                                         "donna",    "donne",    "dorme",    "dormi",    "dormo",    "esce",     "esci",     "esco",     "ferri",   
                                         "ferro",    "foglia",   "foglie",   "fuochi",   "fuoco",    "grossa",   "grosso",   "letti",    "letto",   
                                         "lunga",    "lungo" ,   "mela",     "mele" ,    "mese",     "mesi",     "morta",    "morti",    "morto",   
                                         "muoio",    "muori",    "nipote",   "nipoti",   "nuova",   "nuovo",   "occhi",    "occhio",   "ossa",    
                                         "osso",     "pecora",   "pecore",   "pensa" ,   "pensi",    "penso",    "pesca",    "pesche",   "pettine", 
                                         "pettini",  "pezza",    "pezzo",    "piede",    "piedi" ,   "pietra",   "pietre",   "ponte",    "ponti",   
                                         "porci",    "porco",    "prete",   "preti" ,   "rosa",     "rose" ,    "ruota",    "ruote",    "sedia" ,  
                                         "sedie",    "sole",     "sposa" ,   "sposo",    "stella",   "stelle",   "tengo",    "tiene",    "tieni",   
                                         "topi",     "topo",     "trova" ,   "trovi" ,   "trovo",    "uomini",   "uomo",     "uova",     "uovo" ,   
                                         "vecchia",  "vecchio",  "venti",    "vento",    "verme",    "vermi",    "voglio",   "vuoi",     "zoppa",   
                                         "zoppo"  ))

# check that Index number is the same between the two dfs
# Suff and Dres should have the same number of observations, check out:

var1 <- ci.df$bundle
var2 <- Suff$bundle
matched <- intersect(var1, var2)
all <-  union(var1, var2)
non.matched <- all[!all %in% matched]
non.matched

ci.df = ci.df %>% filter(!bundle %in% non.matched)
Suff = Suff %>% filter(!bundle %in% non.matched)

# add ci to Dres
ci.df$ci <- Suff$ci[match(ci.df$bundle, Suff$bundle)]
met.df=rbind(met_deleted, ci.df)

# match duration info to dataframe already in use (in order to avoid re-running everything)

met.df$SuffDuration <- suff.fm.mean$SuffDuration[match(met.df$bundle, suff.fm.mean$bundle)]

# for the analysis, it is useful that we have "deletion" vs "realisation" as a logical variable (TRUE vs fALSE)

suffix_del = met.df$ci
temp = is.na(suffix_del)
suffix_del[temp] = F
suffix_del[!temp] = T
suffix_del = as.logical(suffix_del)
#table(suffix_del)

# only those which are not NA
temp = is.na(met.df$ci)
ci.df = met.df[!temp,]

#re-level:

ci.df$Stem = factor(ci.df$Stem)
ci.df$speaker = factor(ci.df$speaker)

# formant data of suffix
form.df = suffmean.df %>% dplyr::select(bundle, F1ns, F2ns)

# add in the formants to ci.df
ci.df =  left_join(ci.df, form.df, group="bundle")
ci.df=ci.df %>% rename(F1n = F1ns, F2n = F2ns) #rename, they are mean values! (not used in the analysis)

# rm(suffix_del, temp)


#5. create DF of mid vs high vowels in MZ (i.e. the East)

# D is the dataframe with all data of the 35 speakers

head(D)

i.df = D %>% filter(Stem_vowel == "i")
u.df = D %>% filter(Stem_vowel == "u")

levels(factor(i.df$Word)) # deleting "vedete" and "bevete", it's 21 word types
levels(factor(i.df$Stem)) # 10 stem types

levels(factor(u.df$Word)) # 28 word types (2 pers pl forms excluded)
levels(factor(u.df$Stem)) #16 stem types

######


# create dataframes, only MZ and mid and high vowels

#e-i

i_MZ.df= D %>% filter(Stem_vowel == "i" & speaker %in% c("CA01F", "CC01F", "CC01M", "CC02F", "CC02M", "CC03F",
                                                         "CC03M", "CC04F", "CC05F", "CC06F",
                                                         "CC07F", "CC08F", "MG01M", "MG02M"))
e_Met_MZ.df= D %>% filter(Stem_vowel == "e" & MET =="metaphony" & speaker %in% c("CA01F", "CC01F", "CC01M", "CC02F", "CC02M", "CC03F",
                                                                                 "CC03M", "CC04F", "CC05F", "CC06F",
                                                                                 "CC07F", "CC08F", "MG01M", "MG02M"))
e_NoMet_MZ.df= D %>% filter(Stem_vowel == "e" &  MET =="No metaphony" & speaker %in% c("CA01F", "CC01F", "CC01M", "CC02F", "CC02M", "CC03F",
                                                                                       "CC03M", "CC04F", "CC05F", "CC06F",
                                                                                       "CC07F", "CC08F", "MG01M", "MG02M"))
#o-u
u_MZ.df= D %>% filter(Stem_vowel == "u" & speaker %in% c("CA01F", "CC01F", "CC01M", "CC02F", "CC02M", "CC03F",
                                                         "CC03M", "CC04F", "CC05F", "CC06F",
                                                         "CC07F", "CC08F", "MG01M", "MG02M"))
o_Met_MZ.df= D %>% filter(Stem_vowel == "o" &  MET =="metaphony" & speaker %in% c("CA01F", "CC01F", "CC01M", "CC02F", "CC02M", "CC03F",
                                                                                  "CC03M", "CC04F", "CC05F", "CC06F",
                                                                                  "CC07F", "CC08F", "MG01M", "MG02M"))
o_NoMet_MZ.df= D %>% filter(Stem_vowel == "o" &  MET =="No metaphony" & speaker %in% c("CA01F", "CC01F", "CC01M", "CC02F", "CC02M", "CC03F",
                                                                                       "CC03M", "CC04F", "CC05F", "CC06F",
                                                                                       "CC07F", "CC08F", "MG01M", "MG02M"))
# if necessary delete old stem vowel column

# e.g.

# u_MZ.df$Stem_vowel <- NULL

i_MZ.df$Stem_vowel <- "/i/"
e_Met_MZ.df$Stem_vowel <- "Raised /e/"
e_NoMet_MZ.df$Stem_vowel <- "Non-raised /e/"

u_MZ.df$Stem_vowel <- "/u/"
o_Met_MZ.df$Stem_vowel <- "Raised /o/"
o_NoMet_MZ.df$Stem_vowel <- "Non-raised /o/"


D_MZhigh =  rbind(i_MZ.df, e_Met_MZ.df, e_NoMet_MZ.df, u_MZ.df, o_Met_MZ.df, o_NoMet_MZ.df)

rm(i_MZ.df, e_Met_MZ.df, e_NoMet_MZ.df, u_MZ.df, o_Met_MZ.df, o_NoMet_MZ.df)

D_MZhigh= D_MZhigh %>% filter(times_norm == 0.5)

# D_MZhigh is our dataframe!

#now delete columns you do not need

D_MZhigh = D_MZhigh %>% dplyr::select(-c(start_item_id,level,end_item_id, start_item_seq_idx, end_item_seq_idx, 
                                         type, sample_start, sample_end, sample_rate, times_orig, times_rel, F3, F4, F5, Stem, Trig, MET, 
                                         start,      end ,utts,       db_uuid, Suffix  ))


D_MZhigh$Stem_vowel = factor(D_MZhigh$Stem_vowel, levels = c("/i/", "Raised /e/", "Non-raised /e/", "/u/", "Raised /o/", "Non-raised /o/"))

# add "whichvowel" column for better plotting

e=D_MZhigh %>% filter (Stem_vowel %in% c( "/i/", "Raised /e/" , "Non-raised /e/"))
e$whichvowel<-"/e/~/i/"

o=D_MZhigh %>% filter (Stem_vowel %in% c( "/u/", "Raised /o/" ,"Non-raised /o/"))
o$whichvowel<-"/o/~/u/"

D_MZhigh=rbind(e, o)


##### 6. fix levels of the dataframes that we need for the analyses

met.df=met.df %>% select(-c(sl_rowIdx, Target, Sex))

met.df$Region = factor(met.df$Region, levels = c("MM", "West", "East"))
met.df$Stem_vowel = factor(met.df$Stem_vowel)
met.df$Stem = factor(met.df$Stem)
met.df$speaker = factor(met.df$speaker)

# Re-level so that suffix-e is the base (so that all effects of suffix in the post-hoc test are shown relative the suffix-e)

met.df$Suffix_vowel = relevel(met.df$Suffix_vowel, "e")

#re-level levels of "Suffix"

met.df$Suffix=factor(met.df$Suffix)
ci.df$Suffix=factor(ci.df$Suffix)
e.df$Suffix=factor(e.df$Suffix)
o.df$Suffix=factor(o.df$Suffix)

# re-write stems in SAMPA

levels(met.df$Stem) <- list( anell ="anell",   bell ="bell",    bon="buon",    kapell="capell", kappell ="cappell", krapett="caprett", tServell="cervell", kurtell="coltell", korn="corn",    kott="cott",    kor="cuor",   
                             dent ="dent",  femmin  ="donn",   dorm ="dorm",   ess ="esc",    ferr ="ferr",   
                             foLL="fogli" ,  fok="fuoc",    gross="gross",   lett="lett",    long="lung",    mel="mel",     mes="mes",     mor="mor",     mort="mort",    nipot="nipot",   nov="nuov",    okk="occh",    
                             oss="oss",    pekur ="pecor",  pens ="pens",   pesk ="pesch",  
                             pettin ="pettin", petts ="pezz",   ped ="pied",  petr  ="pietr",  pont ="pont",  pork  ="porc",   previt ="pret",  ros  ="ros",   rot  ="ruot",  seddZ  ="sedi", 
                             sol ="sol",    spos ="spos",  stell  ="stell", ten  ="ten" ,  soritS  ="top",  trov   ="trov" ,  
                             ommin ="uom",    ov ="uov",   vekkj  ="vecchi",  vent="vent",    verm="verm",   vol ="vol" ,  tsopp  ="zopp" ) 


levels(ci.df$Stem) <- list( anell ="anell",   bell ="bell",    bon="buon",    kapell="capell", kappell ="cappell", krapett="caprett", tServell="cervell", kurtell="coltell", korn="corn",    kott="cott",    kor="cuor",   
                            dent ="dent",  femmin  ="donn",   dorm ="dorm",   ess ="esc",    ferr ="ferr",   
                            foLL="fogli" ,  fok="fuoc",    gross="gross",   lett="lett",    long="lung",    mel="mel",     mes="mes",     mor="mor",     mort="mort",    nipot="nipot",   nov="nuov",    okk="occh",    
                            oss="oss",    pekur ="pecor",  pens ="pens",   pesk ="pesch",  
                            pettin ="pettin", petts ="pezz",   ped ="pied",  petr  ="pietr",  pont ="pont",  pork  ="porc",   previt ="pret",  ros  ="ros",   rot  ="ruot",  seddZ  ="sedi", 
                            sol ="sol",    spos ="spos",  stell  ="stell", ten  ="ten" ,  soritS  ="top",  trov   ="trov" ,  
                            ommin ="uom",    ov ="uov",   vekkj  ="vecchi",  vent="vent",    verm="verm",   vol ="vol" ,  tsopp  ="zopp" ) 

levels(e.df$Stem) <- list( anell ="anell",   bell ="bell",    bon="buon",    kapell="capell", kappell ="cappell", krapett="caprett", tServell="cervell", kurtell="coltell", korn="corn",    kott="cott",    kor="cuor",   
                           dent ="dent",  femmin  ="donn",   dorm ="dorm",   ess ="esc",    ferr ="ferr",   
                           foLL="fogli" ,  fok="fuoc",    gross="gross",   lett="lett",    long="lung",    mel="mel",     mes="mes",     mor="mor",     mort="mort",    nipot="nipot",   nov="nuov",    okk="occh",    
                           oss="oss",    pekur ="pecor",  pens ="pens",   pesk ="pesch",  
                           pettin ="pettin", petts ="pezz",   ped ="pied",  petr  ="pietr",  pont ="pont",  pork  ="porc",   previt ="pret",  ros  ="ros",   rot  ="ruot",  seddZ  ="sedi", 
                           sol ="sol",    spos ="spos",  stell  ="stell", ten  ="ten" ,  soritS  ="top",  trov   ="trov" ,  
                           ommin ="uom",    ov ="uov",   vekkj  ="vecchi",  vent="vent",    verm="verm",   vol ="vol" ,  tsopp  ="zopp" ) 

levels(o.df$Stem) <- list( anell ="anell",   bell ="bell",    bon="buon",    kapell="capell", kappell ="cappell", krapett="caprett", tServell="cervell", kurtell="coltell", korn="corn",    kott="cott",    kor="cuor",   
                           dent ="dent",  femmin  ="donn",   dorm ="dorm",   ess ="esc",    ferr ="ferr",   
                           foLL="fogli" ,  fok="fuoc",    gross="gross",   lett="lett",    long="lung",    mel="mel",     mes="mes",     mor="mor",     mort="mort",    nipot="nipot",   nov="nuov",    okk="occh",    
                           oss="oss",    pekur ="pecor",  pens ="pens",   pesk ="pesch",  
                           pettin ="pettin", petts ="pezz",   ped ="pied",  petr  ="pietr",  pont ="pont",  pork  ="porc",   previt ="pret",  ros  ="ros",   rot  ="ruot",  seddZ  ="sedi", 
                           sol ="sol",    spos ="spos",  stell  ="stell", ten  ="ten" ,  soritS  ="top",  trov   ="trov" ,  
                           ommin ="uom",    ov ="uov",   vekkj  ="vecchi",  vent="vent",    verm="verm",   vol ="vol" ,  tsopp  ="zopp" ) 
