Spaces:
Sleeping
Sleeping
catiR
commited on
Commit
·
1e483fc
1
Parent(s):
009ee74
data + demo
Browse files- .gitignore +2 -0
- Data/133_Annotated_Vowel_Lengths.pdf +0 -0
- Data/Length_in_spoken_icelandic.json +0 -0
- Data/Length_in_spoken_icelandic.tsv +0 -0
- README.md +20 -1
- app.py +204 -0
- requirements.txt +2 -0
- vowel_length.py +207 -0
.gitignore
CHANGED
|
@@ -169,3 +169,5 @@ cython_debug/
|
|
| 169 |
|
| 170 |
# PyPI configuration file
|
| 171 |
.pypirc
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
# PyPI configuration file
|
| 171 |
.pypirc
|
| 172 |
+
|
| 173 |
+
**/.DS_Store
|
Data/133_Annotated_Vowel_Lengths.pdf
ADDED
|
Binary file (166 kB). View file
|
|
|
Data/Length_in_spoken_icelandic.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/Length_in_spoken_icelandic.tsv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
README.md
CHANGED
|
@@ -1 +1,20 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Length contrasts in spoken Icelandic
|
| 3 |
+
emoji: 📊
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.15.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
## Assessed and Annotated Vowel Lengths in Spoken Icelandic Sentences for L1 and L2 Speakers: A Resource for Pronunciation Training
|
| 13 |
+
|
| 14 |
+
#### NoDaLiDa/Baltic-HLT 2025, Tallinn, Estonia
|
| 15 |
+
Authors: Caitlin Laura Richter, Kolbrún Friðriksdóttir, Kormákur Logi
|
| 16 |
+
Bergsson, Erik Anders Maher, Ragnheiður María Benediktsdóttir, Jon
|
| 17 |
+
Gudnason
|
| 18 |
+
|
| 19 |
+
### Get [the paper](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/133_Annotated_Vowel_Lengths.pdf) and annotations from the Data directory,
|
| 20 |
+
### or [see the demo](https://huggingface.co/spaces/clr/length-contrast-data-isl)
|
app.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import vowel_length as vln
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
annotation_json = 'Data/Length_in_spoken_icelandic.json'
|
| 8 |
+
|
| 9 |
+
menus, vdata = vln.setup(annotation_json)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
grouplist = [g for g,ws in menus]
|
| 13 |
+
worddict = {g:ws for g,ws in menus}
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def get_group_words(group):
|
| 19 |
+
if group == '[NONE]':
|
| 20 |
+
choices = ['[NONE]']
|
| 21 |
+
else:
|
| 22 |
+
choices = [ '[ALL]' ] + [n for n,v in worddict[group]]
|
| 23 |
+
return gr.Dropdown(choices = choices, value = choices[0], interactive=True)
|
| 24 |
+
|
| 25 |
+
def check_word_langs(word,cur_lang):
|
| 26 |
+
if ' [L' not in word:
|
| 27 |
+
return gr.Radio(value=cur_lang,interactive=True)
|
| 28 |
+
elif ' [L1]' in word:
|
| 29 |
+
return gr.Radio(value='L1',interactive=False)
|
| 30 |
+
else:
|
| 31 |
+
return gr.Radio(value='L2',interactive=False)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def subset_words_spks(g,w,l,s,wsets,db):
|
| 36 |
+
if w == '[ALL]':
|
| 37 |
+
swords = [v for n,v in wsets[g]]
|
| 38 |
+
labl = g
|
| 39 |
+
else:
|
| 40 |
+
labl = w.split(' ')[0]
|
| 41 |
+
swords = [labl]
|
| 42 |
+
|
| 43 |
+
if l == 'All':
|
| 44 |
+
slang = ['L1', 'L2']
|
| 45 |
+
labl += f'\n L1+L2, '
|
| 46 |
+
else:
|
| 47 |
+
slang = [l]
|
| 48 |
+
labl += f'\n {l}, '
|
| 49 |
+
|
| 50 |
+
labl += f'{s}'
|
| 51 |
+
|
| 52 |
+
db1 = db.copy()
|
| 53 |
+
db1 = db1.loc[ (db1['speaker_lang'].isin(slang)) & (db1['word'].isin(swords)) ]
|
| 54 |
+
db1.reset_index()
|
| 55 |
+
|
| 56 |
+
if s.lower() == 'mfa':
|
| 57 |
+
src = 'mfa'
|
| 58 |
+
else:
|
| 59 |
+
assert s[:3].lower() == 'ann'
|
| 60 |
+
src = 'gold'
|
| 61 |
+
|
| 62 |
+
return db1, src, labl
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def plott(g1,w1,l1,s1,g2,w2,l2,s2):
|
| 67 |
+
|
| 68 |
+
dat1,src1,lab1 = subset_words_spks(g1,w1,l1,s1,worddict,vdata)
|
| 69 |
+
|
| 70 |
+
if '[NONE]' in [g2, w2]:
|
| 71 |
+
dat2, l2, src2, lab2 = None, None, None, None
|
| 72 |
+
else:
|
| 73 |
+
dat2,src2,lab2 = subset_words_spks(g2,w2,l2,s2,worddict,vdata)
|
| 74 |
+
|
| 75 |
+
fig = vln.vgraph(dat1,l1,src1,lab1,dat2,l2,src2,lab2)
|
| 76 |
+
|
| 77 |
+
return fig
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
bl = gr.Blocks(theme=gr.themes.Glass())
|
| 82 |
+
|
| 83 |
+
with bl:
|
| 84 |
+
|
| 85 |
+
with gr.Tabs():
|
| 86 |
+
|
| 87 |
+
with gr.TabItem("Vowel quantity"):
|
| 88 |
+
|
| 89 |
+
with gr.Row():
|
| 90 |
+
with gr.Column():
|
| 91 |
+
gr.Markdown(
|
| 92 |
+
"""
|
| 93 |
+
#### Select data (1)
|
| 94 |
+
"""
|
| 95 |
+
)
|
| 96 |
+
gmenu1 = gr.Dropdown(choices=grouplist,label="Group", value='AL:')
|
| 97 |
+
wmenu1 = gr.Dropdown(label="Word", choices=['[ALL]'] + [n for n,v in worddict['AL:']])
|
| 98 |
+
lmenu1 = gr.Radio(["L1", "L2","All"],label="Speaker group",value="L1")
|
| 99 |
+
smenu1 = gr.Dropdown(["Annotated", "MFA"],label="Source",value="Annotated")
|
| 100 |
+
|
| 101 |
+
gmenu1.change(get_group_words,inputs=[gmenu1],outputs = [wmenu1])
|
| 102 |
+
wmenu1.input(check_word_langs,inputs=[wmenu1,lmenu1],outputs = [lmenu1])
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
with gr.Column():
|
| 106 |
+
gr.Markdown(
|
| 107 |
+
"""
|
| 108 |
+
#### Select data (2)
|
| 109 |
+
"""
|
| 110 |
+
)
|
| 111 |
+
gmenu2 = gr.Dropdown(choices=['[NONE]'] + grouplist,label="Group", value='[NONE]')
|
| 112 |
+
wmenu2 = gr.Dropdown(label="Word", choices=['[NONE]'])
|
| 113 |
+
lmenu2 = gr.Radio(choices=["L1", "L2","All"],label="Speaker group",value="L1")
|
| 114 |
+
smenu2 = gr.Dropdown(["Annotated", "MFA"],label="Source",value="Annotated")
|
| 115 |
+
|
| 116 |
+
gmenu2.change(get_group_words,inputs=[gmenu2],outputs = [wmenu2])
|
| 117 |
+
wmenu2.input(check_word_langs,inputs=[wmenu2,lmenu2],outputs = [lmenu2])
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
btn = gr.Button(value="Update Plot")
|
| 121 |
+
plo = gr.Plot()
|
| 122 |
+
btn.click(plott, [gmenu1,wmenu1,lmenu1,smenu1,gmenu2,wmenu2,lmenu2,smenu2], plo)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
gr.Markdown(
|
| 128 |
+
"""
|
| 129 |
+
# Long and short Icelandic vowels
|
| 130 |
+
Check the About tab for more info about the project.
|
| 131 |
+
"""
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
with gr.TabItem("About"):
|
| 136 |
+
gr.Markdown(
|
| 137 |
+
"""
|
| 138 |
+
## Assessed and Annotated Vowel Lengths in Spoken Icelandic Sentences\
|
| 139 |
+
for L1 and L2 Speakers: A Resource for Pronunciation Training
|
| 140 |
+
"""
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
gr.Markdown(
|
| 144 |
+
"""
|
| 145 |
+
## Demo: Viewing the data
|
| 146 |
+
Use the menus to choose words, speaker group, and data source.
|
| 147 |
+
Words are split into related groups and either the whole group or a single word can be selected.
|
| 148 |
+
Available speaker groups are native Icelandic speakers (L1), second-language speakers (L2), or all.
|
| 149 |
+
Data source options are gold (human) annotations or automated Montreal Forced Aligner (MFA).
|
| 150 |
+
|
| 151 |
+
The general expectation is that, all else being equal, syllables with long stressed vowels
|
| 152 |
+
followed by short consonants have a higher vowel:(vowel+consonant) duration ratio,
|
| 153 |
+
while syllables with short stressed vowels followed by long consonants have a lower ratio.
|
| 154 |
+
|
| 155 |
+
Many other factors also affect relative durations in any particular recorded token,
|
| 156 |
+
and these factors have considerable - not necessarily balanced - variation throughout this dataset.
|
| 157 |
+
This demo is provided to begin exploring the data and suggest hypotheses for follow-up.
|
| 158 |
+
See Pind 1999, 'Speech segment durations and quantity in Icelandic'
|
| 159 |
+
(J. Acoustical Society of America, 106(2)) for a review of the acoustics of Icelandic vowel duration.
|
| 160 |
+
"""
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
gr.Markdown(
|
| 166 |
+
"""
|
| 167 |
+
## Accessing the data
|
| 168 |
+
|
| 169 |
+
Annotations can be downloaded as
|
| 170 |
+
[json](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/Length_in_spoken_icelandic.json)
|
| 171 |
+
or [tsv](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/Length_in_spoken_icelandic.tsv) files.
|
| 172 |
+
See [the paper](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/133_Annotated_Vowel_Lengths.pdf)
|
| 173 |
+
for complete information.
|
| 174 |
+
|
| 175 |
+
Audio is available from [Clarin](https://repository.clarin.is/repository/xmlui/) (Samrómur).
|
| 176 |
+
The 'collection' field plus recording filename in the annotations metadata
|
| 177 |
+
specify the original audio file, including which Samrómur collection it is found in.
|
| 178 |
+
"""
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
gr.Markdown(
|
| 183 |
+
"""
|
| 184 |
+
### About
|
| 185 |
+
|
| 186 |
+
This annotated data and its demo application accompany the paper
|
| 187 |
+
*Assessed and Annotated Vowel Lengths in Spoken Icelandic Sentences\
|
| 188 |
+
for L1 and L2 Speakers: A Resource for Pronunciation Training*, \
|
| 189 |
+
Caitlin Laura Richter, Kolbrún Friðriksdóttir, Kormákur Logi Bergsson, \
|
| 190 |
+
Erik Anders Maher, Ragnheiður María Benediktsdóttir, Jon Gudnason - NoDaLiDa/Baltic-HLT 2025, Tallinn, Estonia.
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
### Contact caitlinr@ru.is about bugs, feedback, or collaboration!
|
| 194 |
+
|
| 195 |
+
"""
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
if __name__ == "__main__":
|
| 202 |
+
bl.launch()
|
| 203 |
+
|
| 204 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
scipy
|
| 2 |
+
matplotlib
|
vowel_length.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, json
|
| 2 |
+
import numpy as np
|
| 3 |
+
from collections import defaultdict
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import matplotlib
|
| 6 |
+
matplotlib.use('Agg')
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# make subsets of words for convenience
|
| 11 |
+
def make_sets(db,shorts,longs):
|
| 12 |
+
|
| 13 |
+
def _wspec(wd,l1,l2):
|
| 14 |
+
if (wd in l1) and (wd in l2):
|
| 15 |
+
return(wd,wd)
|
| 16 |
+
elif wd in l1:
|
| 17 |
+
return(f'{wd} [L1]',wd)
|
| 18 |
+
elif wd in l2:
|
| 19 |
+
return(f'{wd} [L2]',wd)
|
| 20 |
+
else:
|
| 21 |
+
return ('','')
|
| 22 |
+
|
| 23 |
+
def _ksrt(k):
|
| 24 |
+
if ' ' in k:
|
| 25 |
+
return((k[0],1/len(k)))
|
| 26 |
+
else:
|
| 27 |
+
return (k.replace(':',''),k[-1] )
|
| 28 |
+
|
| 29 |
+
words = set([(t['word'],t['speaker_lang']) for t in db])
|
| 30 |
+
l1 = [w for w,l in words if l == 'L1']
|
| 31 |
+
l2 = [w for w,l in words if l == 'L2']
|
| 32 |
+
words = set([w for w,l in words])
|
| 33 |
+
|
| 34 |
+
wdict = defaultdict(list)
|
| 35 |
+
for w in words:
|
| 36 |
+
if 'agg' in w:
|
| 37 |
+
wdict['AG:'].append(_wspec(w,l1,l2))
|
| 38 |
+
elif 'all' in w:
|
| 39 |
+
wdict['AL:'].append(_wspec(w,l1,l2))
|
| 40 |
+
elif 'egg' in w:
|
| 41 |
+
wdict['EG:'].append(_wspec(w,l1,l2))
|
| 42 |
+
elif 'eki' in w:
|
| 43 |
+
wdict['E:G'].append(_wspec(w,l1,l2))
|
| 44 |
+
elif 'aki' in w:
|
| 45 |
+
wdict['A:G'].append(_wspec(w,l1,l2))
|
| 46 |
+
elif 'ala' in w:
|
| 47 |
+
wdict['A:L'].append(_wspec(w,l1,l2))
|
| 48 |
+
elif w in shorts:
|
| 49 |
+
wdict['OTHER - SHORT'].append(_wspec(w,l1,l2))
|
| 50 |
+
elif w in longs:
|
| 51 |
+
wdict['OTHER - LONG'].append(_wspec(w,l1,l2))
|
| 52 |
+
else:
|
| 53 |
+
print(f'something should not have happened: {w}')
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
sets = [(k, sorted(wdict[k])) for k in sorted(list(wdict.keys()),key = _ksrt)]
|
| 57 |
+
|
| 58 |
+
return sets
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# compile data for a token record
|
| 62 |
+
def get_tk_data(tk,shorts,longs):
|
| 63 |
+
|
| 64 |
+
# merge intervals
|
| 65 |
+
# from list of phones
|
| 66 |
+
# to word part
|
| 67 |
+
def _merge_intervals(plist):
|
| 68 |
+
if not plist:
|
| 69 |
+
return np.nan
|
| 70 |
+
tot_start, tot_end = plist[0]['start'],plist[-1]['end']
|
| 71 |
+
tot_dur = tot_end-tot_start
|
| 72 |
+
return tot_dur
|
| 73 |
+
|
| 74 |
+
tkdat = {}
|
| 75 |
+
tkdat['word'] = tk['word']
|
| 76 |
+
tkdat['speaker_lang'] = tk['speaker_lang']
|
| 77 |
+
tkdat['n_pre_phone'] = len(tk['gold_annotation']['prevowel'])
|
| 78 |
+
tkdat['n_post_phone'] = len(tk['gold_annotation']['postvowel'])
|
| 79 |
+
|
| 80 |
+
if tk['word'] in longs:
|
| 81 |
+
tkdat['vlen'] = 1
|
| 82 |
+
else:
|
| 83 |
+
assert tk['word'] in shorts
|
| 84 |
+
tkdat['vlen'] = 0
|
| 85 |
+
|
| 86 |
+
for s in ['gold','mfa']:
|
| 87 |
+
tkdat[f'{s}_pre_dur'] = _merge_intervals(tk[f'{s}_annotation']['prevowel'])
|
| 88 |
+
tkdat[f'{s}_v_dur'] = _merge_intervals(tk[f'{s}_annotation']['vowel'])
|
| 89 |
+
tkdat[f'{s}_post_dur'] = _merge_intervals(tk[f'{s}_annotation']['postvowel'])
|
| 90 |
+
tkdat[f'{s}_word_dur'] = tk[f'{s}_annotation']['target_word_end'] -\
|
| 91 |
+
tk[f'{s}_annotation']['target_word_start']
|
| 92 |
+
|
| 93 |
+
return tkdat
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# code short vowels 0, long 1
|
| 97 |
+
def prep_dat(d):
|
| 98 |
+
df = d.copy()
|
| 99 |
+
for s in ['gold','mfa']:
|
| 100 |
+
df[f'{s}_ratio'] = df[f'{s}_v_dur'] / (df[f'{s}_v_dur']+df[f'{s}_post_dur'])
|
| 101 |
+
df[f'{s}_pre_dur'] = df[f'{s}_pre_dur'].fillna(0) # set absent onsets dur zero
|
| 102 |
+
df = df.convert_dtypes()
|
| 103 |
+
return df
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def setup(annot_json):
|
| 107 |
+
|
| 108 |
+
longs = set(['aki', 'ala', 'baki', 'bera', 'betri', 'blaki', 'breki',
|
| 109 |
+
'brosir', 'dala', 'dreki', 'dvala', 'fala', 'fara', 'færa',
|
| 110 |
+
'færi', 'gala', 'hausinn', 'jónas', 'katrín', 'kisa', 'koma',
|
| 111 |
+
'leki', 'leyfa', 'maki', 'muna', 'nema', 'raki', 'sama',
|
| 112 |
+
'speki', 'svala', 'sækja', 'sömu', 'taki', 'tala', 'tvisvar',
|
| 113 |
+
'vala', 'veki', 'vinur', 'ása', 'þaki'])
|
| 114 |
+
|
| 115 |
+
shorts = set(['aggi', 'baggi', 'balla', 'beggi', 'eggi', 'farðu', 'fossinn',
|
| 116 |
+
'færði', 'galla', 'hausnum', 'herra', 'jónsson', 'kaggi', 'kalla',
|
| 117 |
+
'lalla', 'leggi', 'leyfðu', 'maggi', 'malla', 'mamma', 'missa',
|
| 118 |
+
'mömmu', 'nærri', 'palla', 'raggi', 'skeggi', 'snemma', 'sunna',
|
| 119 |
+
'tommi', 'veggi','vinnur', 'ásta'])
|
| 120 |
+
|
| 121 |
+
with open(annot_json, 'r') as handle:
|
| 122 |
+
db = json.load(handle)
|
| 123 |
+
|
| 124 |
+
sets = make_sets(db,shorts,longs)
|
| 125 |
+
|
| 126 |
+
db = [get_tk_data(tk,shorts,longs) for tk in db]
|
| 127 |
+
dat = pd.DataFrame.from_records(db)
|
| 128 |
+
dat = prep_dat(dat)
|
| 129 |
+
|
| 130 |
+
return sets,dat
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def vgraph(dat1,l1,src1,lab1,dat2,l2,src2,lab2):
|
| 135 |
+
|
| 136 |
+
def _gprep(df,l,s):
|
| 137 |
+
|
| 138 |
+
# color by length + speaker group
|
| 139 |
+
ccs = { "lAll" : (0.0, 0.749, 1.0),
|
| 140 |
+
"lL1" : (0.122, 0.467, 0.706),
|
| 141 |
+
"lL2" : (0.282, 0.82, 0.8),
|
| 142 |
+
"sAll" :(0.89, 0.467, 0.761),
|
| 143 |
+
"sL1" : (0.863, 0.078, 0.235),
|
| 144 |
+
"sL2" : (0.859, 0.439, 0.576),
|
| 145 |
+
"xAll" : (0.988, 0.69, 0.004),
|
| 146 |
+
"xL1" : (0.984, 0.49, 0.027),
|
| 147 |
+
"xL2" : (0.969, 0.835, 0.376)}
|
| 148 |
+
|
| 149 |
+
vdurs = np.array(df[f'{s}_v_dur'])*1000
|
| 150 |
+
cdurs = np.array(df[f'{s}_post_dur'])*1000
|
| 151 |
+
rto = np.mean(df[f'{s}_ratio'])
|
| 152 |
+
|
| 153 |
+
if sum(df['vlen']) == 0:
|
| 154 |
+
vl = 's'
|
| 155 |
+
elif sum(df['vlen']) == df.shape[0]:
|
| 156 |
+
vl = 'l'
|
| 157 |
+
else:
|
| 158 |
+
vl = 'x'
|
| 159 |
+
|
| 160 |
+
cc = ccs[f'{vl}{l}']
|
| 161 |
+
|
| 162 |
+
return vdurs, cdurs, rto, cc
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
vd1,cd1,ra1,cl1 = _gprep(dat1,l1,src1)
|
| 166 |
+
lab1 += f'\n Ratio: {ra1:.3f}'
|
| 167 |
+
if src1 == 'gold':
|
| 168 |
+
mk1 = '^'
|
| 169 |
+
else:
|
| 170 |
+
mk1 = '<'
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
fig, ax = plt.subplots(figsize=(9,7))
|
| 174 |
+
ax.set_xlim(0.0,350)
|
| 175 |
+
ax.set_ylim(0.0,350)
|
| 176 |
+
|
| 177 |
+
ax.scatter(vd1,cd1,marker = mk1, label = lab1,
|
| 178 |
+
c = [cl1 + (.7,)], edgecolors = [cl1] )
|
| 179 |
+
|
| 180 |
+
if lab2:
|
| 181 |
+
vd2,cd2,ra2,cl2 = _gprep(dat2,l2,src2)
|
| 182 |
+
lab2 += f'\n Ratio: {ra2:.3f}'
|
| 183 |
+
if src2 == 'gold':
|
| 184 |
+
mk2 = 'v'
|
| 185 |
+
else:
|
| 186 |
+
mk2 = '>'
|
| 187 |
+
ax.scatter(vd2,cd2, marker = mk2, label = lab2,
|
| 188 |
+
c = [cl2 + (.05,)], edgecolors = [cl2] )
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
ax.set_title("Stressed vowel & following consonant(s) duration" )
|
| 192 |
+
ax.set_xlabel("Vowel duration (ms)")
|
| 193 |
+
ax.set_ylabel("Consonant duration (ms)")
|
| 194 |
+
#fig.legend(loc=8,ncols=2)
|
| 195 |
+
fig.legend(loc=7)
|
| 196 |
+
|
| 197 |
+
ax.axline((0,0),slope=1,color="darkgray")
|
| 198 |
+
|
| 199 |
+
fig.tight_layout()
|
| 200 |
+
#fig.subplots_adjust(bottom=0.15)
|
| 201 |
+
fig.subplots_adjust(right=0.75)
|
| 202 |
+
|
| 203 |
+
#plt.xticks(ticks=[50,100,150,200,250,300],labels=[])
|
| 204 |
+
#plt.yticks(ticks=[100,200,300],labels=[])
|
| 205 |
+
|
| 206 |
+
return fig
|
| 207 |
+
|