Spaces:
Build error
Build error
plot per language
Browse files
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
from ctypes.wintypes import LANGID
|
|
|
|
| 2 |
from email.policy import default
|
| 3 |
import pycountry
|
| 4 |
import os
|
|
@@ -179,7 +180,16 @@ def get_metadata_json(path):
|
|
| 179 |
except Exception:
|
| 180 |
return []
|
| 181 |
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
repo.git_pull()
|
| 184 |
REPOSITORY_DATA_DIR = os.path.join(REPOSITORY_DIR,'data')
|
| 185 |
repo_recordings = [os.path.join(REPOSITORY_DATA_DIR,f.name) for f in os.scandir(REPOSITORY_DATA_DIR)] if os.path.isdir(REPOSITORY_DATA_DIR) else []
|
|
@@ -188,29 +198,8 @@ def show_records():
|
|
| 188 |
audio_repo = [a.replace('data/data/','https://huggingface.co/datasets/chrisjay/crowd-speech-africa/resolve/main/data/') for a in audio_repo]
|
| 189 |
metadata_all = [get_metadata_json(os.path.join(f,'metadata.jsonl')) for f in repo_recordings]
|
| 190 |
metadata_all = [m for m in metadata_all if m!=[]]
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
langs=[m['language_name'] for m in metadata_all]
|
| 194 |
-
lang_dict = Counter(langs)
|
| 195 |
-
lang_dict.update({'All others':0})
|
| 196 |
-
all_langs = list(lang_dict.keys())
|
| 197 |
-
langs_count = [lang_dict[k] for k in all_langs]
|
| 198 |
-
y_pos = np.arange(len(all_langs))
|
| 199 |
-
plt.barh(all_langs, langs_count)
|
| 200 |
-
plt.ylabel("Language")
|
| 201 |
-
plt.xlabel('Number of audio samples')
|
| 202 |
-
plt.title('Distribution of audio samples over languages')
|
| 203 |
-
|
| 204 |
-
#audios = [a for a in audios_all]
|
| 205 |
-
#texts = [m['text'] for m in metadata_all]
|
| 206 |
-
#numbers = [m['number'] for m in metadata_all]
|
| 207 |
|
| 208 |
-
html = f"""<div class="infoPoint">
|
| 209 |
-
<h1> Hooray! We have collected {len(metadata_all)} samples!</h1>
|
| 210 |
-
"""
|
| 211 |
-
|
| 212 |
-
return html,plt
|
| 213 |
-
|
| 214 |
|
| 215 |
|
| 216 |
def display_records():
|
|
@@ -315,9 +304,62 @@ with block:
|
|
| 315 |
</div>
|
| 316 |
""")
|
| 317 |
plot = gr.Plot(type="matplotlib")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
#listen = gr.Button("Listen")
|
| 320 |
listen_tab.select(show_records,inputs=[],outputs=[display_html,plot])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
gr.Markdown(ARTICLE)
|
| 322 |
|
| 323 |
block.launch()
|
|
|
|
| 1 |
from ctypes.wintypes import LANGID
|
| 2 |
+
from curses import meta
|
| 3 |
from email.policy import default
|
| 4 |
import pycountry
|
| 5 |
import os
|
|
|
|
| 180 |
except Exception:
|
| 181 |
return []
|
| 182 |
|
| 183 |
+
|
| 184 |
+
def plot_bar(value,name,x_name,y_name,title):
|
| 185 |
+
|
| 186 |
+
plt.barh(name, value)
|
| 187 |
+
plt.ylabel(y_name)
|
| 188 |
+
plt.xlabel(x_name)
|
| 189 |
+
plt.title(title)
|
| 190 |
+
return plt
|
| 191 |
+
|
| 192 |
+
def get_metadata_of_dataset():
|
| 193 |
repo.git_pull()
|
| 194 |
REPOSITORY_DATA_DIR = os.path.join(REPOSITORY_DIR,'data')
|
| 195 |
repo_recordings = [os.path.join(REPOSITORY_DATA_DIR,f.name) for f in os.scandir(REPOSITORY_DATA_DIR)] if os.path.isdir(REPOSITORY_DATA_DIR) else []
|
|
|
|
| 198 |
audio_repo = [a.replace('data/data/','https://huggingface.co/datasets/chrisjay/crowd-speech-africa/resolve/main/data/') for a in audio_repo]
|
| 199 |
metadata_all = [get_metadata_json(os.path.join(f,'metadata.jsonl')) for f in repo_recordings]
|
| 200 |
metadata_all = [m for m in metadata_all if m!=[]]
|
| 201 |
+
return metadata_all
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
|
| 205 |
def display_records():
|
|
|
|
| 304 |
</div>
|
| 305 |
""")
|
| 306 |
plot = gr.Plot(type="matplotlib")
|
| 307 |
+
metadata_all = get_metadata_of_dataset()
|
| 308 |
+
|
| 309 |
+
def show_records():
|
| 310 |
+
langs=[m['language_name'] for m in metadata_all]
|
| 311 |
+
all_genders = [m['gender'] for m in metadata_all
|
| 312 |
+
]
|
| 313 |
+
lang_dict = Counter(langs)
|
| 314 |
+
lang_dict.update({'All others':0})
|
| 315 |
+
all_langs = list(lang_dict.keys())
|
| 316 |
+
langs_count = [lang_dict[k] for k in all_langs]
|
| 317 |
+
plt_ = plot_bar(langs_count,all_langs,'Number of audio samples',"Language",'Distribution of audio samples over languages')
|
| 318 |
+
html = f"""<div class="infoPoint">
|
| 319 |
+
<h1> Hooray! We have collected {len(metadata_all)} samples!</h1>
|
| 320 |
+
"""
|
| 321 |
+
|
| 322 |
+
return html,plt_
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
languages = list(Counter([m['language_name'] for m in metadata_all]).keys())
|
| 327 |
+
for language in languages:
|
| 328 |
+
with gr.Row() as row_lang:
|
| 329 |
+
metadata_for_language = [m for m in metadata_all if m['language_name']==language]
|
| 330 |
+
gender_for_language = [m['gender'] for m in metadata_for_language]
|
| 331 |
+
digits_for_language = [m['number'] for m in metadata_for_language]
|
| 332 |
+
gender_for_language = [g if g!="" else 'Not given' for g in gender_for_language]
|
| 333 |
+
|
| 334 |
+
digits_dict = Counter(digits_for_language)
|
| 335 |
+
gender_dict = Counter(gender_for_language)
|
| 336 |
+
|
| 337 |
+
digits_name_for_language = list(digits_dict.keys())
|
| 338 |
+
digits_count_for_language = [digits_dict[k] for k in digits_name_for_language]
|
| 339 |
+
|
| 340 |
+
gender_name_for_language = list(gender_dict.keys())
|
| 341 |
+
gender_count_for_language = [gender_dict[k] for k in gender_name_for_language]
|
| 342 |
|
| 343 |
+
plot_digits = gr.Plot(type="matplotlib")
|
| 344 |
+
plot_gender = gr.Plot(type="matplotlib")
|
| 345 |
+
|
| 346 |
+
def plot_metadata_for_language():
|
| 347 |
+
plt_digits = plot_bar(digits_count_for_language,digits_name_for_language,'Number of audio samples',"Digit",f"Distribution of audio samples over digits for {language.upper()} ")
|
| 348 |
+
plt_gender = plot_bar(gender_count_for_language,gender_name_for_language,'Number of audio samples',"Gender",f"Distribution of audio samples over digits for {language.upper()}")
|
| 349 |
+
return plt_digits, plt_gender
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
row_lang.select(plot_metadata_for_language,inputs=[],outputs=[plot_digits,plot_gender])
|
| 353 |
+
|
| 354 |
+
|
| 355 |
#listen = gr.Button("Listen")
|
| 356 |
listen_tab.select(show_records,inputs=[],outputs=[display_html,plot])
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
# Have a list of the languages. lang
|
| 360 |
+
# We want digits per language and gender per language
|
| 361 |
+
# for l in range(len(lang),step =4)
|
| 362 |
+
# with Row().... d
|
| 363 |
gr.Markdown(ARTICLE)
|
| 364 |
|
| 365 |
block.launch()
|
data
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit af4ec56533825ccc0877c32d8ad73301181e8e98
|