Spaces:
Runtime error
Runtime error
Commit
·
b7a34b6
1
Parent(s):
5f50e60
Removed ConST
Browse files
app.py
CHANGED
|
@@ -93,9 +93,6 @@ os.system("pip install git+https://github.com/openai/whisper.git")
|
|
| 93 |
#os.system("mkdir -p data checkpoint")
|
| 94 |
|
| 95 |
|
| 96 |
-
huggingface_model_dir = snapshot_download(repo_id="ReneeYe/ConST_en2x_models")
|
| 97 |
-
print(huggingface_model_dir)
|
| 98 |
-
|
| 99 |
|
| 100 |
|
| 101 |
def restrict_src_options(model_type):
|
|
@@ -225,141 +222,13 @@ def predictWithmRASP2(input_audio, src_language, tgt_language):
|
|
| 225 |
translation = (' '.join(translation.split(' ')[1:])).strip()
|
| 226 |
|
| 227 |
mt_time = time.time() - mt_start_time
|
| 228 |
-
print(f"Took {mt_time} to do Machine Translation")
|
| 229 |
-
#print(model_name)
|
| 230 |
|
| 231 |
-
#with open("output", 'r') as r:
|
| 232 |
-
# translation = "Undefined"
|
| 233 |
-
# translation = (' '.join(r.readline().split(' ')[1:])).strip()
|
| 234 |
-
# print(translation)
|
| 235 |
|
| 236 |
# Returns the text
|
| 237 |
-
print("returning transcript: " + transcript + " and the translation: " + translation)
|
| 238 |
return transcript, translation
|
| 239 |
|
| 240 |
|
| 241 |
|
| 242 |
-
# Helper methods for ConST (as written in https://huggingface.co/spaces/ReneeYe/ConST-speech2text-translator/blob/main/app.py)
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
def convert_audio_to_16k_wav(audio_input):
|
| 246 |
-
sound = AudioSegment.from_file(audio_input)
|
| 247 |
-
sample_rate = sound.frame_rate
|
| 248 |
-
num_channels = sound.channels
|
| 249 |
-
num_frames = int(sound.frame_count())
|
| 250 |
-
filename = audio_input.split("/")[-1]
|
| 251 |
-
print("original file is at:", audio_input)
|
| 252 |
-
if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
|
| 253 |
-
if num_channels > 1:
|
| 254 |
-
sound = sound.set_channels(1)
|
| 255 |
-
if sample_rate != 16000:
|
| 256 |
-
sound = sound.set_frame_rate(16000)
|
| 257 |
-
num_frames = int(sound.frame_count())
|
| 258 |
-
filename = filename.replace(".wav", "") + "_16k.wav"
|
| 259 |
-
sound.export(f"data/{filename}", format="wav")
|
| 260 |
-
else:
|
| 261 |
-
shutil.copy(audio_input, f'data/{filename}')
|
| 262 |
-
return filename, num_frames
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
def prepare_tsv(file_name, n_frame, language, task="ST"):
|
| 266 |
-
tgt_lang = language_id_lookup[language]
|
| 267 |
-
with open("data/test_case.tsv", "w") as f:
|
| 268 |
-
f.write("id\taudio\tn_frames\ttgt_text\tspeaker\tsrc_lang\ttgt_lang\tsrc_text\n")
|
| 269 |
-
f.write(f"sample\t{file_name}\t{n_frame}\tThis is in {tgt_lang}.\tspk.1\ten\t{tgt_lang}\tThis is English.\n")
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
def get_vocab_and_yaml(language):
|
| 273 |
-
tgt_lang = language_id_lookup[language]
|
| 274 |
-
# get: spm_ende.model and spm_ende.txt, and save to data/xxx
|
| 275 |
-
# if exist, no need to download
|
| 276 |
-
shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.model"), "./data")
|
| 277 |
-
shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.txt"), "./data")
|
| 278 |
-
|
| 279 |
-
# write yaml file
|
| 280 |
-
abs_path = os.popen("pwd").read().strip()
|
| 281 |
-
yaml_dict = LANG_GEN_SETUPS[tgt_lang]
|
| 282 |
-
yaml_dict["input_channels"] = 1
|
| 283 |
-
yaml_dict["use_audio_input"] = True
|
| 284 |
-
yaml_dict["prepend_tgt_lang_tag"] = True
|
| 285 |
-
yaml_dict["prepend_src_lang_tag"] = True
|
| 286 |
-
yaml_dict["audio_root"] = os.path.join(abs_path, "data")
|
| 287 |
-
yaml_dict["vocab_filename"] = f"spm_en{tgt_lang}.txt"
|
| 288 |
-
yaml_dict["bpe_tokenizer"] = {"bpe": "sentencepiece",
|
| 289 |
-
"sentencepiece_model": os.path.join(abs_path, f"data/spm_en{tgt_lang}.model")}
|
| 290 |
-
with open("data/config.yaml", "w") as f:
|
| 291 |
-
yaml.dump(yaml_dict, f)
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
def get_model(language):
|
| 295 |
-
# download models to checkpoint/xxx
|
| 296 |
-
return os.path.join(huggingface_model_dir, f"models/const_en{language_id_lookup[language]}.pt")
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
def generate(model_path):
|
| 300 |
-
os.system(f"python3 fairseq/fairseq_cli/generate.py data/ --gen-subset test_case --task speech_to_text --prefix-size 1 \
|
| 301 |
-
--max-source-positions 4000000 \
|
| 302 |
-
--config-yaml config.yaml --path {model_path} | tee temp.txt")
|
| 303 |
-
print("No problem with 1st line")
|
| 304 |
-
output = os.popen("grep ^D temp.txt | sort -n -k 2 -t '-' | cut -f 3")
|
| 305 |
-
return output.read().strip()
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
def post_processing(raw_sentence):
|
| 309 |
-
output_sentence = raw_sentence
|
| 310 |
-
if ":" in raw_sentence:
|
| 311 |
-
splited_sent = raw_sentence.split(":")
|
| 312 |
-
if len(splited_sent) == 2:
|
| 313 |
-
prefix = splited_sent[0].strip()
|
| 314 |
-
if len(prefix) <= 3:
|
| 315 |
-
output_sentence = splited_sent[1].strip()
|
| 316 |
-
elif ("(" in prefix) and (")" in prefix):
|
| 317 |
-
bgm = re.findall(r"\(.*?\)", prefix)[0]
|
| 318 |
-
if len(prefix.replace(bgm, "").strip()) <= 3:
|
| 319 |
-
output_sentence = splited_sent[1].strip()
|
| 320 |
-
elif len(splited_sent[1].strip()) > 8:
|
| 321 |
-
output_sentence = splited_sent[1].strip()
|
| 322 |
-
|
| 323 |
-
elif ("(" in raw_sentence) and (")" in raw_sentence):
|
| 324 |
-
bgm_list = re.findall(r"\(.*?\)", raw_sentence)
|
| 325 |
-
for bgm in bgm_list:
|
| 326 |
-
if len(raw_sentence.replace(bgm, "").strip()) > 5:
|
| 327 |
-
output_sentence = output_sentence.replace(bgm, "").strip()
|
| 328 |
-
if len(output_sentence) <= 5:
|
| 329 |
-
output_sentence = raw_sentence
|
| 330 |
-
return output_sentence
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
def remove_temp_files(audio_file):
|
| 334 |
-
os.remove("temp.txt")
|
| 335 |
-
os.remove("data/test_case.tsv")
|
| 336 |
-
os.remove(f"data/{audio_file}")
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
def error_output(language):
|
| 341 |
-
return f"Fail to translate the audio into {language}, you may use the examples I provide."
|
| 342 |
-
|
| 343 |
-
# Predicting the translation with ConST
|
| 344 |
-
def predictWithConST(audio_file, language):
|
| 345 |
-
try:
|
| 346 |
-
converted_audio_file, n_frame = convert_audio_to_16k_wav(audio_file)
|
| 347 |
-
prepare_tsv(converted_audio_file, n_frame, language)
|
| 348 |
-
get_vocab_and_yaml(language)
|
| 349 |
-
model_path = get_model(language)
|
| 350 |
-
print("This is the model path: " + model_path)
|
| 351 |
-
generate_model_path = generate(model_path)
|
| 352 |
-
print("No problem generating model path")
|
| 353 |
-
generated_output = post_processing(generate_model_path)
|
| 354 |
-
print("No problem generating output")
|
| 355 |
-
remove_temp_files(converted_audio_file)
|
| 356 |
-
print("No problem removing_temp")
|
| 357 |
-
return generated_output
|
| 358 |
-
except:
|
| 359 |
-
traceback.print_exc()
|
| 360 |
-
return error_output(language)
|
| 361 |
-
|
| 362 |
-
|
| 363 |
title = "Demo for Speech Translation (Whisper+mRASP2 and ConST)"
|
| 364 |
|
| 365 |
description = """
|
|
@@ -381,7 +250,7 @@ with demo:
|
|
| 381 |
gr.Markdown("###" + description)
|
| 382 |
with gr.Row():
|
| 383 |
with gr.Column():
|
| 384 |
-
model_type = gr.Dropdown(['Whisper+mRASP2'
|
| 385 |
audio_file = gr.Audio(label="Upload Speech", source="upload", type="filepath")
|
| 386 |
src_language = gr.Dropdown(['Arabic',
|
| 387 |
'Chinese',
|
|
@@ -417,4 +286,4 @@ with demo:
|
|
| 417 |
submit_button.click(fn = predict, inputs=[audio_file, src_language, tgt_language_mRASP, tgt_language_ConST, model_type, mic_audio], outputs=[transcript, translate, translated_speech])
|
| 418 |
switch_lang_button.click(switchLang, [src_language, tgt_language_mRASP], [src_language, tgt_language_mRASP])
|
| 419 |
|
| 420 |
-
demo.launch(
|
|
|
|
| 93 |
#os.system("mkdir -p data checkpoint")
|
| 94 |
|
| 95 |
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
|
| 98 |
def restrict_src_options(model_type):
|
|
|
|
| 222 |
translation = (' '.join(translation.split(' ')[1:])).strip()
|
| 223 |
|
| 224 |
mt_time = time.time() - mt_start_time
|
|
|
|
|
|
|
| 225 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
# Returns the text
|
|
|
|
| 228 |
return transcript, translation
|
| 229 |
|
| 230 |
|
| 231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
title = "Demo for Speech Translation (Whisper+mRASP2 and ConST)"
|
| 233 |
|
| 234 |
description = """
|
|
|
|
| 250 |
gr.Markdown("###" + description)
|
| 251 |
with gr.Row():
|
| 252 |
with gr.Column():
|
| 253 |
+
model_type = gr.Dropdown(['Whisper+mRASP2'], type = "value", value = 'Whisper+mRASP2', label = "Select the model you want to use.")
|
| 254 |
audio_file = gr.Audio(label="Upload Speech", source="upload", type="filepath")
|
| 255 |
src_language = gr.Dropdown(['Arabic',
|
| 256 |
'Chinese',
|
|
|
|
| 286 |
submit_button.click(fn = predict, inputs=[audio_file, src_language, tgt_language_mRASP, tgt_language_ConST, model_type, mic_audio], outputs=[transcript, translate, translated_speech])
|
| 287 |
switch_lang_button.click(switchLang, [src_language, tgt_language_mRASP], [src_language, tgt_language_mRASP])
|
| 288 |
|
| 289 |
+
demo.launch()
|