Spaces:

chinmaydan
/

S2SCascadeDemo

Runtime error

App Files Files Community

chinmaydan commited on Sep 7, 2023

Commit

b7a34b6

1 Parent(s): 5f50e60

Removed ConST

Browse files

Files changed (1) hide show

app.py +2 -133

app.py CHANGED Viewed

@@ -93,9 +93,6 @@ os.system("pip install git+https://github.com/openai/whisper.git")
 #os.system("mkdir -p data checkpoint")
-huggingface_model_dir = snapshot_download(repo_id="ReneeYe/ConST_en2x_models")
-print(huggingface_model_dir)
 def restrict_src_options(model_type):
@@ -225,141 +222,13 @@ def predictWithmRASP2(input_audio, src_language, tgt_language):
     translation = (' '.join(translation.split(' ')[1:])).strip()
     mt_time = time.time() - mt_start_time
-    print(f"Took {mt_time} to do Machine Translation")
-    #print(model_name)
-    #with open("output", 'r') as r:
-    #    translation = "Undefined"
-    #    translation = (' '.join(r.readline().split(' ')[1:])).strip()
-    #    print(translation)
     # Returns the text
-    print("returning transcript: " + transcript + " and the translation: " + translation)
     return transcript, translation
-# Helper methods for ConST (as written in https://huggingface.co/spaces/ReneeYe/ConST-speech2text-translator/blob/main/app.py)
-def convert_audio_to_16k_wav(audio_input):
-    sound = AudioSegment.from_file(audio_input)
-    sample_rate = sound.frame_rate
-    num_channels = sound.channels
-    num_frames = int(sound.frame_count())
-    filename = audio_input.split("/")[-1]
-    print("original file is at:", audio_input)
-    if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
-        if num_channels > 1:
-            sound = sound.set_channels(1)
-        if sample_rate != 16000:
-            sound = sound.set_frame_rate(16000)
-        num_frames = int(sound.frame_count())
-        filename = filename.replace(".wav", "") + "_16k.wav"
-        sound.export(f"data/{filename}", format="wav")
-    else:
-        shutil.copy(audio_input, f'data/{filename}')
-    return filename, num_frames
-def prepare_tsv(file_name, n_frame, language, task="ST"):
-    tgt_lang = language_id_lookup[language]
-    with open("data/test_case.tsv", "w") as f:
-        f.write("id\taudio\tn_frames\ttgt_text\tspeaker\tsrc_lang\ttgt_lang\tsrc_text\n")
-        f.write(f"sample\t{file_name}\t{n_frame}\tThis is in {tgt_lang}.\tspk.1\ten\t{tgt_lang}\tThis is English.\n")
-def get_vocab_and_yaml(language):
-    tgt_lang = language_id_lookup[language]
-    # get: spm_ende.model and spm_ende.txt, and save to data/xxx
-    # if exist, no need to download
-    shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.model"), "./data")
-    shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.txt"), "./data")
-    # write yaml file
-    abs_path = os.popen("pwd").read().strip()
-    yaml_dict = LANG_GEN_SETUPS[tgt_lang]
-    yaml_dict["input_channels"] = 1
-    yaml_dict["use_audio_input"] = True
-    yaml_dict["prepend_tgt_lang_tag"] = True
-    yaml_dict["prepend_src_lang_tag"] = True
-    yaml_dict["audio_root"] = os.path.join(abs_path, "data")
-    yaml_dict["vocab_filename"] = f"spm_en{tgt_lang}.txt"
-    yaml_dict["bpe_tokenizer"] = {"bpe": "sentencepiece",
-                                  "sentencepiece_model": os.path.join(abs_path, f"data/spm_en{tgt_lang}.model")}
-    with open("data/config.yaml", "w") as f:
-        yaml.dump(yaml_dict, f)
-def get_model(language):
-    # download models to checkpoint/xxx
-    return os.path.join(huggingface_model_dir, f"models/const_en{language_id_lookup[language]}.pt")
-def generate(model_path):
-    os.system(f"python3 fairseq/fairseq_cli/generate.py data/ --gen-subset test_case --task speech_to_text --prefix-size 1 \
-                 --max-source-positions 4000000 \
-                --config-yaml config.yaml  --path {model_path} | tee temp.txt")
-    print("No problem with 1st line")
-    output = os.popen("grep ^D temp.txt | sort -n -k 2 -t '-' | cut -f 3")
-    return output.read().strip()
-def post_processing(raw_sentence):
-    output_sentence = raw_sentence
-    if ":" in raw_sentence:
-        splited_sent = raw_sentence.split(":")
-        if len(splited_sent) == 2:
-            prefix = splited_sent[0].strip()
-            if len(prefix) <= 3:
-                output_sentence = splited_sent[1].strip()
-            elif ("(" in prefix) and (")" in prefix):
-                bgm = re.findall(r"\(.*?\)", prefix)[0]
-                if len(prefix.replace(bgm, "").strip()) <= 3:
-                    output_sentence = splited_sent[1].strip()
-                elif len(splited_sent[1].strip()) > 8:
-                    output_sentence = splited_sent[1].strip()
-    elif ("(" in raw_sentence) and (")" in raw_sentence):
-        bgm_list = re.findall(r"\(.*?\)", raw_sentence)
-        for bgm in bgm_list:
-            if len(raw_sentence.replace(bgm, "").strip()) > 5:
-                output_sentence = output_sentence.replace(bgm, "").strip()
-        if len(output_sentence) <= 5:
-            output_sentence = raw_sentence
-    return output_sentence
-def remove_temp_files(audio_file):
-    os.remove("temp.txt")
-    os.remove("data/test_case.tsv")
-    os.remove(f"data/{audio_file}")
-def error_output(language):
-    return f"Fail to translate the audio into {language}, you may use the examples I provide."
-# Predicting the translation with ConST
-def predictWithConST(audio_file, language):
-    try:
-        converted_audio_file, n_frame = convert_audio_to_16k_wav(audio_file)
-        prepare_tsv(converted_audio_file, n_frame, language)
-        get_vocab_and_yaml(language)
-        model_path = get_model(language)
-        print("This is the model path: " + model_path)
-        generate_model_path = generate(model_path)
-        print("No problem generating model path")
-        generated_output = post_processing(generate_model_path)
-        print("No problem generating output")
-        remove_temp_files(converted_audio_file)
-        print("No problem removing_temp")
-        return generated_output
-    except:
-        traceback.print_exc()
-        return error_output(language)
 title = "Demo for Speech Translation (Whisper+mRASP2 and ConST)"
 description = """
@@ -381,7 +250,7 @@ with demo:
     gr.Markdown("###" + description)
     with gr.Row():
         with gr.Column():
-            model_type = gr.Dropdown(['Whisper+mRASP2', 'ConST'], type = "value", value = 'Whisper+mRASP2', label = "Select the model you want to use.")
             audio_file = gr.Audio(label="Upload Speech", source="upload", type="filepath")
             src_language = gr.Dropdown(['Arabic',
                                     'Chinese',
@@ -417,4 +286,4 @@ with demo:
     submit_button.click(fn = predict, inputs=[audio_file, src_language, tgt_language_mRASP, tgt_language_ConST, model_type, mic_audio], outputs=[transcript, translate, translated_speech])
     switch_lang_button.click(switchLang, [src_language, tgt_language_mRASP], [src_language, tgt_language_mRASP])
-demo.launch(share= True)

 #os.system("mkdir -p data checkpoint")
 def restrict_src_options(model_type):
     translation = (' '.join(translation.split(' ')[1:])).strip()
     mt_time = time.time() - mt_start_time
     # Returns the text
     return transcript, translation
 title = "Demo for Speech Translation (Whisper+mRASP2 and ConST)"
 description = """
     gr.Markdown("###" + description)
     with gr.Row():
         with gr.Column():
+            model_type = gr.Dropdown(['Whisper+mRASP2'], type = "value", value = 'Whisper+mRASP2', label = "Select the model you want to use.")
             audio_file = gr.Audio(label="Upload Speech", source="upload", type="filepath")
             src_language = gr.Dropdown(['Arabic',
                                     'Chinese',
     submit_button.click(fn = predict, inputs=[audio_file, src_language, tgt_language_mRASP, tgt_language_ConST, model_type, mic_audio], outputs=[transcript, translate, translated_speech])
     switch_lang_button.click(switchLang, [src_language, tgt_language_mRASP], [src_language, tgt_language_mRASP])
+demo.launch()