Spaces:

Yilin0601
/

Multimodal_Language_Learning_Aid

Running

App Files Files Community

Yilin0601 commited on Mar 26

Commit

e2fc711

verified ·

1 Parent(s): 1ee4794

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -35

app.py CHANGED Viewed

@@ -5,17 +5,17 @@ import librosa
 from transformers import pipeline, VitsModel, AutoTokenizer
 import scipy  # if needed for processing
-# -----------------------------------------------
 # 1. ASR Pipeline (English)
-# -----------------------------------------------
 asr = pipeline(
     "automatic-speech-recognition",
     model="facebook/wav2vec2-base-960h"
 )
-# -----------------------------------------------
 # 2. Translation Models (3 languages)
-# -----------------------------------------------
 translation_models = {
     "Spanish": "Helsinki-NLP/opus-mt-en-es",
     "Chinese": "Helsinki-NLP/opus-mt-en-zh",
@@ -28,34 +28,32 @@ translation_tasks = {
     "Japanese": "translation_en_to_ja"
 }
-# -----------------------------------------------
-# 3. TTS Model Configurations (All VITS)
-# -----------------------------------------------
-# Make sure these model IDs exist on Hugging Face.
 tts_config = {
     "Spanish": {
-        "model_id": "facebook/mms-tts-spa",
-        "architecture": "vits"
-    },
-    "Chinese": {
-        "model_id": "facebook/mms-tts-che",
         "architecture": "vits"
     },
     "Japanese": {
-        "model_id": "facebook/mms-tts-jpn",
         "architecture": "vits"
     }
 }
-# -----------------------------------------------
 # 4. Caches
-# -----------------------------------------------
 translator_cache = {}
 tts_model_cache = {}  # store (model, tokenizer, architecture)
-# -----------------------------------------------
 # 5. Translator Helper
-# -----------------------------------------------
 def get_translator(lang):
     if lang in translator_cache:
         return translator_cache[lang]
@@ -65,25 +63,27 @@ def get_translator(lang):
     translator_cache[lang] = translator
     return translator
-# -----------------------------------------------
 # 6. TTS Loading Helper
-# -----------------------------------------------
 def get_tts_model(lang):
     """
     Loads (model, tokenizer, architecture) from Hugging Face once, then caches.
     """
     if lang in tts_model_cache:
         return tts_model_cache[lang]
     config = tts_config.get(lang)
     if config is None:
         raise ValueError(f"No TTS config found for language: {lang}")
     model_id = config["model_id"]
     arch = config["architecture"]
     try:
-        # Since arch == "vits" for all three languages, we load VitsModel + AutoTokenizer
         model = VitsModel.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
     except Exception as e:
@@ -92,9 +92,9 @@ def get_tts_model(lang):
     tts_model_cache[lang] = (model, tokenizer, arch)
     return tts_model_cache[lang]
-# -----------------------------------------------
 # 7. TTS Inference Helper
-# -----------------------------------------------
 def run_tts_inference(lang, text):
     """
     Generates waveform using the loaded TTS model and tokenizer.
@@ -119,14 +119,14 @@ def run_tts_inference(lang, text):
     sample_rate = 16000
     return (sample_rate, waveform)
-# -----------------------------------------------
 # 8. Prediction Function
-# -----------------------------------------------
 def predict(audio, text, target_language):
     """
     1. Obtain English text (from text input or ASR).
     2. Translate English -> target_language.
-    3. Run VITS-based TTS for that language.
     """
     # Step 1: English text
     if text.strip():
@@ -142,7 +142,7 @@ def predict(audio, text, target_language):
         if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
             audio_data = np.mean(audio_data, axis=1)
-        # Resample to 16k
         if sample_rate != 16000:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
@@ -160,17 +160,20 @@ def predict(audio, text, target_language):
     except Exception as e:
         return english_text, f"Translation error: {e}", None
-    # Step 3: TTS
     try:
         sample_rate, waveform = run_tts_inference(target_language, translated_text)
     except Exception as e:
         return english_text, translated_text, f"TTS error: {e}"
     return english_text, translated_text, (sample_rate, waveform)
-# -----------------------------------------------
 # 9. Gradio Interface
-# -----------------------------------------------
 iface = gr.Interface(
     fn=predict,
     inputs=[
@@ -181,19 +184,20 @@ iface = gr.Interface(
     outputs=[
         gr.Textbox(label="English Transcription"),
         gr.Textbox(label="Translation (Target Language)"),
-        gr.Audio(label="Synthesized Speech in Target Language")
     ],
     title="Multimodal Language Learning Aid (MMS TTS / VITS)",
     description=(
         "This app:\n"
         "1. Transcribes English speech (via ASR) or accepts English text.\n"
         "2. Translates to Spanish, Chinese, or Japanese (Helsinki-NLP).\n"
-        "3. Synthesizes speech with VITS-based MMS TTS models.\n\n"
-        "Note: Ensure the MMS model IDs exist on Hugging Face. If not, you'll see an error.\n"
-        "Record/upload English audio or enter text, then select a target language."
     ),
     allow_flagging="never"
 )
 if __name__ == "__main__":
-    iface.launch()

 from transformers import pipeline, VitsModel, AutoTokenizer
 import scipy  # if needed for processing
+# ------------------------------------------------------
 # 1. ASR Pipeline (English)
+# ------------------------------------------------------
 asr = pipeline(
     "automatic-speech-recognition",
     model="facebook/wav2vec2-base-960h"
 )
+# ------------------------------------------------------
 # 2. Translation Models (3 languages)
+# ------------------------------------------------------
 translation_models = {
     "Spanish": "Helsinki-NLP/opus-mt-en-es",
     "Chinese": "Helsinki-NLP/opus-mt-en-zh",
     "Japanese": "translation_en_to_ja"
 }
+# ------------------------------------------------------
+# 3. TTS Model Configurations
+#    NOTE: MMS does not provide a Mandarin TTS model,
+#    so we skip TTS for Chinese.
+# ------------------------------------------------------
 tts_config = {
     "Spanish": {
+        "model_id": "facebook/mms-tts-spa",  # MMS Spanish
         "architecture": "vits"
     },
+    "Chinese": None,  # No MMS TTS for Chinese
     "Japanese": {
+        "model_id": "facebook/mms-tts-jpn",  # MMS Japanese
         "architecture": "vits"
     }
 }
+# ------------------------------------------------------
 # 4. Caches
+# ------------------------------------------------------
 translator_cache = {}
 tts_model_cache = {}  # store (model, tokenizer, architecture)
+# ------------------------------------------------------
 # 5. Translator Helper
+# ------------------------------------------------------
 def get_translator(lang):
     if lang in translator_cache:
         return translator_cache[lang]
     translator_cache[lang] = translator
     return translator
+# ------------------------------------------------------
 # 6. TTS Loading Helper
+# ------------------------------------------------------
 def get_tts_model(lang):
     """
     Loads (model, tokenizer, architecture) from Hugging Face once, then caches.
+    If no config is found (e.g. for Chinese), raises ValueError.
     """
     if lang in tts_model_cache:
         return tts_model_cache[lang]
     config = tts_config.get(lang)
     if config is None:
+        # No TTS model for this language
         raise ValueError(f"No TTS config found for language: {lang}")
     model_id = config["model_id"]
     arch = config["architecture"]
     try:
+        # Since arch == "vits" for these examples, load VitsModel + AutoTokenizer
         model = VitsModel.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
     except Exception as e:
     tts_model_cache[lang] = (model, tokenizer, arch)
     return tts_model_cache[lang]
+# ------------------------------------------------------
 # 7. TTS Inference Helper
+# ------------------------------------------------------
 def run_tts_inference(lang, text):
     """
     Generates waveform using the loaded TTS model and tokenizer.
     sample_rate = 16000
     return (sample_rate, waveform)
+# ------------------------------------------------------
 # 8. Prediction Function
+# ------------------------------------------------------
 def predict(audio, text, target_language):
     """
     1. Obtain English text (from text input or ASR).
     2. Translate English -> target_language.
+    3. Run VITS-based TTS for that language (if available).
     """
     # Step 1: English text
     if text.strip():
         if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
             audio_data = np.mean(audio_data, axis=1)
+        # Resample to 16k if needed
         if sample_rate != 16000:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
     except Exception as e:
         return english_text, f"Translation error: {e}", None
+    # Step 3: TTS (skip if no config for language)
     try:
+        if tts_config[target_language] is None:
+            # No TTS model for Chinese or not supported
+            return english_text, translated_text, None
         sample_rate, waveform = run_tts_inference(target_language, translated_text)
     except Exception as e:
         return english_text, translated_text, f"TTS error: {e}"
     return english_text, translated_text, (sample_rate, waveform)
+# ------------------------------------------------------
 # 9. Gradio Interface
+# ------------------------------------------------------
 iface = gr.Interface(
     fn=predict,
     inputs=[
     outputs=[
         gr.Textbox(label="English Transcription"),
         gr.Textbox(label="Translation (Target Language)"),
+        gr.Audio(label="Synthesized Speech (if available)")
     ],
     title="Multimodal Language Learning Aid (MMS TTS / VITS)",
     description=(
         "This app:\n"
         "1. Transcribes English speech (via ASR) or accepts English text.\n"
         "2. Translates to Spanish, Chinese, or Japanese (Helsinki-NLP).\n"
+        "3. Synthesizes speech with VITS-based MMS TTS models for Spanish/Japanese.\n\n"
+        "Note: MMS does NOT currently provide a Mandarin TTS model, so TTS is skipped for Chinese."
     ),
     allow_flagging="never"
 )
 if __name__ == "__main__":
+    # If running locally, uncomment:
+    # iface.launch()
+    iface.launch(server_name="0.0.0.0", server_port=7860)