Spaces:

Yilin0601
/

Multimodal_Language_Learning_Aid

Running

App Files Files Community

Yilin0601 commited on Mar 26

Commit

1ee4794

verified ·

1 Parent(s): 7064b79

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -37

app.py CHANGED Viewed

@@ -29,24 +29,21 @@ translation_tasks = {
 }
 # -----------------------------------------------
-# 3. TTS Model Configurations
-#    We'll load them manually (not with pipeline("text-to-speech"))
 # -----------------------------------------------
-# - Spanish (MMS TTS, uses VITS architecture)
-# - Chinese (MMS TTS, uses VITS architecture)
-# - Japanese (SpeechT5 or a VITS-based model—here we pick a SpeechT5 example)
 tts_config = {
     "Spanish": {
         "model_id": "facebook/mms-tts-spa",
-        "architecture": "vits"  # We'll use VitsModel
     },
     "Chinese": {
         "model_id": "facebook/mms-tts-che",
         "architecture": "vits"
     },
     "Japanese": {
-        "model_id": "esnya/japanese_speecht5_tts",
-        "architecture": "speecht5"  # We'll treat this differently
     }
 }
@@ -69,7 +66,7 @@ def get_translator(lang):
     return translator
 # -----------------------------------------------
-# 6. TTS Helper
 # -----------------------------------------------
 def get_tts_model(lang):
     """
@@ -86,25 +83,18 @@ def get_tts_model(lang):
     arch = config["architecture"]
     try:
-        if arch == "vits":
-            # Load a VitsModel + tokenizer
-            model = VitsModel.from_pretrained(model_id)
-            tokenizer = AutoTokenizer.from_pretrained(model_id)
-        elif arch == "speecht5":
-            # For a SpeechT5 model, we might do something else
-            # e.g., pipeline("text-to-speech", model=...) if it works
-            # or custom loading if it's also a VITS-based approach
-            # We'll attempt a similar pattern:
-            model = VitsModel.from_pretrained(model_id)
-            tokenizer = AutoTokenizer.from_pretrained(model_id)
-        else:
-            raise ValueError(f"Unknown TTS architecture: {arch}")
     except Exception as e:
         raise RuntimeError(f"Failed to load TTS model {model_id}: {e}")
     tts_model_cache[lang] = (model, tokenizer, arch)
     return tts_model_cache[lang]
 def run_tts_inference(lang, text):
     """
     Generates waveform using the loaded TTS model and tokenizer.
@@ -120,25 +110,23 @@ def run_tts_inference(lang, text):
     if hasattr(output, "waveform"):
         waveform_tensor = output.waveform
     else:
-        # Some models might return a different attribute
-        raise RuntimeError("The TTS model output doesn't have 'waveform' attribute.")
-    # Convert to numpy array
     waveform = waveform_tensor.squeeze().cpu().numpy()
-    # Typically, MMS TTS uses 16 kHz
     sample_rate = 16000
     return (sample_rate, waveform)
 # -----------------------------------------------
-# 7. Prediction Function
 # -----------------------------------------------
 def predict(audio, text, target_language):
     """
-    1. If text is provided, use it directly as English text.
-       Else, if audio is provided, run ASR.
     2. Translate English -> target_language.
-    3. Run TTS with the correct approach for that language.
     """
     # Step 1: English text
     if text.strip():
@@ -150,7 +138,7 @@ def predict(audio, text, target_language):
         if audio_data.dtype not in [np.float32, np.float64]:
             audio_data = audio_data.astype(np.float32)
-        # Mono
         if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
             audio_data = np.mean(audio_data, axis=1)
@@ -181,7 +169,7 @@ def predict(audio, text, target_language):
     return english_text, translated_text, (sample_rate, waveform)
 # -----------------------------------------------
-# 8. Gradio Interface
 # -----------------------------------------------
 iface = gr.Interface(
     fn=predict,
@@ -195,14 +183,14 @@ iface = gr.Interface(
         gr.Textbox(label="Translation (Target Language)"),
         gr.Audio(label="Synthesized Speech in Target Language")
     ],
-    title="Multimodal Language Learning Aid (VITS-based TTS)",
     description=(
         "This app:\n"
         "1. Transcribes English speech (via ASR) or accepts English text.\n"
-        "2. Translates to Spanish, Chinese, or Japanese.\n"
-        "3. Synthesizes speech with VITS-based or SpeechT5-based models.\n\n"
-        "Note: Some models are experimental and may produce errors or poor quality.\n"
-        "Either upload/record English audio or enter text, then select a target language."
     ),
     allow_flagging="never"
 )

 }
 # -----------------------------------------------
+# 3. TTS Model Configurations (All VITS)
 # -----------------------------------------------
+# Make sure these model IDs exist on Hugging Face.
 tts_config = {
     "Spanish": {
         "model_id": "facebook/mms-tts-spa",
+        "architecture": "vits"
     },
     "Chinese": {
         "model_id": "facebook/mms-tts-che",
         "architecture": "vits"
     },
     "Japanese": {
+        "model_id": "facebook/mms-tts-jpn",
+        "architecture": "vits"
     }
 }
     return translator
 # -----------------------------------------------
+# 6. TTS Loading Helper
 # -----------------------------------------------
 def get_tts_model(lang):
     """
     arch = config["architecture"]
     try:
+        # Since arch == "vits" for all three languages, we load VitsModel + AutoTokenizer
+        model = VitsModel.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
     except Exception as e:
         raise RuntimeError(f"Failed to load TTS model {model_id}: {e}")
     tts_model_cache[lang] = (model, tokenizer, arch)
     return tts_model_cache[lang]
+# -----------------------------------------------
+# 7. TTS Inference Helper
+# -----------------------------------------------
 def run_tts_inference(lang, text):
     """
     Generates waveform using the loaded TTS model and tokenizer.
     if hasattr(output, "waveform"):
         waveform_tensor = output.waveform
     else:
+        raise RuntimeError("TTS model output does not contain 'waveform'.")
+    # Convert to numpy
     waveform = waveform_tensor.squeeze().cpu().numpy()
+    # MMS TTS typically uses 16 kHz
     sample_rate = 16000
     return (sample_rate, waveform)
 # -----------------------------------------------
+# 8. Prediction Function
 # -----------------------------------------------
 def predict(audio, text, target_language):
     """
+    1. Obtain English text (from text input or ASR).
     2. Translate English -> target_language.
+    3. Run VITS-based TTS for that language.
     """
     # Step 1: English text
     if text.strip():
         if audio_data.dtype not in [np.float32, np.float64]:
             audio_data = audio_data.astype(np.float32)
+        # Convert stereo to mono if needed
         if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
             audio_data = np.mean(audio_data, axis=1)
     return english_text, translated_text, (sample_rate, waveform)
 # -----------------------------------------------
+# 9. Gradio Interface
 # -----------------------------------------------
 iface = gr.Interface(
     fn=predict,
         gr.Textbox(label="Translation (Target Language)"),
         gr.Audio(label="Synthesized Speech in Target Language")
     ],
+    title="Multimodal Language Learning Aid (MMS TTS / VITS)",
     description=(
         "This app:\n"
         "1. Transcribes English speech (via ASR) or accepts English text.\n"
+        "2. Translates to Spanish, Chinese, or Japanese (Helsinki-NLP).\n"
+        "3. Synthesizes speech with VITS-based MMS TTS models.\n\n"
+        "Note: Ensure the MMS model IDs exist on Hugging Face. If not, you'll see an error.\n"
+        "Record/upload English audio or enter text, then select a target language."
     ),
     allow_flagging="never"
 )