Spaces:

Yilin0601
/

Multimodal_Language_Learning_Aid

Running

App Files Files Community

Yilin0601 commited on Mar 26

Commit

16d930f

verified ·

1 Parent(s): cdf5e7f

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -33

app.py CHANGED Viewed

@@ -6,11 +6,11 @@ from transformers import pipeline, VitsModel, AutoTokenizer
 import scipy  # if needed for processing
 # ------------------------------------------------------
-# 1. ASR Pipeline (English)
 # ------------------------------------------------------
 asr = pipeline(
     "automatic-speech-recognition",
-    model="facebook/wav2vec2-base-960h"
 )
 # ------------------------------------------------------
@@ -30,17 +30,20 @@ translation_tasks = {
 # ------------------------------------------------------
 # 3. TTS Model Configurations
-#    NOTE: MMS does not provide a Mandarin TTS model,
-#    so we skip TTS for Chinese.
 # ------------------------------------------------------
 tts_config = {
     "Spanish": {
         "model_id": "facebook/mms-tts-spa",  # MMS Spanish
         "architecture": "vits"
     },
-    "Chinese": None,  # No MMS TTS for Chinese
     "Japanese": {
-        "model_id": "facebook/mms-tts-jpn",  # MMS Japanese
         "architecture": "vits"
     }
 }
@@ -69,21 +72,19 @@ def get_translator(lang):
 def get_tts_model(lang):
     """
     Loads (model, tokenizer, architecture) from Hugging Face once, then caches.
-    If no config is found (e.g. for Chinese), raises ValueError.
     """
     if lang in tts_model_cache:
         return tts_model_cache[lang]
     config = tts_config.get(lang)
     if config is None:
-        # No TTS model for this language
         raise ValueError(f"No TTS config found for language: {lang}")
     model_id = config["model_id"]
     arch = config["architecture"]
     try:
-        # Since arch == "vits" for these examples, load VitsModel + AutoTokenizer
         model = VitsModel.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
     except Exception as e:
@@ -106,17 +107,14 @@ def run_tts_inference(lang, text):
     with torch.no_grad():
         output = model(**inputs)
-    # VitsModel output is typically `.waveform`
     if hasattr(output, "waveform"):
         waveform_tensor = output.waveform
     else:
         raise RuntimeError("TTS model output does not contain 'waveform'.")
-    # Convert to numpy
     waveform = waveform_tensor.squeeze().cpu().numpy()
-    # MMS TTS typically uses 16 kHz
-    sample_rate = 16000
     return (sample_rate, waveform)
 # ------------------------------------------------------
@@ -124,25 +122,25 @@ def run_tts_inference(lang, text):
 # ------------------------------------------------------
 def predict(audio, text, target_language):
     """
-    1. Obtain English text (from text input or ASR).
-    2. Translate English -> target_language.
-    3. Run VITS-based TTS for that language (if available).
     """
-    # Step 1: English text
     if text.strip():
         english_text = text.strip()
     elif audio is not None:
         sample_rate, audio_data = audio
-        # Convert to float32
         if audio_data.dtype not in [np.float32, np.float64]:
             audio_data = audio_data.astype(np.float32)
-        # Convert stereo to mono if needed
         if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
             audio_data = np.mean(audio_data, axis=1)
-        # Resample to 16k if needed
         if sample_rate != 16000:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
@@ -160,11 +158,8 @@ def predict(audio, text, target_language):
     except Exception as e:
         return english_text, f"Translation error: {e}", None
-    # Step 3: TTS (skip if no config for language)
     try:
-        if tts_config[target_language] is None:
-            # No TTS model for Chinese or not supported
-            return english_text, translated_text, None
         sample_rate, waveform = run_tts_inference(target_language, translated_text)
     except Exception as e:
         return english_text, translated_text, f"TTS error: {e}"
@@ -184,20 +179,17 @@ iface = gr.Interface(
     outputs=[
         gr.Textbox(label="English Transcription"),
         gr.Textbox(label="Translation (Target Language)"),
-        gr.Audio(label="Synthesized Speech (if available)")
     ],
-    title="Multimodal Language Learning Aid (MMS TTS / VITS)",
     description=(
         "This app:\n"
-        "1. Transcribes English speech (via ASR) or accepts English text.\n"
-        "2. Translates to Spanish, Chinese, or Japanese (Helsinki-NLP).\n"
-        "3. Synthesizes speech with VITS-based MMS TTS models for Spanish/Japanese.\n\n"
-        "Note: MMS does NOT currently provide a Mandarin TTS model, so TTS is skipped for Chinese."
     ),
     allow_flagging="never"
 )
 if __name__ == "__main__":
-    # If running locally, uncomment:
-    # iface.launch()
     iface.launch(server_name="0.0.0.0", server_port=7860)

 import scipy  # if needed for processing
 # ------------------------------------------------------
+# 1. ASR Pipeline (English) using Whisper-small
 # ------------------------------------------------------
 asr = pipeline(
     "automatic-speech-recognition",
+    model="openai/whisper-small"
 )
 # ------------------------------------------------------
 # ------------------------------------------------------
 # 3. TTS Model Configurations
+# For Spanish, we keep the MMS TTS.
+# For Chinese & Japanese, use myshell-ai/MeloTTS-Chinese.
 # ------------------------------------------------------
 tts_config = {
     "Spanish": {
         "model_id": "facebook/mms-tts-spa",  # MMS Spanish
         "architecture": "vits"
     },
+    "Chinese": {
+        "model_id": "myshell-ai/MeloTTS-Chinese",
+        "architecture": "vits"
+    },
     "Japanese": {
+        "model_id": "myshell-ai/MeloTTS-Japanese",
         "architecture": "vits"
     }
 }
 def get_tts_model(lang):
     """
     Loads (model, tokenizer, architecture) from Hugging Face once, then caches.
     """
     if lang in tts_model_cache:
         return tts_model_cache[lang]
     config = tts_config.get(lang)
     if config is None:
         raise ValueError(f"No TTS config found for language: {lang}")
     model_id = config["model_id"]
     arch = config["architecture"]
     try:
+        # Assuming the model follows VITS-based inference
         model = VitsModel.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
     except Exception as e:
     with torch.no_grad():
         output = model(**inputs)
+    # VitsModel output is typically provided via .waveform attribute
     if hasattr(output, "waveform"):
         waveform_tensor = output.waveform
     else:
         raise RuntimeError("TTS model output does not contain 'waveform'.")
     waveform = waveform_tensor.squeeze().cpu().numpy()
+    sample_rate = 16000  # Typically used sample rate for these models
     return (sample_rate, waveform)
 # ------------------------------------------------------
 # ------------------------------------------------------
 def predict(audio, text, target_language):
     """
+    1. Obtain English text (via ASR using Whisper-small or text input).
+    2. Translate English text to the target language.
+    3. Synthesize speech with the target language TTS model.
     """
+    # Step 1: Get English text
     if text.strip():
         english_text = text.strip()
     elif audio is not None:
         sample_rate, audio_data = audio
+        # Ensure float32 data type
         if audio_data.dtype not in [np.float32, np.float64]:
             audio_data = audio_data.astype(np.float32)
+        # Convert stereo to mono if necessary
         if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
             audio_data = np.mean(audio_data, axis=1)
+        # Resample to 16kHz if necessary
         if sample_rate != 16000:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
     except Exception as e:
         return english_text, f"Translation error: {e}", None
+    # Step 3: TTS
     try:
         sample_rate, waveform = run_tts_inference(target_language, translated_text)
     except Exception as e:
         return english_text, translated_text, f"TTS error: {e}"
     outputs=[
         gr.Textbox(label="English Transcription"),
         gr.Textbox(label="Translation (Target Language)"),
+        gr.Audio(label="Synthesized Speech")
     ],
+    title="Multimodal Language Learning Aid (ASR / TTS)",
     description=(
         "This app:\n"
+        "1. Transcribes English speech or English text.\n"
+        "2. Translates to Spanish, Chinese, or Japanese (using Helsinki-NLP models).\n"
+        "3. Provides synthetic speech with TTS models:\n"
     ),
     allow_flagging="never"
 )
 if __name__ == "__main__":
     iface.launch(server_name="0.0.0.0", server_port=7860)