Spaces:

Yilin0601
/

Multimodal_Language_Learning_Aid

Running

App Files Files Community

Yilin0601 commited on Mar 25

Commit

d744aff

verified ·

1 Parent(s): c098e72

Update app.py

Browse files

Files changed (1) hide show

app.py +134 -53

app.py CHANGED Viewed

@@ -2,19 +2,20 @@ import gradio as gr
 import torch
 import numpy as np
 import librosa
-from transformers import pipeline
-# --------------------------------------------------
-# ASR Pipeline (for English transcription)
-# --------------------------------------------------
 asr = pipeline(
     "automatic-speech-recognition",
     model="facebook/wav2vec2-base-960h"
 )
-# --------------------------------------------------
-# Mapping for Target Languages (Spanish, Chinese, Japanese)
-# --------------------------------------------------
 translation_models = {
     "Spanish": "Helsinki-NLP/opus-mt-en-es",
     "Chinese": "Helsinki-NLP/opus-mt-en-zh",
@@ -27,62 +28,143 @@ translation_tasks = {
     "Japanese": "translation_en_to_ja"
 }
-tts_models = {
-    "Spanish": "facebook/mms-tts-spa",
-    "Chinese": "facebook/mms-tts-che",
-    "Japanese": "esnya/japanese_speecht5_tts"
 }
-# --------------------------------------------------
-# Caches for translator and TTS pipelines
-# --------------------------------------------------
 translator_cache = {}
-tts_cache = {}
-def get_translator(target_language):
-    if target_language in translator_cache:
-        return translator_cache[target_language]
-    model_name = translation_models[target_language]
-    task_name = translation_tasks[target_language]
     translator = pipeline(task_name, model=model_name)
-    translator_cache[target_language] = translator
     return translator
-def get_tts(target_language):
-    if target_language in tts_cache:
-        return tts_cache[target_language]
-    model_name = tts_models.get(target_language)
-    if model_name is None:
-        raise ValueError(f"No TTS model available for {target_language}.")
     try:
-        tts_pipeline = pipeline("text-to-speech", model=model_name)
     except Exception as e:
-        raise ValueError(f"Failed to load TTS model for {target_language} with model '{model_name}'.\nError: {e}")
-    tts_cache[target_language] = tts_pipeline
-    return tts_pipeline
-# --------------------------------------------------
-# Prediction Function
-# --------------------------------------------------
 def predict(audio, text, target_language):
-    # Step 1: Obtain English text from text input if provided, otherwise use ASR.
     if text.strip():
         english_text = text.strip()
     elif audio is not None:
         sample_rate, audio_data = audio
         if audio_data.dtype not in [np.float32, np.float64]:
             audio_data = audio_data.astype(np.float32)
         if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
             audio_data = np.mean(audio_data, axis=1)
         if sample_rate != 16000:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
-        input_audio = {"array": audio_data, "sampling_rate": 16000}
-        asr_result = asr(input_audio)
         english_text = asr_result["text"]
     else:
         return "No input provided.", "", None
-    # Step 2: Translate the English text to the target language.
     translator = get_translator(target_language)
     try:
         translation_result = translator(english_text)
@@ -90,38 +172,37 @@ def predict(audio, text, target_language):
     except Exception as e:
         return english_text, f"Translation error: {e}", None
-    # Step 3: Synthesize speech using the TTS pipeline.
     try:
-        tts_pipeline = get_tts(target_language)
-        tts_result = tts_pipeline(translated_text)
-        synthesized_audio = (tts_result["sample_rate"], tts_result["wav"])
     except Exception as e:
         return english_text, translated_text, f"TTS error: {e}"
-    return english_text, translated_text, synthesized_audio
-# --------------------------------------------------
-# Gradio Interface Setup
-# --------------------------------------------------
 iface = gr.Interface(
     fn=predict,
     inputs=[
         gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
         gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
-        gr.Dropdown(choices=list(translation_models.keys()), value="Spanish", label="Target Language")
     ],
     outputs=[
         gr.Textbox(label="English Transcription"),
         gr.Textbox(label="Translation (Target Language)"),
         gr.Audio(label="Synthesized Speech in Target Language")
     ],
-    title="Multimodal Language Learning Aid",
     description=(
-        "This app provides three outputs:\n"
-        "1. English transcription (from ASR or text input),\n"
-        "2. Translation to Spanish, Chinese, or Japanese (using Helsinki-NLP models), and\n"
-        "3. Synthetic speech in the target language (using Facebook MMS TTS or equivalent).\n\n"
-        "Either record/upload an English audio sample or enter English text directly."
     ),
     allow_flagging="never"
 )

 import torch
 import numpy as np
 import librosa
+from transformers import pipeline, VitsModel, AutoTokenizer
+import scipy  # if needed for processing
+# -----------------------------------------------
+# 1. ASR Pipeline (English)
+# -----------------------------------------------
 asr = pipeline(
     "automatic-speech-recognition",
     model="facebook/wav2vec2-base-960h"
 )
+# -----------------------------------------------
+# 2. Translation Models (3 languages)
+# -----------------------------------------------
 translation_models = {
     "Spanish": "Helsinki-NLP/opus-mt-en-es",
     "Chinese": "Helsinki-NLP/opus-mt-en-zh",
     "Japanese": "translation_en_to_ja"
 }
+# -----------------------------------------------
+# 3. TTS Model Configurations
+#    We'll load them manually (not with pipeline("text-to-speech"))
+# -----------------------------------------------
+# - Spanish (MMS TTS, uses VITS architecture)
+# - Chinese (MMS TTS, uses VITS architecture)
+# - Japanese (SpeechT5 or a VITS-based model—here we pick a SpeechT5 example)
+tts_config = {
+    "Spanish": {
+        "model_id": "facebook/mms-tts-spa",
+        "architecture": "vits"  # We'll use VitsModel
+    },
+    "Chinese": {
+        "model_id": "facebook/mms-tts-che",
+        "architecture": "vits"
+    },
+    "Japanese": {
+        "model_id": "esnya/japanese_speecht5_tts",
+        "architecture": "speecht5"  # We'll treat this differently
+    }
 }
+# -----------------------------------------------
+# 4. Caches
+# -----------------------------------------------
 translator_cache = {}
+tts_model_cache = {}  # store (model, tokenizer, architecture)
+# -----------------------------------------------
+# 5. Translator Helper
+# -----------------------------------------------
+def get_translator(lang):
+    if lang in translator_cache:
+        return translator_cache[lang]
+    model_name = translation_models[lang]
+    task_name = translation_tasks[lang]
     translator = pipeline(task_name, model=model_name)
+    translator_cache[lang] = translator
     return translator
+# -----------------------------------------------
+# 6. TTS Helper
+# -----------------------------------------------
+def get_tts_model(lang):
+    """
+    Loads (model, tokenizer, architecture) from Hugging Face once, then caches.
+    """
+    if lang in tts_model_cache:
+        return tts_model_cache[lang]
+    config = tts_config.get(lang)
+    if config is None:
+        raise ValueError(f"No TTS config found for language: {lang}")
+    model_id = config["model_id"]
+    arch = config["architecture"]
     try:
+        if arch == "vits":
+            # Load a VitsModel + tokenizer
+            model = VitsModel.from_pretrained(model_id)
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+        elif arch == "speecht5":
+            # For a SpeechT5 model, we might do something else
+            # e.g., pipeline("text-to-speech", model=...) if it works
+            # or custom loading if it's also a VITS-based approach
+            # We'll attempt a similar pattern:
+            model = VitsModel.from_pretrained(model_id)
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+        else:
+            raise ValueError(f"Unknown TTS architecture: {arch}")
     except Exception as e:
+        raise RuntimeError(f"Failed to load TTS model {model_id}: {e}")
+    tts_model_cache[lang] = (model, tokenizer, arch)
+    return tts_model_cache[lang]
+def run_tts_inference(lang, text):
+    """
+    Generates waveform using the loaded TTS model and tokenizer.
+    Returns (sample_rate, np_array).
+    """
+    model, tokenizer, arch = get_tts_model(lang)
+    inputs = tokenizer(text, return_tensors="pt")
+    with torch.no_grad():
+        output = model(**inputs)
+    # VitsModel output is typically `.waveform`
+    if hasattr(output, "waveform"):
+        waveform_tensor = output.waveform
+    else:
+        # Some models might return a different attribute
+        raise RuntimeError("The TTS model output doesn't have 'waveform' attribute.")
+    # Convert to numpy array
+    waveform = waveform_tensor.squeeze().cpu().numpy()
+    # Typically, MMS TTS uses 16 kHz
+    sample_rate = 16000
+    return (sample_rate, waveform)
+# -----------------------------------------------
+# 7. Prediction Function
+# -----------------------------------------------
 def predict(audio, text, target_language):
+    """
+    1. If text is provided, use it directly as English text.
+       Else, if audio is provided, run ASR.
+    2. Translate English -> target_language.
+    3. Run TTS with the correct approach for that language.
+    """
+    # Step 1: English text
     if text.strip():
         english_text = text.strip()
     elif audio is not None:
         sample_rate, audio_data = audio
+        # Convert to float32
         if audio_data.dtype not in [np.float32, np.float64]:
             audio_data = audio_data.astype(np.float32)
+        # Mono
         if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
             audio_data = np.mean(audio_data, axis=1)
+        # Resample to 16k
         if sample_rate != 16000:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
+        asr_input = {"array": audio_data, "sampling_rate": 16000}
+        asr_result = asr(asr_input)
         english_text = asr_result["text"]
     else:
         return "No input provided.", "", None
+    # Step 2: Translation
     translator = get_translator(target_language)
     try:
         translation_result = translator(english_text)
     except Exception as e:
         return english_text, f"Translation error: {e}", None
+    # Step 3: TTS
     try:
+        sample_rate, waveform = run_tts_inference(target_language, translated_text)
     except Exception as e:
         return english_text, translated_text, f"TTS error: {e}"
+    return english_text, translated_text, (sample_rate, waveform)
+# -----------------------------------------------
+# 8. Gradio Interface
+# -----------------------------------------------
 iface = gr.Interface(
     fn=predict,
     inputs=[
         gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
         gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
+        gr.Dropdown(choices=["Spanish", "Chinese", "Japanese"], value="Spanish", label="Target Language")
     ],
     outputs=[
         gr.Textbox(label="English Transcription"),
         gr.Textbox(label="Translation (Target Language)"),
         gr.Audio(label="Synthesized Speech in Target Language")
     ],
+    title="Multimodal Language Learning Aid (VITS-based TTS)",
     description=(
+        "This app:\n"
+        "1. Transcribes English speech (via ASR) or accepts English text.\n"
+        "2. Translates to Spanish, Chinese, or Japanese.\n"
+        "3. Synthesizes speech with VITS-based or SpeechT5-based models.\n\n"
+        "Note: Some models are experimental and may produce errors or poor quality.\n"
+        "Either upload/record English audio or enter text, then select a target language."
     ),
     allow_flagging="never"
 )