Spaces:

Yilin0601
/

Multimodal_Language_Learning_Aid

Running

App Files Files Community

Yilin0601 commited on Mar 31

Commit

fa64981

verified ·

1 Parent(s): c953920

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -38

app.py CHANGED Viewed

@@ -9,11 +9,11 @@ import os
 from transformers import pipeline, VitsModel, AutoTokenizer
 from datasets import load_dataset
-# For MeloTTS (Chinese and Japanese)
 try:
-    from melo.api import TTS as MeloTTS
 except ImportError:
-    raise ImportError("Please install the MeloTTS package (e.g., pip install myshell-ai/MeloTTS-Chinese)")
 # ------------------------------------------------------
 # 1. ASR Pipeline (English) using Wav2Vec2
@@ -51,7 +51,7 @@ translation_tasks = {
 # ------------------------------------------------------
 # 3. TTS Configuration
 #    - MMS TTS (VITS) for: Spanish, Vietnamese, Indonesian, Turkish, Portuguese, Korean
-#    - MeloTTS for: Chinese and Japanese
 # ------------------------------------------------------
 tts_config = {
     "Spanish": {"model_id": "facebook/mms-tts-spa", "architecture": "vits", "type": "mms"},
@@ -60,8 +60,14 @@ tts_config = {
     "Turkish": {"model_id": "facebook/mms-tts-tur", "architecture": "vits", "type": "mms"},
     "Portuguese": {"model_id": "facebook/mms-tts-por", "architecture": "vits", "type": "mms"},
     "Korean": {"model_id": "facebook/mms-tts-kor", "architecture": "vits", "type": "mms"},
-    "Chinese": {"type": "melo"},
-    "Japanese": {"type": "melo"}
 }
 # ------------------------------------------------------
@@ -69,7 +75,7 @@ tts_config = {
 # ------------------------------------------------------
 translator_cache = {}
 mms_tts_cache = {}     # For MMS (VITS-based) TTS models
-melo_tts_cache = {}    # For MeloTTS models (Chinese/Japanese)
 # ------------------------------------------------------
 # 5. Translator Helper
@@ -110,31 +116,31 @@ def run_mms_tts(text, lang):
     return sample_rate, waveform
 # ------------------------------------------------------
-# 7. MeloTTS Helper for Chinese and Japanese
 # ------------------------------------------------------
-def run_melo_tts(text, lang):
-    """
-    Uses the myshell-ai MeloTTS model.
-    For Chinese, use language parameter 'ZH'; for Japanese, use 'JP'.
-    """
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    lang_param = 'ZH' if lang == "Chinese" else 'JP'
-    if lang not in melo_tts_cache:
-        try:
-            model = MeloTTS(language=lang_param, device=device)
-            melo_tts_cache[lang] = model
-        except Exception as e:
-            raise RuntimeError(f"Failed to load MeloTTS model for {lang}: {e}")
-    else:
-        model = melo_tts_cache[lang]
-    speaker_ids = model.hps.data.spk2id
-    # Assume the speaker key is the same as lang_param
-    speaker_key = lang_param
-    speed = 1.0
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         tmp_name = tmp.name
     try:
-        model.tts_to_file(text, speaker_ids[speaker_key], tmp_name, speed=speed)
         data, sr = sf.read(tmp_name)
     finally:
         if os.path.exists(tmp_name):
@@ -147,8 +153,8 @@ def run_melo_tts(text, lang):
 def predict(audio, text, target_language):
     """
     1. Obtain English text (via ASR if audio provided, else text).
-    2. Translate the English text to target_language.
-    3. Generate TTS audio using either MMS TTS (VITS) or MeloTTS.
     """
     # Step 1: Get English text.
     if text.strip():
@@ -180,8 +186,8 @@ def predict(audio, text, target_language):
         tts_type = tts_config[target_language]["type"]
         if tts_type == "mms":
             sr, waveform = run_mms_tts(translated_text, target_language)
-        elif tts_type == "melo":
-            sr, waveform = run_melo_tts(translated_text, target_language)
         else:
             raise RuntimeError("Unknown TTS type for target language.")
     except Exception as e:
@@ -212,14 +218,12 @@ iface = gr.Interface(
     description=(
         "This app performs the following steps:\n"
         "1. Transcribes English speech using Wav2Vec2 (or accepts text input).\n"
-        "2. Translates the English text to the target language using Helsinki-NLP MarianMT models.\n"
-        "3. Synthesizes speech:\n"
-        "   - For Spanish, Vietnamese, Indonesian, Turkish, Portuguese, and Korean: uses Facebook MMS TTS (VITS-based).\n"
-        "   - For Chinese and Japanese: uses myshell-ai MeloTTS models.\n"
-        "\nSelect your target language from the dropdown."
     ),
     allow_flagging="never"
 )
 if __name__ == "__main__":
-    iface.launch(server_name="0.0.0.0", server_port=7860)

 from transformers import pipeline, VitsModel, AutoTokenizer
 from datasets import load_dataset
+# For Coqui TTS (XTTS-v2)
 try:
+    from TTS.api import TTS as CoquiTTS
 except ImportError:
+    raise ImportError("Please install Coqui TTS via pip install TTS.")
 # ------------------------------------------------------
 # 1. ASR Pipeline (English) using Wav2Vec2
 # ------------------------------------------------------
 # 3. TTS Configuration
 #    - MMS TTS (VITS) for: Spanish, Vietnamese, Indonesian, Turkish, Portuguese, Korean
+#    - Coqui XTTS-v2 for: Chinese and Japanese
 # ------------------------------------------------------
 tts_config = {
     "Spanish": {"model_id": "facebook/mms-tts-spa", "architecture": "vits", "type": "mms"},
     "Turkish": {"model_id": "facebook/mms-tts-tur", "architecture": "vits", "type": "mms"},
     "Portuguese": {"model_id": "facebook/mms-tts-por", "architecture": "vits", "type": "mms"},
     "Korean": {"model_id": "facebook/mms-tts-kor", "architecture": "vits", "type": "mms"},
+    "Chinese": {"type": "coqui"},
+    "Japanese": {"type": "coqui"}
+}
+# For Coqui, we map our languages to language codes expected by the model.
+coqui_lang_map = {
+    "Chinese": "zh",
+    "Japanese": "ja"
 }
 # ------------------------------------------------------
 # ------------------------------------------------------
 translator_cache = {}
 mms_tts_cache = {}     # For MMS (VITS-based) TTS models
+coqui_tts_cache = None  # Single instance for Coqui XTTS-v2
 # ------------------------------------------------------
 # 5. Translator Helper
     return sample_rate, waveform
 # ------------------------------------------------------
+# 7. Coqui TTS Helper for Chinese and Japanese
 # ------------------------------------------------------
+def load_coqui_tts():
+    global coqui_tts_cache
+    if coqui_tts_cache is not None:
+        return coqui_tts_cache
+    try:
+        # Set gpu=True if a GPU is available.
+        coqui_tts_cache = CoquiTTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
+    except Exception as e:
+        raise RuntimeError(f"Failed to load Coqui XTTS-v2 TTS: {e}")
+    return coqui_tts_cache
+def run_coqui_tts(text, lang):
+    coqui_tts = load_coqui_tts()
+    lang_code = coqui_lang_map[lang]  # "zh" for Chinese or "ja" for Japanese
+    # Write the output to a temporary file and then read it back.
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         tmp_name = tmp.name
     try:
+        coqui_tts.tts_to_file(
+            text=text,
+            file_path=tmp_name,
+            language=lang_code  # using default voice; for cloning, add speaker_wav parameter
+        )
         data, sr = sf.read(tmp_name)
     finally:
         if os.path.exists(tmp_name):
 def predict(audio, text, target_language):
     """
     1. Obtain English text (via ASR if audio provided, else text).
+    2. Translate English text to target_language.
+    3. Generate TTS audio using either MMS TTS (VITS) or Coqui XTTS-v2.
     """
     # Step 1: Get English text.
     if text.strip():
         tts_type = tts_config[target_language]["type"]
         if tts_type == "mms":
             sr, waveform = run_mms_tts(translated_text, target_language)
+        elif tts_type == "coqui":
+            sr, waveform = run_coqui_tts(translated_text, target_language)
         else:
             raise RuntimeError("Unknown TTS type for target language.")
     except Exception as e:
     description=(
         "This app performs the following steps:\n"
         "1. Transcribes English speech using Wav2Vec2 (or accepts text input).\n"
+        "2. Translates the English text to the target language using Helsinki-NLP models.\n"
+        "3. Provides Synthetic speech:\n"
+        "For Spanish, Vietnamese, Indonesian, Turkish, Portuguese, and Korean."
     ),
     allow_flagging="never"
 )
 if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860)