Spaces:

akisg
/

care-notes

Running on Zero

App Files Files Community

Akis Giannoukos commited on 14 days ago

Commit

aec1268

1 Parent(s): 30f47d7

Implement Coqui TTS integration with model and speaker selection in demo interface; update requirements to include coqui-tts package.

Browse files

Files changed (2) hide show

app.py +54 -5
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -220,12 +220,33 @@ def detect_explicit_suicidality(text: Optional[str]) -> bool:
     return False
-def synthesize_tts(text: Optional[str]) -> Optional[str]:
     if not text:
         return None
     try:
-        # Save MP3 to tmp and return filepath
-        ts = int(time.time() * 1000)
         out_path = f"/tmp/tts_{ts}.mp3"
         tts = gTTS(text=text, lang="en")
         tts.save(out_path)
@@ -234,6 +255,21 @@ def synthesize_tts(text: Optional[str]) -> Optional[str]:
         return None
 def severity_from_total(total_score: int) -> str:
     if total_score <= 4:
         return "Minimal Depression"
@@ -660,6 +696,10 @@ def create_demo():
                     severity_label = gr.Label(label="Severity")
                     threshold = gr.Slider(0.5, 1.0, value=CONFIDENCE_THRESHOLD_DEFAULT, step=0.05, label="Confidence Threshold (stop when min ≥ τ)")
                     tts_enable = gr.Checkbox(label="Speak clinician responses (TTS)", value=USE_TTS_DEFAULT)
                     tts_audio = gr.Audio(label="Clinician voice", interactive=False, autoplay=False, visible=False)
                     model_id_tb = gr.Textbox(value=current_model_id, label="Chat Model ID", info="e.g., google/gemma-2-2b-it or google/medgemma-4b-it")
                     with gr.Row():
@@ -681,9 +721,18 @@ def create_demo():
         intro_play_btn.click(fn=_play_intro_tts, inputs=[tts_enable], outputs=[tts_audio_main])
         # Wire interactions
         audio_main.stop_recording(
-            fn=process_turn,
-            inputs=[audio_main, text_main, chatbot, threshold, tts_enable, finished_state, turns_state, scores_state, meta_state],
             outputs=[chatbot, score_json, severity_label, finished_state, turns_state, audio_main, text_main, tts_audio, tts_audio_main],
             queue=True,
             api_name="message",

     return False
+def synthesize_tts(
+    text: Optional[str],
+    provider: str = "Coqui",
+    coqui_model_name: Optional[str] = None,
+    coqui_speaker: Optional[str] = None,
+) -> Optional[str]:
     if not text:
         return None
+    ts = int(time.time() * 1000)
+    provider_norm = (provider or "Coqui").strip().lower()
+    # Try Coqui first if requested
+    if provider_norm == "coqui":
+        try:
+            # coqui-tts uses the same import path TTS.api
+            from TTS.api import TTS as CoquiTTS  # type: ignore
+            model_name = (coqui_model_name or os.getenv("COQUI_MODEL", "tts_models/en/vctk/vits")).strip()
+            engine = CoquiTTS(model_name=model_name, progress_bar=False)
+            out_path_wav = f"/tmp/tts_{ts}.wav"
+            kwargs = {}
+            if coqui_speaker:
+                kwargs["speaker"] = coqui_speaker
+            engine.tts_to_file(text=text, file_path=out_path_wav, **kwargs)
+            return out_path_wav
+        except Exception:
+            pass
+    # Fallback to gTTS
     try:
         out_path = f"/tmp/tts_{ts}.mp3"
         tts = gTTS(text=text, lang="en")
         tts.save(out_path)
         return None
+def list_coqui_speakers(model_name: str) -> List[str]:
+    try:
+        from TTS.api import TTS as CoquiTTS  # type: ignore
+        engine = CoquiTTS(model_name=model_name, progress_bar=False)
+        # Try common attributes
+        if hasattr(engine, "speakers") and isinstance(engine.speakers, list):
+            return [str(s) for s in engine.speakers]
+        if hasattr(engine, "speaker_manager") and hasattr(engine.speaker_manager, "speaker_names"):
+            return list(engine.speaker_manager.speaker_names)
+    except Exception:
+        pass
+    # Reasonable defaults for VCTK multi-speaker
+    return ["p225", "p227", "p231", "p233", "p236"]
 def severity_from_total(total_score: int) -> str:
     if total_score <= 4:
         return "Minimal Depression"
                     severity_label = gr.Label(label="Severity")
                     threshold = gr.Slider(0.5, 1.0, value=CONFIDENCE_THRESHOLD_DEFAULT, step=0.05, label="Confidence Threshold (stop when min ≥ τ)")
                     tts_enable = gr.Checkbox(label="Speak clinician responses (TTS)", value=USE_TTS_DEFAULT)
+                    with gr.Row():
+                        tts_provider_dd = gr.Dropdown(choices=["Coqui", "gTTS"], value="Coqui", label="TTS Provider")
+                        coqui_model_tb = gr.Textbox(value=os.getenv("COQUI_MODEL", "tts_models/en/vctk/vits"), label="Coqui Model")
+                    coqui_speaker_dd = gr.Dropdown(choices=list_coqui_speakers(os.getenv("COQUI_MODEL", "tts_models/en/vctk/vits")), value="p225", label="Coqui Speaker")
                     tts_audio = gr.Audio(label="Clinician voice", interactive=False, autoplay=False, visible=False)
                     model_id_tb = gr.Textbox(value=current_model_id, label="Chat Model ID", info="e.g., google/gemma-2-2b-it or google/medgemma-4b-it")
                     with gr.Row():
         intro_play_btn.click(fn=_play_intro_tts, inputs=[tts_enable], outputs=[tts_audio_main])
         # Wire interactions
+        def _process_with_tts(audio, text, chat, th, tts_on, finished, turns, scores, meta, provider, coqui_model, coqui_speaker):
+            result = process_turn(audio, text, chat, th, tts_on, finished, turns, scores, meta)
+            chat_history, display_json, severity, finished_o, turns_o, _, _, _, last_tts = result
+            if tts_on and chat_history and chat_history[-1][1]:
+                new_path = synthesize_tts(chat_history[-1][1], provider=provider, coqui_model_name=coqui_model, coqui_speaker=coqui_speaker)
+            else:
+                new_path = None
+            return chat_history, display_json, severity, finished_o, turns_o, None, None, new_path, new_path
         audio_main.stop_recording(
+            fn=_process_with_tts,
+            inputs=[audio_main, text_main, chatbot, threshold, tts_enable, finished_state, turns_state, scores_state, meta_state, tts_provider_dd, coqui_model_tb, coqui_speaker_dd],
             outputs=[chatbot, score_json, severity_label, finished_state, turns_state, audio_main, text_main, tts_audio, tts_audio_main],
             queue=True,
             api_name="message",

requirements.txt CHANGED Viewed

@@ -10,4 +10,4 @@ scipy>=1.11.4
 protobuf>=4.25.3
 gTTS>=2.5.3
 spaces>=0.27.1

 protobuf>=4.25.3
 gTTS>=2.5.3
 spaces>=0.27.1
+coqui-tts>=0.27.2