asr-inference

Running on Zero

App Files Files Community

AbirMessaoudi commited on about 10 hours ago

Commit

a808f78

verified ·

1 Parent(s): 1619dcb

bug_fix env (#47)

Browse files

- clean code, requirements compatibility check (524ec453999a3139fc3d9e33dec15a8ac0af6e4f)

Files changed (6) hide show

app.py +97 -36
pyannote/config.yaml +0 -18
pyannote/pytorch_model.bin +0 -3
pyannote/segmentation-3.0.bin +0 -3
pyannote/wespeaker-voxceleb-resnet34-LM.bin +0 -3
requirements.txt +13 -9

app.py CHANGED Viewed

@@ -1,22 +1,38 @@
 import os
 import gradio as gr
 import spaces
 from whisper_cs_fase_1 import generate_fase_1
 from whisper_cs_fase_2 import generate_fase_2
 from AinaTheme import theme
-@spaces.GPU()
-def transcribe_fase_1(inputs: str, model_version: str, civil_channel: str):
-    if inputs is None:
-        raise gr.Error("Cap fitxer d'àudio introduit! Si us plau pengeu un fitxer o enregistreu un àudio abans d'enviar la vostra sol·licitud")
-    return generate_fase_1(audio_path=inputs, model_version=model_version, civil_channel=civil_channel)
-@spaces.GPU()
-def transcribe_fase_2_display(inputs: str, model_version: str, civil_channel: str):
     if inputs is None:
-        raise gr.Error("Cap fitxer d'àudio introduit! Si us plau pengeu un fitxer o enregistreu un àudio abans d'enviar la vostra sol·licitud")
-    return generate_fase_2(audio_path=inputs, model_version=model_version, civil_channel=civil_channel)
 def clear_fase_1(model_version, civil_channel):
@@ -27,16 +43,17 @@ def clear_fase_2(model_version, civil_channel):
 with gr.Blocks(theme=theme) as demo:
     gr.Markdown("## 🗣️ Transcripció automàtica d'àudio — Mode amb dues fases")
     with gr.Tabs():
         with gr.Tab("Fase 1"):
-            description_string = (
                 "### 🎧 Transcripció de trucades multilingüe de bona qualitat per a transcripció fiable\n"
                 "- **v2_fast**: Inclou separació de canals i inferència ràpida.\n"
                 "- **v1.0**: Inclou inferència moderada sense separació de canals."
             )
-            gr.Markdown(description_string)
             with gr.Row():
                 with gr.Column(scale=1):
@@ -46,28 +63,45 @@ with gr.Blocks(theme=theme) as demo:
                         value="v2_fast",
                         elem_id="fase1-model-version",
                     )
                     civil_channel_1 = gr.Dropdown(
                         label="Canal del Civil (persona que truca)",
                         choices=["Left", "Right"],
                         value="Left",
                     )
-                    input_1 = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio")
                 with gr.Column(scale=1):
                     output_1 = gr.Textbox(label="Output", lines=8)
-            with gr.Row(variant="panel"):
-                clear_btn = gr.Button("Clear")
-                submit_btn = gr.Button("Submit", variant="primary")
-            submit_btn.click(fn=transcribe_fase_1, inputs=[input_1, model_version_1, civil_channel_1], outputs=[output_1])
-            clear_btn.click(fn=clear_fase_1, inputs=[model_version_1, civil_channel_1], outputs=[input_1, model_version_1, civil_channel_1], queue=False)
         with gr.Tab("Fase 2"):
-            description_string = (
                 "### 🧠 Transcripció de trucades multilingüe de bona qualitat per a anàlisi d'informe\n"
-                "- **v2_fast_and_detection_v1**: Inclou inferència ràpida, separació de parlants i explotació de nova informació per processos analítics i informes avançats."
             )
-            gr.Markdown(description_string)
             with gr.Row():
                 with gr.Column(scale=1):
@@ -77,33 +111,60 @@ with gr.Blocks(theme=theme) as demo:
                         value="v2_fast_and_detection_v1",
                         elem_id="fase2-model-version",
                     )
                     civil_channel_2 = gr.Dropdown(
                         label="Canal del Civil (persona que truca)",
                         choices=["Left", "Right"],
                         value="Left",
                     )
-                    input_2 = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio")
                 with gr.Column(scale=1):
                     output_text = gr.Textbox(label="Transcripció ASR", lines=8)
                     output_sex = gr.Textbox(label="Gènere", lines=1)
                     output_age = gr.Textbox(label="Edat", lines=1)
                     output_silence = gr.Textbox(label="Detecció de silenci", lines=2)
                     output_shout = gr.Textbox(label="Detecció de crits", lines=2)
-                    output_meteo = gr.Textbox(label="Detecció d'esdeveniment meteorològic", lines=2)
-            with gr.Row(variant="panel"):
-                clear_btn2 = gr.Button("Clear")
-                submit_btn2 = gr.Button("Submit", variant="primary")
-            submit_btn2.click(
-                fn=transcribe_fase_2_display,
-                inputs=[input_2, model_version_2, civil_channel_2],
-                outputs=[output_text, output_sex, output_age, output_silence, output_shout, output_meteo]
-            )
-            clear_btn2.click(fn=clear_fase_2, inputs=[model_version_2, civil_channel_2], outputs=[input_2, model_version_2, civil_channel_2, output_text, output_sex, output_age, output_silence, output_shout, output_meteo], queue=False)
 if __name__ == "__main__":
     demo.launch()

 import os
 import gradio as gr
 import spaces
 from whisper_cs_fase_1 import generate_fase_1
 from whisper_cs_fase_2 import generate_fase_2
 from AinaTheme import theme
+def generate_fase(audio_path, model_version, civil_channel, fase):
+    if fase == 1:
+        text = generate_fase_1(
+            audio_path,
+            model_version=model_version,
+            civil_channel=civil_channel
+        )
+        return text, None, None, None, None, None
+    elif fase == 2:
+        text, sex, age, silence_event, shout_event, meteo_event = generate_fase_2(
+            audio_path,
+            model_version=model_version,
+            civil_channel=civil_channel
+        )
+        return text, sex, age, silence_event, shout_event, meteo_event
+    else:
+        raise ValueError("Invalid fase. Must be 1 or 2.")
+@spaces.GPU
+def transcribe(inputs: str, model_version: str, civil_channel: str, fase: int):
     if inputs is None:
+        raise gr.Error(
+            "Cap fitxer d'àudio introduit! Si us plau pengeu un fitxer o enregistreu un àudio abans d'enviar la vostra sol·licitud"
+        )
+    return generate_fase(inputs, model_version, civil_channel, fase)
 def clear_fase_1(model_version, civil_channel):
 with gr.Blocks(theme=theme) as demo:
     gr.Markdown("## 🗣️ Transcripció automàtica d'àudio — Mode amb dues fases")
     with gr.Tabs():
         with gr.Tab("Fase 1"):
+            gr.Markdown(
                 "### 🎧 Transcripció de trucades multilingüe de bona qualitat per a transcripció fiable\n"
                 "- **v2_fast**: Inclou separació de canals i inferència ràpida.\n"
                 "- **v1.0**: Inclou inferència moderada sense separació de canals."
             )
             with gr.Row():
                 with gr.Column(scale=1):
                         value="v2_fast",
                         elem_id="fase1-model-version",
                     )
                     civil_channel_1 = gr.Dropdown(
                         label="Canal del Civil (persona que truca)",
                         choices=["Left", "Right"],
                         value="Left",
                     )
+                    input_1 = gr.Audio(
+                        sources=["upload", "microphone"],
+                        type="filepath",
+                        label="Audio",
+                    )
                 with gr.Column(scale=1):
                     output_1 = gr.Textbox(label="Output", lines=8)
+                with gr.Row(variant="panel"):
+                    clear_btn = gr.Button("Clear")
+                    submit_btn = gr.Button("Submit", variant="primary")
+                    submit_btn.click(
+                        fn=transcribe,
+                        inputs=[input_1, model_version_1, civil_channel_1, gr.Number(value=1)],
+                        outputs=[output_1],
+                    )
+                    clear_btn.click(
+                        fn=clear_fase_1,
+                        inputs=[model_version_1, civil_channel_1],
+                        outputs=[input_1, model_version_1, civil_channel_1],
+                        queue=False,
+                    )
         with gr.Tab("Fase 2"):
+            gr.Markdown(
                 "### 🧠 Transcripció de trucades multilingüe de bona qualitat per a anàlisi d'informe\n"
+                "- **v2_fast_and_detection_v1**: Inclou inferència ràpida, separació de parlants i explotació d'informació detectada."
             )
             with gr.Row():
                 with gr.Column(scale=1):
                         value="v2_fast_and_detection_v1",
                         elem_id="fase2-model-version",
                     )
                     civil_channel_2 = gr.Dropdown(
                         label="Canal del Civil (persona que truca)",
                         choices=["Left", "Right"],
                         value="Left",
                     )
+                    input_2 = gr.Audio(
+                        sources=["upload", "microphone"],
+                        type="filepath",
+                        label="Audio",
+                    )
                 with gr.Column(scale=1):
                     output_text = gr.Textbox(label="Transcripció ASR", lines=8)
                     output_sex = gr.Textbox(label="Gènere", lines=1)
                     output_age = gr.Textbox(label="Edat", lines=1)
                     output_silence = gr.Textbox(label="Detecció de silenci", lines=2)
                     output_shout = gr.Textbox(label="Detecció de crits", lines=2)
+                    output_meteo = gr.Textbox(label="Detecció meteo", lines=2)
+                with gr.Row(variant="panel"):
+                    clear_btn2 = gr.Button("Clear")
+                    submit_btn2 = gr.Button("Submit", variant="primary")
+                    submit_btn2.click(
+                        fn=transcribe,
+                        inputs=[input_2, model_version_2, civil_channel_2, gr.Number(value=2)],
+                        outputs=[
+                            output_text,
+                            output_sex,
+                            output_age,
+                            output_silence,
+                            output_shout,
+                            output_meteo,
+                        ],
+                    )
+                    clear_btn2.click(
+                        fn=clear_fase_2,
+                        inputs=[model_version_2, civil_channel_2],
+                        outputs=[
+                            input_2,
+                            model_version_2,
+                            civil_channel_2,
+                            output_text,
+                            output_sex,
+                            output_age,
+                            output_silence,
+                            output_shout,
+                            output_meteo,
+                        ],
+                        queue=False,
+                    )
 if __name__ == "__main__":
     demo.launch()

pyannote/config.yaml DELETED Viewed

@@ -1,18 +0,0 @@
-version: 3.1.0
-pipeline:
-  name: pyannote.audio.pipelines.SpeakerDiarization
-  params:
-    clustering: AgglomerativeClustering
-    embedding: ./pyannote/wespeaker-voxceleb-resnet34-LM.bin
-    embedding_batch_size: 32
-    embedding_exclude_overlap: false
-    segmentation: ./pyannote/segmentation-3.0.bin
-    segmentation_batch_size: 32
-params:
-  clustering:
-    method: centroid
-    min_cluster_size: 12
-    threshold: 0.7045654963945799
-  segmentation:
-    min_duration_off: 0.09791355693027545

pyannote/pytorch_model.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea
-size 17719103

pyannote/segmentation-3.0.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:da85c29829d4002daedd676e012936488234d9255e65e86dfab9bec6b1729298
-size 5905440

pyannote/wespeaker-voxceleb-resnet34-LM.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:366edf44f4c80889a3eb7a9d7bdf02c4aede3127f7dd15e274dcdb826b143c56
-size 26645418

requirements.txt CHANGED Viewed

@@ -1,16 +1,20 @@
-torch
-torchaudio
-transformers==4.40.2 #gated models
-ctranslate2==4.6.0
 faster_whisper==1.2.0
 hf_transfer==0.1.9
-pyannote.audio==3.3.2
-yt-dlp==2025.7.21
-gradio==5.41.1
 librosa==0.10.1
 ffmpeg-python==0.2.0
 aina-gradio-theme==2.3
 spaces==0.39.0
-peft==0.11.1
-whisper_timestamped==1.15.8
 typing==3.7.4.3

+torch==2.6.0+cu124 #speechbrain fix
+torchaudio==2.6.0+cu124 #speechbrain fix
+--extra-index-url https://download.pytorch.org/whl/cu124 #speechbrain fix
+nvidia-cudnn-cu12>=9.1.0, <9.2 #ctranslate2 fix
+ctranslate2>=4.3
 faster_whisper==1.2.0
+whisper_timestamped==1.15.8
+transformers==4.40.2
 hf_transfer==0.1.9
+huggingface_hub
 librosa==0.10.1
+soundfile
 ffmpeg-python==0.2.0
+speechbrain
+pydub
+gradio==5.41.1
 aina-gradio-theme==2.3
 spaces==0.39.0
 typing==3.7.4.3
+yt-dlp==2025.7.21