Spaces:

MikeTangoEcho
/

asrnersbx

Paused

App Files Files Community

MikeTangoEcho commited on Nov 10, 2024

Commit

48b9b5d

0 Parent(s):

initial commit

Browse files

Files changed (4) hide show

README.md +9 -0
app.py +88 -0
packages.txt +1 -0
requirements.txt +2 -0

README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+---
+title: COC
+emoji: 🎤
+sdk: gradio
+sdk_version: 5.5.0
+app_file: app.py
+pinned: false
+disable_embedding: true
+---

app.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+from transformers import pipeline
+import gradio as gr
+# Pipelines
+device = 0 if torch.cuda.is_available() else "cpu"
+## Automatic Speech Recognition
+## https://huggingface.co/docs/transformers/task_summary#automatic-speech-recognition
+## Require ffmpeg to be installed
+asr_model = "openai/whisper-tiny"
+asr = pipeline(
+    "automatic-speech-recognition",
+    model=asr_model,
+#    torch_dtype=torch.float16,
+    device=device
+)
+## Token Classification / Name Entity Recognition
+## https://huggingface.co/docs/transformers/task_summary#token-classification
+tc_model = "dslim/distilbert-NER"
+tc = pipeline(
+    "token-classification", # ner
+    model=ner_model,
+    device=device
+)
+# ---
+# Transformers
+# https://www.gradio.app/main/docs/gradio/audio#behavior
+# As output component: expects audio data in any of these formats:
+# - a str or pathlib.Path filepath
+# - or URL to an audio file,
+# - or a bytes object (recommended for streaming),
+# - or a tuple of (sample rate in Hz, audio data as numpy array)
+def transcribe(audio: str | Path | bytes | tuple[int, np.ndarray] | None):
+    if audio is None:
+        return "..."
+    # TODO Manage str/Path
+    text = ""
+    # https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__
+    # Whisper input format for tuple differ from output provided by gradio audio component
+    if asr_model.startswith("openai/whisper"):
+        inputs = {"sampling_rate": audio[0], "raw": audio[1]} if type(audio) is tuple and else audio
+        transcript = asr(inputs)
+        text = transcript['text']
+    entities = tc(text)
+    # TODO Add Text Classification for sentiment analysis
+    return {"text": text, "entities": entities}
+# ---
+# Gradio
+## Interfaces
+# https://www.gradio.app/main/docs/gradio/audio
+input_audio = gr.Audio(
+    sources=["upload", "microphone"],
+    show_share_button=False
+)
+## App
+gradio_app = gr.Interface(
+    transcribe,
+    inputs=[
+        input_audio
+    ],
+    outputs=[
+        gr.HighlightedText()
+    ],
+    theme="huggingface"
+    title="ASRNERSBX"
+    description=(
+        "Transcribe, Tokenize, Classify"
+    )
+    allow_flagging="never"
+)
+## Start!
+gradio_app.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ transformers
2	+ torch