Spaces:
Paused
Paused
| import os | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| import transformers | |
| from pathlib import Path | |
| from transformers import pipeline | |
| from transformers.utils import logging | |
| # Log | |
| #logging.set_verbosity_debug() | |
| logger = logging.get_logger("transformers") | |
| # Pipelines | |
| ## Automatic Speech Recognition | |
| ## https://huggingface.co/docs/transformers/task_summary#automatic-speech-recognition | |
| ## Require ffmpeg to be installed | |
| asr_device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| asr_torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| asr_model = "openai/whisper-tiny" | |
| asr = pipeline( | |
| "automatic-speech-recognition", | |
| model=asr_model, | |
| torch_dtype=asr_torch_dtype, | |
| device=asr_device | |
| ) | |
| ## Token Classification / Name Entity Recognition | |
| ## https://huggingface.co/docs/transformers/task_summary#token-classification | |
| tc_device = 0 if torch.cuda.is_available() else "cpu" | |
| tc_model = os.environ.get("TOKEN_CLASSIFICATION_MODEL", "dslim/distilbert-NER") | |
| tc = pipeline( | |
| "token-classification", # ner | |
| model=tc_model, | |
| device=tc_device | |
| ) | |
| # --- | |
| # Transformers | |
| # https://www.gradio.app/main/docs/gradio/audio#behavior | |
| # As output component: expects audio data in any of these formats: | |
| # - a str or pathlib.Path filepath | |
| # - or URL to an audio file, | |
| # - or a bytes object (recommended for streaming), | |
| # - or a tuple of (sample rate in Hz, audio data as numpy array) | |
| def transcribe(audio: str | Path | bytes | tuple[int, np.ndarray] | None): | |
| logger.debug(">Transcribe") | |
| if audio is None: | |
| return "..." | |
| # TODO Manage str/Path | |
| text = "" | |
| # https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__ | |
| # Whisper input format for tuple differ from output provided by gradio audio component | |
| if asr_model.startswith("openai/whisper") and type(audio) is tuple: | |
| sampling_rate, raw = audio | |
| # Convert to mono if stereo | |
| if raw.ndim > 1: | |
| raw = raw.mean(axis=1) | |
| # Convert according to asr_torch_dtype | |
| raw = raw.astype(np.float16 if type(asr_torch_dtype) is torch.float16 else np.float32) | |
| raw /= np.max(np.abs(raw)) | |
| inputs = {"sampling_rate": sampling_rate, "raw": raw} | |
| logger.debug(inputs) | |
| transcript = asr(inputs) | |
| text = transcript['text'] | |
| logger.debug(text) | |
| return text | |
| def tokenize(text: str): | |
| logger.debug(">Tokenize") | |
| entities = tc(text) | |
| logger.debug(entities) | |
| # TODO Add Text Classification for sentiment analysis | |
| return {"text": text, "entities": entities} | |
| def classify(text: str): | |
| logger.debug(">Classify") | |
| return None | |
| def transcribe_tokenize(*arg): | |
| return tokenize(transcribe(arg)) | |
| # --- | |
| # Gradio | |
| ## Interfaces | |
| # https://www.gradio.app/main/docs/gradio/audio | |
| input_audio = gr.Audio( | |
| sources=["upload", "microphone"], | |
| show_share_button=False | |
| ) | |
| ## App | |
| asrner_app = gr.Interface( | |
| transcribe_tokenize, | |
| inputs=[ | |
| input_audio | |
| ], | |
| outputs=[ | |
| gr.HighlightedText() | |
| ], | |
| title="ASR>NER", | |
| description=( | |
| "Transcribe, Tokenize, Classify" | |
| ), | |
| flagging_mode="never" | |
| ) | |
| ner_app = gr.Interface( | |
| tokenize, | |
| inputs=[ | |
| gr.Textbox() | |
| ], | |
| outputs=[ | |
| gr.HighlightedText() | |
| ], | |
| title="NER", | |
| description=( | |
| "Tokenize, Classify" | |
| ), | |
| flagging_mode="never" | |
| ) | |
| gradio_app = gr.TabbedInterface( | |
| interface_list=[ | |
| asrner_app, | |
| ner_app | |
| ], | |
| tab_names=[ | |
| asrner_app.title, | |
| ner_app.title | |
| ], | |
| title="ASRNERSBX" | |
| ) | |
| ## Start! | |
| gradio_app.launch() |