Spaces:
Running
Running
| # Copyright (c) 2022 Horizon Robotics. (authors: Binbin Zhang) | |
| # 2022 Chengdong Liang (liangchengdong@mail.nwpu.edu.cn) | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import gradio as gr | |
| import torch | |
| from wenet.cli.model import load_model | |
| def process_cat_embs(cat_embs): | |
| device = "cpu" | |
| cat_embs = torch.tensor( | |
| [float(c) for c in cat_embs.split(',')]).to(device) | |
| return cat_embs | |
| def download_rev_models(): | |
| from huggingface_hub import hf_hub_download | |
| import joblib | |
| REPO_ID = "Revai/reverb-asr" | |
| files = ['reverb_asr_v1.jit.zip', 'tk.units.txt'] | |
| downloaded_files = [hf_hub_download(repo_id=REPO_ID, filename=f) for f in files] | |
| model = load_model(downloaded_files[0], downloaded_files[1]) | |
| return model | |
| model = download_rev_models() | |
| def recognition(audio, style=0): | |
| if audio is None: | |
| return "Input Error! Please enter one audio!" | |
| # NOTE: model supports 16k sample_rate | |
| cat_embs = ','.join([str(s) for s in (style, 1-style)]) | |
| cat_embs = process_cat_embs(cat_embs) | |
| ans = model.transcribe(audio, cat_embs = cat_embs) | |
| if ans is None: | |
| return "ERROR! No text output! Please try again!" | |
| txt = ans['text'] | |
| txt = txt.replace('β', ' ') | |
| return txt | |
| # input | |
| inputs = [ | |
| gr.inputs.Audio(source="microphone", type="filepath", label='Input audio'), | |
| gr.Slider(0, 1, value=0, label="Verbatimicity - from non-verbatim (0) to verbatim (1)", info="Choose a transcription style between non-verbatim and verbatim"), | |
| ] | |
| examples = [ | |
| ['examples/POD1000000012_S0000335.wav'], | |
| ['examples/POD1000000013_S0000062.wav'], | |
| ['examples/POD1000000032_S0000020.wav'], | |
| ['examples/POD1000000032_S0000038.wav'], | |
| ['examples/POD1000000032_S0000050.wav'], | |
| ['examples/POD1000000032_S0000058.wav'], | |
| ] | |
| output = gr.outputs.Textbox(label="Output Text") | |
| text = "Reverb ASR Transcription Styles Demo" | |
| # description | |
| description = ( | |
| "Reverb ASR supports verbatim and non-verbatim transcription. Try recording an audio with disfluencies (ex: \'uh\', \'um\') and testing both transcription styles. Or, choose an example audio below." # noqa | |
| ) | |
| article = ( | |
| "<p style='text-align: center'>" | |
| "<a href='https://rev.com' target='_blank'>Learn more about Rev</a>" # noqa | |
| "</p>") | |
| interface = gr.Interface( | |
| fn=recognition, | |
| inputs=inputs, | |
| outputs=output, | |
| title=text, | |
| description=description, | |
| article=article, | |
| examples=examples, | |
| theme='huggingface', | |
| ) | |
| interface.launch(enable_queue=True) | |