Spaces:
Build error
Build error
| import gradio as gr | |
| import librosa | |
| import soundfile as sf | |
| import torch | |
| import warnings | |
| import os | |
| from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model | |
| from engine import SpeechToTextEngine | |
| import wave | |
| from nemo_asr import transcribe | |
| warnings.filterwarnings("ignore") | |
| from speechbrain.pretrained import EncoderDecoderASR | |
| asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw") | |
| #asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3") | |
| # define speech-to-text function | |
| def asr_transcript(audio): | |
| if audio == None: | |
| return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)" | |
| text = "" | |
| data={} | |
| if audio: | |
| text_asr = asr_model.transcribe_file(audio.name) | |
| #text_nemo_trasducer = transcribe(audio, "stt_rw_conformer_ctc_large") | |
| with open(audio.name,'rb') as f: | |
| audio_proper = f.read() | |
| stt_engine = SpeechToTextEngine() | |
| all_hot_words = [] | |
| if data: | |
| all_hot_words = stt_engine.add_hot_words(data) | |
| if not audio_proper: | |
| raise InvalidUsage('Audio not provided') | |
| # Running the transcription | |
| text_coqui = stt_engine.run(audio_proper) | |
| return text_asr.lower() , text_coqui | |
| return "File not valid" | |
| gradio_ui = gr.Interface( | |
| fn=asr_transcript, | |
| title="Kinyarwanda Speech Recognition", | |
| description="Record an audio clip from browser using microphone, and let AI do the hard work of transcribing.", | |
| article = """ | |
| This demo showcases two pretrained STT models the first model from speechbrain(wave2vec+CTC models)(1,2gb) is 30 times larger compared to the coqui STT (deepspeech model)(45mb). | |
| """, | |
| inputs=[ gr.inputs.Audio(label="Upload Audio File", type="file", optional=False)], | |
| outputs=[gr.outputs.Textbox(label="Recognized speech from speechbrain model"), | |
| gr.outputs.Textbox(label="Recognized speech from coqui STT model")] | |
| # examples = [["sample_1.wav"],["sample_2.wav"]] | |
| ) | |
| gradio_ui.launch(enable_queue=True) |