Spaces:
				
			
			
	
			
			
		Build error
		
	
	
	
			
			
	
	
	
	
		
		
		Build error
		
	push the code
Browse files- README.md +1 -1
 - app.py +65 -0
 - app_single.py +45 -0
 - app_upload_model_input.py +48 -0
 - engine.py +129 -0
 - nemo_asr.py +22 -0
 - packages.txt +2 -0
 - requirements.txt +14 -0
 
    	
        README.md
    CHANGED
    
    | 
         @@ -1,6 +1,6 @@ 
     | 
|
| 1 | 
         
             
            ---
         
     | 
| 2 | 
         
             
            title: Kinyarwanda Asr
         
     | 
| 3 | 
         
            -
            emoji:  
     | 
| 4 | 
         
             
            colorFrom: yellow
         
     | 
| 5 | 
         
             
            colorTo: indigo
         
     | 
| 6 | 
         
             
            sdk: gradio
         
     | 
| 
         | 
|
| 1 | 
         
             
            ---
         
     | 
| 2 | 
         
             
            title: Kinyarwanda Asr
         
     | 
| 3 | 
         
            +
            emoji: 🚀
         
     | 
| 4 | 
         
             
            colorFrom: yellow
         
     | 
| 5 | 
         
             
            colorTo: indigo
         
     | 
| 6 | 
         
             
            sdk: gradio
         
     | 
    	
        app.py
    ADDED
    
    | 
         @@ -0,0 +1,65 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import gradio as gr
         
     | 
| 2 | 
         
            +
            import librosa
         
     | 
| 3 | 
         
            +
            import soundfile as sf
         
     | 
| 4 | 
         
            +
            import torch
         
     | 
| 5 | 
         
            +
            import warnings
         
     | 
| 6 | 
         
            +
            import os 
         
     | 
| 7 | 
         
            +
            from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            from engine import SpeechToTextEngine
         
     | 
| 10 | 
         
            +
            import wave
         
     | 
| 11 | 
         
            +
            import gradio as gr
         
     | 
| 12 | 
         
            +
            import librosa
         
     | 
| 13 | 
         
            +
            import soundfile as sf
         
     | 
| 14 | 
         
            +
            import warnings
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            from nemo_asr import transcribe
         
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
             
     | 
| 19 | 
         
            +
            warnings.filterwarnings("ignore")
         
     | 
| 20 | 
         
            +
             
     | 
| 21 | 
         
            +
            from speechbrain.pretrained import EncoderDecoderASR
         
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
            asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
         
     | 
| 24 | 
         
            +
            #asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")
         
     | 
| 25 | 
         
            +
             
     | 
| 26 | 
         
            +
            # define speech-to-text function
         
     | 
| 27 | 
         
            +
            def asr_transcript(audio):
         
     | 
| 28 | 
         
            +
               
         
     | 
| 29 | 
         
            +
                if audio == None:
         
     | 
| 30 | 
         
            +
                    return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
         
     | 
| 31 | 
         
            +
                text = ""
         
     | 
| 32 | 
         
            +
                data={}
         
     | 
| 33 | 
         
            +
                if audio:
         
     | 
| 34 | 
         
            +
                    text_asr = asr_model.transcribe_file(audio.name) 
         
     | 
| 35 | 
         
            +
                    text_nemo_trasducer = transcribe(audio.name, "stt_rw_conformer_transducer_large")
         
     | 
| 36 | 
         
            +
                    with open(audio.name,'rb') as f:
         
     | 
| 37 | 
         
            +
                        audio_proper = f.read()
         
     | 
| 38 | 
         
            +
                    stt_engine = SpeechToTextEngine()
         
     | 
| 39 | 
         
            +
                    all_hot_words = []
         
     | 
| 40 | 
         
            +
                    if data:
         
     | 
| 41 | 
         
            +
                        all_hot_words = stt_engine.add_hot_words(data)
         
     | 
| 42 | 
         
            +
                    if not audio_proper:
         
     | 
| 43 | 
         
            +
                        raise InvalidUsage('Audio not provided')
         
     | 
| 44 | 
         
            +
                    # Running the transcription
         
     | 
| 45 | 
         
            +
                    text_coqui = stt_engine.run(audio_proper)
         
     | 
| 46 | 
         
            +
                    
         
     | 
| 47 | 
         
            +
                    return text_asr.lower() , text_coqui , text_nemo_trasducer
         
     | 
| 48 | 
         
            +
                else:
         
     | 
| 49 | 
         
            +
                    return  "File not valid"
         
     | 
| 50 | 
         
            +
                
         
     | 
| 51 | 
         
            +
            gradio_ui = gr.Interface(
         
     | 
| 52 | 
         
            +
                fn=asr_transcript,
         
     | 
| 53 | 
         
            +
                title="Kinyarwanda Speech Recognition",
         
     | 
| 54 | 
         
            +
                description="Record an audio clip from browser using microphone, and let AI do the hard work of transcribing.",
         
     | 
| 55 | 
         
            +
                article = """
         
     | 
| 56 | 
         
            +
                This demo showcases two pretrained STT models the first model from speechbrain(wave2vec+CTC models)(1,2gb) is 30 times larger compared to the coqui STT (deepspeech model)(45mb).
         
     | 
| 57 | 
         
            +
                """,
         
     | 
| 58 | 
         
            +
                inputs=[gr.inputs.Audio(source="microphone", type="file", optional=False, label="Record from microphone")],
         
     | 
| 59 | 
         
            +
                outputs=[gr.outputs.Textbox(label="Recognized speech from speechbrain model"),
         
     | 
| 60 | 
         
            +
                         gr.outputs.Textbox(label="Recognized speech from coqui STT model")
         
     | 
| 61 | 
         
            +
                         gr.outputs.Textbox(label="Recognized speech from NVIDIA Conformer transduver large model")]
         
     | 
| 62 | 
         
            +
                # examples =  [["sample_1.wav"],["sample_2.wav"]]
         
     | 
| 63 | 
         
            +
            )
         
     | 
| 64 | 
         
            +
             
     | 
| 65 | 
         
            +
            gradio_ui.launch(enable_queue=True)
         
     | 
    	
        app_single.py
    ADDED
    
    | 
         @@ -0,0 +1,45 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import gradio as gr
         
     | 
| 2 | 
         
            +
            import librosa
         
     | 
| 3 | 
         
            +
            import soundfile as sf
         
     | 
| 4 | 
         
            +
            import torch
         
     | 
| 5 | 
         
            +
            import warnings
         
     | 
| 6 | 
         
            +
            import os 
         
     | 
| 7 | 
         
            +
            from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
            warnings.filterwarnings("ignore")
         
     | 
| 11 | 
         
            +
             
     | 
| 12 | 
         
            +
            from speechbrain.pretrained import EncoderDecoderASR
         
     | 
| 13 | 
         
            +
             
     | 
| 14 | 
         
            +
            asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
         
     | 
| 15 | 
         
            +
            #asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")
         
     | 
| 16 | 
         
            +
             
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
             
     | 
| 19 | 
         
            +
            # define speech-to-text function
         
     | 
| 20 | 
         
            +
            def asr_transcript(audio):
         
     | 
| 21 | 
         
            +
               
         
     | 
| 22 | 
         
            +
                if audio == None:
         
     | 
| 23 | 
         
            +
                    return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
         
     | 
| 24 | 
         
            +
                text = ""
         
     | 
| 25 | 
         
            +
             
     | 
| 26 | 
         
            +
                if audio:
         
     | 
| 27 | 
         
            +
                    text = asr_model.transcribe_file(audio.name) 
         
     | 
| 28 | 
         
            +
                    
         
     | 
| 29 | 
         
            +
                    return text
         
     | 
| 30 | 
         
            +
                else:
         
     | 
| 31 | 
         
            +
                    return  "File not valid"
         
     | 
| 32 | 
         
            +
                
         
     | 
| 33 | 
         
            +
            gradio_ui = gr.Interface(
         
     | 
| 34 | 
         
            +
                fn=asr_transcript,
         
     | 
| 35 | 
         
            +
                title="Kinyarwanda Speech Recognition",
         
     | 
| 36 | 
         
            +
                description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.",
         
     | 
| 37 | 
         
            +
                article = """
         
     | 
| 38 | 
         
            +
                This demo showcases the pretrained model from deepspeech.
         
     | 
| 39 | 
         
            +
                """,
         
     | 
| 40 | 
         
            +
                inputs=[gr.inputs.Audio(source="microphone", type="file", optional=False, label="Record from microphone")],
         
     | 
| 41 | 
         
            +
                outputs=[gr.outputs.Textbox(label="Recognized speech")],
         
     | 
| 42 | 
         
            +
                examples =  [["sample_1.wav"],["sample_2.wav"]]
         
     | 
| 43 | 
         
            +
            )
         
     | 
| 44 | 
         
            +
             
     | 
| 45 | 
         
            +
            gradio_ui.launch(enable_queue=True)
         
     | 
    	
        app_upload_model_input.py
    ADDED
    
    | 
         @@ -0,0 +1,48 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import gradio as gr
         
     | 
| 2 | 
         
            +
            import librosa
         
     | 
| 3 | 
         
            +
            import soundfile as sf
         
     | 
| 4 | 
         
            +
            import torch
         
     | 
| 5 | 
         
            +
            import warnings
         
     | 
| 6 | 
         
            +
            import os 
         
     | 
| 7 | 
         
            +
            from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
            warnings.filterwarnings("ignore")
         
     | 
| 11 | 
         
            +
             
     | 
| 12 | 
         
            +
            from speechbrain.pretrained import EncoderDecoderASR
         
     | 
| 13 | 
         
            +
             
     | 
| 14 | 
         
            +
            asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
         
     | 
| 15 | 
         
            +
            #asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")
         
     | 
| 16 | 
         
            +
             
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
             
     | 
| 19 | 
         
            +
            # define speech-to-text function
         
     | 
| 20 | 
         
            +
            def asr_transcript(audio, audio_microphone, model_params):
         
     | 
| 21 | 
         
            +
                
         
     | 
| 22 | 
         
            +
                
         
     | 
| 23 | 
         
            +
                audio = audio_microphone if audio_microphone else audio
         
     | 
| 24 | 
         
            +
                
         
     | 
| 25 | 
         
            +
                if audio == None and audio_microphone == None:
         
     | 
| 26 | 
         
            +
                    return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
         
     | 
| 27 | 
         
            +
                text = ""
         
     | 
| 28 | 
         
            +
             
     | 
| 29 | 
         
            +
                if audio:
         
     | 
| 30 | 
         
            +
                    text = asr_model.transcribe_file(audio.name) 
         
     | 
| 31 | 
         
            +
                    
         
     | 
| 32 | 
         
            +
                    return text
         
     | 
| 33 | 
         
            +
                else:
         
     | 
| 34 | 
         
            +
                    return  "File not valid"
         
     | 
| 35 | 
         
            +
                
         
     | 
| 36 | 
         
            +
            gradio_ui = gr.Interface(
         
     | 
| 37 | 
         
            +
                fn=asr_transcript,
         
     | 
| 38 | 
         
            +
                title="Kinyarwanda Speech Recognition",
         
     | 
| 39 | 
         
            +
                description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.",
         
     | 
| 40 | 
         
            +
                article = """
         
     | 
| 41 | 
         
            +
                This demo showcases the pretrained model from deepspeech.
         
     | 
| 42 | 
         
            +
                """,
         
     | 
| 43 | 
         
            +
                inputs=[gr.inputs.Audio(label="Upload Audio File", type="file", optional=True), gr.inputs.Audio(source="microphone", type="file", optional=True, label="Record from microphone"), gr.inputs.Dropdown(choices=["deepspeech","coqui (soon)"], type="value", default="deepspeech", label="Select speech recognition model ", optional=False)],
         
     | 
| 44 | 
         
            +
                outputs=[gr.outputs.Textbox(label="Recognized speech")],
         
     | 
| 45 | 
         
            +
                examples =  [["sample_1.wav","sample_1.wav","deepspeech"],["sample_2.wav","sample_2.wav","deepspeech"]]
         
     | 
| 46 | 
         
            +
            )
         
     | 
| 47 | 
         
            +
             
     | 
| 48 | 
         
            +
            gradio_ui.launch(enable_queue=True)
         
     | 
    	
        engine.py
    ADDED
    
    | 
         @@ -0,0 +1,129 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import queue
         
     | 
| 2 | 
         
            +
            import wave
         
     | 
| 3 | 
         
            +
            from io import BytesIO
         
     | 
| 4 | 
         
            +
            from pathlib import Path
         
     | 
| 5 | 
         
            +
            import wget
         
     | 
| 6 | 
         
            +
            import ffmpeg
         
     | 
| 7 | 
         
            +
            import numpy as np
         
     | 
| 8 | 
         
            +
            import webrtcvad
         
     | 
| 9 | 
         
            +
            from stt import Metadata
         
     | 
| 10 | 
         
            +
            from stt import Model, version
         
     | 
| 11 | 
         
            +
             
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
             
     | 
| 14 | 
         
            +
            def normalize_audio_input(audio):
         
     | 
| 15 | 
         
            +
                output, err = ffmpeg.input('pipe:0').output('pipe:1', f='WAV', acodec='pcm_s16le', ac=1, ar='16k', loglevel='error',
         
     | 
| 16 | 
         
            +
                                                            hide_banner=None).run(input=audio, capture_stdout=True,
         
     | 
| 17 | 
         
            +
                                                                                  capture_stderr=True)
         
     | 
| 18 | 
         
            +
                if err:
         
     | 
| 19 | 
         
            +
                    raise Exception(err)
         
     | 
| 20 | 
         
            +
                return output
         
     | 
| 21 | 
         
            +
             
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
            class Frame(object):
         
     | 
| 24 | 
         
            +
                """Represents a "frame" of audio data."""
         
     | 
| 25 | 
         
            +
             
     | 
| 26 | 
         
            +
                def __init__(self, frame_bytes, timestamp, duration):
         
     | 
| 27 | 
         
            +
                    self.bytes = frame_bytes
         
     | 
| 28 | 
         
            +
                    self.timestamp = timestamp
         
     | 
| 29 | 
         
            +
                    self.duration = duration
         
     | 
| 30 | 
         
            +
             
     | 
| 31 | 
         
            +
             
     | 
| 32 | 
         
            +
            class SpeechToTextEngine:
         
     | 
| 33 | 
         
            +
                """ Class to perform speech-to-text transcription and related functionality """
         
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
                FORMAT = 8
         
     | 
| 36 | 
         
            +
                SAMPLE_RATE = 16000
         
     | 
| 37 | 
         
            +
                CHANNELS = 1
         
     | 
| 38 | 
         
            +
                BLOCKS_PER_SECOND = 50
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
                def __init__(self, scorer='kinyarwanda.scorer') -> None:
         
     | 
| 41 | 
         
            +
                    """ Initializing the DeepSpeech model """
         
     | 
| 42 | 
         
            +
                    wget.download("https://huggingface.co/mbazaNLP/kinyarwanda-coqui-stt-model/resolve/main/kinyarwanda.scorer")
         
     | 
| 43 | 
         
            +
                    wget.download("https://huggingface.co/mbazaNLP/kinyarwanda-coqui-stt-model/resolve/main/kinyarwanda.tflite")
         
     | 
| 44 | 
         
            +
             
     | 
| 45 | 
         
            +
             
     | 
| 46 | 
         
            +
                    self.model = Model('kinyarwanda.tflite')
         
     | 
| 47 | 
         
            +
                    self.model.enableExternalScorer(
         
     | 
| 48 | 
         
            +
                        scorer_path=Path(__file__).parents[0].joinpath(scorer).absolute().as_posix())
         
     | 
| 49 | 
         
            +
                    self.vad = webrtcvad.Vad(mode=3)
         
     | 
| 50 | 
         
            +
                    self.sample_rate = self.SAMPLE_RATE
         
     | 
| 51 | 
         
            +
                    self.buffer_queue = queue.Queue()
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
                def run(self, audio) -> str:
         
     | 
| 54 | 
         
            +
                    """ Receives the audio,  normalizes it and is sent to the model to be transcribed. Returns the result of the
         
     | 
| 55 | 
         
            +
                    transcribe audio in string format."""
         
     | 
| 56 | 
         
            +
             
     | 
| 57 | 
         
            +
                    normalized_audio = normalize_audio_input(audio)
         
     | 
| 58 | 
         
            +
                    audio_streams = BytesIO(normalized_audio)
         
     | 
| 59 | 
         
            +
                    with wave.Wave_read(audio_streams) as wav:
         
     | 
| 60 | 
         
            +
                        audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16)
         
     | 
| 61 | 
         
            +
                    results = self.model.stt(audio_buffer=audio_streams)
         
     | 
| 62 | 
         
            +
                    return results
         
     | 
| 63 | 
         
            +
             
     | 
| 64 | 
         
            +
                def run_with_metadata(self, audio) -> Metadata:
         
     | 
| 65 | 
         
            +
                    normalized_audio = normalize_audio_input(audio)
         
     | 
| 66 | 
         
            +
                    audio_streams = BytesIO(normalized_audio)
         
     | 
| 67 | 
         
            +
                    with wave.Wave_read(audio_streams) as wav:
         
     | 
| 68 | 
         
            +
                        audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16)
         
     | 
| 69 | 
         
            +
                    results = self.model.sttWithMetadata(audio_buffer=audio_streams)
         
     | 
| 70 | 
         
            +
                    return results
         
     | 
| 71 | 
         
            +
             
     | 
| 72 | 
         
            +
                def add_hot_words(self, data) -> list:
         
     | 
| 73 | 
         
            +
                    """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the
         
     | 
| 74 | 
         
            +
                    added hot-words """
         
     | 
| 75 | 
         
            +
             
     | 
| 76 | 
         
            +
                    all_hot_words = []
         
     | 
| 77 | 
         
            +
                    try:
         
     | 
| 78 | 
         
            +
                        print('----------------------------------------------------')
         
     | 
| 79 | 
         
            +
                        for hot_word in data:
         
     | 
| 80 | 
         
            +
                            # Change all the characters of the hot-word to lower case
         
     | 
| 81 | 
         
            +
                            word = hot_word.lower()
         
     | 
| 82 | 
         
            +
             
     | 
| 83 | 
         
            +
                            # Get numeric value of the boost
         
     | 
| 84 | 
         
            +
                            boost = float(data.get(hot_word))
         
     | 
| 85 | 
         
            +
             
     | 
| 86 | 
         
            +
                            # Adding the hot-word and its boost to the language model
         
     | 
| 87 | 
         
            +
                            self.model.addHotWord(hot_word, boost)
         
     | 
| 88 | 
         
            +
             
     | 
| 89 | 
         
            +
                            # Printing on the prompt the activity
         
     | 
| 90 | 
         
            +
                            print(f"`{word}` hot-word with boost `{boost}` was added.")
         
     | 
| 91 | 
         
            +
                            all_hot_words.append(word)
         
     | 
| 92 | 
         
            +
                        return all_hot_words
         
     | 
| 93 | 
         
            +
                    except RuntimeError:
         
     | 
| 94 | 
         
            +
                        return []
         
     | 
| 95 | 
         
            +
             
     | 
| 96 | 
         
            +
                def erase_hot_word(self, hot_words) -> None:
         
     | 
| 97 | 
         
            +
                    try:
         
     | 
| 98 | 
         
            +
                        for hot_word in hot_words:
         
     | 
| 99 | 
         
            +
                            self.model.eraseHotWord(hot_word)
         
     | 
| 100 | 
         
            +
                            print(f"`{hot_word}` hot-word is erased.")
         
     | 
| 101 | 
         
            +
                        print('----------------------------------------------------')
         
     | 
| 102 | 
         
            +
                    except RuntimeError:
         
     | 
| 103 | 
         
            +
                        return
         
     | 
| 104 | 
         
            +
             
     | 
| 105 | 
         
            +
                def clear_hot_words(self) -> str:
         
     | 
| 106 | 
         
            +
                    try:
         
     | 
| 107 | 
         
            +
                        self.model.clearHotWords()
         
     | 
| 108 | 
         
            +
                        return f"All hot-words were erased."
         
     | 
| 109 | 
         
            +
                    except RuntimeError:
         
     | 
| 110 | 
         
            +
                        return f"No more hot-words are left."
         
     | 
| 111 | 
         
            +
             
     | 
| 112 | 
         
            +
                def deep_stream(self):
         
     | 
| 113 | 
         
            +
                    return self.model.createStream()
         
     | 
| 114 | 
         
            +
             
     | 
| 115 | 
         
            +
                def frame_generator(self, audio, sample_rate=16000, frame_duration_ms=30):
         
     | 
| 116 | 
         
            +
                    """
         
     | 
| 117 | 
         
            +
                    Takes the desired frame duration in milliseconds, the PCM data, and
         
     | 
| 118 | 
         
            +
                    the sample rate. Yields Frames of the requested duration.
         
     | 
| 119 | 
         
            +
                    """
         
     | 
| 120 | 
         
            +
             
     | 
| 121 | 
         
            +
                    # audio = np.frombuffer(audio, np.int16)
         
     | 
| 122 | 
         
            +
                    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
         
     | 
| 123 | 
         
            +
                    offset = 0
         
     | 
| 124 | 
         
            +
                    timestamp = 0.0
         
     | 
| 125 | 
         
            +
                    duration = (float(n) / sample_rate) / 2.0
         
     | 
| 126 | 
         
            +
                    while offset + n < len(audio):
         
     | 
| 127 | 
         
            +
                        yield Frame(audio[offset:offset + n], timestamp, duration)
         
     | 
| 128 | 
         
            +
                        timestamp += duration
         
     | 
| 129 | 
         
            +
                        offset += n
         
     | 
    	
        nemo_asr.py
    ADDED
    
    | 
         @@ -0,0 +1,22 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import aiofiles
         
     | 
| 2 | 
         
            +
            import nemo
         
     | 
| 3 | 
         
            +
            import nemo.collections.asr as nemo_asr
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
             
     | 
| 6 | 
         
            +
            def transcribe(file, modelName="stt_rw_conformer_transducer_large"):
         
     | 
| 7 | 
         
            +
              with aiofiles.open(file.filename, 'wb') as out_file:
         
     | 
| 8 | 
         
            +
                content = file.read()  # async read
         
     | 
| 9 | 
         
            +
                out_file.write(content)  # async write
         
     | 
| 10 | 
         
            +
                print(out_file.name)
         
     | 
| 11 | 
         
            +
                asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
         
     | 
| 12 | 
         
            +
                            model_name=modelName)
         
     | 
| 13 | 
         
            +
                if not file.name.endswith("wav"):
         
     | 
| 14 | 
         
            +
                  sound = AudioSegment.from_mp3(out_file.name)
         
     | 
| 15 | 
         
            +
                  sound.export(out_file.name, format="wav")
         
     | 
| 16 | 
         
            +
                  files = [out_file.name]
         
     | 
| 17 | 
         
            +
                pac.convert_wav_to_16bit_mono(out_file.name,out_file.name)
         
     | 
| 18 | 
         
            +
                # print("file loaded is **************",file.file)
         
     | 
| 19 | 
         
            +
                for fname, transcription in zip(files, asr_model.transcribe(paths2audio_files=files)):
         
     | 
| 20 | 
         
            +
                  print(f"Audio in {fname} was recognized as: {transcription}")
         
     | 
| 21 | 
         
            +
                  print(transcription[0])
         
     | 
| 22 | 
         
            +
                  return {"text": transcription[0], "filename": file.filename}
         
     | 
    	
        packages.txt
    ADDED
    
    | 
         @@ -0,0 +1,2 @@ 
     | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            libsndfile1
         
     | 
| 2 | 
         
            +
            ffmpeg
         
     | 
    	
        requirements.txt
    ADDED
    
    | 
         @@ -0,0 +1,14 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            librosa==0.9.1
         
     | 
| 2 | 
         
            +
            soundfile==0.10.3.post1
         
     | 
| 3 | 
         
            +
            torch==1.11.0
         
     | 
| 4 | 
         
            +
            transformers==4.18.0
         
     | 
| 5 | 
         
            +
            speechbrain
         
     | 
| 6 | 
         
            +
            stt
         
     | 
| 7 | 
         
            +
            webrtcvad
         
     | 
| 8 | 
         
            +
            numpy
         
     | 
| 9 | 
         
            +
            ffmpeg-python
         
     | 
| 10 | 
         
            +
            librosa==0.9.1
         
     | 
| 11 | 
         
            +
            soundfile==0.10.3.post1
         
     | 
| 12 | 
         
            +
            wget
         
     | 
| 13 | 
         
            +
            aiofiles
         
     | 
| 14 | 
         
            +
            -e https://github.com/NVIDIA/NeMo.git
         
     |