import torch import torchaudio from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import gradio as gr model = Wav2Vec2ForCTC.from_pretrained("tacab/tacab_asr_somali") processor = Wav2Vec2Processor.from_pretrained("tacab/tacab_asr_somali") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) def transcribe(audio_path): waveform, sample_rate = torchaudio.load(audio_path) if sample_rate != 16000: waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform) if waveform.shape[0] > 1: waveform = waveform.mean(dim=0, keepdim=True) inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt") input_values = inputs.input_values.to(device) with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] return transcription.lower() iface = gr.Interface( fn=transcribe, inputs=gr.Audio(type="filepath", label="🎙️ Somali Audio"), outputs=gr.Text(label="📄 Transcription"), title="Tacab Somali ASR", description="Speak Somali and get transcription back!", ) iface.launch(server_name="0.0.0.0")