Spaces:
Sleeping
Sleeping
| import os | |
| from pyannote.audio import Pipeline | |
| from pydub import AudioSegment | |
| from transformers import WhisperForConditionalGeneration, WhisperProcessor | |
| import torchaudio | |
| import torch | |
| device = 0 if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float32 | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| MODEL_NAME = "projecte-aina/whisper-large-v3-ca-es-synth-cs" | |
| model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch_dtype,token=HF_TOKEN).to(device) | |
| processor = WhisperProcessor.from_pretrained(MODEL_NAME) | |
| def generate(audio_path): | |
| input_audio, sample_rate = torchaudio.load(audio_path) | |
| input_audio = torchaudio.transforms.Resample(sample_rate, 16000)(input_audio) | |
| input_speech = input_audio[0] | |
| input_features = processor(input_speech, | |
| sampling_rate=16_000, | |
| return_attention_mask=True, | |
| return_tensors="pt", torch_dtype=torch_dtype).input_features.to(device) | |
| pred_ids = model.generate(input_features, | |
| return_timestamps=True, | |
| max_new_tokens=128) | |
| output = processor.batch_decode(pred_ids, skip_special_tokens=True) | |
| line = output[0] | |
| return line |