| #from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
| #from datasets import Audio, load_dataset | |
| # | |
| ## load model and processor | |
| #processor = WhisperProcessor.from_pretrained("openai/whisper-base") | |
| #model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base") | |
| #forced_decoder_ids = processor.get_decoder_prompt_ids(language="french", task="transcribe") | |
| # | |
| ## load streaming dataset and read first audio sample | |
| #input_speech = next(iter(ds))["audio"] | |
| #input_features = processor(input_speech["array"], sampling_rate=input_speech["sampling_rate"], return_tensors="pt").input_features | |
| # | |
| ## generate token ids | |
| #predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids) | |
| ## decode token ids to text | |
| #transcription = processor.batch_decode(predicted_ids) | |
| # | |
| #transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) | |