Spaces:
Running
Running
| import io | |
| import os | |
| import wave | |
| import numpy as np | |
| from openai import OpenAI | |
| from numpy.typing import NDArray | |
| __all__ = ["speech_to_text"] | |
| class OpenAISTT: | |
| def __init__( | |
| self, | |
| api_key: str = os.getenv("STT_OPENAI_API_KEY", ""), | |
| api_base: str = os.getenv("STT_OPENAI_API_BASE_URL", "https://api.sambanova.ai/v1"), | |
| model: str = "Whisper-Large-v3", | |
| ): | |
| self.openai_client = OpenAI( | |
| base_url=api_base, | |
| api_key=api_key, | |
| ) | |
| self.model = model | |
| def _numpy_to_wav_bytes(self, sample_rate: int, audio: NDArray[np.int16]) -> bytes: | |
| """ | |
| Convert numpy int16 audio array to a WAV bytes buffer. | |
| """ | |
| with io.BytesIO() as buf: | |
| with wave.open(buf, "wb") as wf: | |
| wf.setnchannels(1) # mono audio | |
| wf.setsampwidth(2) # 2 bytes for int16 | |
| wf.setframerate(sample_rate) | |
| wf.writeframes(audio.tobytes()) | |
| return buf.getvalue() | |
| def transcribe_from_tuple( | |
| self, audio_input: tuple[int, NDArray[np.int16]], language: str | None = None | |
| ) -> str: | |
| """ | |
| Transcribe audio from a tuple (sample_rate, audio_array). | |
| :param audio_input: Tuple of (sample_rate, np.int16 numpy array). | |
| :param language: Optional language code. | |
| :return: Transcription string. | |
| """ | |
| sample_rate, audio_array = audio_input | |
| wav_bytes = self._numpy_to_wav_bytes(sample_rate, audio_array) | |
| # Prepare the file-like object to pass to openai.Audio.transcribe | |
| audio_file = io.BytesIO(wav_bytes) | |
| audio_file.name = "audio.wav" # some APIs require the filename attribute | |
| try: | |
| if language: | |
| response = self.openai_client.audio.transcriptions.create( | |
| model=self.model, | |
| file=audio_file, | |
| language=language, | |
| ) | |
| else: | |
| response = self.openai_client.audio.transcriptions.create( | |
| model=self.model, | |
| file=audio_file, | |
| ) | |
| return response.text | |
| except Exception as e: | |
| raise RuntimeError(f"Transcription failed: {str(e)}") | |
| model = OpenAISTT() | |
| ## Need to have the same interface as the other STT (like zero_gpu_stt.py) | |
| speech_to_text = model.transcribe_from_tuple | |