Spaces:
Running
Running
File size: 2,422 Bytes
f8b769b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import io
import os
import wave
import numpy as np
from openai import OpenAI
from numpy.typing import NDArray
__all__ = ["speech_to_text"]
class OpenAISTT:
def __init__(
self,
api_key: str = os.getenv("STT_OPENAI_API_KEY", ""),
api_base: str = os.getenv("STT_OPENAI_API_BASE_URL", "https://api.sambanova.ai/v1"),
model: str = "Whisper-Large-v3",
):
self.openai_client = OpenAI(
base_url=api_base,
api_key=api_key,
)
self.model = model
def _numpy_to_wav_bytes(self, sample_rate: int, audio: NDArray[np.int16]) -> bytes:
"""
Convert numpy int16 audio array to a WAV bytes buffer.
"""
with io.BytesIO() as buf:
with wave.open(buf, "wb") as wf:
wf.setnchannels(1) # mono audio
wf.setsampwidth(2) # 2 bytes for int16
wf.setframerate(sample_rate)
wf.writeframes(audio.tobytes())
return buf.getvalue()
def transcribe_from_tuple(
self, audio_input: tuple[int, NDArray[np.int16]], language: str | None = None
) -> str:
"""
Transcribe audio from a tuple (sample_rate, audio_array).
:param audio_input: Tuple of (sample_rate, np.int16 numpy array).
:param language: Optional language code.
:return: Transcription string.
"""
sample_rate, audio_array = audio_input
wav_bytes = self._numpy_to_wav_bytes(sample_rate, audio_array)
# Prepare the file-like object to pass to openai.Audio.transcribe
audio_file = io.BytesIO(wav_bytes)
audio_file.name = "audio.wav" # some APIs require the filename attribute
try:
if language:
response = self.openai_client.audio.transcriptions.create(
model=self.model,
file=audio_file,
language=language,
)
else:
response = self.openai_client.audio.transcriptions.create(
model=self.model,
file=audio_file,
)
return response.text
except Exception as e:
raise RuntimeError(f"Transcription failed: {str(e)}")
model = OpenAISTT()
## Need to have the same interface as the other STT (like zero_gpu_stt.py)
speech_to_text = model.transcribe_from_tuple
|