vibe-shopping / mcp_host /stt /openai_stt.py
sitatech's picture
Implement openai stt
f8b769b
import io
import os
import wave
import numpy as np
from openai import OpenAI
from numpy.typing import NDArray
__all__ = ["speech_to_text"]
class OpenAISTT:
def __init__(
self,
api_key: str = os.getenv("STT_OPENAI_API_KEY", ""),
api_base: str = os.getenv("STT_OPENAI_API_BASE_URL", "https://api.sambanova.ai/v1"),
model: str = "Whisper-Large-v3",
):
self.openai_client = OpenAI(
base_url=api_base,
api_key=api_key,
)
self.model = model
def _numpy_to_wav_bytes(self, sample_rate: int, audio: NDArray[np.int16]) -> bytes:
"""
Convert numpy int16 audio array to a WAV bytes buffer.
"""
with io.BytesIO() as buf:
with wave.open(buf, "wb") as wf:
wf.setnchannels(1) # mono audio
wf.setsampwidth(2) # 2 bytes for int16
wf.setframerate(sample_rate)
wf.writeframes(audio.tobytes())
return buf.getvalue()
def transcribe_from_tuple(
self, audio_input: tuple[int, NDArray[np.int16]], language: str | None = None
) -> str:
"""
Transcribe audio from a tuple (sample_rate, audio_array).
:param audio_input: Tuple of (sample_rate, np.int16 numpy array).
:param language: Optional language code.
:return: Transcription string.
"""
sample_rate, audio_array = audio_input
wav_bytes = self._numpy_to_wav_bytes(sample_rate, audio_array)
# Prepare the file-like object to pass to openai.Audio.transcribe
audio_file = io.BytesIO(wav_bytes)
audio_file.name = "audio.wav" # some APIs require the filename attribute
try:
if language:
response = self.openai_client.audio.transcriptions.create(
model=self.model,
file=audio_file,
language=language,
)
else:
response = self.openai_client.audio.transcriptions.create(
model=self.model,
file=audio_file,
)
return response.text
except Exception as e:
raise RuntimeError(f"Transcription failed: {str(e)}")
model = OpenAISTT()
## Need to have the same interface as the other STT (like zero_gpu_stt.py)
speech_to_text = model.transcribe_from_tuple