Spaces:

sitatech
/

vibe-shopping

Running

App Files Files Community

vibe-shopping / mcp_host /stt /openai_stt.py

sitatech

Implement openai stt

f8b769b 5 months ago

raw

history blame contribute delete

2.42 kB

	import io
	import os
	import wave
	import numpy as np
	from openai import OpenAI
	from numpy.typing import NDArray


	__all__ = ["speech_to_text"]


	class OpenAISTT:
	def __init__(
	self,
	api_key: str = os.getenv("STT_OPENAI_API_KEY", ""),
	api_base: str = os.getenv("STT_OPENAI_API_BASE_URL", "https://api.sambanova.ai/v1"),
	model: str = "Whisper-Large-v3",
	):
	self.openai_client = OpenAI(
	base_url=api_base,
	api_key=api_key,
	)
	self.model = model

	def _numpy_to_wav_bytes(self, sample_rate: int, audio: NDArray[np.int16]) -> bytes:
	"""
	Convert numpy int16 audio array to a WAV bytes buffer.
	"""
	with io.BytesIO() as buf:
	with wave.open(buf, "wb") as wf:
	wf.setnchannels(1) # mono audio
	wf.setsampwidth(2) # 2 bytes for int16
	wf.setframerate(sample_rate)
	wf.writeframes(audio.tobytes())
	return buf.getvalue()

	def transcribe_from_tuple(
	self, audio_input: tuple[int, NDArray[np.int16]], language: str \| None = None
	) -> str:
	"""
	Transcribe audio from a tuple (sample_rate, audio_array).
	:param audio_input: Tuple of (sample_rate, np.int16 numpy array).
	:param language: Optional language code.
	:return: Transcription string.
	"""
	sample_rate, audio_array = audio_input

	wav_bytes = self._numpy_to_wav_bytes(sample_rate, audio_array)

	# Prepare the file-like object to pass to openai.Audio.transcribe
	audio_file = io.BytesIO(wav_bytes)
	audio_file.name = "audio.wav" # some APIs require the filename attribute

	try:
	if language:
	response = self.openai_client.audio.transcriptions.create(
	model=self.model,
	file=audio_file,
	language=language,
	)
	else:
	response = self.openai_client.audio.transcriptions.create(
	model=self.model,
	file=audio_file,
	)
	return response.text
	except Exception as e:
	raise RuntimeError(f"Transcription failed: {str(e)}")


	model = OpenAISTT()

	## Need to have the same interface as the other STT (like zero_gpu_stt.py)
	speech_to_text = model.transcribe_from_tuple