Spaces:
Running
Running
| # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
| # SPDX-License-Identifier: BSD 2-Clause License | |
| """ACE Websocket Serializer Implementation. | |
| This module defines the `ACEWebSocketSerializer` class, which is responsible for | |
| serializing and deserializing frames for WebSocket communication in a speech-based | |
| user interface. The serializer supports various frame types related to audio, text-to-speech (TTS), | |
| and automatic speech recognition (ASR). | |
| The serializer handles the following frame types: | |
| - AudioRawFrame: Raw audio data | |
| - BotUpdatedSpeakingTranscriptFrame: Updates during bot speech | |
| - BotStoppedSpeakingFrame: End of bot speech | |
| - UserUpdatedSpeakingTranscriptFrame: Updates during user speech | |
| - UserStoppedSpeakingTranscriptFrame: End of user speech | |
| - InputAudioRawFrame: Raw input audio data | |
| The serialization format is either binary (for audio data) or JSON (for transcript updates). | |
| """ | |
| import io | |
| import json | |
| import wave | |
| from pipecat.frames.frames import ( | |
| AudioRawFrame, | |
| BotStoppedSpeakingFrame, | |
| Frame, | |
| InputAudioRawFrame, | |
| ) | |
| from pipecat.serializers.base_serializer import ( | |
| FrameSerializer, | |
| FrameSerializerType, | |
| ) | |
| from nvidia_pipecat.frames.transcripts import ( | |
| BotUpdatedSpeakingTranscriptFrame, | |
| UserStoppedSpeakingTranscriptFrame, | |
| UserUpdatedSpeakingTranscriptFrame, | |
| ) | |
| class ACEWebSocketSerializer(FrameSerializer): | |
| """Serializes frames for WebSocket communication in speech interface. | |
| This class provides methods to serialize and deserialize frames for communication | |
| between the server and a speech-based UI. It supports both binary audio data | |
| and JSON-formatted transcript updates. | |
| Attributes: | |
| type (FrameSerializerType): The serializer type, always BINARY. | |
| Input Frames: | |
| AudioRawFrame: Raw audio data | |
| BotUpdatedSpeakingTranscriptFrame: TTS update | |
| BotStoppedSpeakingFrame: TTS end | |
| UserUpdatedSpeakingTranscriptFrame: ASR update | |
| UserStoppedSpeakingTranscriptFrame: ASR end | |
| InputAudioRawFrame: Raw input audio | |
| """ | |
| def type(self) -> FrameSerializerType: | |
| """Return the type of FrameSerializer. | |
| Returns: | |
| FrameSerializerType: Always returns BINARY type for this serializer. | |
| """ | |
| return FrameSerializerType.BINARY | |
| async def serialize(self, frame: Frame) -> str | bytes | None: | |
| """Serializes a frame to JSON string or bytes. | |
| Args: | |
| frame (Frame): The frame to serialize. Can be one of: | |
| - AudioRawFrame: Returns raw audio bytes | |
| - BotUpdatedSpeakingTranscriptFrame: Returns JSON with TTS update | |
| - BotStoppedSpeakingFrame: Returns JSON with TTS end | |
| - UserUpdatedSpeakingTranscriptFrame: Returns JSON with ASR update | |
| - UserStoppedSpeakingTranscriptFrame: Returns JSON with ASR end | |
| Returns: | |
| str | bytes | None: Serialized data: | |
| - bytes for audio frames | |
| - JSON string for transcript updates | |
| - None for unsupported frames | |
| """ | |
| message = None | |
| if isinstance(frame, AudioRawFrame): | |
| return frame.audio | |
| if isinstance(frame, BotUpdatedSpeakingTranscriptFrame): | |
| message = {"type": "tts_update", "tts": frame.transcript} | |
| if isinstance(frame, BotStoppedSpeakingFrame): | |
| message = {"type": "tts_end"} | |
| if isinstance(frame, UserUpdatedSpeakingTranscriptFrame): | |
| message = {"type": "asr_update", "asr": frame.transcript} | |
| if isinstance(frame, UserStoppedSpeakingTranscriptFrame): | |
| message = {"type": "asr_end", "asr": frame.transcript} | |
| if message: | |
| return json.dumps(message) | |
| return None | |
| async def deserialize(self, data: str | bytes) -> Frame | None: | |
| """Deserialize bytes into a Frame object. | |
| Args: | |
| data (str | bytes): The data to deserialize. Expected to be | |
| WAV-formatted audio data. | |
| Returns: | |
| Frame | None: The deserialized frame as an InputAudioRawFrame for audio data, | |
| or None for unsupported data types. | |
| """ | |
| if isinstance(data, bytes): | |
| with io.BytesIO(data) as buffer, wave.open(buffer, "rb") as wf: | |
| return InputAudioRawFrame(wf.readframes(wf.getnframes()), wf.getframerate(), wf.getnchannels()) | |
| return None | |