Spaces:

fastrtc
/

gemini-audio-video

Running on CPU Upgrade

App Files Files Community

freddyaboulton HF Staff commited on Apr 22

Commit

2ec08a6

verified ·

1 Parent(s): 3c84e3a

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +24 -17
requirements.txt +0 -1

README.md CHANGED Viewed

@@ -9,7 +9,7 @@ app_file: app.py
 pinned: false
 license: mit
 short_description: Gemini understands audio and video!
-tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 license: mit
 short_description: Gemini understands audio and video!
+tags: [webrtc, websocket, gradio, secret|HF_TOKEN secret|GEMINI_API_KEY]
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -5,16 +5,18 @@ import time
 from io import BytesIO
 import gradio as gr
-from gradio.utils import get_space
 import numpy as np
-from google import genai
 from dotenv import load_dotenv
 from fastrtc import (
     AsyncAudioVideoStreamHandler,
     Stream,
-    get_twilio_turn_credentials,
     WebRTC,
 )
 from PIL import Image
 load_dotenv()
@@ -44,12 +46,10 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
         super().__init__(
             "mono",
             output_sample_rate=24000,
-            output_frame_size=480,
             input_sample_rate=16000,
         )
         self.audio_queue = asyncio.Queue()
         self.video_queue = asyncio.Queue()
-        self.quit = asyncio.Event()
         self.session = None
         self.last_frame_time = 0
         self.quit = asyncio.Event()
@@ -69,10 +69,14 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
             print("set session")
             while not self.quit.is_set():
                 turn = self.session.receive()
-                async for response in turn:
-                    if data := response.data:
-                        audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
                         self.audio_queue.put_nowait(audio)
     async def video_receive(self, frame: np.ndarray):
         if self.session:
@@ -87,7 +91,11 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
         self.video_queue.put_nowait(frame)
     async def video_emit(self):
-        return await self.video_queue.get()
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         _, array = frame
@@ -97,13 +105,14 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
             await self.session.send(input=audio_message)
     async def emit(self):
-        array = await self.audio_queue.get()
-        return (self.output_sample_rate, array)
     async def shutdown(self) -> None:
         if self.session:
             self.quit.set()
-            await self.session._websocket.close()
             self.quit.clear()
@@ -111,9 +120,7 @@ stream = Stream(
     handler=GeminiHandler(),
     modality="audio-video",
     mode="send-receive",
-    rtc_configuration=get_twilio_turn_credentials()
-    if get_space()
-    else None,
     time_limit=90 if get_space() else None,
     additional_inputs=[
         gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
@@ -153,8 +160,8 @@ with gr.Blocks(css=css) as demo:
                 modality="audio-video",
                 mode="send-receive",
                 elem_id="video-source",
-                rtc_configuration=get_twilio_turn_credentials()
-                if get_space()
                 else None,
                 icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
                 pulse_color="rgb(255, 255, 255)",

 from io import BytesIO
 import gradio as gr
 import numpy as np
+import websockets
 from dotenv import load_dotenv
 from fastrtc import (
     AsyncAudioVideoStreamHandler,
     Stream,
     WebRTC,
+    get_turn_credentials_async,
+    wait_for_item,
 )
+from google import genai
+from gradio.utils import get_space
 from PIL import Image
 load_dotenv()
         super().__init__(
             "mono",
             output_sample_rate=24000,
             input_sample_rate=16000,
         )
         self.audio_queue = asyncio.Queue()
         self.video_queue = asyncio.Queue()
         self.session = None
         self.last_frame_time = 0
         self.quit = asyncio.Event()
             print("set session")
             while not self.quit.is_set():
                 turn = self.session.receive()
+                try:
+                    async for response in turn:
+                        if data := response.data:
+                            audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
                         self.audio_queue.put_nowait(audio)
+                except websockets.exceptions.ConnectionClosedOK:
+                    print("connection closed")
+                    break
     async def video_receive(self, frame: np.ndarray):
         if self.session:
         self.video_queue.put_nowait(frame)
     async def video_emit(self):
+        frame = await wait_for_item(self.video_queue)
+        if frame is not None:
+            return frame
+        else:
+            return np.zeros((100, 100, 3), dtype=np.uint8)
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         _, array = frame
             await self.session.send(input=audio_message)
     async def emit(self):
+        array = await wait_for_item(self.audio_queue)
+        if array is not None:
+            return (self.output_sample_rate, array)
     async def shutdown(self) -> None:
         if self.session:
             self.quit.set()
+            await self.session.close()
             self.quit.clear()
     handler=GeminiHandler(),
     modality="audio-video",
     mode="send-receive",
+    rtc_configuration=get_turn_credentials_async if get_space() == "spaces" else None,
     time_limit=90 if get_space() else None,
     additional_inputs=[
         gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
                 modality="audio-video",
                 mode="send-receive",
                 elem_id="video-source",
+                rtc_configuration=get_turn_credentials_async
+                if get_space() == "spaces"
                 else None,
                 icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
                 pulse_color="rgb(255, 255, 255)",

requirements.txt CHANGED Viewed

@@ -2,4 +2,3 @@ fastrtc
 python-dotenv
 google-genai
 twilio
-pydantic==2.10.0

 python-dotenv
 google-genai
 twilio