Spaces:

sitatech
/

vibe-shopping

Running

App Files Files Community

sitatech commited on Jun 12

Commit

646ceb9

1 Parent(s): 03ba0a4

Update local tool definitions to match mcp format & improve sys prompt

Browse files

Files changed (8) hide show

.gitattributes +1 -0
.gitignore +2 -1
app.py +50 -41
mcp_host/agent.py +110 -72
mcp_host/tts/gradio_api_tts.py +1 -1
mcp_host/tts/hf_zero_gpu_tts.py +1 -0
mcp_host/ui.py +1 -1
static/welcome-to-vibe-shopping.webp +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.webp filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -1,4 +1,5 @@
 __pycache__
 .DS_Store
 .env
-.gradio

 __pycache__
 .DS_Store
 .env
+.gradio
+*.webp

app.py CHANGED Viewed

@@ -78,32 +78,36 @@ async def handle_audio_stream(
     image_with_mask: dict | None = None,
     gradio_client: Client | None = None,
 ):
-    image, mask = handle_image_upload(image_with_mask)
-    def update_ui(products, image, clear_ui):
-        nonlocal displayed_products, displayed_image
-        if clear_ui:
-            displayed_products = None
-            displayed_image = None
-        else:
-            displayed_products = products
-            displayed_image = image
-    async for ai_speech in vibe_shopping_agent.chat(
-        user_speech=audio,
-        chat_history=chat_history,
-        voice=voice,
-        update_ui=update_ui,
-        input_image=image,
-        input_mask=mask,
-        gradio_client=gradio_client,
-    ):
-        # Yield the audio chunk to the WebRTC stream
-        yield ai_speech
-    yield AdditionalOutputs(
-        chat_history, displayed_products, displayed_image, None
-    )  # None for resetting the input_image state
 async def set_client_for_session(request: gr.Request):
@@ -115,6 +119,7 @@ async def set_client_for_session(request: gr.Request):
         raise gr.Error(
             f"Inference server is not available. Status code: {health_check_response.status}"
         )
     if not vibe_shopping_agent.clients_connected:
         await vibe_shopping_agent.connect_clients()
@@ -128,17 +133,17 @@ async def set_client_for_session(request: gr.Request):
     x_ip_token = request.headers["x-ip-token"]
-    return Client("sitatech/Kokoro-TTS", headers={"X-IP-Token": x_ip_token}), Modal(visible=False)
-with gr.Blocks(theme=gr.themes.Ocean()) as vibe_shopping_app:
     gradio_client = gr.State()
-    with Modal(visible=True) as modal:
-        ColdBootUI()
-    vibe_shopping_app.load(set_client_for_session, None, [gradio_client, modal])
     debuging_options = {
         "Echo user speech": "debug_echo_user_speech",
         "USE HF ZeroGPU STT": "debug_use_hf_zero_gpu_stt",
@@ -147,7 +152,7 @@ with gr.Blocks(theme=gr.themes.Ocean()) as vibe_shopping_app:
     chat_history = gr.State(value=[])
     displayed_products = gr.State(value=[])
     displayed_image = gr.State(value=None)
-    with gr.Column():
         voice = gr.Dropdown(
             label="Language & Voice",
             choices=list(VOICES.items()) + list(debuging_options.items()),
@@ -164,14 +169,14 @@ with gr.Blocks(theme=gr.themes.Ocean()) as vibe_shopping_app:
             mode="send-receive",
             modality="audio",
             button_labels={"start": "Start Vibe Shopping"},
-            rtc_configuration=get_cloudflare_turn_credentials_async
-            if not IS_LOCAL
-            else None,
-            server_rtc_configuration=get_cloudflare_turn_credentials(ttl=360_000)
-            if not IS_LOCAL
-            else None,
             scale=0,
-            time_limit=500,
         )
         with gr.Accordion(open=False, label="Input Image"):
             gr.Markdown(
@@ -203,4 +208,8 @@ with gr.Blocks(theme=gr.themes.Ocean()) as vibe_shopping_app:
         show_progress="hidden",
     )
     vibe_shopping_app.launch()

     image_with_mask: dict | None = None,
     gradio_client: Client | None = None,
 ):
+    try:
+        image, mask = handle_image_upload(image_with_mask)
+        def update_ui(products, image, clear_ui):
+            nonlocal displayed_products, displayed_image
+            if clear_ui:
+                displayed_products = None
+                displayed_image = None
+            else:
+                displayed_products = products
+                displayed_image = image
+        async for ai_speech in vibe_shopping_agent.chat(
+            user_speech=audio,
+            chat_history=chat_history,
+            voice=voice,
+            update_ui=update_ui,
+            input_image=image,
+            input_mask=mask,
+            gradio_client=gradio_client,
+        ):
+            # Yield the audio chunk to the WebRTC stream
+            yield ai_speech
+        yield AdditionalOutputs(
+            chat_history, displayed_products, displayed_image, None
+        )  # None for resetting the input_image state
+    except Exception as e:
+        print(f"Error in handle_audio_stream: {e}")
+        raise gr.Error(f"An error occurred: {e}")
 async def set_client_for_session(request: gr.Request):
         raise gr.Error(
             f"Inference server is not available. Status code: {health_check_response.status}"
         )
     if not vibe_shopping_agent.clients_connected:
         await vibe_shopping_agent.connect_clients()
     x_ip_token = request.headers["x-ip-token"]
+    return Client("sitatech/Kokoro-TTS", headers={"X-IP-Token": x_ip_token}), Modal(
+        visible=False
+    )
+with gr.Blocks(
+    theme=gr.themes.Ocean(),
+    css="#main-container { max-width: 1200px; margin: 0 auto; }",
+) as vibe_shopping_app:
     gradio_client = gr.State()
     debuging_options = {
         "Echo user speech": "debug_echo_user_speech",
         "USE HF ZeroGPU STT": "debug_use_hf_zero_gpu_stt",
     chat_history = gr.State(value=[])
     displayed_products = gr.State(value=[])
     displayed_image = gr.State(value=None)
+    with gr.Column(elem_id="main-container"):
         voice = gr.Dropdown(
             label="Language & Voice",
             choices=list(VOICES.items()) + list(debuging_options.items()),
             mode="send-receive",
             modality="audio",
             button_labels={"start": "Start Vibe Shopping"},
+            rtc_configuration=(
+                get_cloudflare_turn_credentials_async if not IS_LOCAL else None
+            ),
+            server_rtc_configuration=(
+                get_cloudflare_turn_credentials(ttl=360_000) if not IS_LOCAL else None
+            ),
             scale=0,
+            time_limit=3600,
         )
         with gr.Accordion(open=False, label="Input Image"):
             gr.Markdown(
         show_progress="hidden",
     )
+    with Modal(visible=True) as modal:
+        ColdBootUI()
+    vibe_shopping_app.load(set_client_for_session, None, [gradio_client, modal])
     vibe_shopping_app.launch()

mcp_host/agent.py CHANGED Viewed

@@ -48,12 +48,12 @@ Then, you can say what you think about the displayed item(s), tell how they fit
 Always ask the user for confirmation before taking any action that requires payment or purchase.
 If a function requires an input that you don't have based on your knowledge and the conversation history, you should ask the user for it. For example, if the user asks to try a product, but you don't have the target image, you should ask the user to provide it.
-When calling a function, ALWAYS start with a short notification message to the user before calling it.
-Here is an example you most follow: "One moment, I will search for products matching your request \n<tool_call>\n<call-function-to-search-products>\n</tool_call>".
-Then when you get the response from the function, you say "Here are some products I found for you \n<tool_call>\n<call-function-to-display-products>\n</tool_call>".
 The maximum number of products you can search at once is 10, don't exceed this limit.
-Make sure to only output raw text. NEVER output formatted text, markdown or emoji.
 """
     def __init__(
@@ -84,7 +84,7 @@ Make sure to only output raw text. NEVER output formatted text, markdown or emoj
             self.fewsats_client,
             self.virtual_try_client,
         ]
-        self.display_tool = _build_display_tool_definition()
         self.image_uploader = image_uploader
         self.clients_connected = False
@@ -98,10 +98,10 @@ Make sure to only output raw text. NEVER output formatted text, markdown or emoj
         await self.virtual_try_client.connect_to_server("python", ["./mcp_server.py"])
         self.tools = (
-            await self.agora_client.tools
             + await self.fewsats_client.tools
             + await self.virtual_try_client.tools
-            + [self.display_tool]
         )
         self.clients_connected = True
@@ -224,6 +224,7 @@ Make sure to only output raw text. NEVER output formatted text, markdown or emoj
             messages=chat_history,
             stream=True,
             tools=self.tools,
         )
         pending_tool_calls: dict[int, ChoiceDeltaToolCall] = {}
@@ -282,48 +283,52 @@ Make sure to only output raw text. NEVER output formatted text, markdown or emoj
                 }
             )
-            mcp_client = self._get_mcp_client_for_tool(tool_name)
-            if mcp_client is None:
-                print(f"Tool {tool_name} not found in any MCP client.")
-                tool_responses.append(
-                    {
                         "role": "tool",
                         "tool_call_id": call_id,
-                        "content": f"Unable to find tool '{tool_name}'.",
                     }
-                )
-            else:
-                try:
-                    print(f"Calling tool {tool_name} with args: {tool_args}")
-                    if tool_name == "display":
-                        args = json.loads(tool_args) if tool_args else {}
-                        update_ui(
-                            args.get("products"),
-                            args.get("image_url"),
-                            args.get("clear_ui"),
                         )
-                        tool_response: ChatCompletionToolMessageParam = {
-                            "role": "tool",
-                            "tool_call_id": call_id,
-                            "content": "Content displayed successfully.",
-                        }
                     else:
                         tool_response = await mcp_client.call_tool(
                             call_id=call_id,
                             tool_name=tool_name,
                             tool_args=json.loads(tool_args) if tool_args else None,
                         )
-                    print("Tool responded")
-                    tool_responses.append(tool_response)
-                except Exception as e:
-                    print(f"Error calling tool {tool_name}: {e}")
-                    tool_responses.append(
-                        {
-                            "role": "tool",
-                            "tool_call_id": call_id,
-                            "content": f"Error calling tool '{tool_name}', Error: {str(e)[:500]}",
-                        }
-                    )
     def _build_input_image_content(
         self, input_image: Image.Image, image_label: str
@@ -348,43 +353,76 @@ Make sure to only output raw text. NEVER output formatted text, markdown or emoj
         )
-def _build_display_tool_definition() -> ChatCompletionToolParam:
-    return {
-        "type": "function",
-        "function": {
-            "name": "display",
-            "description": """This tool Shows/Displays content to the user.
-You can use this tool whenever you want to show responses you get from other tools or when the user requests to see something that you have access to, like a list of products, specific product(s) from the conversation history, an image, or cart items.
-You can only pass one argument at a time, either products or image_url, or clear_ui.
 """,
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "products": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "properties": {
-                                "name": {"type": "string"},
-                                "image_url": {"type": "string"},
-                                "price": {"type": "string"},
                             },
-                            "required": ["name", "image_url", "price"],
-                        },
-                        "description": "A list of products to display from search results, cart items, or conversation history.",
-                    },
-                    "image_url": {
-                        "type": "string",
-                        "description": "An optional URL of an image to display.",
                     },
-                    "clear_ui": {
-                        "type": "boolean",
-                        "description": (
-                            "If true, clear the UI instead of displaying anything."
-                        ),
                     },
                 },
             },
         },
-    }

 Always ask the user for confirmation before taking any action that requires payment or purchase.
 If a function requires an input that you don't have based on your knowledge and the conversation history, you should ask the user for it. For example, if the user asks to try a product, but you don't have the target image, you should ask the user to provide it.
+When calling a function, ALWAYS let the user know what you are doing while they are waiting.
+Something like: One moment, I will search for products matching your request \n<tool_call>\n<call-function-to-search-products>\n</tool_call>. \
+Then when you get the response from the function, you can say Here are some products I found for you \n<tool_call>\n<call-function-to-display-products>\n</tool_call>.
 The maximum number of products you can search at once is 10, don't exceed this limit.
+Make sure to only output raw text. NEVER output markdown or emoji.
 """
     def __init__(
             self.fewsats_client,
             self.virtual_try_client,
         ]
+        self.display_tool = _build_display_tool_definitions()
         self.image_uploader = image_uploader
         self.clients_connected = False
         await self.virtual_try_client.connect_to_server("python", ["./mcp_server.py"])
         self.tools = (
+            self.display_tool
+            + await self.agora_client.tools
             + await self.fewsats_client.tools
             + await self.virtual_try_client.tools
         )
         self.clients_connected = True
             messages=chat_history,
             stream=True,
             tools=self.tools,
+            temperature=0.7,
         )
         pending_tool_calls: dict[int, ChoiceDeltaToolCall] = {}
                 }
             )
+            try:
+                print(f"Calling tool {tool_name} with args: {tool_args}")
+                if tool_name.startswith("Display."):
+                    args = json.loads(tool_args) if tool_args else {}
+                    update_ui(
+                        args.get("products"),
+                        args.get("image_url"),
+                        tool_name == "Display.clear_display",
+                    )
+                    tool_response: ChatCompletionToolMessageParam = {
                         "role": "tool",
                         "tool_call_id": call_id,
+                        "content": (
+                            "Content displayed successfully."
+                            if tool_name != "clear_display"
+                            else "Display cleared."
+                        ),
                     }
+                else:
+                    mcp_client = self._get_mcp_client_for_tool(tool_name)
+                    if mcp_client is None:
+                        print(f"Tool {tool_name} not found in any MCP client.")
+                        tool_responses.append(
+                            {
+                                "role": "tool",
+                                "tool_call_id": call_id,
+                                "content": f"Unable to find tool '{tool_name}'.",
+                            }
                         )
                     else:
                         tool_response = await mcp_client.call_tool(
                             call_id=call_id,
                             tool_name=tool_name,
                             tool_args=json.loads(tool_args) if tool_args else None,
                         )
+                print("Tool responded")
+                tool_responses.append(tool_response)
+            except Exception as e:
+                print(f"Error calling tool {tool_name}: {e}")
+                tool_responses.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": call_id,
+                        "content": f"Error calling tool '{tool_name}', Error: {str(e)[:500]}",
+                    }
+                )
     def _build_input_image_content(
         self, input_image: Image.Image, image_label: str
         )
+def _build_display_tool_definitions() -> list[ChatCompletionToolParam]:
+    return [
+        {
+            "type": "function",
+            "function": {
+                "name": "Display.display_products",
+                "description": """
+    Display a list of products. Use this to show search results, cart items, or products from conversation history.
+    Args:
+        products: A list of products to display. Each product should have a name, image URL, and formatted price.
+        example:
+            products: [
+                {
+                    "name": "Stylish Green Shirt",
+                    "image_url": "https://example.com/images/green-shirt.jpg",
+                    "price": "$29.99"
+                },
+                {
+                    "name": "Comfortable Jeans",
+                    "image_url": "https://example.com/images/jeans.jpg",
+                    "price": "$49.99"
+                }
+            ]
 """,
+                "parameters": {
+                    "properties": {
+                        "products": {
+                            "items": {
+                                "additionalProperties": {"type": "string"},
+                                "type": "object",
                             },
+                            "title": "Products",
+                            "type": "array",
+                        }
                     },
+                    "required": ["products"],
+                    "title": "display_productsArguments",
+                    "type": "object",
+                },
+            },
+        },
+        {
+            "type": "function",
+            "function": {
+                "name": "Display.display_image",
+                "description": "Display a single standalone image. Use this for virtual try-on results, a specific product image requested by the user, or any other relevant single image.\n\nArgs:\n    image_url: The URL of the image to display.",
+                "parameters": {
+                    "properties": {
+                        "image_url": {
+                            "title": "Image URL",
+                            "type": "string",
+                        },
                     },
+                    "required": ["image_url"],
+                    "title": "display_imageArguments",
+                    "type": "object",
+                },
+            },
+        },
+        {
+            "type": "function",
+            "function": {
+                "name": "Display.clear_display",
+                "description": "Clear any content currently displayed in the user interface. Removes everything from the visual display area.\n\nArgs: None",
+                "parameters": {
+                    "properties": {},
+                    "title": "clear_displayArguments",
+                    "type": "object",
                 },
             },
         },
+    ]

mcp_host/tts/gradio_api_tts.py CHANGED Viewed

@@ -37,7 +37,7 @@ async def stream_text_to_speech(
     standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]
     for text in generate_sentences(text_stream, language=standard_lang_code):
-        print(f"Streaming audio for text: {text[:10]}...")
         audio = client.submit(
             text=text, voice=voice, speed=1, use_gpu=True, api_name="/stream"
         )

     standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]
     for text in generate_sentences(text_stream, language=standard_lang_code):
+        print(f"Streaming audio for text: {text}")
         audio = client.submit(
             text=text, voice=voice, speed=1, use_gpu=True, api_name="/stream"
         )

mcp_host/tts/hf_zero_gpu_tts.py CHANGED Viewed

@@ -58,6 +58,7 @@ async def stream_text_to_speech(
     standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]
     for text in generate_sentences(text_stream, language=standard_lang_code):
         for audio in text_to_speech(text, pipe_key=kokoro_lang, voice=voice):
             yield 24000, audio

     standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]
     for text in generate_sentences(text_stream, language=standard_lang_code):
+        print(f"Streaming audio for text: {text}")
         for audio in text_to_speech(text, pipe_key=kokoro_lang, voice=voice):
             yield 24000, audio

mcp_host/ui.py CHANGED Viewed

@@ -13,7 +13,7 @@ def UI(products_state: gr.State, image_state: gr.State):
             justify-content: center;
             height: 600px;
             width: 100%;
-            background: linear-gradient(rgba(0,0,0,0.3), rgba(0,0,0,0.3)), url('{get_hf_space_file_url_prefix()}static/welcome-to-vibe-shopping.webp');
             background-size: cover;
             background-position: center;
             background-repeat: no-repeat;

             justify-content: center;
             height: 600px;
             width: 100%;
+            background: linear-gradient(rgba(0,0,0,0.3), rgba(0,0,0,0.3)), url('{get_hf_space_file_url_prefix()}static/welcome-to-vibe-shopping-upscaled.webp');
             background-size: cover;
             background-position: center;
             background-repeat: no-repeat;

static/welcome-to-vibe-shopping.webp CHANGED Viewed

Git LFS Details

SHA256: 457f8f04916ead700e1d8f6168d043be09a049908a3d7c9f5225aba480c25442
Pointer size: 130 Bytes
Size of remote file: 51.5 kB