sitatech commited on
Commit
646ceb9
·
1 Parent(s): 03ba0a4

Update local tool definitions to match mcp format & improve sys prompt

Browse files
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.webp filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1,4 +1,5 @@
1
  __pycache__
2
  .DS_Store
3
  .env
4
- .gradio
 
 
1
  __pycache__
2
  .DS_Store
3
  .env
4
+ .gradio
5
+ *.webp
app.py CHANGED
@@ -78,32 +78,36 @@ async def handle_audio_stream(
78
  image_with_mask: dict | None = None,
79
  gradio_client: Client | None = None,
80
  ):
81
- image, mask = handle_image_upload(image_with_mask)
82
-
83
- def update_ui(products, image, clear_ui):
84
- nonlocal displayed_products, displayed_image
85
- if clear_ui:
86
- displayed_products = None
87
- displayed_image = None
88
- else:
89
- displayed_products = products
90
- displayed_image = image
91
-
92
- async for ai_speech in vibe_shopping_agent.chat(
93
- user_speech=audio,
94
- chat_history=chat_history,
95
- voice=voice,
96
- update_ui=update_ui,
97
- input_image=image,
98
- input_mask=mask,
99
- gradio_client=gradio_client,
100
- ):
101
- # Yield the audio chunk to the WebRTC stream
102
- yield ai_speech
103
-
104
- yield AdditionalOutputs(
105
- chat_history, displayed_products, displayed_image, None
106
- ) # None for resetting the input_image state
 
 
 
 
107
 
108
 
109
  async def set_client_for_session(request: gr.Request):
@@ -115,6 +119,7 @@ async def set_client_for_session(request: gr.Request):
115
  raise gr.Error(
116
  f"Inference server is not available. Status code: {health_check_response.status}"
117
  )
 
118
  if not vibe_shopping_agent.clients_connected:
119
  await vibe_shopping_agent.connect_clients()
120
 
@@ -128,17 +133,17 @@ async def set_client_for_session(request: gr.Request):
128
 
129
  x_ip_token = request.headers["x-ip-token"]
130
 
131
- return Client("sitatech/Kokoro-TTS", headers={"X-IP-Token": x_ip_token}), Modal(visible=False)
 
 
132
 
133
 
134
- with gr.Blocks(theme=gr.themes.Ocean()) as vibe_shopping_app:
 
 
 
135
  gradio_client = gr.State()
136
 
137
- with Modal(visible=True) as modal:
138
- ColdBootUI()
139
-
140
- vibe_shopping_app.load(set_client_for_session, None, [gradio_client, modal])
141
-
142
  debuging_options = {
143
  "Echo user speech": "debug_echo_user_speech",
144
  "USE HF ZeroGPU STT": "debug_use_hf_zero_gpu_stt",
@@ -147,7 +152,7 @@ with gr.Blocks(theme=gr.themes.Ocean()) as vibe_shopping_app:
147
  chat_history = gr.State(value=[])
148
  displayed_products = gr.State(value=[])
149
  displayed_image = gr.State(value=None)
150
- with gr.Column():
151
  voice = gr.Dropdown(
152
  label="Language & Voice",
153
  choices=list(VOICES.items()) + list(debuging_options.items()),
@@ -164,14 +169,14 @@ with gr.Blocks(theme=gr.themes.Ocean()) as vibe_shopping_app:
164
  mode="send-receive",
165
  modality="audio",
166
  button_labels={"start": "Start Vibe Shopping"},
167
- rtc_configuration=get_cloudflare_turn_credentials_async
168
- if not IS_LOCAL
169
- else None,
170
- server_rtc_configuration=get_cloudflare_turn_credentials(ttl=360_000)
171
- if not IS_LOCAL
172
- else None,
173
  scale=0,
174
- time_limit=500,
175
  )
176
  with gr.Accordion(open=False, label="Input Image"):
177
  gr.Markdown(
@@ -203,4 +208,8 @@ with gr.Blocks(theme=gr.themes.Ocean()) as vibe_shopping_app:
203
  show_progress="hidden",
204
  )
205
 
 
 
 
 
206
  vibe_shopping_app.launch()
 
78
  image_with_mask: dict | None = None,
79
  gradio_client: Client | None = None,
80
  ):
81
+ try:
82
+ image, mask = handle_image_upload(image_with_mask)
83
+
84
+ def update_ui(products, image, clear_ui):
85
+ nonlocal displayed_products, displayed_image
86
+ if clear_ui:
87
+ displayed_products = None
88
+ displayed_image = None
89
+ else:
90
+ displayed_products = products
91
+ displayed_image = image
92
+
93
+ async for ai_speech in vibe_shopping_agent.chat(
94
+ user_speech=audio,
95
+ chat_history=chat_history,
96
+ voice=voice,
97
+ update_ui=update_ui,
98
+ input_image=image,
99
+ input_mask=mask,
100
+ gradio_client=gradio_client,
101
+ ):
102
+ # Yield the audio chunk to the WebRTC stream
103
+ yield ai_speech
104
+
105
+ yield AdditionalOutputs(
106
+ chat_history, displayed_products, displayed_image, None
107
+ ) # None for resetting the input_image state
108
+ except Exception as e:
109
+ print(f"Error in handle_audio_stream: {e}")
110
+ raise gr.Error(f"An error occurred: {e}")
111
 
112
 
113
  async def set_client_for_session(request: gr.Request):
 
119
  raise gr.Error(
120
  f"Inference server is not available. Status code: {health_check_response.status}"
121
  )
122
+
123
  if not vibe_shopping_agent.clients_connected:
124
  await vibe_shopping_agent.connect_clients()
125
 
 
133
 
134
  x_ip_token = request.headers["x-ip-token"]
135
 
136
+ return Client("sitatech/Kokoro-TTS", headers={"X-IP-Token": x_ip_token}), Modal(
137
+ visible=False
138
+ )
139
 
140
 
141
+ with gr.Blocks(
142
+ theme=gr.themes.Ocean(),
143
+ css="#main-container { max-width: 1200px; margin: 0 auto; }",
144
+ ) as vibe_shopping_app:
145
  gradio_client = gr.State()
146
 
 
 
 
 
 
147
  debuging_options = {
148
  "Echo user speech": "debug_echo_user_speech",
149
  "USE HF ZeroGPU STT": "debug_use_hf_zero_gpu_stt",
 
152
  chat_history = gr.State(value=[])
153
  displayed_products = gr.State(value=[])
154
  displayed_image = gr.State(value=None)
155
+ with gr.Column(elem_id="main-container"):
156
  voice = gr.Dropdown(
157
  label="Language & Voice",
158
  choices=list(VOICES.items()) + list(debuging_options.items()),
 
169
  mode="send-receive",
170
  modality="audio",
171
  button_labels={"start": "Start Vibe Shopping"},
172
+ rtc_configuration=(
173
+ get_cloudflare_turn_credentials_async if not IS_LOCAL else None
174
+ ),
175
+ server_rtc_configuration=(
176
+ get_cloudflare_turn_credentials(ttl=360_000) if not IS_LOCAL else None
177
+ ),
178
  scale=0,
179
+ time_limit=3600,
180
  )
181
  with gr.Accordion(open=False, label="Input Image"):
182
  gr.Markdown(
 
208
  show_progress="hidden",
209
  )
210
 
211
+ with Modal(visible=True) as modal:
212
+ ColdBootUI()
213
+
214
+ vibe_shopping_app.load(set_client_for_session, None, [gradio_client, modal])
215
  vibe_shopping_app.launch()
mcp_host/agent.py CHANGED
@@ -48,12 +48,12 @@ Then, you can say what you think about the displayed item(s), tell how they fit
48
  Always ask the user for confirmation before taking any action that requires payment or purchase.
49
  If a function requires an input that you don't have based on your knowledge and the conversation history, you should ask the user for it. For example, if the user asks to try a product, but you don't have the target image, you should ask the user to provide it.
50
 
51
- When calling a function, ALWAYS start with a short notification message to the user before calling it.
52
- Here is an example you most follow: "One moment, I will search for products matching your request \n<tool_call>\n<call-function-to-search-products>\n</tool_call>".
53
- Then when you get the response from the function, you say "Here are some products I found for you \n<tool_call>\n<call-function-to-display-products>\n</tool_call>".
54
 
55
  The maximum number of products you can search at once is 10, don't exceed this limit.
56
- Make sure to only output raw text. NEVER output formatted text, markdown or emoji.
57
  """
58
 
59
  def __init__(
@@ -84,7 +84,7 @@ Make sure to only output raw text. NEVER output formatted text, markdown or emoj
84
  self.fewsats_client,
85
  self.virtual_try_client,
86
  ]
87
- self.display_tool = _build_display_tool_definition()
88
  self.image_uploader = image_uploader
89
  self.clients_connected = False
90
 
@@ -98,10 +98,10 @@ Make sure to only output raw text. NEVER output formatted text, markdown or emoj
98
  await self.virtual_try_client.connect_to_server("python", ["./mcp_server.py"])
99
 
100
  self.tools = (
101
- await self.agora_client.tools
 
102
  + await self.fewsats_client.tools
103
  + await self.virtual_try_client.tools
104
- + [self.display_tool]
105
  )
106
  self.clients_connected = True
107
 
@@ -224,6 +224,7 @@ Make sure to only output raw text. NEVER output formatted text, markdown or emoj
224
  messages=chat_history,
225
  stream=True,
226
  tools=self.tools,
 
227
  )
228
  pending_tool_calls: dict[int, ChoiceDeltaToolCall] = {}
229
 
@@ -282,48 +283,52 @@ Make sure to only output raw text. NEVER output formatted text, markdown or emoj
282
  }
283
  )
284
 
285
- mcp_client = self._get_mcp_client_for_tool(tool_name)
286
- if mcp_client is None:
287
- print(f"Tool {tool_name} not found in any MCP client.")
288
- tool_responses.append(
289
- {
 
 
 
 
 
290
  "role": "tool",
291
  "tool_call_id": call_id,
292
- "content": f"Unable to find tool '{tool_name}'.",
 
 
 
 
293
  }
294
- )
295
- else:
296
- try:
297
- print(f"Calling tool {tool_name} with args: {tool_args}")
298
- if tool_name == "display":
299
- args = json.loads(tool_args) if tool_args else {}
300
- update_ui(
301
- args.get("products"),
302
- args.get("image_url"),
303
- args.get("clear_ui"),
304
  )
305
- tool_response: ChatCompletionToolMessageParam = {
306
- "role": "tool",
307
- "tool_call_id": call_id,
308
- "content": "Content displayed successfully.",
309
- }
310
  else:
311
  tool_response = await mcp_client.call_tool(
312
  call_id=call_id,
313
  tool_name=tool_name,
314
  tool_args=json.loads(tool_args) if tool_args else None,
315
  )
316
- print("Tool responded")
317
- tool_responses.append(tool_response)
318
- except Exception as e:
319
- print(f"Error calling tool {tool_name}: {e}")
320
- tool_responses.append(
321
- {
322
- "role": "tool",
323
- "tool_call_id": call_id,
324
- "content": f"Error calling tool '{tool_name}', Error: {str(e)[:500]}",
325
- }
326
- )
327
 
328
  def _build_input_image_content(
329
  self, input_image: Image.Image, image_label: str
@@ -348,43 +353,76 @@ Make sure to only output raw text. NEVER output formatted text, markdown or emoj
348
  )
349
 
350
 
351
- def _build_display_tool_definition() -> ChatCompletionToolParam:
352
- return {
353
- "type": "function",
354
- "function": {
355
- "name": "display",
356
- "description": """This tool Shows/Displays content to the user.
357
- You can use this tool whenever you want to show responses you get from other tools or when the user requests to see something that you have access to, like a list of products, specific product(s) from the conversation history, an image, or cart items.
358
-
359
- You can only pass one argument at a time, either products or image_url, or clear_ui.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  """,
361
- "parameters": {
362
- "type": "object",
363
- "properties": {
364
- "products": {
365
- "type": "array",
366
- "items": {
367
- "type": "object",
368
- "properties": {
369
- "name": {"type": "string"},
370
- "image_url": {"type": "string"},
371
- "price": {"type": "string"},
372
  },
373
- "required": ["name", "image_url", "price"],
374
- },
375
- "description": "A list of products to display from search results, cart items, or conversation history.",
376
- },
377
- "image_url": {
378
- "type": "string",
379
- "description": "An optional URL of an image to display.",
380
  },
381
- "clear_ui": {
382
- "type": "boolean",
383
- "description": (
384
- "If true, clear the UI instead of displaying anything."
385
- ),
 
 
 
 
 
 
 
 
 
 
 
 
386
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  },
388
  },
389
  },
390
- }
 
48
  Always ask the user for confirmation before taking any action that requires payment or purchase.
49
  If a function requires an input that you don't have based on your knowledge and the conversation history, you should ask the user for it. For example, if the user asks to try a product, but you don't have the target image, you should ask the user to provide it.
50
 
51
+ When calling a function, ALWAYS let the user know what you are doing while they are waiting.
52
+ Something like: One moment, I will search for products matching your request \n<tool_call>\n<call-function-to-search-products>\n</tool_call>. \
53
+ Then when you get the response from the function, you can say Here are some products I found for you \n<tool_call>\n<call-function-to-display-products>\n</tool_call>.
54
 
55
  The maximum number of products you can search at once is 10, don't exceed this limit.
56
+ Make sure to only output raw text. NEVER output markdown or emoji.
57
  """
58
 
59
  def __init__(
 
84
  self.fewsats_client,
85
  self.virtual_try_client,
86
  ]
87
+ self.display_tool = _build_display_tool_definitions()
88
  self.image_uploader = image_uploader
89
  self.clients_connected = False
90
 
 
98
  await self.virtual_try_client.connect_to_server("python", ["./mcp_server.py"])
99
 
100
  self.tools = (
101
+ self.display_tool
102
+ + await self.agora_client.tools
103
  + await self.fewsats_client.tools
104
  + await self.virtual_try_client.tools
 
105
  )
106
  self.clients_connected = True
107
 
 
224
  messages=chat_history,
225
  stream=True,
226
  tools=self.tools,
227
+ temperature=0.7,
228
  )
229
  pending_tool_calls: dict[int, ChoiceDeltaToolCall] = {}
230
 
 
283
  }
284
  )
285
 
286
+ try:
287
+ print(f"Calling tool {tool_name} with args: {tool_args}")
288
+ if tool_name.startswith("Display."):
289
+ args = json.loads(tool_args) if tool_args else {}
290
+ update_ui(
291
+ args.get("products"),
292
+ args.get("image_url"),
293
+ tool_name == "Display.clear_display",
294
+ )
295
+ tool_response: ChatCompletionToolMessageParam = {
296
  "role": "tool",
297
  "tool_call_id": call_id,
298
+ "content": (
299
+ "Content displayed successfully."
300
+ if tool_name != "clear_display"
301
+ else "Display cleared."
302
+ ),
303
  }
304
+ else:
305
+ mcp_client = self._get_mcp_client_for_tool(tool_name)
306
+ if mcp_client is None:
307
+ print(f"Tool {tool_name} not found in any MCP client.")
308
+ tool_responses.append(
309
+ {
310
+ "role": "tool",
311
+ "tool_call_id": call_id,
312
+ "content": f"Unable to find tool '{tool_name}'.",
313
+ }
314
  )
 
 
 
 
 
315
  else:
316
  tool_response = await mcp_client.call_tool(
317
  call_id=call_id,
318
  tool_name=tool_name,
319
  tool_args=json.loads(tool_args) if tool_args else None,
320
  )
321
+ print("Tool responded")
322
+ tool_responses.append(tool_response)
323
+ except Exception as e:
324
+ print(f"Error calling tool {tool_name}: {e}")
325
+ tool_responses.append(
326
+ {
327
+ "role": "tool",
328
+ "tool_call_id": call_id,
329
+ "content": f"Error calling tool '{tool_name}', Error: {str(e)[:500]}",
330
+ }
331
+ )
332
 
333
  def _build_input_image_content(
334
  self, input_image: Image.Image, image_label: str
 
353
  )
354
 
355
 
356
+ def _build_display_tool_definitions() -> list[ChatCompletionToolParam]:
357
+ return [
358
+ {
359
+ "type": "function",
360
+ "function": {
361
+ "name": "Display.display_products",
362
+ "description": """
363
+ Display a list of products. Use this to show search results, cart items, or products from conversation history.
364
+
365
+ Args:
366
+ products: A list of products to display. Each product should have a name, image URL, and formatted price.
367
+ example:
368
+ products: [
369
+ {
370
+ "name": "Stylish Green Shirt",
371
+ "image_url": "https://example.com/images/green-shirt.jpg",
372
+ "price": "$29.99"
373
+ },
374
+ {
375
+ "name": "Comfortable Jeans",
376
+ "image_url": "https://example.com/images/jeans.jpg",
377
+ "price": "$49.99"
378
+ }
379
+ ]
380
  """,
381
+ "parameters": {
382
+ "properties": {
383
+ "products": {
384
+ "items": {
385
+ "additionalProperties": {"type": "string"},
386
+ "type": "object",
 
 
 
 
 
387
  },
388
+ "title": "Products",
389
+ "type": "array",
390
+ }
 
 
 
 
391
  },
392
+ "required": ["products"],
393
+ "title": "display_productsArguments",
394
+ "type": "object",
395
+ },
396
+ },
397
+ },
398
+ {
399
+ "type": "function",
400
+ "function": {
401
+ "name": "Display.display_image",
402
+ "description": "Display a single standalone image. Use this for virtual try-on results, a specific product image requested by the user, or any other relevant single image.\n\nArgs:\n image_url: The URL of the image to display.",
403
+ "parameters": {
404
+ "properties": {
405
+ "image_url": {
406
+ "title": "Image URL",
407
+ "type": "string",
408
+ },
409
  },
410
+ "required": ["image_url"],
411
+ "title": "display_imageArguments",
412
+ "type": "object",
413
+ },
414
+ },
415
+ },
416
+ {
417
+ "type": "function",
418
+ "function": {
419
+ "name": "Display.clear_display",
420
+ "description": "Clear any content currently displayed in the user interface. Removes everything from the visual display area.\n\nArgs: None",
421
+ "parameters": {
422
+ "properties": {},
423
+ "title": "clear_displayArguments",
424
+ "type": "object",
425
  },
426
  },
427
  },
428
+ ]
mcp_host/tts/gradio_api_tts.py CHANGED
@@ -37,7 +37,7 @@ async def stream_text_to_speech(
37
  standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]
38
 
39
  for text in generate_sentences(text_stream, language=standard_lang_code):
40
- print(f"Streaming audio for text: {text[:10]}...")
41
  audio = client.submit(
42
  text=text, voice=voice, speed=1, use_gpu=True, api_name="/stream"
43
  )
 
37
  standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]
38
 
39
  for text in generate_sentences(text_stream, language=standard_lang_code):
40
+ print(f"Streaming audio for text: {text}")
41
  audio = client.submit(
42
  text=text, voice=voice, speed=1, use_gpu=True, api_name="/stream"
43
  )
mcp_host/tts/hf_zero_gpu_tts.py CHANGED
@@ -58,6 +58,7 @@ async def stream_text_to_speech(
58
  standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]
59
 
60
  for text in generate_sentences(text_stream, language=standard_lang_code):
 
61
  for audio in text_to_speech(text, pipe_key=kokoro_lang, voice=voice):
62
  yield 24000, audio
63
 
 
58
  standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]
59
 
60
  for text in generate_sentences(text_stream, language=standard_lang_code):
61
+ print(f"Streaming audio for text: {text}")
62
  for audio in text_to_speech(text, pipe_key=kokoro_lang, voice=voice):
63
  yield 24000, audio
64
 
mcp_host/ui.py CHANGED
@@ -13,7 +13,7 @@ def UI(products_state: gr.State, image_state: gr.State):
13
  justify-content: center;
14
  height: 600px;
15
  width: 100%;
16
- background: linear-gradient(rgba(0,0,0,0.3), rgba(0,0,0,0.3)), url('{get_hf_space_file_url_prefix()}static/welcome-to-vibe-shopping.webp');
17
  background-size: cover;
18
  background-position: center;
19
  background-repeat: no-repeat;
 
13
  justify-content: center;
14
  height: 600px;
15
  width: 100%;
16
+ background: linear-gradient(rgba(0,0,0,0.3), rgba(0,0,0,0.3)), url('{get_hf_space_file_url_prefix()}static/welcome-to-vibe-shopping-upscaled.webp');
17
  background-size: cover;
18
  background-position: center;
19
  background-repeat: no-repeat;
static/welcome-to-vibe-shopping.webp CHANGED

Git LFS Details

  • SHA256: 457f8f04916ead700e1d8f6168d043be09a049908a3d7c9f5225aba480c25442
  • Pointer size: 130 Bytes
  • Size of remote file: 51.5 kB