Juju519 commited on
Commit
759baac
·
unverified ·
1 Parent(s): d5497ad

Refactor comments and improve latency handling

Browse files

Updated comments for clarity and compliance, modified latency observation logic, and adjusted OAuth handling in Gradio demo.

Files changed (1) hide show
  1. app.py +14 -7
app.py CHANGED
@@ -1,4 +1,9 @@
1
- # app.py
 
 
 
 
 
2
  import os, json, random, time
3
  from typing import Optional
4
 
@@ -55,6 +60,7 @@ except Exception:
55
  fancy_css = "#title { text-align:center; }"
56
 
57
  # -------- Helpers --------
 
58
  def _build_local_prompt(msgs: list[dict[str, str]]) -> str:
59
  """Simple chat-ish prompt for local text-generation."""
60
  parts = []
@@ -69,6 +75,7 @@ def _build_local_prompt(msgs: list[dict[str, str]]) -> str:
69
  parts.append("Assistant:")
70
  return "\n".join(parts)
71
 
 
72
  def _build_chat_messages(system_message: str, history: list[dict[str, str]], user_text: str):
73
  """Build OpenAI-style chat messages."""
74
  msgs = [{"role": "system", "content": system_message}]
@@ -76,6 +83,7 @@ def _build_chat_messages(system_message: str, history: list[dict[str, str]], use
76
  msgs.append({"role": "user", "content": user_text})
77
  return msgs
78
 
 
79
  def _build_hf_chat_prompt(msgs: list[dict[str, str]]) -> str:
80
  """Simple chat-style prompt for HF /v1/completions."""
81
  parts = []
@@ -94,6 +102,7 @@ def _build_hf_chat_prompt(msgs: list[dict[str, str]]) -> str:
94
  pipe = None
95
  tokenizer = None
96
 
 
97
  def respond(
98
  message,
99
  history: list[dict[str, str]],
@@ -174,7 +183,6 @@ def respond(
174
  "top_p": float(top_p),
175
  }
176
  try:
177
- t0 = time.time()
178
  r = requests.post(url, headers=headers, json=payload, timeout=120)
179
  if r.status_code == 401:
180
  status = "error"
@@ -185,7 +193,6 @@ def respond(
185
  else:
186
  data = r.json()
187
  text = data["choices"][0]["message"]["content"]
188
- RESP_LATENCY.observe(time.time() - t0)
189
  token_estimate += max(0, len(text)) // 4
190
  yield text
191
  except requests.Timeout:
@@ -214,7 +221,6 @@ def respond(
214
  "top_p": float(top_p),
215
  }
216
  try:
217
- t0 = time.time()
218
  r = requests.post(url, headers=headers, json=payload, timeout=120)
219
  if r.status_code == 401:
220
  status = "error"
@@ -230,7 +236,6 @@ def respond(
230
  else:
231
  data = r.json()
232
  text = data["choices"][0].get("text") or ""
233
- RESP_LATENCY.observe(time.time() - t0)
234
  token_estimate += max(0, len(text)) // 4
235
  yield text
236
  except requests.Timeout:
@@ -244,16 +249,18 @@ def respond(
244
  status = "error"
245
  raise
246
  finally:
 
 
247
  REQS_TOTAL.labels(PRODUCT_KIND, status).inc()
248
  ACTIVE_SESSIONS.set(0 if not history else len(history))
249
- RESP_LATENCY.observe(time.time() - start_time)
250
  TOKENS_OUT.labels(PRODUCT_KIND).inc(token_estimate)
251
 
 
252
  def create_demo(enable_oauth: bool = False):
253
  with gr.Blocks(css=fancy_css) as demo:
254
  with gr.Row():
255
  gr.Markdown("<h1 id='title'>🐐 Chat with Gompei</h1>")
256
- token_input = gr.State(value=None) # disable OAuth to avoid confusion
257
 
258
  gr.ChatInterface(
259
  fn=respond,
 
1
+ # app.py (CS3 compliant)
2
+ # - Exposes Python Prometheus metrics on :8000
3
+ # - Gradio UI on :7860 (binds 0.0.0.0 for Docker)
4
+ # - Distinguishes local vs API product via PRODUCT_KIND env
5
+ # - Avoids double-counting RESP_LATENCY (observed once in finally)
6
+
7
  import os, json, random, time
8
  from typing import Optional
9
 
 
60
  fancy_css = "#title { text-align:center; }"
61
 
62
  # -------- Helpers --------
63
+
64
  def _build_local_prompt(msgs: list[dict[str, str]]) -> str:
65
  """Simple chat-ish prompt for local text-generation."""
66
  parts = []
 
75
  parts.append("Assistant:")
76
  return "\n".join(parts)
77
 
78
+
79
  def _build_chat_messages(system_message: str, history: list[dict[str, str]], user_text: str):
80
  """Build OpenAI-style chat messages."""
81
  msgs = [{"role": "system", "content": system_message}]
 
83
  msgs.append({"role": "user", "content": user_text})
84
  return msgs
85
 
86
+
87
  def _build_hf_chat_prompt(msgs: list[dict[str, str]]) -> str:
88
  """Simple chat-style prompt for HF /v1/completions."""
89
  parts = []
 
102
  pipe = None
103
  tokenizer = None
104
 
105
+
106
  def respond(
107
  message,
108
  history: list[dict[str, str]],
 
183
  "top_p": float(top_p),
184
  }
185
  try:
 
186
  r = requests.post(url, headers=headers, json=payload, timeout=120)
187
  if r.status_code == 401:
188
  status = "error"
 
193
  else:
194
  data = r.json()
195
  text = data["choices"][0]["message"]["content"]
 
196
  token_estimate += max(0, len(text)) // 4
197
  yield text
198
  except requests.Timeout:
 
221
  "top_p": float(top_p),
222
  }
223
  try:
 
224
  r = requests.post(url, headers=headers, json=payload, timeout=120)
225
  if r.status_code == 401:
226
  status = "error"
 
236
  else:
237
  data = r.json()
238
  text = data["choices"][0].get("text") or ""
 
239
  token_estimate += max(0, len(text)) // 4
240
  yield text
241
  except requests.Timeout:
 
249
  status = "error"
250
  raise
251
  finally:
252
+ # One end-to-end latency observation per request
253
+ RESP_LATENCY.observe(time.time() - start_time)
254
  REQS_TOTAL.labels(PRODUCT_KIND, status).inc()
255
  ACTIVE_SESSIONS.set(0 if not history else len(history))
 
256
  TOKENS_OUT.labels(PRODUCT_KIND).inc(token_estimate)
257
 
258
+
259
  def create_demo(enable_oauth: bool = False):
260
  with gr.Blocks(css=fancy_css) as demo:
261
  with gr.Row():
262
  gr.Markdown("<h1 id='title'>🐐 Chat with Gompei</h1>")
263
+ token_input = gr.State(value=None) # OAuth disabled; keep state slot
264
 
265
  gr.ChatInterface(
266
  fn=respond,