Spaces:
Sleeping
Sleeping
Juju519
commited on
Refactor comments and improve latency handling
Browse filesUpdated comments for clarity and compliance, modified latency observation logic, and adjusted OAuth handling in Gradio demo.
app.py
CHANGED
|
@@ -1,4 +1,9 @@
|
|
| 1 |
-
# app.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import os, json, random, time
|
| 3 |
from typing import Optional
|
| 4 |
|
|
@@ -55,6 +60,7 @@ except Exception:
|
|
| 55 |
fancy_css = "#title { text-align:center; }"
|
| 56 |
|
| 57 |
# -------- Helpers --------
|
|
|
|
| 58 |
def _build_local_prompt(msgs: list[dict[str, str]]) -> str:
|
| 59 |
"""Simple chat-ish prompt for local text-generation."""
|
| 60 |
parts = []
|
|
@@ -69,6 +75,7 @@ def _build_local_prompt(msgs: list[dict[str, str]]) -> str:
|
|
| 69 |
parts.append("Assistant:")
|
| 70 |
return "\n".join(parts)
|
| 71 |
|
|
|
|
| 72 |
def _build_chat_messages(system_message: str, history: list[dict[str, str]], user_text: str):
|
| 73 |
"""Build OpenAI-style chat messages."""
|
| 74 |
msgs = [{"role": "system", "content": system_message}]
|
|
@@ -76,6 +83,7 @@ def _build_chat_messages(system_message: str, history: list[dict[str, str]], use
|
|
| 76 |
msgs.append({"role": "user", "content": user_text})
|
| 77 |
return msgs
|
| 78 |
|
|
|
|
| 79 |
def _build_hf_chat_prompt(msgs: list[dict[str, str]]) -> str:
|
| 80 |
"""Simple chat-style prompt for HF /v1/completions."""
|
| 81 |
parts = []
|
|
@@ -94,6 +102,7 @@ def _build_hf_chat_prompt(msgs: list[dict[str, str]]) -> str:
|
|
| 94 |
pipe = None
|
| 95 |
tokenizer = None
|
| 96 |
|
|
|
|
| 97 |
def respond(
|
| 98 |
message,
|
| 99 |
history: list[dict[str, str]],
|
|
@@ -174,7 +183,6 @@ def respond(
|
|
| 174 |
"top_p": float(top_p),
|
| 175 |
}
|
| 176 |
try:
|
| 177 |
-
t0 = time.time()
|
| 178 |
r = requests.post(url, headers=headers, json=payload, timeout=120)
|
| 179 |
if r.status_code == 401:
|
| 180 |
status = "error"
|
|
@@ -185,7 +193,6 @@ def respond(
|
|
| 185 |
else:
|
| 186 |
data = r.json()
|
| 187 |
text = data["choices"][0]["message"]["content"]
|
| 188 |
-
RESP_LATENCY.observe(time.time() - t0)
|
| 189 |
token_estimate += max(0, len(text)) // 4
|
| 190 |
yield text
|
| 191 |
except requests.Timeout:
|
|
@@ -214,7 +221,6 @@ def respond(
|
|
| 214 |
"top_p": float(top_p),
|
| 215 |
}
|
| 216 |
try:
|
| 217 |
-
t0 = time.time()
|
| 218 |
r = requests.post(url, headers=headers, json=payload, timeout=120)
|
| 219 |
if r.status_code == 401:
|
| 220 |
status = "error"
|
|
@@ -230,7 +236,6 @@ def respond(
|
|
| 230 |
else:
|
| 231 |
data = r.json()
|
| 232 |
text = data["choices"][0].get("text") or ""
|
| 233 |
-
RESP_LATENCY.observe(time.time() - t0)
|
| 234 |
token_estimate += max(0, len(text)) // 4
|
| 235 |
yield text
|
| 236 |
except requests.Timeout:
|
|
@@ -244,16 +249,18 @@ def respond(
|
|
| 244 |
status = "error"
|
| 245 |
raise
|
| 246 |
finally:
|
|
|
|
|
|
|
| 247 |
REQS_TOTAL.labels(PRODUCT_KIND, status).inc()
|
| 248 |
ACTIVE_SESSIONS.set(0 if not history else len(history))
|
| 249 |
-
RESP_LATENCY.observe(time.time() - start_time)
|
| 250 |
TOKENS_OUT.labels(PRODUCT_KIND).inc(token_estimate)
|
| 251 |
|
|
|
|
| 252 |
def create_demo(enable_oauth: bool = False):
|
| 253 |
with gr.Blocks(css=fancy_css) as demo:
|
| 254 |
with gr.Row():
|
| 255 |
gr.Markdown("<h1 id='title'>🐐 Chat with Gompei</h1>")
|
| 256 |
-
token_input = gr.State(value=None) #
|
| 257 |
|
| 258 |
gr.ChatInterface(
|
| 259 |
fn=respond,
|
|
|
|
| 1 |
+
# app.py (CS3 compliant)
|
| 2 |
+
# - Exposes Python Prometheus metrics on :8000
|
| 3 |
+
# - Gradio UI on :7860 (binds 0.0.0.0 for Docker)
|
| 4 |
+
# - Distinguishes local vs API product via PRODUCT_KIND env
|
| 5 |
+
# - Avoids double-counting RESP_LATENCY (observed once in finally)
|
| 6 |
+
|
| 7 |
import os, json, random, time
|
| 8 |
from typing import Optional
|
| 9 |
|
|
|
|
| 60 |
fancy_css = "#title { text-align:center; }"
|
| 61 |
|
| 62 |
# -------- Helpers --------
|
| 63 |
+
|
| 64 |
def _build_local_prompt(msgs: list[dict[str, str]]) -> str:
|
| 65 |
"""Simple chat-ish prompt for local text-generation."""
|
| 66 |
parts = []
|
|
|
|
| 75 |
parts.append("Assistant:")
|
| 76 |
return "\n".join(parts)
|
| 77 |
|
| 78 |
+
|
| 79 |
def _build_chat_messages(system_message: str, history: list[dict[str, str]], user_text: str):
|
| 80 |
"""Build OpenAI-style chat messages."""
|
| 81 |
msgs = [{"role": "system", "content": system_message}]
|
|
|
|
| 83 |
msgs.append({"role": "user", "content": user_text})
|
| 84 |
return msgs
|
| 85 |
|
| 86 |
+
|
| 87 |
def _build_hf_chat_prompt(msgs: list[dict[str, str]]) -> str:
|
| 88 |
"""Simple chat-style prompt for HF /v1/completions."""
|
| 89 |
parts = []
|
|
|
|
| 102 |
pipe = None
|
| 103 |
tokenizer = None
|
| 104 |
|
| 105 |
+
|
| 106 |
def respond(
|
| 107 |
message,
|
| 108 |
history: list[dict[str, str]],
|
|
|
|
| 183 |
"top_p": float(top_p),
|
| 184 |
}
|
| 185 |
try:
|
|
|
|
| 186 |
r = requests.post(url, headers=headers, json=payload, timeout=120)
|
| 187 |
if r.status_code == 401:
|
| 188 |
status = "error"
|
|
|
|
| 193 |
else:
|
| 194 |
data = r.json()
|
| 195 |
text = data["choices"][0]["message"]["content"]
|
|
|
|
| 196 |
token_estimate += max(0, len(text)) // 4
|
| 197 |
yield text
|
| 198 |
except requests.Timeout:
|
|
|
|
| 221 |
"top_p": float(top_p),
|
| 222 |
}
|
| 223 |
try:
|
|
|
|
| 224 |
r = requests.post(url, headers=headers, json=payload, timeout=120)
|
| 225 |
if r.status_code == 401:
|
| 226 |
status = "error"
|
|
|
|
| 236 |
else:
|
| 237 |
data = r.json()
|
| 238 |
text = data["choices"][0].get("text") or ""
|
|
|
|
| 239 |
token_estimate += max(0, len(text)) // 4
|
| 240 |
yield text
|
| 241 |
except requests.Timeout:
|
|
|
|
| 249 |
status = "error"
|
| 250 |
raise
|
| 251 |
finally:
|
| 252 |
+
# One end-to-end latency observation per request
|
| 253 |
+
RESP_LATENCY.observe(time.time() - start_time)
|
| 254 |
REQS_TOTAL.labels(PRODUCT_KIND, status).inc()
|
| 255 |
ACTIVE_SESSIONS.set(0 if not history else len(history))
|
|
|
|
| 256 |
TOKENS_OUT.labels(PRODUCT_KIND).inc(token_estimate)
|
| 257 |
|
| 258 |
+
|
| 259 |
def create_demo(enable_oauth: bool = False):
|
| 260 |
with gr.Blocks(css=fancy_css) as demo:
|
| 261 |
with gr.Row():
|
| 262 |
gr.Markdown("<h1 id='title'>🐐 Chat with Gompei</h1>")
|
| 263 |
+
token_input = gr.State(value=None) # OAuth disabled; keep state slot
|
| 264 |
|
| 265 |
gr.ChatInterface(
|
| 266 |
fn=respond,
|