Spaces:
Sleeping
Sleeping
Juju519
commited on
Refactor app.py for clarity and local model usage
Browse filesUpdated comments for clarity and simplified the local model implementation. Adjusted local model usage and default settings in the Gradio UI.
app.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
| 1 |
-
# app.py (CS3 compliant,
|
| 2 |
# - Prometheus metrics on :8000
|
| 3 |
# - Gradio UI on :7860 (0.0.0.0 for Docker)
|
| 4 |
# - Local vs API selection via checkbox + env
|
| 5 |
# - Falls back to local model if no API creds
|
| 6 |
-
# - Avoids double-counting RESP_LATENCY
|
| 7 |
|
| 8 |
import os
|
| 9 |
import json
|
|
@@ -22,7 +21,7 @@ print("[CS3] STARTUP")
|
|
| 22 |
|
| 23 |
PRODUCT_KIND = os.getenv("PRODUCT_KIND", "unknown") # "local" | "api" | "unknown"
|
| 24 |
|
| 25 |
-
# Local model
|
| 26 |
LOCAL_MODEL = os.getenv("LOCAL_MODEL", "microsoft/Phi-3-mini-4k-instruct").strip()
|
| 27 |
|
| 28 |
# OpenAI-compatible provider (OpenRouter / Together / OpenAI)
|
|
@@ -129,8 +128,7 @@ def _build_hf_chat_prompt(msgs: list[dict[str, str]]) -> str:
|
|
| 129 |
|
| 130 |
# ========== State for local model ==========
|
| 131 |
|
| 132 |
-
pipe = None
|
| 133 |
-
tokenizer = None
|
| 134 |
|
| 135 |
|
| 136 |
# ========== Core chat handler ==========
|
|
@@ -152,9 +150,9 @@ def respond(
|
|
| 152 |
- Else:
|
| 153 |
- If OPENAI_API_KEY set β OpenAI-compatible API path.
|
| 154 |
- Elif HF_TOKEN set β Hugging Face Router /v1/completions.
|
| 155 |
-
- Else β fall back to local model (no
|
| 156 |
"""
|
| 157 |
-
global pipe
|
| 158 |
|
| 159 |
start_time = time.time()
|
| 160 |
token_estimate = 0
|
|
@@ -173,27 +171,13 @@ def respond(
|
|
| 173 |
|
| 174 |
# ---------- LOCAL MODEL PATH ----------
|
| 175 |
if effective_use_local:
|
| 176 |
-
print("[CS3] MODE=local")
|
| 177 |
try:
|
| 178 |
-
from transformers import pipeline
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
pass
|
| 184 |
-
|
| 185 |
-
if pipe is None or tokenizer is None:
|
| 186 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 187 |
-
LOCAL_MODEL,
|
| 188 |
-
trust_remote_code=True,
|
| 189 |
-
)
|
| 190 |
-
pipe = pipeline(
|
| 191 |
-
"text-generation",
|
| 192 |
-
model=LOCAL_MODEL,
|
| 193 |
-
tokenizer=tokenizer,
|
| 194 |
-
device_map="auto",
|
| 195 |
-
trust_remote_code=True,
|
| 196 |
-
)
|
| 197 |
|
| 198 |
local_msgs = (
|
| 199 |
[{"role": "system", "content": system_message}]
|
|
@@ -201,17 +185,17 @@ def respond(
|
|
| 201 |
+ [{"role": "user", "content": user_with_fact}]
|
| 202 |
)
|
| 203 |
prompt = _build_local_prompt(local_msgs)
|
|
|
|
| 204 |
outputs = pipe(
|
| 205 |
prompt,
|
| 206 |
max_new_tokens=int(max_tokens),
|
| 207 |
do_sample=True,
|
| 208 |
temperature=float(temperature),
|
| 209 |
top_p=float(top_p),
|
| 210 |
-
pad_token_id=getattr(tokenizer, "eos_token_id", None),
|
| 211 |
-
eos_token_id=getattr(tokenizer, "eos_token_id", None),
|
| 212 |
)
|
| 213 |
-
|
| 214 |
-
|
|
|
|
| 215 |
if "Assistant:" in assistant:
|
| 216 |
assistant = assistant.split("Assistant:", 1)[-1].strip()
|
| 217 |
|
|
@@ -267,7 +251,6 @@ def respond(
|
|
| 267 |
return
|
| 268 |
|
| 269 |
# ---------- API PATH (HF ROUTER) ----------
|
| 270 |
-
# We know OPENAI_API_KEY is empty here. Use HF_TOKEN if available.
|
| 271 |
if HF_TOKEN:
|
| 272 |
print("[CS3] MODE=api (HF Router)")
|
| 273 |
url = f"{HF_BASE_URL.rstrip('/')}/v1/completions"
|
|
@@ -332,8 +315,7 @@ def create_demo(enable_oauth: bool = False):
|
|
| 332 |
with gr.Blocks(css=fancy_css) as demo:
|
| 333 |
with gr.Row():
|
| 334 |
gr.Markdown("<h1 id='title'>π Chat with Gompei</h1>")
|
| 335 |
-
|
| 336 |
-
token_input = gr.State(value=None)
|
| 337 |
|
| 338 |
gr.ChatInterface(
|
| 339 |
fn=respond,
|
|
@@ -366,8 +348,8 @@ def create_demo(enable_oauth: bool = False):
|
|
| 366 |
step=0.05,
|
| 367 |
label="Top-p (nucleus sampling)",
|
| 368 |
),
|
| 369 |
-
gr.Checkbox(label="Use Local Model", value=
|
| 370 |
-
token_input,
|
| 371 |
],
|
| 372 |
type="messages",
|
| 373 |
examples=[
|
|
@@ -380,7 +362,7 @@ def create_demo(enable_oauth: bool = False):
|
|
| 380 |
128,
|
| 381 |
0.7,
|
| 382 |
0.95,
|
| 383 |
-
|
| 384 |
None,
|
| 385 |
],
|
| 386 |
[
|
|
@@ -392,7 +374,7 @@ def create_demo(enable_oauth: bool = False):
|
|
| 392 |
128,
|
| 393 |
0.7,
|
| 394 |
0.95,
|
| 395 |
-
|
| 396 |
None,
|
| 397 |
],
|
| 398 |
],
|
|
|
|
| 1 |
+
# app.py (CS3 compliant, simplified local model to avoid DynamicCache issues)
|
| 2 |
# - Prometheus metrics on :8000
|
| 3 |
# - Gradio UI on :7860 (0.0.0.0 for Docker)
|
| 4 |
# - Local vs API selection via checkbox + env
|
| 5 |
# - Falls back to local model if no API creds
|
|
|
|
| 6 |
|
| 7 |
import os
|
| 8 |
import json
|
|
|
|
| 21 |
|
| 22 |
PRODUCT_KIND = os.getenv("PRODUCT_KIND", "unknown") # "local" | "api" | "unknown"
|
| 23 |
|
| 24 |
+
# Local model β use Phi-3-mini like your last working version
|
| 25 |
LOCAL_MODEL = os.getenv("LOCAL_MODEL", "microsoft/Phi-3-mini-4k-instruct").strip()
|
| 26 |
|
| 27 |
# OpenAI-compatible provider (OpenRouter / Together / OpenAI)
|
|
|
|
| 128 |
|
| 129 |
# ========== State for local model ==========
|
| 130 |
|
| 131 |
+
pipe = None # simple pipeline object, like in your old code
|
|
|
|
| 132 |
|
| 133 |
|
| 134 |
# ========== Core chat handler ==========
|
|
|
|
| 150 |
- Else:
|
| 151 |
- If OPENAI_API_KEY set β OpenAI-compatible API path.
|
| 152 |
- Elif HF_TOKEN set β Hugging Face Router /v1/completions.
|
| 153 |
+
- Else β fall back to local model (no π error).
|
| 154 |
"""
|
| 155 |
+
global pipe
|
| 156 |
|
| 157 |
start_time = time.time()
|
| 158 |
token_estimate = 0
|
|
|
|
| 171 |
|
| 172 |
# ---------- LOCAL MODEL PATH ----------
|
| 173 |
if effective_use_local:
|
| 174 |
+
print(f"[CS3] MODE=local (model={LOCAL_MODEL})")
|
| 175 |
try:
|
| 176 |
+
from transformers import pipeline
|
| 177 |
+
|
| 178 |
+
if pipe is None:
|
| 179 |
+
# Simple pipeline, just like your last working version
|
| 180 |
+
pipe = pipeline("text-generation", model=LOCAL_MODEL)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
local_msgs = (
|
| 183 |
[{"role": "system", "content": system_message}]
|
|
|
|
| 185 |
+ [{"role": "user", "content": user_with_fact}]
|
| 186 |
)
|
| 187 |
prompt = _build_local_prompt(local_msgs)
|
| 188 |
+
|
| 189 |
outputs = pipe(
|
| 190 |
prompt,
|
| 191 |
max_new_tokens=int(max_tokens),
|
| 192 |
do_sample=True,
|
| 193 |
temperature=float(temperature),
|
| 194 |
top_p=float(top_p),
|
|
|
|
|
|
|
| 195 |
)
|
| 196 |
+
|
| 197 |
+
full_text = outputs[0]["generated_text"]
|
| 198 |
+
assistant = full_text[len(prompt):].strip()
|
| 199 |
if "Assistant:" in assistant:
|
| 200 |
assistant = assistant.split("Assistant:", 1)[-1].strip()
|
| 201 |
|
|
|
|
| 251 |
return
|
| 252 |
|
| 253 |
# ---------- API PATH (HF ROUTER) ----------
|
|
|
|
| 254 |
if HF_TOKEN:
|
| 255 |
print("[CS3] MODE=api (HF Router)")
|
| 256 |
url = f"{HF_BASE_URL.rstrip('/')}/v1/completions"
|
|
|
|
| 315 |
with gr.Blocks(css=fancy_css) as demo:
|
| 316 |
with gr.Row():
|
| 317 |
gr.Markdown("<h1 id='title'>π Chat with Gompei</h1>")
|
| 318 |
+
token_input = gr.State(value=None) # dummy to match fn signature
|
|
|
|
| 319 |
|
| 320 |
gr.ChatInterface(
|
| 321 |
fn=respond,
|
|
|
|
| 348 |
step=0.05,
|
| 349 |
label="Top-p (nucleus sampling)",
|
| 350 |
),
|
| 351 |
+
gr.Checkbox(label="Use Local Model", value=True), # default ON now
|
| 352 |
+
token_input,
|
| 353 |
],
|
| 354 |
type="messages",
|
| 355 |
examples=[
|
|
|
|
| 362 |
128,
|
| 363 |
0.7,
|
| 364 |
0.95,
|
| 365 |
+
True,
|
| 366 |
None,
|
| 367 |
],
|
| 368 |
[
|
|
|
|
| 374 |
128,
|
| 375 |
0.7,
|
| 376 |
0.95,
|
| 377 |
+
True,
|
| 378 |
None,
|
| 379 |
],
|
| 380 |
],
|