Spaces:
Sleeping
Sleeping
Juju519
commited on
Refactor local model handling and Gradio UI settings
Browse filesUpdated local model configuration and improved fallback handling. Adjusted Gradio UI settings and ensured tokenizer initialization.
app.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
-
# app.py (CS3 compliant,
|
| 2 |
# - Prometheus metrics on :8000
|
| 3 |
# - Gradio UI on :7860 (0.0.0.0 for Docker)
|
| 4 |
# - Local vs API selection via checkbox + env
|
| 5 |
# - Falls back to local model if no API creds
|
|
|
|
| 6 |
|
| 7 |
import os
|
| 8 |
import json
|
|
@@ -21,8 +22,8 @@ print("[CS3] STARTUP")
|
|
| 21 |
|
| 22 |
PRODUCT_KIND = os.getenv("PRODUCT_KIND", "unknown") # "local" | "api" | "unknown"
|
| 23 |
|
| 24 |
-
# Local model
|
| 25 |
-
LOCAL_MODEL = os.getenv("LOCAL_MODEL", "
|
| 26 |
|
| 27 |
# OpenAI-compatible provider (OpenRouter / Together / OpenAI)
|
| 28 |
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "").strip()
|
|
@@ -128,7 +129,8 @@ def _build_hf_chat_prompt(msgs: list[dict[str, str]]) -> str:
|
|
| 128 |
|
| 129 |
# ========== State for local model ==========
|
| 130 |
|
| 131 |
-
pipe = None
|
|
|
|
| 132 |
|
| 133 |
|
| 134 |
# ========== Core chat handler ==========
|
|
@@ -150,9 +152,9 @@ def respond(
|
|
| 150 |
- Else:
|
| 151 |
- If OPENAI_API_KEY set β OpenAI-compatible API path.
|
| 152 |
- Elif HF_TOKEN set β Hugging Face Router /v1/completions.
|
| 153 |
-
- Else β fall back to local model (no π error).
|
| 154 |
"""
|
| 155 |
-
global pipe
|
| 156 |
|
| 157 |
start_time = time.time()
|
| 158 |
token_estimate = 0
|
|
@@ -171,13 +173,27 @@ def respond(
|
|
| 171 |
|
| 172 |
# ---------- LOCAL MODEL PATH ----------
|
| 173 |
if effective_use_local:
|
| 174 |
-
print(
|
| 175 |
try:
|
| 176 |
-
from transformers import pipeline
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
local_msgs = (
|
| 183 |
[{"role": "system", "content": system_message}]
|
|
@@ -185,17 +201,17 @@ def respond(
|
|
| 185 |
+ [{"role": "user", "content": user_with_fact}]
|
| 186 |
)
|
| 187 |
prompt = _build_local_prompt(local_msgs)
|
| 188 |
-
|
| 189 |
outputs = pipe(
|
| 190 |
prompt,
|
| 191 |
max_new_tokens=int(max_tokens),
|
| 192 |
do_sample=True,
|
| 193 |
temperature=float(temperature),
|
| 194 |
top_p=float(top_p),
|
|
|
|
|
|
|
| 195 |
)
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
assistant = full_text[len(prompt):].strip()
|
| 199 |
if "Assistant:" in assistant:
|
| 200 |
assistant = assistant.split("Assistant:", 1)[-1].strip()
|
| 201 |
|
|
@@ -251,6 +267,7 @@ def respond(
|
|
| 251 |
return
|
| 252 |
|
| 253 |
# ---------- API PATH (HF ROUTER) ----------
|
|
|
|
| 254 |
if HF_TOKEN:
|
| 255 |
print("[CS3] MODE=api (HF Router)")
|
| 256 |
url = f"{HF_BASE_URL.rstrip('/')}/v1/completions"
|
|
@@ -315,7 +332,8 @@ def create_demo(enable_oauth: bool = False):
|
|
| 315 |
with gr.Blocks(css=fancy_css) as demo:
|
| 316 |
with gr.Row():
|
| 317 |
gr.Markdown("<h1 id='title'>π Chat with Gompei</h1>")
|
| 318 |
-
|
|
|
|
| 319 |
|
| 320 |
gr.ChatInterface(
|
| 321 |
fn=respond,
|
|
@@ -348,8 +366,8 @@ def create_demo(enable_oauth: bool = False):
|
|
| 348 |
step=0.05,
|
| 349 |
label="Top-p (nucleus sampling)",
|
| 350 |
),
|
| 351 |
-
gr.Checkbox(label="Use Local Model", value=
|
| 352 |
-
token_input,
|
| 353 |
],
|
| 354 |
type="messages",
|
| 355 |
examples=[
|
|
@@ -362,7 +380,7 @@ def create_demo(enable_oauth: bool = False):
|
|
| 362 |
128,
|
| 363 |
0.7,
|
| 364 |
0.95,
|
| 365 |
-
|
| 366 |
None,
|
| 367 |
],
|
| 368 |
[
|
|
@@ -374,7 +392,7 @@ def create_demo(enable_oauth: bool = False):
|
|
| 374 |
128,
|
| 375 |
0.7,
|
| 376 |
0.95,
|
| 377 |
-
|
| 378 |
None,
|
| 379 |
],
|
| 380 |
],
|
|
|
|
| 1 |
+
# app.py (CS3 compliant, with safe fallbacks)
|
| 2 |
# - Prometheus metrics on :8000
|
| 3 |
# - Gradio UI on :7860 (0.0.0.0 for Docker)
|
| 4 |
# - Local vs API selection via checkbox + env
|
| 5 |
# - Falls back to local model if no API creds
|
| 6 |
+
# - Avoids double-counting RESP_LATENCY
|
| 7 |
|
| 8 |
import os
|
| 9 |
import json
|
|
|
|
| 22 |
|
| 23 |
PRODUCT_KIND = os.getenv("PRODUCT_KIND", "unknown") # "local" | "api" | "unknown"
|
| 24 |
|
| 25 |
+
# Local model
|
| 26 |
+
LOCAL_MODEL = os.getenv("LOCAL_MODEL", "sshleifer/tiny-gpt2").strip()
|
| 27 |
|
| 28 |
# OpenAI-compatible provider (OpenRouter / Together / OpenAI)
|
| 29 |
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "").strip()
|
|
|
|
| 129 |
|
| 130 |
# ========== State for local model ==========
|
| 131 |
|
| 132 |
+
pipe = None
|
| 133 |
+
tokenizer = None
|
| 134 |
|
| 135 |
|
| 136 |
# ========== Core chat handler ==========
|
|
|
|
| 152 |
- Else:
|
| 153 |
- If OPENAI_API_KEY set β OpenAI-compatible API path.
|
| 154 |
- Elif HF_TOKEN set β Hugging Face Router /v1/completions.
|
| 155 |
+
- Else β fall back to local model (no more π error).
|
| 156 |
"""
|
| 157 |
+
global pipe, tokenizer
|
| 158 |
|
| 159 |
start_time = time.time()
|
| 160 |
token_estimate = 0
|
|
|
|
| 173 |
|
| 174 |
# ---------- LOCAL MODEL PATH ----------
|
| 175 |
if effective_use_local:
|
| 176 |
+
print("[CS3] MODE=local")
|
| 177 |
try:
|
| 178 |
+
from transformers import pipeline, AutoTokenizer
|
| 179 |
+
try:
|
| 180 |
+
import torch
|
| 181 |
+
torch.set_num_threads(2)
|
| 182 |
+
except Exception:
|
| 183 |
+
pass
|
| 184 |
+
|
| 185 |
+
if pipe is None or tokenizer is None:
|
| 186 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 187 |
+
LOCAL_MODEL,
|
| 188 |
+
trust_remote_code=True,
|
| 189 |
+
)
|
| 190 |
+
pipe = pipeline(
|
| 191 |
+
"text-generation",
|
| 192 |
+
model=LOCAL_MODEL,
|
| 193 |
+
tokenizer=tokenizer,
|
| 194 |
+
device_map="auto",
|
| 195 |
+
trust_remote_code=True,
|
| 196 |
+
)
|
| 197 |
|
| 198 |
local_msgs = (
|
| 199 |
[{"role": "system", "content": system_message}]
|
|
|
|
| 201 |
+ [{"role": "user", "content": user_with_fact}]
|
| 202 |
)
|
| 203 |
prompt = _build_local_prompt(local_msgs)
|
|
|
|
| 204 |
outputs = pipe(
|
| 205 |
prompt,
|
| 206 |
max_new_tokens=int(max_tokens),
|
| 207 |
do_sample=True,
|
| 208 |
temperature=float(temperature),
|
| 209 |
top_p=float(top_p),
|
| 210 |
+
pad_token_id=getattr(tokenizer, "eos_token_id", None),
|
| 211 |
+
eos_token_id=getattr(tokenizer, "eos_token_id", None),
|
| 212 |
)
|
| 213 |
+
full = outputs[0]["generated_text"]
|
| 214 |
+
assistant = full[len(prompt):].strip()
|
|
|
|
| 215 |
if "Assistant:" in assistant:
|
| 216 |
assistant = assistant.split("Assistant:", 1)[-1].strip()
|
| 217 |
|
|
|
|
| 267 |
return
|
| 268 |
|
| 269 |
# ---------- API PATH (HF ROUTER) ----------
|
| 270 |
+
# We know OPENAI_API_KEY is empty here. Use HF_TOKEN if available.
|
| 271 |
if HF_TOKEN:
|
| 272 |
print("[CS3] MODE=api (HF Router)")
|
| 273 |
url = f"{HF_BASE_URL.rstrip('/')}/v1/completions"
|
|
|
|
| 332 |
with gr.Blocks(css=fancy_css) as demo:
|
| 333 |
with gr.Row():
|
| 334 |
gr.Markdown("<h1 id='title'>π Chat with Gompei</h1>")
|
| 335 |
+
# CS3: OAuth disabled; keep a dummy state to match fn signature
|
| 336 |
+
token_input = gr.State(value=None)
|
| 337 |
|
| 338 |
gr.ChatInterface(
|
| 339 |
fn=respond,
|
|
|
|
| 366 |
step=0.05,
|
| 367 |
label="Top-p (nucleus sampling)",
|
| 368 |
),
|
| 369 |
+
gr.Checkbox(label="Use Local Model", value=False),
|
| 370 |
+
token_input, # placeholder for _unused_login
|
| 371 |
],
|
| 372 |
type="messages",
|
| 373 |
examples=[
|
|
|
|
| 380 |
128,
|
| 381 |
0.7,
|
| 382 |
0.95,
|
| 383 |
+
False,
|
| 384 |
None,
|
| 385 |
],
|
| 386 |
[
|
|
|
|
| 392 |
128,
|
| 393 |
0.7,
|
| 394 |
0.95,
|
| 395 |
+
False,
|
| 396 |
None,
|
| 397 |
],
|
| 398 |
],
|