# app.py (CS3 compliant, with safe fallbacks) # - Prometheus metrics on :8000 # - Gradio UI on :7860 (0.0.0.0 for Docker) # - Local vs API selection via checkbox + env # - Falls back to local model if no API creds # - Avoids double-counting RESP_LATENCY import os import json import random import time from typing import Optional import gradio as gr import requests from prometheus_client import start_http_server, Counter, Histogram, Gauge, Info print("[CS3] STARTUP") # ========== Config ========== PRODUCT_KIND = os.getenv("PRODUCT_KIND", "unknown") # "local" | "api" | "unknown" # Local model LOCAL_MODEL = os.getenv("LOCAL_MODEL", "sshleifer/tiny-gpt2").strip() # OpenAI-compatible provider (OpenRouter / Together / OpenAI) OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "").strip() OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "").strip() # Hugging Face Router fallback HF_BASE_URL = os.getenv("HF_BASE_URL", "https://router.huggingface.co").strip() HF_MODEL_ID = os.getenv("HF_MODEL_ID", "google/gemma-2-2b-it").strip() HF_TOKEN = os.getenv("HF_TOKEN", "").strip() print(f"[CS3] PRODUCT_KIND={PRODUCT_KIND}") print(f"[CS3] LOCAL_MODEL={LOCAL_MODEL}") print(f"[CS3] OPENAI_BASE_URL={'' if OPENAI_BASE_URL else ''}") print(f"[CS3] HF_BASE_URL={HF_BASE_URL}") print(f"[CS3] HF_MODEL_ID={HF_MODEL_ID}") print(f"[CS3] HF_TOKEN={'' if HF_TOKEN else ''}") # ========== Metrics ========== REQS_TOTAL = Counter( "gompei_requests_total", "Total chat requests processed", ["product", "status"], ) RESP_LATENCY = Histogram( "gompei_response_latency_seconds", "End-to-end response latency (seconds)", buckets=(0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10, 20), ) ACTIVE_SESSIONS = Gauge( "gompei_active_sessions", "Active chat sessions (len(history) proxy)", ) TOKENS_OUT = Counter( "gompei_tokens_emitted_total", "Approx tokens emitted (chars/4 heuristic)", ["product"], ) BUILD_INFO = Info( "gompei_build_info", "Build/provider/model info for this instance", ) # ========== Facts + CSS ========== FACTS_PATH = "facts.json" DEFAULT_FACTS = [{"text": "WPI was founded in 1865 by John Boynton and Ichabod Washburn."}] try: with open(FACTS_PATH, "r") as f: WPI_FACTS = json.load(f) if not isinstance(WPI_FACTS, list) or not WPI_FACTS: WPI_FACTS = DEFAULT_FACTS except Exception as e: print(f"[CS3] Could not load facts.json: {e}") WPI_FACTS = DEFAULT_FACTS fancy_css = "#title { text-align: center; }" # ========== Prompt helpers ========== def _build_local_prompt(msgs: list[dict[str, str]]) -> str: """Simple chat-ish prompt for local text-generation.""" parts = [] for m in msgs: r = m["role"] if r == "system": parts.append(f"System: {m['content']}") elif r == "user": parts.append(f"User: {m['content']}") else: parts.append(f"Assistant: {m['content']}") parts.append("Assistant:") return "\n".join(parts) def _build_chat_messages( system_message: str, history: list[dict[str, str]], user_text: str, ) -> list[dict[str, str]]: msgs = [{"role": "system", "content": system_message}] msgs.extend(history or []) msgs.append({"role": "user", "content": user_text}) return msgs def _build_hf_chat_prompt(msgs: list[dict[str, str]]) -> str: """Flat prompt for HF /v1/completions.""" parts = [] for m in msgs: r = m["role"] if r == "system": parts.append(f"System: {m['content']}") elif r == "user": parts.append(f"User: {m['content']}") elif r == "assistant": parts.append(f"Assistant: {m['content']}") parts.append("Assistant:") return "\n".join(parts) # ========== State for local model ========== pipe = None tokenizer = None # ========== Core chat handler ========== def respond( message: str, history: list[dict[str, str]], system_message: str, max_tokens: int, temperature: float, top_p: float, use_local_model: bool, _unused_login: Optional[object] = None, # kept for signature compatibility ): """ Main chat handler: - If use_local_model is True → always local HF transformers pipeline. - Else: - If OPENAI_API_KEY set → OpenAI-compatible API path. - Elif HF_TOKEN set → Hugging Face Router /v1/completions. - Else → fall back to local model (no more 🔐 error). """ global pipe, tokenizer start_time = time.time() token_estimate = 0 status = "ok" try: # Pick a random WPI fact fact = random.choice(WPI_FACTS)["text"] user_with_fact = f"{message}\n\nFun fact: {fact}" # Effective decision: if no API creds at all, force local effective_use_local = use_local_model if not effective_use_local and not OPENAI_API_KEY and not HF_TOKEN: print("[CS3] No OPENAI_API_KEY or HF_TOKEN found; falling back to local model.") effective_use_local = True # ---------- LOCAL MODEL PATH ---------- if effective_use_local: print("[CS3] MODE=local") try: from transformers import pipeline, AutoTokenizer try: import torch torch.set_num_threads(2) except Exception: pass if pipe is None or tokenizer is None: tokenizer = AutoTokenizer.from_pretrained( LOCAL_MODEL, trust_remote_code=True, ) pipe = pipeline( "text-generation", model=LOCAL_MODEL, tokenizer=tokenizer, device_map="auto", trust_remote_code=True, ) local_msgs = ( [{"role": "system", "content": system_message}] + (history or []) + [{"role": "user", "content": user_with_fact}] ) prompt = _build_local_prompt(local_msgs) outputs = pipe( prompt, max_new_tokens=int(max_tokens), do_sample=True, temperature=float(temperature), top_p=float(top_p), pad_token_id=getattr(tokenizer, "eos_token_id", None), eos_token_id=getattr(tokenizer, "eos_token_id", None), ) full = outputs[0]["generated_text"] assistant = full[len(prompt):].strip() if "Assistant:" in assistant: assistant = assistant.split("Assistant:", 1)[-1].strip() token_estimate += max(0, len(assistant)) // 4 yield assistant return except Exception as e: status = "error" print(f"[CS3] Local model failed: {e}") yield f"⚠️ Local model error: {e}" return # ---------- API PATH (OPENAI-COMPATIBLE) ---------- if OPENAI_API_KEY: print("[CS3] MODE=api (OpenAI-compatible)") base = (OPENAI_BASE_URL or "https://openrouter.ai/api").rstrip("/") url = f"{base}/v1/chat/completions" headers = { "Authorization": f"Bearer {OPENAI_API_KEY}", # optional headers for OpenRouter, safe to send elsewhere too "HTTP-Referer": os.getenv("OR_REFERER", "http://localhost"), "X-Title": os.getenv("OR_TITLE", "Gompei CS3"), } msgs = _build_chat_messages(system_message, history or [], user_with_fact) payload = { "model": HF_MODEL_ID, # reuse env var to choose model name "messages": msgs, "max_tokens": int(max_tokens), "temperature": float(temperature), "top_p": float(top_p), } try: r = requests.post(url, headers=headers, json=payload, timeout=120) if r.status_code == 401: status = "error" yield "⚠️ Auth failed (401). Check OPENAI_API_KEY and model access." elif r.status_code >= 400: status = "error" yield f"⚠️ API error {r.status_code}: {r.text[:300]}" else: data = r.json() text = data["choices"][0]["message"]["content"] token_estimate += max(0, len(text)) // 4 yield text except requests.Timeout: status = "error" yield "⚠️ API timeout. Try again or lower max tokens." except Exception as e: status = "error" yield f"⚠️ API request failed: {e}" return # ---------- API PATH (HF ROUTER) ---------- # We know OPENAI_API_KEY is empty here. Use HF_TOKEN if available. if HF_TOKEN: print("[CS3] MODE=api (HF Router)") url = f"{HF_BASE_URL.rstrip('/')}/v1/completions" headers = {"Authorization": f"Bearer {HF_TOKEN}"} msgs = _build_chat_messages(system_message, history or [], user_with_fact) prompt = _build_hf_chat_prompt(msgs) payload = { "model": HF_MODEL_ID, "prompt": prompt, "max_tokens": int(max_tokens), "temperature": float(temperature), "top_p": float(top_p), } try: r = requests.post(url, headers=headers, json=payload, timeout=120) if r.status_code == 401: status = "error" yield "⚠️ Hugging Face auth failed (401). Ensure HF_TOKEN is valid." elif r.status_code == 404: status = "error" yield ( "⚠️ Model not found at HF Router (404). " "Try a widely available model like " "`google/gemma-2-2b-it` or adjust HF_MODEL_ID." ) elif r.status_code >= 400: status = "error" yield f"⚠️ HF Router error {r.status_code}: {r.text[:300]}" else: data = r.json() text = data["choices"][0].get("text") or "" token_estimate += max(0, len(text)) // 4 yield text except requests.Timeout: status = "error" yield "⚠️ HF Router timeout. Try again or lower max tokens." except Exception as e: status = "error" yield f"⚠️ HF request failed: {e}" return # If we somehow get here, fall back to local with a message status = "error" yield "⚠️ No API credentials found; please enable 'Use Local Model' or set OPENAI_API_KEY/HF_TOKEN." except Exception: status = "error" raise finally: # One end-to-end latency observation per request RESP_LATENCY.observe(time.time() - start_time) REQS_TOTAL.labels(PRODUCT_KIND, status).inc() ACTIVE_SESSIONS.set(0 if not history else len(history)) TOKENS_OUT.labels(PRODUCT_KIND).inc(token_estimate) # ========== Gradio UI ========== def create_demo(enable_oauth: bool = False): with gr.Blocks(css=fancy_css) as demo: with gr.Row(): gr.Markdown("

🐐 Chat with Gompei

") # CS3: OAuth disabled; keep a dummy state to match fn signature token_input = gr.State(value=None) gr.ChatInterface( fn=respond, additional_inputs=[ gr.Textbox( value=( "You are Gompei the Goat, WPI's mascot. " "Answer questions with fun goat-like personality and real WPI facts." ), label="System message", ), gr.Slider( minimum=1, maximum=1024, value=256, step=1, label="Max new tokens", ), gr.Slider( minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature", ), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), gr.Checkbox(label="Use Local Model", value=False), token_input, # placeholder for _unused_login ], type="messages", examples=[ [ "Where is WPI located?", ( "You are Gompei the Goat, WPI's mascot. " "Answer questions with fun goat-like personality and real WPI facts." ), 128, 0.7, 0.95, False, None, ], [ "Who founded WPI?", ( "You are Gompei the Goat, WPI's mascot. " "Answer questions with fun goat-like personality and real WPI facts." ), 128, 0.7, 0.95, False, None, ], ], ) return demo # Auto-create UI unless tests/CI skip it if os.getenv("SKIP_UI_ON_IMPORT") != "1": demo = create_demo(enable_oauth=False) if __name__ == "__main__": # Start Prometheus metrics server on :8000 before launching UI start_http_server(8000) BUILD_INFO.info({ "version": "cs3", "provider": ( "openai-compatible" if OPENAI_API_KEY else ("hf-router" if HF_TOKEN else "local-only") ), "local_model": LOCAL_MODEL, "hf_model_id": HF_MODEL_ID, "product": PRODUCT_KIND, }) if "demo" not in globals(): demo = create_demo(enable_oauth=False) demo.queue().launch( server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"), server_port=int(os.getenv("GRADIO_SERVER_PORT", "7860")), show_api=False, )