|
|
|
|
|
import os |
|
|
|
|
|
|
|
|
try: |
|
|
import gradio as gr |
|
|
USE_GRADIO = True |
|
|
except ImportError: |
|
|
USE_GRADIO = False |
|
|
print("Gradio not installed β UI disabled.") |
|
|
|
|
|
|
|
|
DISABLE_INFERENCE = os.getenv("DISABLE_INFERENCE", "0") == "1" |
|
|
|
|
|
if not DISABLE_INFERENCE: |
|
|
try: |
|
|
from huggingface_hub import InferenceClient |
|
|
except ImportError as err: |
|
|
raise RuntimeError( |
|
|
"huggingface_hub missing -- install or set DISABLE_INFERENCE=1" |
|
|
) from err |
|
|
|
|
|
|
|
|
MODEL_ID = "Dushyant4342/ft-llama3-8b-credit-analyst" |
|
|
CONTEXT_WINDOW = 4096 |
|
|
RESERVED_TOKENS = 512 |
|
|
MAX_HISTORY = 10 |
|
|
DEFAULT_SYSTEM = ( |
|
|
"You are an expert credit analyst. Summarise key positive and negative " |
|
|
"changes in a customer's credit profile." |
|
|
) |
|
|
|
|
|
_client: "InferenceClient|None" = None |
|
|
|
|
|
|
|
|
def _get_client(): |
|
|
"""Lazy-init the InferenceClient unless inference is disabled.""" |
|
|
global _client |
|
|
if _client is None: |
|
|
_client = InferenceClient(repo_id=MODEL_ID) |
|
|
return _client |
|
|
|
|
|
|
|
|
|
|
|
def respond(user_msg, history, system_msg, max_tokens, temperature, top_p): |
|
|
"""Gradio streaming callback -- transparently stubs when inference is off.""" |
|
|
|
|
|
sys = system_msg.strip() or DEFAULT_SYSTEM |
|
|
msgs = [{"role": "system", "content": sys}] |
|
|
for u, a in history[-MAX_HISTORY:]: |
|
|
if u.strip(): msgs.append({"role": "user", "content": u.strip()}) |
|
|
if a.strip(): msgs.append({"role": "assistant", "content": a.strip()}) |
|
|
if user_msg.strip(): |
|
|
msgs.append({"role": "user", "content": user_msg.strip()}) |
|
|
|
|
|
|
|
|
budget = min(max_tokens, CONTEXT_WINDOW - RESERVED_TOKENS) |
|
|
if budget <= 0: |
|
|
yield "[Error] token budget exhausted." |
|
|
return |
|
|
|
|
|
|
|
|
if DISABLE_INFERENCE: |
|
|
yield f"(stub) echo: {user_msg}" |
|
|
return |
|
|
|
|
|
|
|
|
client = _get_client() |
|
|
try: |
|
|
for chunk in client.chat_completion( |
|
|
model=MODEL_ID, |
|
|
messages=msgs, |
|
|
max_tokens=budget, |
|
|
temperature=temperature, |
|
|
top_p=top_p, |
|
|
stream=True, |
|
|
): |
|
|
delta = chunk.choices[0].delta.get("content", "") |
|
|
if delta: |
|
|
yield delta |
|
|
except Exception as err: |
|
|
yield f"[Error] inference failed: {err}" |
|
|
|
|
|
|
|
|
if USE_GRADIO: |
|
|
demo = gr.ChatInterface( |
|
|
fn=respond, |
|
|
title="Credit Analyst Bot (stub-ready)", |
|
|
description=( |
|
|
"Set <code>DISABLE_INFERENCE=1</code> to work offline. " |
|
|
"Otherwise the app will call the hosted model." |
|
|
), |
|
|
additional_inputs=[ |
|
|
gr.Textbox(value=DEFAULT_SYSTEM, label="System message"), |
|
|
gr.Slider(1, CONTEXT_WINDOW, value=512, step=1, label="Max new tokens"), |
|
|
gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature"), |
|
|
gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p"), |
|
|
], |
|
|
type="messages", |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(show_error=True) |
|
|
else: |
|
|
if __name__ == "__main__": |
|
|
print("Gradio UI disabled.") |
|
|
|