Spaces:

Jensin
/

creditmav

Sleeping

App Files Files Community

Jensin commited on Jul 29

Commit

6fe9315

verified ·

1 Parent(s): 04f4a68

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -77

app.py CHANGED Viewed

@@ -1,73 +1,66 @@
-import gradio as gr
 from huggingface_hub import InferenceClient
-from typing import List, Tuple, Iterator
-from huggingface_hub.utils import HfHubHTTPError
-# Custom HuggingFace Inference endpoint
-MODEL_ID = Dushyant4342/ft-llama3-8b-credit-analyst
-# Context window constraints (Llama‑3‑8B supports ~4096 tokens)
-CONTEXT_WINDOW = 4096
-RESERVED_TOKENS = 512  # reserve space for response
-MAX_HISTORY_ENTRIES = 10  # cap history length to prevent context overflow
-# Default system prompt
 DEFAULT_SYSTEM = (
     "You are an expert credit analyst. Your role is to analyze a customer's "
     "credit data and generate a concise summary of the most important "
     "positive and negative changes."
 )
-def respond(
-    user_message: str,
-    history: List[Tuple[str, str]],
-    system_message: str,
-    max_tokens: int,
-    temperature: float,
-    top_p: float,
-) -> Iterator[str]:
-    """
-    Builds a chat history payload and streams back the assistant response.
-    Caps history length, instantiates a fresh InferenceClient per request, and
-    ensures max_tokens is clipped to context window limits.
-    Yields streamed token deltas or an error message if the call fails.
-    """
-    # Initialize a new client for this request (avoids lock contention)
-    client = InferenceClient(base_url=ENDPOINT)
-    # Strip system_message once
-    sys_content = system_message.strip()
-    # Trim history to the most recent entries
-    trimmed = history[-MAX_HISTORY_ENTRIES:]
-    # Build messages list, starting with system prompt
-    messages = [{
-        "role": "system",
-        "content": sys_content if sys_content else DEFAULT_SYSTEM
-    }]
-    # Append trimmed, non-empty history entries
-    for usr, bot in trimmed:
-        usr_text = usr.strip()
-        bot_text = bot.strip()
-        if usr_text:
-            messages.append({"role": "user", "content": usr_text})
-        if bot_text:
-            messages.append({"role": "assistant", "content": bot_text})
-    # Append current user message
-    um_text = user_message.strip()
-    if um_text:
-        messages.append({"role": "user", "content": um_text})
-    # Clip max_tokens to fit within context
-    allowed = max(0, CONTEXT_WINDOW - RESERVED_TOKENS)
     max_tok = min(max_tokens, allowed)
-    # Stream generation without shared locks
     try:
         for chunk in client.chat_completion(
             messages=messages,
             max_tokens=max_tok,
             temperature=temperature,
@@ -77,30 +70,101 @@ def respond(
             delta = chunk.choices[0].delta.get("content", "")
             if delta:
                 yield delta
-    except HfHubHTTPError as e:
         yield f"[Error] Inference request failed: {e}"
-# Gradio Chat UI setup
-demo = gr.ChatInterface(
-    fn=respond,
-    title="Credit Analyst Bot",
-    description="Ask about customer credit profile changes.",
-    additional_inputs=[
-        gr.Textbox(value=DEFAULT_SYSTEM, label="System message"),
-        gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.9,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-    type='messages'
 )
 if __name__ == "__main__":
-    demo.launch()

+# File: app.py
+import os
+# Attempt Gradio import; disable UI if ssl is unavailable
+try:
+    import gradio as gr
+    import ssl  # noqa: F401
+    USE_GRADIO = True
+except ModuleNotFoundError as e:
+    if 'ssl' in str(e):
+        USE_GRADIO = False
+        print("Warning: ssl module unavailable; Gradio UI disabled.")
+    else:
+        raise
 from huggingface_hub import InferenceClient
+import requests  # for HTTP error handling
+MODEL_ID = "Dushyant4342/ft-llama3-8b-credit-analyst"
+ENDPOINT = f"https://api-inference.huggingface.co/models/{MODEL_ID}"
+CONTEXT_WINDOW = 4096  # model context size
+RESERVED_TOKENS = 512  # space reserved for generation
+MAX_HISTORY_ENTRIES = 10  # context truncation length
 DEFAULT_SYSTEM = (
     "You are an expert credit analyst. Your role is to analyze a customer's "
     "credit data and generate a concise summary of the most important "
     "positive and negative changes."
 )
+_client = None
+def get_client():
+    """Singleton InferenceClient to reduce instantiation overhead."""
+    global _client
+    if _client is None:
+        _client = InferenceClient(base_url=ENDPOINT)
+    return _client
+def respond(user_message, history, system_message, max_tokens, temperature, top_p):
+    client = get_client()
+    # Build system + history + user messages
+    sys_content = system_message.strip() or DEFAULT_SYSTEM
+    messages = [{"role": "system", "content": sys_content}]
+    for usr, bot in history[-MAX_HISTORY_ENTRIES:]:
+        if usr.strip(): messages.append({"role": "user", "content": usr.strip()})
+        if bot.strip(): messages.append({"role": "assistant", "content": bot.strip()})
+    if user_message.strip():
+        messages.append({"role": "user", "content": user_message.strip()})
+    # Token budget guard
+    allowed = CONTEXT_WINDOW - RESERVED_TOKENS
     max_tok = min(max_tokens, allowed)
+    if max_tok <= 0:
+        yield "[Error] Token budget exhausted."
+        return
+    # Stream response, catch network errors
     try:
         for chunk in client.chat_completion(
+            model=MODEL_ID,
             messages=messages,
             max_tokens=max_tok,
             temperature=temperature,
             delta = chunk.choices[0].delta.get("content", "")
             if delta:
                 yield delta
+    except requests.exceptions.RequestException as e:
         yield f"[Error] Inference request failed: {e}"
+if USE_GRADIO:
+    demo = gr.ChatInterface(
+        fn=respond,
+        title="Credit Analyst Bot",
+        description="Ask about customer credit profile changes.",
+        additional_inputs=[
+            gr.Textbox(value=DEFAULT_SYSTEM, label="System message"),
+            gr.Slider(1, CONTEXT_WINDOW, value=512, step=1, label="Max new tokens"),
+            gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature"),
+            gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p"),
+        ],
+        type="messages"
+    )
+    if __name__ == "__main__":
+        demo.launch()
+else:
+    if __name__ == "__main__":
+        print("Gradio UI disabled. Use local_inference.py for direct calls.")
+# File: local_inference.py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+MODEL_ID = "Dushyant4342/ft-llama3-8b-credit-analyst"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.bfloat16,
+    device_map="auto"
 )
+model.eval()
+def summarize_credit(customer_data: str, user_command: str,
+                     max_new_tokens=128, temperature=0.6, top_p=0.9):
+    """Return a concise credit summary given structured data and a user command."""
+    system_prompt = DEFAULT_SYSTEM
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": f"{user_command}\n\n--- DATA ---\n{customer_data}"}
+    ]
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            temperature=temperature,
+            top_p=top_p,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+    # Decode only the generated part
+    gen = outputs[0][inputs["input_ids"].shape[-1]:]
+    return tokenizer.decode(gen, skip_special_tokens=True)
+# File: tests/test_credit_analyst.py
+import unittest
+from local_inference import summarize_credit
+class TestCreditAnalystSummarization(unittest.TestCase):
+    def test_basic_output_type(self):
+        data = (
+            "--- Credit Profile Report ---\n"
+            "Risk Score: 600 (was 650)"
+        )
+        cmd = "Summarize changes in one sentence."
+        output = summarize_credit(data, cmd, max_new_tokens=32, temperature=0.0, top_p=1.0)
+        self.assertIsInstance(output, str)
+        self.assertTrue(len(output) > 0)
+    def test_empty_data(self):
+        data = ""
+        cmd = "Summarize changes."
+        output = summarize_credit(data, cmd, max_new_tokens=16, temperature=0.0, top_p=1.0)
+        self.assertIsInstance(output, str)
+    def test_token_budget_exhaustion(self):
+        # Simulate a scenario where max_tokens <= RESERVED_TOKENS
+        # This uses the respond() logic; here we simply ensure summarize_credit doesn't error
+        data = "--- Credit Profile Report ---"
+        cmd = "Summarize."
+        # Pass a very low max_new_tokens to test generate with zero budget
+        output = summarize_credit(data, cmd, max_new_tokens=0, temperature=0.0, top_p=1.0)
+        self.assertIsInstance(output, str)
 if __name__ == "__main__":
+    unittest.main()