Spaces:

shawno
/

Bella

Sleeping

App Files Files Community

shawno commited on Jun 22

Commit

149342d

verified ·

1 Parent(s): c1c816a

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -93

app.py CHANGED Viewed

@@ -1,8 +1,5 @@
 import os
 from llama_cpp import Llama
-# from sentence_transformers import SentenceTransformer # Keep commented for now due to RAM/complexity
-# import chromadb
-# from chromadb.utils import embedding_functions
 from fastapi import FastAPI, Query
 import gradio as gr
@@ -15,76 +12,48 @@ SYSTEM_MESSAGE = """You are Bella, an expert AI assistant dedicated to supportin
 # === Load LLM ===
 llm = None # Initialize llm to None
 try:
-    # MiniCPM-V models are generally used with `create_chat_completion`
-    # Llama.from_pretrained automatically handles downloading the GGUF from HF Hub
     print("Loading MiniCPM-V-2_6-gguf model...")
     llm = Llama.from_pretrained(
         repo_id="openbmb/MiniCPM-V-2_6-gguf",
         filename="ggml-model-Q4_K_M.gguf",
         n_ctx=4096,
         n_threads=os.cpu_count(),
-        n_batch=512,
-        verbose=False, # Set to True for more debug output
-        # `chat_format` can sometimes be inferred from model, but explicitly setting for safety:
-        # MiniCPM-V-2_6-gguf uses a specific chat template.
-        # Check the model card or a GGUF viewer for its precise chat template.
-        # This one is a common pattern for MiniCPM:
-        chat_format="chatml" # Or "llama-2" if that's what it uses, but chatml is more common
-                             # For MiniCPM specifically, it's <|im_start|>role\ncontent<|im_end|>
-                             # which is a variant of ChatML. Llama.cpp handles it if metadata exists.
     )
     print("MiniCPM-V-2_6-gguf model loaded successfully.")
 except Exception as e:
     print(f"Error loading MiniCPM-V-2_6-gguf model: {e}")
-    # Consider raising an error or exiting if the model is critical
-    # sys.exit(1) # You might want to import sys for this
-# === RAG Setup (Commented out for free Space compatibility and initial focus) ===
-"""
-embedder = SentenceTransformer("all-MiniLM-L6-v2")
-client = chromadb.PersistentClient(path="chroma_db")
-col = client.get_or_create_collection(
-    "docs",
-    embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
-        model_name="all-MiniLM-L6-v2"
-    )
-)
-seed_texts = [
-    "MiniCPM‑V‑2_6‑gguf runs well on CPU via llama.cpp.",
-    "This model supports RAG with Chromadb and FastAPI + Gradio UI."
-]
-for t in seed_texts:
-    col.add(documents=[t], ids=[str(hash(t))])
-"""
-# === Query Function (Modified to use chat_completion) ===
 def llm_query(messages_history: list, max_tokens: int) -> str:
     if llm is None:
-        return "Error: LLM model not loaded. Cannot generate response."
-    # context = "" # If RAG were active, you'd insert context here
-    # prompt = f"Context:\n{context}\n\nUser: {q}\nAssistant:" # Not needed with chat_completion
     try:
-        # Use create_chat_completion for streaming responses
         response_generator = llm.create_chat_completion(
-            messages=messages_history, # Pass the entire prepared history
             stream=True,
             max_tokens=max_tokens,
             temperature=0.7,
             top_p=0.9,
-            # Add stop tokens if known for MiniCPM-V-2_6, e.g., for ChatML:
-            # The model's chat_format (if set correctly during Llama init) will often handle these.
-            # MiniCPM-V-2_6 uses <|im_end|>
-            stop=["<|im_end|>"]
         )
         full_response = ""
         for chunk in response_generator:
-            # 'delta' contains the new token
             token = chunk["choices"][0]["delta"].get("content", "")
             full_response += token
-            yield full_response # Yield partial response for streaming
     except Exception as e:
         print(f"Error during LLM inference: {e}")
@@ -92,29 +61,23 @@ def llm_query(messages_history: list, max_tokens: int) -> str:
 # === FastAPI App ===
-# Keep FastAPI part if you intend to expose an API endpoint.
-# Note: For Gradio-only Spaces, you don't strictly need FastAPI, but it's fine to keep.
 app = FastAPI()
 @app.get("/ask")
 def ask_api(q: str = Query(...), tokens: int = Query(TOKEN_LIMIT)):
-    # This FastAPI endpoint will now use the chat history format internally,
-    # but for a single query it's just the system message and user message.
     messages_for_api = [
         {"role": "system", "content": SYSTEM_MESSAGE},
         {"role": "user", "content": q}
     ]
-    # For a non-streaming API, you'd run it to completion and return the final text
-    # Note: llm_query is a generator now, so you'd need to consume it for an API.
-    # For simplicity, if this API is purely for the Gradio frontend, it might not be necessary.
-    # If it is for external use and non-streaming, you'd adapt llm_query or call llm.create_chat_completion directly here.
     try:
         response = llm.create_chat_completion(
             messages=messages_for_api,
             max_tokens=tokens,
             temperature=0.7,
             top_p=0.9,
-            stop=["<|im_end|>"]
         )
         return {"answer": response["choices"][0]["message"]["content"]}
     except Exception as e:
@@ -127,37 +90,20 @@ def ask_post_api(body: dict):
 # === Gradio UI ===
 def chat_fn(message, history, max_tokens):
-    # Gradio `history` for gr.Chatbot(type="messages") is already in OpenAI format:
-    # list of dictionaries like [{"role": "user", "content": "hello"}, {"role": "assistant", "content": "hi"}]
-    # 1. Add user message to history immediately for display
-    # This creates a new history list with the user's message, for immediate display
     new_history = history + [{"role": "user", "content": message}]
-    yield new_history, gr.update(value="") # Clear textbox and update chatbot with user message
-    # 2. Prepare full message list for LLM, including the system message
     messages_for_llm = [{"role": "system", "content": SYSTEM_MESSAGE}] + new_history
-    # 3. Call LLM for response (streaming)
     full_bot_response = ""
     for chunk in llm_query(messages_for_llm, max_tokens):
-        full_bot_response = chunk # `llm_query` now yields the full_response string
-        # Update the last assistant message in the history with the streaming content
-        # Note: new_history[-1] is the user's message. We need to add a new assistant message.
-        # This implies modifying the `new_history` list in place for streaming to work on the UI.
-        if len(new_history) > 0 and new_history[-1]["role"] == "user":
-            if len(new_history) == len(history) + 1: # First chunk after user message
-                new_history.append({"role": "assistant", "content": full_bot_response})
-            else: # Subsequent chunks for the same assistant message
-                new_history[-1]["content"] = full_bot_response
-        else: # Fallback if history state is unexpected (shouldn't happen with Chatbot type="messages")
             new_history.append({"role": "assistant", "content": full_bot_response})
-        yield new_history, gr.update(value="") # Keep textbox cleared, update chatbot
-    # After generation is complete, ensure the final history state is sent
-    # (though the last yield in the loop should cover this)
-    # yield new_history, gr.update(value="") # This might be redundant but harmless
 with gr.Blocks() as demo:
@@ -169,44 +115,35 @@ with gr.Blocks() as demo:
         """
     )
-    # Use type="messages" for OpenAI-like chat history format
     chatbot = gr.Chatbot(
         height=500,
         label="Bella's Responses",
-        type="messages", # Important for the history format
         autoscroll=True,
         resizable=True,
         show_copy_button=True
     )
-    # Simplified input section
     msg = gr.Textbox(placeholder="Ask Bella a question...", show_label=False, submit_btn="Ask")
     token_slider = gr.Slider(64, 1024, value=256, step=16, label="Max tokens")
-    # Clear button
     clear_btn = gr.ClearButton([msg, chatbot])
-    # Gradio submit event for streaming.
-    # The `outputs` here are: chatbot (for history updates) and msg (to clear it).
     msg.submit(
         fn=chat_fn,
         inputs=[msg, chatbot, token_slider],
-        outputs=[chatbot, msg], # Order: chatbot first for history, msg second to clear input
-        queue=True # Set to True for streaming to work correctly in Gradio
     )
-# When using FastAPI, Gradio is launched via FastAPI's startup event.
 @app.on_event("startup")
-async def startup_event(): # Use async def for FastAPI startup events
     print("Starting Gradio app within FastAPI startup event...")
-    # This will launch Gradio within the Uvicorn server started by FastAPI
-    # The `share=True` is not needed in Hugging Face Spaces; it's handled automatically.
     demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
     print("Gradio app launch initiated.")
 if __name__ == "__main__":
     import uvicorn
-    # This block is for local testing. On Hugging Face Spaces, `app` is run by Gunicorn/Uvicorn.
     print("Running FastAPI app locally (if not in Hugging Face Space)...")
     uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))

 import os
 from llama_cpp import Llama
 from fastapi import FastAPI, Query
 import gradio as gr
 # === Load LLM ===
 llm = None # Initialize llm to None
 try:
     print("Loading MiniCPM-V-2_6-gguf model...")
     llm = Llama.from_pretrained(
         repo_id="openbmb/MiniCPM-V-2_6-gguf",
         filename="ggml-model-Q4_K_M.gguf",
         n_ctx=4096,
         n_threads=os.cpu_count(),
+        n_batch=512, # Increased batch size for prompt processing
+        n_gpu_layers=0, # Ensure this is 0 for CPU-only inference on free tier
+        verbose=False,
     )
     print("MiniCPM-V-2_6-gguf model loaded successfully.")
 except Exception as e:
     print(f"Error loading MiniCPM-V-2_6-gguf model: {e}")
+    # Consider more robust error handling for production
+    # e.g., setting a flag and displaying an error message in the UI
+# === Query Function (Modified for better repetition control) ===
 def llm_query(messages_history: list, max_tokens: int) -> str:
     if llm is None:
+        yield "Error: LLM model not loaded. Cannot generate response."
+        return
     try:
+        common_stop_tokens = ["<|im_end|>", "</s>", "<|end_of_text|>"]
         response_generator = llm.create_chat_completion(
+            messages=messages_history,
             stream=True,
             max_tokens=max_tokens,
             temperature=0.7,
             top_p=0.9,
+            repeat_penalty=1.1,
+            #repeat_last_n=256, # <--- NEW/MODIFIED: Increase the window for repetition penalty
+            stop=common_stop_tokens
         )
         full_response = ""
         for chunk in response_generator:
             token = chunk["choices"][0]["delta"].get("content", "")
             full_response += token
+            yield full_response
     except Exception as e:
         print(f"Error during LLM inference: {e}")
 # === FastAPI App ===
 app = FastAPI()
 @app.get("/ask")
 def ask_api(q: str = Query(...), tokens: int = Query(TOKEN_LIMIT)):
     messages_for_api = [
         {"role": "system", "content": SYSTEM_MESSAGE},
         {"role": "user", "content": q}
     ]
     try:
         response = llm.create_chat_completion(
             messages=messages_for_api,
             max_tokens=tokens,
             temperature=0.7,
             top_p=0.9,
+            repeat_penalty=1.1,
+            repeat_last_n=256, # <--- NEW/MODIFIED: Apply here as well
+            stop=["<|im_end|>", "</s>", "<|end_of_text|>"]
         )
         return {"answer": response["choices"][0]["message"]["content"]}
     except Exception as e:
 # === Gradio UI ===
 def chat_fn(message, history, max_tokens):
     new_history = history + [{"role": "user", "content": message}]
+    yield new_history, gr.update(value="")
     messages_for_llm = [{"role": "system", "content": SYSTEM_MESSAGE}] + new_history
     full_bot_response = ""
     for chunk in llm_query(messages_for_llm, max_tokens):
+        full_bot_response = chunk
+        if len(new_history) > 0 and new_history[-1]["role"] == "assistant":
+            new_history[-1]["content"] = full_bot_response
+        else:
             new_history.append({"role": "assistant", "content": full_bot_response})
+        yield new_history, gr.update(value="")
 with gr.Blocks() as demo:
         """
     )
     chatbot = gr.Chatbot(
         height=500,
         label="Bella's Responses",
+        type="messages",
         autoscroll=True,
         resizable=True,
         show_copy_button=True
     )
     msg = gr.Textbox(placeholder="Ask Bella a question...", show_label=False, submit_btn="Ask")
     token_slider = gr.Slider(64, 1024, value=256, step=16, label="Max tokens")
     clear_btn = gr.ClearButton([msg, chatbot])
     msg.submit(
         fn=chat_fn,
         inputs=[msg, chatbot, token_slider],
+        outputs=[chatbot, msg],
+        queue=True
     )
 @app.on_event("startup")
+async def startup_event():
     print("Starting Gradio app within FastAPI startup event...")
     demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
     print("Gradio app launch initiated.")
 if __name__ == "__main__":
     import uvicorn
     print("Running FastAPI app locally (if not in Hugging Face Space)...")
     uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))