Update app.py
Browse files
app.py
CHANGED
|
@@ -1,98 +1,211 @@
|
|
| 1 |
import os
|
| 2 |
from llama_cpp import Llama
|
| 3 |
-
from sentence_transformers import SentenceTransformer
|
| 4 |
-
import chromadb
|
| 5 |
-
from chromadb.utils import embedding_functions
|
| 6 |
from fastapi import FastAPI, Query
|
| 7 |
import gradio as gr
|
| 8 |
-
from functools import lru_cache
|
| 9 |
|
| 10 |
-
# ===
|
| 11 |
-
TOKEN_LIMIT = 256
|
| 12 |
-
|
|
|
|
|
|
|
| 13 |
|
| 14 |
# === Load LLM ===
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
)
|
| 31 |
-
)
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
except:
|
| 41 |
-
pass # Avoid duplicates on restart
|
| 42 |
-
|
| 43 |
-
@lru_cache(maxsize=128)
|
| 44 |
-
def embed_query(q: str):
|
| 45 |
-
return embedder.encode(q)
|
| 46 |
-
|
| 47 |
-
# === RAG or Vanilla Query ===
|
| 48 |
-
def rag_query(q: str, max_tokens: int) -> str:
|
| 49 |
-
try:
|
| 50 |
-
context = ""
|
| 51 |
-
if USE_RAG:
|
| 52 |
-
results = col.query(
|
| 53 |
-
query_embeddings=[embed_query(q)],
|
| 54 |
-
n_results=3
|
| 55 |
-
)
|
| 56 |
-
context = "\n".join(results["documents"][0])
|
| 57 |
-
|
| 58 |
-
prompt = f"Context:\n{context}\n\nUser: {q}\nAssistant:"
|
| 59 |
-
out = model.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=0.7)
|
| 60 |
-
return out["choices"][0]["text"]
|
| 61 |
except Exception as e:
|
| 62 |
-
|
|
|
|
|
|
|
| 63 |
|
| 64 |
-
# === FastAPI ===
|
|
|
|
|
|
|
| 65 |
app = FastAPI()
|
| 66 |
|
| 67 |
@app.get("/ask")
|
| 68 |
-
def
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
@app.post("/ask")
|
| 73 |
-
def
|
| 74 |
-
return
|
|
|
|
| 75 |
|
| 76 |
# === Gradio UI ===
|
| 77 |
def chat_fn(message, history, max_tokens):
|
| 78 |
-
history =
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
with gr.Blocks() as demo:
|
| 84 |
-
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
|
|
|
| 92 |
@app.on_event("startup")
|
| 93 |
-
def
|
|
|
|
|
|
|
|
|
|
| 94 |
demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
|
|
|
|
| 95 |
|
| 96 |
if __name__ == "__main__":
|
| 97 |
import uvicorn
|
| 98 |
-
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
from llama_cpp import Llama
|
| 3 |
+
# from sentence_transformers import SentenceTransformer # Keep commented for now due to RAM/complexity
|
| 4 |
+
# import chromadb
|
| 5 |
+
# from chromadb.utils import embedding_functions
|
| 6 |
from fastapi import FastAPI, Query
|
| 7 |
import gradio as gr
|
|
|
|
| 8 |
|
| 9 |
+
# === Globals ===
|
| 10 |
+
TOKEN_LIMIT = 256 # Default, overridden by slider
|
| 11 |
+
|
| 12 |
+
# --- System Message for Bella ---
|
| 13 |
+
SYSTEM_MESSAGE = """You are Bella, an expert AI assistant dedicated to supporting users across diverse domains such as coding, academic assignments (homework, computer science projects), and professional document creation. Your responses should always be accurate, comprehensive, and tailored to the user's needs, whether they are beginners or advanced learners. Prioritize clear explanations, practical advice, and step-by-step guidance to ensure user success. Do not engage in conversational filler; focus strictly on providing direct and valuable assistance."""
|
| 14 |
|
| 15 |
# === Load LLM ===
|
| 16 |
+
llm = None # Initialize llm to None
|
| 17 |
+
try:
|
| 18 |
+
# MiniCPM-V models are generally used with `create_chat_completion`
|
| 19 |
+
# Llama.from_pretrained automatically handles downloading the GGUF from HF Hub
|
| 20 |
+
print("Loading MiniCPM-V-2_6-gguf model...")
|
| 21 |
+
llm = Llama.from_pretrained(
|
| 22 |
+
repo_id="openbmb/MiniCPM-V-2_6-gguf",
|
| 23 |
+
filename="ggml-model-Q4_K_M.gguf",
|
| 24 |
+
n_ctx=4096,
|
| 25 |
+
n_threads=os.cpu_count(),
|
| 26 |
+
verbose=False, # Set to True for more debug output
|
| 27 |
+
# `chat_format` can sometimes be inferred from model, but explicitly setting for safety:
|
| 28 |
+
# MiniCPM-V-2_6-gguf uses a specific chat template.
|
| 29 |
+
# Check the model card or a GGUF viewer for its precise chat template.
|
| 30 |
+
# This one is a common pattern for MiniCPM:
|
| 31 |
+
chat_format="chatml" # Or "llama-2" if that's what it uses, but chatml is more common
|
| 32 |
+
# For MiniCPM specifically, it's <|im_start|>role\ncontent<|im_end|>
|
| 33 |
+
# which is a variant of ChatML. Llama.cpp handles it if metadata exists.
|
| 34 |
+
)
|
| 35 |
+
print("MiniCPM-V-2_6-gguf model loaded successfully.")
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(f"Error loading MiniCPM-V-2_6-gguf model: {e}")
|
| 38 |
+
# Consider raising an error or exiting if the model is critical
|
| 39 |
+
# sys.exit(1) # You might want to import sys for this
|
| 40 |
+
|
| 41 |
+
# === RAG Setup (Commented out for free Space compatibility and initial focus) ===
|
| 42 |
+
"""
|
| 43 |
+
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
| 44 |
+
client = chromadb.PersistentClient(path="chroma_db")
|
| 45 |
+
col = client.get_or_create_collection(
|
| 46 |
+
"docs",
|
| 47 |
+
embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
|
| 48 |
+
model_name="all-MiniLM-L6-v2"
|
| 49 |
+
)
|
| 50 |
)
|
| 51 |
+
seed_texts = [
|
| 52 |
+
"MiniCPM‑V‑2_6‑gguf runs well on CPU via llama.cpp.",
|
| 53 |
+
"This model supports RAG with Chromadb and FastAPI + Gradio UI."
|
| 54 |
+
]
|
| 55 |
+
for t in seed_texts:
|
| 56 |
+
col.add(documents=[t], ids=[str(hash(t))])
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
# === Query Function (Modified to use chat_completion) ===
|
| 60 |
+
def llm_query(messages_history: list, max_tokens: int) -> str:
|
| 61 |
+
if llm is None:
|
| 62 |
+
return "Error: LLM model not loaded. Cannot generate response."
|
| 63 |
+
|
| 64 |
+
# context = "" # If RAG were active, you'd insert context here
|
| 65 |
+
# prompt = f"Context:\n{context}\n\nUser: {q}\nAssistant:" # Not needed with chat_completion
|
| 66 |
|
| 67 |
+
try:
|
| 68 |
+
# Use create_chat_completion for streaming responses
|
| 69 |
+
response_generator = llm.create_chat_completion(
|
| 70 |
+
messages=messages_history, # Pass the entire prepared history
|
| 71 |
+
stream=True,
|
| 72 |
+
max_tokens=max_tokens,
|
| 73 |
+
temperature=0.7,
|
| 74 |
+
top_p=0.9,
|
| 75 |
+
# Add stop tokens if known for MiniCPM-V-2_6, e.g., for ChatML:
|
| 76 |
+
# The model's chat_format (if set correctly during Llama init) will often handle these.
|
| 77 |
+
# MiniCPM-V-2_6 uses <|im_end|>
|
| 78 |
+
stop=["<|im_end|>"]
|
| 79 |
)
|
|
|
|
| 80 |
|
| 81 |
+
full_response = ""
|
| 82 |
+
for chunk in response_generator:
|
| 83 |
+
# 'delta' contains the new token
|
| 84 |
+
token = chunk["choices"][0]["delta"].get("content", "")
|
| 85 |
+
full_response += token
|
| 86 |
+
yield full_response # Yield partial response for streaming
|
| 87 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
except Exception as e:
|
| 89 |
+
print(f"Error during LLM inference: {e}")
|
| 90 |
+
yield f"An error occurred during generation: {e}"
|
| 91 |
+
|
| 92 |
|
| 93 |
+
# === FastAPI App ===
|
| 94 |
+
# Keep FastAPI part if you intend to expose an API endpoint.
|
| 95 |
+
# Note: For Gradio-only Spaces, you don't strictly need FastAPI, but it's fine to keep.
|
| 96 |
app = FastAPI()
|
| 97 |
|
| 98 |
@app.get("/ask")
|
| 99 |
+
def ask_api(q: str = Query(...), tokens: int = Query(TOKEN_LIMIT)):
|
| 100 |
+
# This FastAPI endpoint will now use the chat history format internally,
|
| 101 |
+
# but for a single query it's just the system message and user message.
|
| 102 |
+
messages_for_api = [
|
| 103 |
+
{"role": "system", "content": SYSTEM_MESSAGE},
|
| 104 |
+
{"role": "user", "content": q}
|
| 105 |
+
]
|
| 106 |
+
# For a non-streaming API, you'd run it to completion and return the final text
|
| 107 |
+
# Note: llm_query is a generator now, so you'd need to consume it for an API.
|
| 108 |
+
# For simplicity, if this API is purely for the Gradio frontend, it might not be necessary.
|
| 109 |
+
# If it is for external use and non-streaming, you'd adapt llm_query or call llm.create_chat_completion directly here.
|
| 110 |
+
try:
|
| 111 |
+
response = llm.create_chat_completion(
|
| 112 |
+
messages=messages_for_api,
|
| 113 |
+
max_tokens=tokens,
|
| 114 |
+
temperature=0.7,
|
| 115 |
+
top_p=0.9,
|
| 116 |
+
stop=["<|im_end|>"]
|
| 117 |
+
)
|
| 118 |
+
return {"answer": response["choices"][0]["message"]["content"]}
|
| 119 |
+
except Exception as e:
|
| 120 |
+
return {"error": str(e)}, 500
|
| 121 |
|
| 122 |
@app.post("/ask")
|
| 123 |
+
def ask_post_api(body: dict):
|
| 124 |
+
return ask_api(q=body.get("q", ""), tokens=body.get("tokens", TOKEN_LIMIT))
|
| 125 |
+
|
| 126 |
|
| 127 |
# === Gradio UI ===
|
| 128 |
def chat_fn(message, history, max_tokens):
|
| 129 |
+
# Gradio `history` for gr.Chatbot(type="messages") is already in OpenAI format:
|
| 130 |
+
# list of dictionaries like [{"role": "user", "content": "hello"}, {"role": "assistant", "content": "hi"}]
|
| 131 |
+
|
| 132 |
+
# 1. Add user message to history immediately for display
|
| 133 |
+
# This creates a new history list with the user's message, for immediate display
|
| 134 |
+
new_history = history + [{"role": "user", "content": message}]
|
| 135 |
+
yield new_history, gr.update(value="") # Clear textbox and update chatbot with user message
|
| 136 |
+
|
| 137 |
+
# 2. Prepare full message list for LLM, including the system message
|
| 138 |
+
messages_for_llm = [{"role": "system", "content": SYSTEM_MESSAGE}] + new_history
|
| 139 |
+
|
| 140 |
+
# 3. Call LLM for response (streaming)
|
| 141 |
+
full_bot_response = ""
|
| 142 |
+
for chunk in llm_query(messages_for_llm, max_tokens):
|
| 143 |
+
full_bot_response = chunk # `llm_query` now yields the full_response string
|
| 144 |
+
# Update the last assistant message in the history with the streaming content
|
| 145 |
+
# Note: new_history[-1] is the user's message. We need to add a new assistant message.
|
| 146 |
+
# This implies modifying the `new_history` list in place for streaming to work on the UI.
|
| 147 |
+
if len(new_history) > 0 and new_history[-1]["role"] == "user":
|
| 148 |
+
if len(new_history) == len(history) + 1: # First chunk after user message
|
| 149 |
+
new_history.append({"role": "assistant", "content": full_bot_response})
|
| 150 |
+
else: # Subsequent chunks for the same assistant message
|
| 151 |
+
new_history[-1]["content"] = full_bot_response
|
| 152 |
+
else: # Fallback if history state is unexpected (shouldn't happen with Chatbot type="messages")
|
| 153 |
+
new_history.append({"role": "assistant", "content": full_bot_response})
|
| 154 |
+
|
| 155 |
+
yield new_history, gr.update(value="") # Keep textbox cleared, update chatbot
|
| 156 |
+
|
| 157 |
+
# After generation is complete, ensure the final history state is sent
|
| 158 |
+
# (though the last yield in the loop should cover this)
|
| 159 |
+
# yield new_history, gr.update(value="") # This might be redundant but harmless
|
| 160 |
+
|
| 161 |
|
| 162 |
with gr.Blocks() as demo:
|
| 163 |
+
gr.Markdown(
|
| 164 |
+
"""
|
| 165 |
+
# 🧠 Bella: MiniCPM-V-2_6-gguf AI Assistant
|
| 166 |
+
Welcome! I'm Bella, designed to assist you with coding, homework, computer science projects,
|
| 167 |
+
and document writing. I provide accurate, comprehensive, and tailored guidance.
|
| 168 |
+
"""
|
| 169 |
+
)
|
| 170 |
|
| 171 |
+
# Use type="messages" for OpenAI-like chat history format
|
| 172 |
+
chatbot = gr.Chatbot(
|
| 173 |
+
height=500,
|
| 174 |
+
label="Bella's Responses",
|
| 175 |
+
type="messages", # Important for the history format
|
| 176 |
+
autoscroll=True,
|
| 177 |
+
resizable=True,
|
| 178 |
+
show_copy_button=True
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
# Simplified input section
|
| 182 |
+
msg = gr.Textbox(placeholder="Ask Bella a question...", show_label=False, submit_btn="Ask")
|
| 183 |
+
|
| 184 |
+
token_slider = gr.Slider(64, 1024, value=256, step=16, label="Max tokens")
|
| 185 |
|
| 186 |
+
# Clear button
|
| 187 |
+
clear_btn = gr.ClearButton([msg, chatbot])
|
| 188 |
+
|
| 189 |
+
# Gradio submit event for streaming.
|
| 190 |
+
# The `outputs` here are: chatbot (for history updates) and msg (to clear it).
|
| 191 |
+
msg.submit(
|
| 192 |
+
fn=chat_fn,
|
| 193 |
+
inputs=[msg, chatbot, token_slider],
|
| 194 |
+
outputs=[chatbot, msg], # Order: chatbot first for history, msg second to clear input
|
| 195 |
+
queue=True # Set to True for streaming to work correctly in Gradio
|
| 196 |
+
)
|
| 197 |
|
| 198 |
+
# When using FastAPI, Gradio is launched via FastAPI's startup event.
|
| 199 |
@app.on_event("startup")
|
| 200 |
+
async def startup_event(): # Use async def for FastAPI startup events
|
| 201 |
+
print("Starting Gradio app within FastAPI startup event...")
|
| 202 |
+
# This will launch Gradio within the Uvicorn server started by FastAPI
|
| 203 |
+
# The `share=True` is not needed in Hugging Face Spaces; it's handled automatically.
|
| 204 |
demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
|
| 205 |
+
print("Gradio app launch initiated.")
|
| 206 |
|
| 207 |
if __name__ == "__main__":
|
| 208 |
import uvicorn
|
| 209 |
+
# This block is for local testing. On Hugging Face Spaces, `app` is run by Gunicorn/Uvicorn.
|
| 210 |
+
print("Running FastAPI app locally (if not in Hugging Face Space)...")
|
| 211 |
+
uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))
|