shawno commited on
Commit
9212838
·
verified ·
1 Parent(s): 66f4f20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +180 -67
app.py CHANGED
@@ -1,98 +1,211 @@
1
  import os
2
  from llama_cpp import Llama
3
- from sentence_transformers import SentenceTransformer
4
- import chromadb
5
- from chromadb.utils import embedding_functions
6
  from fastapi import FastAPI, Query
7
  import gradio as gr
8
- from functools import lru_cache
9
 
10
- # === Config ===
11
- TOKEN_LIMIT = 256
12
- USE_RAG = True # Toggle RAG mode
 
 
13
 
14
  # === Load LLM ===
15
- model = Llama.from_pretrained(
16
- repo_id="openbmb/MiniCPM-V-2_6-gguf",
17
- filename="ggml-model-Q4_K_M.gguf",
18
- n_ctx=4096,
19
- n_threads=os.cpu_count(),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- # === Optional: RAG Setup ===
23
- if USE_RAG:
24
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
25
- client = chromadb.PersistentClient(path="chroma_db")
26
- col = client.get_or_create_collection(
27
- "docs",
28
- embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
29
- model_name="all-MiniLM-L6-v2"
 
 
 
 
30
  )
31
- )
32
 
33
- seed_texts = [
34
- "MiniCPM‑V‑2_6‑gguf runs well on CPU via llama.cpp.",
35
- "This model supports RAG with Chromadb and FastAPI + Gradio UI."
36
- ]
37
- for t in seed_texts:
38
- try:
39
- col.add(documents=[t], ids=[str(hash(t))])
40
- except:
41
- pass # Avoid duplicates on restart
42
-
43
- @lru_cache(maxsize=128)
44
- def embed_query(q: str):
45
- return embedder.encode(q)
46
-
47
- # === RAG or Vanilla Query ===
48
- def rag_query(q: str, max_tokens: int) -> str:
49
- try:
50
- context = ""
51
- if USE_RAG:
52
- results = col.query(
53
- query_embeddings=[embed_query(q)],
54
- n_results=3
55
- )
56
- context = "\n".join(results["documents"][0])
57
-
58
- prompt = f"Context:\n{context}\n\nUser: {q}\nAssistant:"
59
- out = model.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=0.7)
60
- return out["choices"][0]["text"]
61
  except Exception as e:
62
- return f"[Error] {e}"
 
 
63
 
64
- # === FastAPI ===
 
 
65
  app = FastAPI()
66
 
67
  @app.get("/ask")
68
- def ask(q: str = Query(...), tokens: int = Query(TOKEN_LIMIT)):
69
- tokens = min(max(32, tokens), 1024)
70
- return {"answer": rag_query(q, tokens)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  @app.post("/ask")
73
- def ask_post(body: dict):
74
- return ask(q=body.get("q", ""), tokens=body.get("tokens", TOKEN_LIMIT))
 
75
 
76
  # === Gradio UI ===
77
  def chat_fn(message, history, max_tokens):
78
- history = history or []
79
- reply = rag_query(message, max_tokens)
80
- new_history = history + [(message, reply)]
81
- yield new_history, new_history, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  with gr.Blocks() as demo:
84
- gr.Markdown("### 🧠 MiniCPM‑V‑2_6‑gguf RAG Chat")
 
 
 
 
 
 
85
 
86
- chatbot = gr.Chatbot(label="Bella Lite", show_copy_button=True)
87
- txt = gr.Textbox(placeholder="Ask me anything...", show_label=False)
88
- token_slider = gr.Slider(64, 1024, value=TOKEN_LIMIT, step=16, label="Max Tokens")
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- txt.submit(chat_fn, [txt, chatbot, token_slider], [chatbot, chatbot, txt])
 
 
 
 
 
 
 
 
 
 
91
 
 
92
  @app.on_event("startup")
93
- def startup():
 
 
 
94
  demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
 
95
 
96
  if __name__ == "__main__":
97
  import uvicorn
98
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
1
  import os
2
  from llama_cpp import Llama
3
+ # from sentence_transformers import SentenceTransformer # Keep commented for now due to RAM/complexity
4
+ # import chromadb
5
+ # from chromadb.utils import embedding_functions
6
  from fastapi import FastAPI, Query
7
  import gradio as gr
 
8
 
9
+ # === Globals ===
10
+ TOKEN_LIMIT = 256 # Default, overridden by slider
11
+
12
+ # --- System Message for Bella ---
13
+ SYSTEM_MESSAGE = """You are Bella, an expert AI assistant dedicated to supporting users across diverse domains such as coding, academic assignments (homework, computer science projects), and professional document creation. Your responses should always be accurate, comprehensive, and tailored to the user's needs, whether they are beginners or advanced learners. Prioritize clear explanations, practical advice, and step-by-step guidance to ensure user success. Do not engage in conversational filler; focus strictly on providing direct and valuable assistance."""
14
 
15
  # === Load LLM ===
16
+ llm = None # Initialize llm to None
17
+ try:
18
+ # MiniCPM-V models are generally used with `create_chat_completion`
19
+ # Llama.from_pretrained automatically handles downloading the GGUF from HF Hub
20
+ print("Loading MiniCPM-V-2_6-gguf model...")
21
+ llm = Llama.from_pretrained(
22
+ repo_id="openbmb/MiniCPM-V-2_6-gguf",
23
+ filename="ggml-model-Q4_K_M.gguf",
24
+ n_ctx=4096,
25
+ n_threads=os.cpu_count(),
26
+ verbose=False, # Set to True for more debug output
27
+ # `chat_format` can sometimes be inferred from model, but explicitly setting for safety:
28
+ # MiniCPM-V-2_6-gguf uses a specific chat template.
29
+ # Check the model card or a GGUF viewer for its precise chat template.
30
+ # This one is a common pattern for MiniCPM:
31
+ chat_format="chatml" # Or "llama-2" if that's what it uses, but chatml is more common
32
+ # For MiniCPM specifically, it's <|im_start|>role\ncontent<|im_end|>
33
+ # which is a variant of ChatML. Llama.cpp handles it if metadata exists.
34
+ )
35
+ print("MiniCPM-V-2_6-gguf model loaded successfully.")
36
+ except Exception as e:
37
+ print(f"Error loading MiniCPM-V-2_6-gguf model: {e}")
38
+ # Consider raising an error or exiting if the model is critical
39
+ # sys.exit(1) # You might want to import sys for this
40
+
41
+ # === RAG Setup (Commented out for free Space compatibility and initial focus) ===
42
+ """
43
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
44
+ client = chromadb.PersistentClient(path="chroma_db")
45
+ col = client.get_or_create_collection(
46
+ "docs",
47
+ embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
48
+ model_name="all-MiniLM-L6-v2"
49
+ )
50
  )
51
+ seed_texts = [
52
+ "MiniCPM‑V‑2_6‑gguf runs well on CPU via llama.cpp.",
53
+ "This model supports RAG with Chromadb and FastAPI + Gradio UI."
54
+ ]
55
+ for t in seed_texts:
56
+ col.add(documents=[t], ids=[str(hash(t))])
57
+ """
58
+
59
+ # === Query Function (Modified to use chat_completion) ===
60
+ def llm_query(messages_history: list, max_tokens: int) -> str:
61
+ if llm is None:
62
+ return "Error: LLM model not loaded. Cannot generate response."
63
+
64
+ # context = "" # If RAG were active, you'd insert context here
65
+ # prompt = f"Context:\n{context}\n\nUser: {q}\nAssistant:" # Not needed with chat_completion
66
 
67
+ try:
68
+ # Use create_chat_completion for streaming responses
69
+ response_generator = llm.create_chat_completion(
70
+ messages=messages_history, # Pass the entire prepared history
71
+ stream=True,
72
+ max_tokens=max_tokens,
73
+ temperature=0.7,
74
+ top_p=0.9,
75
+ # Add stop tokens if known for MiniCPM-V-2_6, e.g., for ChatML:
76
+ # The model's chat_format (if set correctly during Llama init) will often handle these.
77
+ # MiniCPM-V-2_6 uses <|im_end|>
78
+ stop=["<|im_end|>"]
79
  )
 
80
 
81
+ full_response = ""
82
+ for chunk in response_generator:
83
+ # 'delta' contains the new token
84
+ token = chunk["choices"][0]["delta"].get("content", "")
85
+ full_response += token
86
+ yield full_response # Yield partial response for streaming
87
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  except Exception as e:
89
+ print(f"Error during LLM inference: {e}")
90
+ yield f"An error occurred during generation: {e}"
91
+
92
 
93
+ # === FastAPI App ===
94
+ # Keep FastAPI part if you intend to expose an API endpoint.
95
+ # Note: For Gradio-only Spaces, you don't strictly need FastAPI, but it's fine to keep.
96
  app = FastAPI()
97
 
98
  @app.get("/ask")
99
+ def ask_api(q: str = Query(...), tokens: int = Query(TOKEN_LIMIT)):
100
+ # This FastAPI endpoint will now use the chat history format internally,
101
+ # but for a single query it's just the system message and user message.
102
+ messages_for_api = [
103
+ {"role": "system", "content": SYSTEM_MESSAGE},
104
+ {"role": "user", "content": q}
105
+ ]
106
+ # For a non-streaming API, you'd run it to completion and return the final text
107
+ # Note: llm_query is a generator now, so you'd need to consume it for an API.
108
+ # For simplicity, if this API is purely for the Gradio frontend, it might not be necessary.
109
+ # If it is for external use and non-streaming, you'd adapt llm_query or call llm.create_chat_completion directly here.
110
+ try:
111
+ response = llm.create_chat_completion(
112
+ messages=messages_for_api,
113
+ max_tokens=tokens,
114
+ temperature=0.7,
115
+ top_p=0.9,
116
+ stop=["<|im_end|>"]
117
+ )
118
+ return {"answer": response["choices"][0]["message"]["content"]}
119
+ except Exception as e:
120
+ return {"error": str(e)}, 500
121
 
122
  @app.post("/ask")
123
+ def ask_post_api(body: dict):
124
+ return ask_api(q=body.get("q", ""), tokens=body.get("tokens", TOKEN_LIMIT))
125
+
126
 
127
  # === Gradio UI ===
128
  def chat_fn(message, history, max_tokens):
129
+ # Gradio `history` for gr.Chatbot(type="messages") is already in OpenAI format:
130
+ # list of dictionaries like [{"role": "user", "content": "hello"}, {"role": "assistant", "content": "hi"}]
131
+
132
+ # 1. Add user message to history immediately for display
133
+ # This creates a new history list with the user's message, for immediate display
134
+ new_history = history + [{"role": "user", "content": message}]
135
+ yield new_history, gr.update(value="") # Clear textbox and update chatbot with user message
136
+
137
+ # 2. Prepare full message list for LLM, including the system message
138
+ messages_for_llm = [{"role": "system", "content": SYSTEM_MESSAGE}] + new_history
139
+
140
+ # 3. Call LLM for response (streaming)
141
+ full_bot_response = ""
142
+ for chunk in llm_query(messages_for_llm, max_tokens):
143
+ full_bot_response = chunk # `llm_query` now yields the full_response string
144
+ # Update the last assistant message in the history with the streaming content
145
+ # Note: new_history[-1] is the user's message. We need to add a new assistant message.
146
+ # This implies modifying the `new_history` list in place for streaming to work on the UI.
147
+ if len(new_history) > 0 and new_history[-1]["role"] == "user":
148
+ if len(new_history) == len(history) + 1: # First chunk after user message
149
+ new_history.append({"role": "assistant", "content": full_bot_response})
150
+ else: # Subsequent chunks for the same assistant message
151
+ new_history[-1]["content"] = full_bot_response
152
+ else: # Fallback if history state is unexpected (shouldn't happen with Chatbot type="messages")
153
+ new_history.append({"role": "assistant", "content": full_bot_response})
154
+
155
+ yield new_history, gr.update(value="") # Keep textbox cleared, update chatbot
156
+
157
+ # After generation is complete, ensure the final history state is sent
158
+ # (though the last yield in the loop should cover this)
159
+ # yield new_history, gr.update(value="") # This might be redundant but harmless
160
+
161
 
162
  with gr.Blocks() as demo:
163
+ gr.Markdown(
164
+ """
165
+ # 🧠 Bella: MiniCPM-V-2_6-gguf AI Assistant
166
+ Welcome! I'm Bella, designed to assist you with coding, homework, computer science projects,
167
+ and document writing. I provide accurate, comprehensive, and tailored guidance.
168
+ """
169
+ )
170
 
171
+ # Use type="messages" for OpenAI-like chat history format
172
+ chatbot = gr.Chatbot(
173
+ height=500,
174
+ label="Bella's Responses",
175
+ type="messages", # Important for the history format
176
+ autoscroll=True,
177
+ resizable=True,
178
+ show_copy_button=True
179
+ )
180
+
181
+ # Simplified input section
182
+ msg = gr.Textbox(placeholder="Ask Bella a question...", show_label=False, submit_btn="Ask")
183
+
184
+ token_slider = gr.Slider(64, 1024, value=256, step=16, label="Max tokens")
185
 
186
+ # Clear button
187
+ clear_btn = gr.ClearButton([msg, chatbot])
188
+
189
+ # Gradio submit event for streaming.
190
+ # The `outputs` here are: chatbot (for history updates) and msg (to clear it).
191
+ msg.submit(
192
+ fn=chat_fn,
193
+ inputs=[msg, chatbot, token_slider],
194
+ outputs=[chatbot, msg], # Order: chatbot first for history, msg second to clear input
195
+ queue=True # Set to True for streaming to work correctly in Gradio
196
+ )
197
 
198
+ # When using FastAPI, Gradio is launched via FastAPI's startup event.
199
  @app.on_event("startup")
200
+ async def startup_event(): # Use async def for FastAPI startup events
201
+ print("Starting Gradio app within FastAPI startup event...")
202
+ # This will launch Gradio within the Uvicorn server started by FastAPI
203
+ # The `share=True` is not needed in Hugging Face Spaces; it's handled automatically.
204
  demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
205
+ print("Gradio app launch initiated.")
206
 
207
  if __name__ == "__main__":
208
  import uvicorn
209
+ # This block is for local testing. On Hugging Face Spaces, `app` is run by Gunicorn/Uvicorn.
210
+ print("Running FastAPI app locally (if not in Hugging Face Space)...")
211
+ uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))