shawno commited on
Commit
584ecec
·
verified ·
1 Parent(s): f16b617

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -21
app.py CHANGED
@@ -6,13 +6,17 @@ from chromadb.utils import embedding_functions
6
  from fastapi import FastAPI, Query
7
  import gradio as gr
8
 
9
- # 1. Load model via llama-cpp-python
 
 
 
10
  model = Llama.from_pretrained(
11
  repo_id="openbmb/MiniCPM-V-2_6-gguf",
12
- filename="ggml-model-Q4_K_M.gguf", # Choose a specific GGUF file
13
  n_ctx=4096,
14
  )
15
- # 2. Setup RAG
 
16
  embedder = SentenceTransformer("all-MiniLM-L6-v2")
17
  client = chromadb.PersistentClient(path="chroma_db")
18
  col = client.get_or_create_collection(
@@ -21,7 +25,6 @@ col = client.get_or_create_collection(
21
  model_name="all-MiniLM-L6-v2"
22
  )
23
  )
24
- # Seed with example context
25
  seed_texts = [
26
  "MiniCPM‑V‑2_6‑gguf runs well on CPU via llama.cpp.",
27
  "This model supports RAG with Chromadb and FastAPI + Gradio UI."
@@ -29,44 +32,52 @@ seed_texts = [
29
  for t in seed_texts:
30
  col.add(documents=[t], ids=[str(hash(t))])
31
 
32
- def rag_query(q: str) -> str:
 
33
  results = col.query(
34
  query_embeddings=[embedder.encode(q)],
35
  n_results=3
36
  )
37
  context = "\n".join(results["documents"][0])
38
  prompt = f"Context:\n{context}\n\nUser: {q}\nAssistant:"
39
- out = model.create_completion(prompt=prompt, max_tokens=256, temperature=0.7)
40
  return out["choices"][0]["text"]
41
 
42
- # 3. FastAPI app
43
  app = FastAPI()
44
 
45
  @app.get("/ask")
46
- def ask(q: str = Query(...)):
47
- return {"answer": rag_query(q)}
48
 
49
  @app.post("/ask")
50
  def ask_post(body: dict):
51
- return ask(q=body.get("q",""))
52
 
53
- # 4. Gradio UI
54
- def chat_fn(message, history):
55
- reply = rag_query(message)
56
  history = history or []
57
- history.append(("User", message))
58
- history.append(("Assistant", reply))
59
- return history, history
60
 
61
  with gr.Blocks() as demo:
62
- chatbot = gr.Chatbot()
63
- txt = gr.Textbox(placeholder="Ask me...", show_label=False)
64
- txt.submit(chat_fn, [txt, chatbot], [chatbot, chatbot])
65
- gr.Markdown("### 🧠 MiniCPM‑V‑2_6‑gguf RAG Chat (GET & POST API support)")
 
 
 
 
 
 
 
66
 
67
  @app.on_event("startup")
68
  def startup():
69
- demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT",7860)))
70
 
71
  if __name__ == "__main__":
72
  import uvicorn
 
6
  from fastapi import FastAPI, Query
7
  import gradio as gr
8
 
9
+ # === Globals ===
10
+ TOKEN_LIMIT = 256 # Default, overridden by slider
11
+
12
+ # === Load LLM ===
13
  model = Llama.from_pretrained(
14
  repo_id="openbmb/MiniCPM-V-2_6-gguf",
15
+ filename="ggml-model-Q4_K_M.gguf",
16
  n_ctx=4096,
17
  )
18
+
19
+ # === RAG Setup ===
20
  embedder = SentenceTransformer("all-MiniLM-L6-v2")
21
  client = chromadb.PersistentClient(path="chroma_db")
22
  col = client.get_or_create_collection(
 
25
  model_name="all-MiniLM-L6-v2"
26
  )
27
  )
 
28
  seed_texts = [
29
  "MiniCPM‑V‑2_6‑gguf runs well on CPU via llama.cpp.",
30
  "This model supports RAG with Chromadb and FastAPI + Gradio UI."
 
32
  for t in seed_texts:
33
  col.add(documents=[t], ids=[str(hash(t))])
34
 
35
+ # === Query Function ===
36
+ def rag_query(q: str, max_tokens: int) -> str:
37
  results = col.query(
38
  query_embeddings=[embedder.encode(q)],
39
  n_results=3
40
  )
41
  context = "\n".join(results["documents"][0])
42
  prompt = f"Context:\n{context}\n\nUser: {q}\nAssistant:"
43
+ out = model.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=0.7)
44
  return out["choices"][0]["text"]
45
 
46
+ # === FastAPI App ===
47
  app = FastAPI()
48
 
49
  @app.get("/ask")
50
+ def ask(q: str = Query(...), tokens: int = Query(TOKEN_LIMIT)):
51
+ return {"answer": rag_query(q, tokens)}
52
 
53
  @app.post("/ask")
54
  def ask_post(body: dict):
55
+ return ask(q=body.get("q", ""), tokens=body.get("tokens", TOKEN_LIMIT))
56
 
57
+ # === Gradio UI ===
58
+ def chat_fn(message, history, max_tokens):
59
+ reply = rag_query(message, max_tokens)
60
  history = history or []
61
+ history.append((f"🧑 You", message))
62
+ history.append((f"🤖 Bot", reply))
63
+ return history, history, ""
64
 
65
  with gr.Blocks() as demo:
66
+ gr.Markdown("### 🧠 MiniCPM‑V‑2_6‑gguf RAG Chat")
67
+
68
+ chatbot = gr.Chatbot(label="Chat", bubble_full_width=False)
69
+ with gr.Row():
70
+ txt = gr.Textbox(placeholder="Ask me...", show_label=False, scale=8)
71
+ send_btn = gr.Button("Send", scale=1)
72
+
73
+ token_slider = gr.Slider(64, 1024, value=256, step=16, label="Max tokens")
74
+
75
+ txt.submit(chat_fn, [txt, chatbot, token_slider], [chatbot, chatbot, txt])
76
+ send_btn.click(chat_fn, [txt, chatbot, token_slider], [chatbot, chatbot, txt])
77
 
78
  @app.on_event("startup")
79
  def startup():
80
+ demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
81
 
82
  if __name__ == "__main__":
83
  import uvicorn