Spaces:

shawno
/

Bella

Sleeping

App Files Files Community

shawno commited on Jun 20

Commit

66f4f20

verified ·

1 Parent(s): 205abbc

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -53

app.py CHANGED Viewed

@@ -5,9 +5,11 @@ import chromadb
 from chromadb.utils import embedding_functions
 from fastapi import FastAPI, Query
 import gradio as gr
-# === Globals ===
-TOKEN_LIMIT = 256  # Default, overridden by slider
 # === Load LLM ===
 model = Llama.from_pretrained(
@@ -17,39 +19,54 @@ model = Llama.from_pretrained(
     n_threads=os.cpu_count(),
 )
-# === RAG Setup ===
-"""embedder = SentenceTransformer("all-MiniLM-L6-v2")
-client = chromadb.PersistentClient(path="chroma_db")
-col = client.get_or_create_collection(
-    "docs",
-    embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
-        model_name="all-MiniLM-L6-v2"
     )
-)
-seed_texts = [
-    "MiniCPM‑V‑2_6‑gguf runs well on CPU via llama.cpp.",
-    "This model supports RAG with Chromadb and FastAPI + Gradio UI."
-]
-for t in seed_texts:
-    col.add(documents=[t], ids=[str(hash(t))])"""
-# === Query Function ===
 def rag_query(q: str, max_tokens: int) -> str:
-    """results = col.query(
-        query_embeddings=[embedder.encode(q)],
-        n_results=3
-    )"""
-    #context = "\n".join(results["documents"][0])
-    context=""
-    prompt = f"Context:\n{context}\n\nUser: {q}\nAssistant:"
-    out = model.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=0.7)
-    return out["choices"][0]["text"]
-# === FastAPI App ===
 app = FastAPI()
 @app.get("/ask")
 def ask(q: str = Query(...), tokens: int = Query(TOKEN_LIMIT)):
     return {"answer": rag_query(q, tokens)}
 @app.post("/ask")
@@ -59,36 +76,18 @@ def ask_post(body: dict):
 # === Gradio UI ===
 def chat_fn(message, history, max_tokens):
     history = history or []
-    """history.append(gr.ChatMessage(role="user",
-                    content=message))"""
-    new_history = history + [gr.ChatMessage(role="user", content=message)]
-    yield new_history, new_history, ""  # Show user's message immediately
-    #reply = rag_query(message, max_tokens)
-    new_history.append(gr.ChatMessage(role="assistant", content="reply"))
     yield new_history, new_history, ""
-    """history.append(gr.ChatMessage(role="assistant",
-                    content=reply))"""
-    #history.append((f"🧑 You", message))
-    #history.append((f"🤖 Bot", reply))
-    #return history, history, ""
 with gr.Blocks() as demo:
     gr.Markdown("### 🧠 MiniCPM‑V‑2_6‑gguf RAG Chat")
-    chatbot = gr.Chatbot(type="messages", label="Bella Lite", autoscroll=True, resizable=True, show_copy_button=True)
-    """with gr.Row():
-        txt = gr.Textbox(placeholder="Ask me...", show_label=False, scale=8)
-        send_btn = gr.Button("Send", scale=1)"""
-    txt = gr.Textbox(placeholder="Ask me...", show_label=False, submit_btn="Ask")
-    token_slider = gr.Slider(64, 1024, value=256, step=16, label="Max tokens")
     txt.submit(chat_fn, [txt, chatbot, token_slider], [chatbot, chatbot, txt])
-    #send_btn.click(chat_fn, [txt, chatbot, token_slider], [chatbot, chatbot, txt])
 @app.on_event("startup")
 def startup():
@@ -96,4 +95,4 @@ def startup():
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 from chromadb.utils import embedding_functions
 from fastapi import FastAPI, Query
 import gradio as gr
+from functools import lru_cache
+# === Config ===
+TOKEN_LIMIT = 256
+USE_RAG = True  # Toggle RAG mode
 # === Load LLM ===
 model = Llama.from_pretrained(
     n_threads=os.cpu_count(),
 )
+# === Optional: RAG Setup ===
+if USE_RAG:
+    embedder = SentenceTransformer("all-MiniLM-L6-v2")
+    client = chromadb.PersistentClient(path="chroma_db")
+    col = client.get_or_create_collection(
+        "docs",
+        embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
+            model_name="all-MiniLM-L6-v2"
+        )
     )
+    seed_texts = [
+        "MiniCPM‑V‑2_6‑gguf runs well on CPU via llama.cpp.",
+        "This model supports RAG with Chromadb and FastAPI + Gradio UI."
+    ]
+    for t in seed_texts:
+        try:
+            col.add(documents=[t], ids=[str(hash(t))])
+        except:
+            pass  # Avoid duplicates on restart
+    @lru_cache(maxsize=128)
+    def embed_query(q: str):
+        return embedder.encode(q)
+# === RAG or Vanilla Query ===
 def rag_query(q: str, max_tokens: int) -> str:
+    try:
+        context = ""
+        if USE_RAG:
+            results = col.query(
+                query_embeddings=[embed_query(q)],
+                n_results=3
+            )
+            context = "\n".join(results["documents"][0])
+        prompt = f"Context:\n{context}\n\nUser: {q}\nAssistant:"
+        out = model.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=0.7)
+        return out["choices"][0]["text"]
+    except Exception as e:
+        return f"[Error] {e}"
+# === FastAPI ===
 app = FastAPI()
 @app.get("/ask")
 def ask(q: str = Query(...), tokens: int = Query(TOKEN_LIMIT)):
+    tokens = min(max(32, tokens), 1024)
     return {"answer": rag_query(q, tokens)}
 @app.post("/ask")
 # === Gradio UI ===
 def chat_fn(message, history, max_tokens):
     history = history or []
+    reply = rag_query(message, max_tokens)
+    new_history = history + [(message, reply)]
     yield new_history, new_history, ""
 with gr.Blocks() as demo:
     gr.Markdown("### 🧠 MiniCPM‑V‑2_6‑gguf RAG Chat")
+    chatbot = gr.Chatbot(label="Bella Lite", show_copy_button=True)
+    txt = gr.Textbox(placeholder="Ask me anything...", show_label=False)
+    token_slider = gr.Slider(64, 1024, value=TOKEN_LIMIT, step=16, label="Max Tokens")
     txt.submit(chat_fn, [txt, chatbot, token_slider], [chatbot, chatbot, txt])
 @app.on_event("startup")
 def startup():
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)