File size: 5,376 Bytes
5a44068 9212838 584ecec 9212838 789bc12 9212838 149342d 9212838 149342d 9212838 149342d 584ecec 9212838 149342d 9212838 149342d 9212838 149342d 66f4f20 9212838 149342d 9212838 66f4f20 9212838 66f4f20 9212838 5a44068 9212838 149342d 9212838 5a44068 9212838 5a44068 584ecec 9212838 149342d 9212838 149342d 9212838 149342d 9212838 5a44068 9212838 584ecec 9212838 149342d 9212838 584ecec 9212838 149342d 9212838 5a44068 149342d 9212838 584ecec 9212838 5a44068 9212838 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import os
from llama_cpp import Llama
from fastapi import FastAPI, Query
import gradio as gr
# === Globals ===
TOKEN_LIMIT = 256 # Default, overridden by slider
# --- System Message for Bella ---
SYSTEM_MESSAGE = """You are Bella, an expert AI assistant dedicated to supporting users across diverse domains such as coding, academic assignments (homework, computer science projects), and professional document creation. Your responses should always be accurate, comprehensive, and tailored to the user's needs, whether they are beginners or advanced learners. Prioritize clear explanations, practical advice, and step-by-step guidance to ensure user success. Do not engage in conversational filler; focus strictly on providing direct and valuable assistance."""
# === Load LLM ===
llm = None # Initialize llm to None
try:
print("Loading MiniCPM-V-2_6-gguf model...")
llm = Llama.from_pretrained(
repo_id="openbmb/MiniCPM-V-2_6-gguf",
filename="ggml-model-Q4_K_M.gguf",
n_ctx=4096,
n_threads=os.cpu_count(),
n_batch=512, # Increased batch size for prompt processing
n_gpu_layers=0, # Ensure this is 0 for CPU-only inference on free tier
verbose=False,
)
print("MiniCPM-V-2_6-gguf model loaded successfully.")
except Exception as e:
print(f"Error loading MiniCPM-V-2_6-gguf model: {e}")
# Consider more robust error handling for production
# e.g., setting a flag and displaying an error message in the UI
# === Query Function (Modified for better repetition control) ===
def llm_query(messages_history: list, max_tokens: int) -> str:
if llm is None:
yield "Error: LLM model not loaded. Cannot generate response."
return
try:
common_stop_tokens = ["<|im_end|>", "</s>", "<|end_of_text|>"]
response_generator = llm.create_chat_completion(
messages=messages_history,
stream=True,
max_tokens=max_tokens,
temperature=0.7,
top_p=0.9,
repeat_penalty=1.1,
#repeat_last_n=256, # <--- NEW/MODIFIED: Increase the window for repetition penalty
stop=common_stop_tokens
)
full_response = ""
for chunk in response_generator:
token = chunk["choices"][0]["delta"].get("content", "")
full_response += token
yield full_response
except Exception as e:
print(f"Error during LLM inference: {e}")
yield f"An error occurred during generation: {e}"
# === FastAPI App ===
app = FastAPI()
@app.get("/ask")
def ask_api(q: str = Query(...), tokens: int = Query(TOKEN_LIMIT)):
messages_for_api = [
{"role": "system", "content": SYSTEM_MESSAGE},
{"role": "user", "content": q}
]
try:
response = llm.create_chat_completion(
messages=messages_for_api,
max_tokens=tokens,
temperature=0.7,
top_p=0.9,
repeat_penalty=1.1,
repeat_last_n=256, # <--- NEW/MODIFIED: Apply here as well
stop=["<|im_end|>", "</s>", "<|end_of_text|>"]
)
return {"answer": response["choices"][0]["message"]["content"]}
except Exception as e:
return {"error": str(e)}, 500
@app.post("/ask")
def ask_post_api(body: dict):
return ask_api(q=body.get("q", ""), tokens=body.get("tokens", TOKEN_LIMIT))
# === Gradio UI ===
def chat_fn(message, history, max_tokens):
new_history = history + [{"role": "user", "content": message}]
yield new_history, gr.update(value="")
messages_for_llm = [{"role": "system", "content": SYSTEM_MESSAGE}] + new_history
full_bot_response = ""
for chunk in llm_query(messages_for_llm, max_tokens):
full_bot_response = chunk
if len(new_history) > 0 and new_history[-1]["role"] == "assistant":
new_history[-1]["content"] = full_bot_response
else:
new_history.append({"role": "assistant", "content": full_bot_response})
yield new_history, gr.update(value="")
with gr.Blocks() as demo:
gr.Markdown(
"""
# 🧠 Bella: MiniCPM-V-2_6-gguf AI Assistant
Welcome! I'm Bella, designed to assist you with coding, homework, computer science projects,
and document writing. I provide accurate, comprehensive, and tailored guidance.
"""
)
chatbot = gr.Chatbot(
height=500,
label="Bella's Responses",
type="messages",
autoscroll=True,
resizable=True,
show_copy_button=True
)
msg = gr.Textbox(placeholder="Ask Bella a question...", show_label=False, submit_btn="Ask")
token_slider = gr.Slider(64, 1024, value=256, step=16, label="Max tokens")
clear_btn = gr.ClearButton([msg, chatbot])
msg.submit(
fn=chat_fn,
inputs=[msg, chatbot, token_slider],
outputs=[chatbot, msg],
queue=True
)
@app.on_event("startup")
async def startup_event():
print("Starting Gradio app within FastAPI startup event...")
demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
print("Gradio app launch initiated.")
if __name__ == "__main__":
import uvicorn
print("Running FastAPI app locally (if not in Hugging Face Space)...")
uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860))) |