File size: 5,376 Bytes
5a44068
 
 
 
 
9212838
 
 
 
 
584ecec
 
9212838
 
 
 
 
789bc12
9212838
 
149342d
 
 
9212838
 
 
 
149342d
 
 
 
 
9212838
 
149342d
 
584ecec
9212838
149342d
 
9212838
149342d
9212838
 
 
 
149342d
 
 
66f4f20
 
9212838
 
 
 
149342d
9212838
66f4f20
9212838
 
 
66f4f20
9212838
5a44068
 
 
9212838
 
 
 
 
 
 
 
 
 
 
149342d
 
 
9212838
 
 
 
5a44068
 
9212838
 
 
5a44068
584ecec
 
9212838
149342d
9212838
 
 
 
 
149342d
 
 
 
9212838
 
149342d
9212838
5a44068
 
9212838
 
 
 
 
 
 
584ecec
9212838
 
 
149342d
9212838
 
 
 
 
 
 
 
584ecec
9212838
 
 
 
 
149342d
 
9212838
5a44068
 
149342d
9212838
584ecec
9212838
5a44068
 
 
9212838
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
from llama_cpp import Llama
from fastapi import FastAPI, Query
import gradio as gr

# === Globals ===
TOKEN_LIMIT = 256  # Default, overridden by slider

# --- System Message for Bella ---
SYSTEM_MESSAGE = """You are Bella, an expert AI assistant dedicated to supporting users across diverse domains such as coding, academic assignments (homework, computer science projects), and professional document creation. Your responses should always be accurate, comprehensive, and tailored to the user's needs, whether they are beginners or advanced learners. Prioritize clear explanations, practical advice, and step-by-step guidance to ensure user success. Do not engage in conversational filler; focus strictly on providing direct and valuable assistance."""

# === Load LLM ===
llm = None # Initialize llm to None
try:
    print("Loading MiniCPM-V-2_6-gguf model...")
    llm = Llama.from_pretrained(
        repo_id="openbmb/MiniCPM-V-2_6-gguf",
        filename="ggml-model-Q4_K_M.gguf",
        n_ctx=4096,
        n_threads=os.cpu_count(),
        n_batch=512, # Increased batch size for prompt processing
        n_gpu_layers=0, # Ensure this is 0 for CPU-only inference on free tier
        verbose=False,
    )
    print("MiniCPM-V-2_6-gguf model loaded successfully.")
except Exception as e:
    print(f"Error loading MiniCPM-V-2_6-gguf model: {e}")
    # Consider more robust error handling for production
    # e.g., setting a flag and displaying an error message in the UI


# === Query Function (Modified for better repetition control) ===
def llm_query(messages_history: list, max_tokens: int) -> str:
    if llm is None:
        yield "Error: LLM model not loaded. Cannot generate response."
        return

    try:
        common_stop_tokens = ["<|im_end|>", "</s>", "<|end_of_text|>"]

        response_generator = llm.create_chat_completion(
            messages=messages_history,
            stream=True,
            max_tokens=max_tokens,
            temperature=0.7,
            top_p=0.9,
            repeat_penalty=1.1,
            #repeat_last_n=256, # <--- NEW/MODIFIED: Increase the window for repetition penalty
            stop=common_stop_tokens
        )

        full_response = ""
        for chunk in response_generator:
            token = chunk["choices"][0]["delta"].get("content", "")
            full_response += token
            yield full_response

    except Exception as e:
        print(f"Error during LLM inference: {e}")
        yield f"An error occurred during generation: {e}"


# === FastAPI App ===
app = FastAPI()

@app.get("/ask")
def ask_api(q: str = Query(...), tokens: int = Query(TOKEN_LIMIT)):
    messages_for_api = [
        {"role": "system", "content": SYSTEM_MESSAGE},
        {"role": "user", "content": q}
    ]
    try:
        response = llm.create_chat_completion(
            messages=messages_for_api,
            max_tokens=tokens,
            temperature=0.7,
            top_p=0.9,
            repeat_penalty=1.1,
            repeat_last_n=256, # <--- NEW/MODIFIED: Apply here as well
            stop=["<|im_end|>", "</s>", "<|end_of_text|>"]
        )
        return {"answer": response["choices"][0]["message"]["content"]}
    except Exception as e:
        return {"error": str(e)}, 500

@app.post("/ask")
def ask_post_api(body: dict):
    return ask_api(q=body.get("q", ""), tokens=body.get("tokens", TOKEN_LIMIT))


# === Gradio UI ===
def chat_fn(message, history, max_tokens):
    new_history = history + [{"role": "user", "content": message}]
    yield new_history, gr.update(value="")

    messages_for_llm = [{"role": "system", "content": SYSTEM_MESSAGE}] + new_history

    full_bot_response = ""
    for chunk in llm_query(messages_for_llm, max_tokens):
        full_bot_response = chunk
        if len(new_history) > 0 and new_history[-1]["role"] == "assistant":
            new_history[-1]["content"] = full_bot_response
        else:
            new_history.append({"role": "assistant", "content": full_bot_response})

        yield new_history, gr.update(value="")


with gr.Blocks() as demo:
    gr.Markdown(
        """
        # 🧠 Bella: MiniCPM-V-2_6-gguf AI Assistant
        Welcome! I'm Bella, designed to assist you with coding, homework, computer science projects,
        and document writing. I provide accurate, comprehensive, and tailored guidance.
        """
    )

    chatbot = gr.Chatbot(
        height=500,
        label="Bella's Responses",
        type="messages",
        autoscroll=True,
        resizable=True,
        show_copy_button=True
    )

    msg = gr.Textbox(placeholder="Ask Bella a question...", show_label=False, submit_btn="Ask")

    token_slider = gr.Slider(64, 1024, value=256, step=16, label="Max tokens")

    clear_btn = gr.ClearButton([msg, chatbot])

    msg.submit(
        fn=chat_fn,
        inputs=[msg, chatbot, token_slider],
        outputs=[chatbot, msg],
        queue=True
    )

@app.on_event("startup")
async def startup_event():
    print("Starting Gradio app within FastAPI startup event...")
    demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
    print("Gradio app launch initiated.")

if __name__ == "__main__":
    import uvicorn
    print("Running FastAPI app locally (if not in Hugging Face Space)...")
    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))