shawno commited on
Commit
149342d
·
verified ·
1 Parent(s): c1c816a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -93
app.py CHANGED
@@ -1,8 +1,5 @@
1
  import os
2
  from llama_cpp import Llama
3
- # from sentence_transformers import SentenceTransformer # Keep commented for now due to RAM/complexity
4
- # import chromadb
5
- # from chromadb.utils import embedding_functions
6
  from fastapi import FastAPI, Query
7
  import gradio as gr
8
 
@@ -15,76 +12,48 @@ SYSTEM_MESSAGE = """You are Bella, an expert AI assistant dedicated to supportin
15
  # === Load LLM ===
16
  llm = None # Initialize llm to None
17
  try:
18
- # MiniCPM-V models are generally used with `create_chat_completion`
19
- # Llama.from_pretrained automatically handles downloading the GGUF from HF Hub
20
  print("Loading MiniCPM-V-2_6-gguf model...")
21
  llm = Llama.from_pretrained(
22
  repo_id="openbmb/MiniCPM-V-2_6-gguf",
23
  filename="ggml-model-Q4_K_M.gguf",
24
  n_ctx=4096,
25
  n_threads=os.cpu_count(),
26
- n_batch=512,
27
- verbose=False, # Set to True for more debug output
28
- # `chat_format` can sometimes be inferred from model, but explicitly setting for safety:
29
- # MiniCPM-V-2_6-gguf uses a specific chat template.
30
- # Check the model card or a GGUF viewer for its precise chat template.
31
- # This one is a common pattern for MiniCPM:
32
- chat_format="chatml" # Or "llama-2" if that's what it uses, but chatml is more common
33
- # For MiniCPM specifically, it's <|im_start|>role\ncontent<|im_end|>
34
- # which is a variant of ChatML. Llama.cpp handles it if metadata exists.
35
  )
36
  print("MiniCPM-V-2_6-gguf model loaded successfully.")
37
  except Exception as e:
38
  print(f"Error loading MiniCPM-V-2_6-gguf model: {e}")
39
- # Consider raising an error or exiting if the model is critical
40
- # sys.exit(1) # You might want to import sys for this
41
-
42
- # === RAG Setup (Commented out for free Space compatibility and initial focus) ===
43
- """
44
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
45
- client = chromadb.PersistentClient(path="chroma_db")
46
- col = client.get_or_create_collection(
47
- "docs",
48
- embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
49
- model_name="all-MiniLM-L6-v2"
50
- )
51
- )
52
- seed_texts = [
53
- "MiniCPM‑V‑2_6‑gguf runs well on CPU via llama.cpp.",
54
- "This model supports RAG with Chromadb and FastAPI + Gradio UI."
55
- ]
56
- for t in seed_texts:
57
- col.add(documents=[t], ids=[str(hash(t))])
58
- """
59
-
60
- # === Query Function (Modified to use chat_completion) ===
61
  def llm_query(messages_history: list, max_tokens: int) -> str:
62
  if llm is None:
63
- return "Error: LLM model not loaded. Cannot generate response."
64
-
65
- # context = "" # If RAG were active, you'd insert context here
66
- # prompt = f"Context:\n{context}\n\nUser: {q}\nAssistant:" # Not needed with chat_completion
67
 
68
  try:
69
- # Use create_chat_completion for streaming responses
 
70
  response_generator = llm.create_chat_completion(
71
- messages=messages_history, # Pass the entire prepared history
72
  stream=True,
73
  max_tokens=max_tokens,
74
  temperature=0.7,
75
  top_p=0.9,
76
- # Add stop tokens if known for MiniCPM-V-2_6, e.g., for ChatML:
77
- # The model's chat_format (if set correctly during Llama init) will often handle these.
78
- # MiniCPM-V-2_6 uses <|im_end|>
79
- stop=["<|im_end|>"]
80
  )
81
 
82
  full_response = ""
83
  for chunk in response_generator:
84
- # 'delta' contains the new token
85
  token = chunk["choices"][0]["delta"].get("content", "")
86
  full_response += token
87
- yield full_response # Yield partial response for streaming
88
 
89
  except Exception as e:
90
  print(f"Error during LLM inference: {e}")
@@ -92,29 +61,23 @@ def llm_query(messages_history: list, max_tokens: int) -> str:
92
 
93
 
94
  # === FastAPI App ===
95
- # Keep FastAPI part if you intend to expose an API endpoint.
96
- # Note: For Gradio-only Spaces, you don't strictly need FastAPI, but it's fine to keep.
97
  app = FastAPI()
98
 
99
  @app.get("/ask")
100
  def ask_api(q: str = Query(...), tokens: int = Query(TOKEN_LIMIT)):
101
- # This FastAPI endpoint will now use the chat history format internally,
102
- # but for a single query it's just the system message and user message.
103
  messages_for_api = [
104
  {"role": "system", "content": SYSTEM_MESSAGE},
105
  {"role": "user", "content": q}
106
  ]
107
- # For a non-streaming API, you'd run it to completion and return the final text
108
- # Note: llm_query is a generator now, so you'd need to consume it for an API.
109
- # For simplicity, if this API is purely for the Gradio frontend, it might not be necessary.
110
- # If it is for external use and non-streaming, you'd adapt llm_query or call llm.create_chat_completion directly here.
111
  try:
112
  response = llm.create_chat_completion(
113
  messages=messages_for_api,
114
  max_tokens=tokens,
115
  temperature=0.7,
116
  top_p=0.9,
117
- stop=["<|im_end|>"]
 
 
118
  )
119
  return {"answer": response["choices"][0]["message"]["content"]}
120
  except Exception as e:
@@ -127,37 +90,20 @@ def ask_post_api(body: dict):
127
 
128
  # === Gradio UI ===
129
  def chat_fn(message, history, max_tokens):
130
- # Gradio `history` for gr.Chatbot(type="messages") is already in OpenAI format:
131
- # list of dictionaries like [{"role": "user", "content": "hello"}, {"role": "assistant", "content": "hi"}]
132
-
133
- # 1. Add user message to history immediately for display
134
- # This creates a new history list with the user's message, for immediate display
135
  new_history = history + [{"role": "user", "content": message}]
136
- yield new_history, gr.update(value="") # Clear textbox and update chatbot with user message
137
 
138
- # 2. Prepare full message list for LLM, including the system message
139
  messages_for_llm = [{"role": "system", "content": SYSTEM_MESSAGE}] + new_history
140
 
141
- # 3. Call LLM for response (streaming)
142
  full_bot_response = ""
143
  for chunk in llm_query(messages_for_llm, max_tokens):
144
- full_bot_response = chunk # `llm_query` now yields the full_response string
145
- # Update the last assistant message in the history with the streaming content
146
- # Note: new_history[-1] is the user's message. We need to add a new assistant message.
147
- # This implies modifying the `new_history` list in place for streaming to work on the UI.
148
- if len(new_history) > 0 and new_history[-1]["role"] == "user":
149
- if len(new_history) == len(history) + 1: # First chunk after user message
150
- new_history.append({"role": "assistant", "content": full_bot_response})
151
- else: # Subsequent chunks for the same assistant message
152
- new_history[-1]["content"] = full_bot_response
153
- else: # Fallback if history state is unexpected (shouldn't happen with Chatbot type="messages")
154
  new_history.append({"role": "assistant", "content": full_bot_response})
155
 
156
- yield new_history, gr.update(value="") # Keep textbox cleared, update chatbot
157
-
158
- # After generation is complete, ensure the final history state is sent
159
- # (though the last yield in the loop should cover this)
160
- # yield new_history, gr.update(value="") # This might be redundant but harmless
161
 
162
 
163
  with gr.Blocks() as demo:
@@ -169,44 +115,35 @@ with gr.Blocks() as demo:
169
  """
170
  )
171
 
172
- # Use type="messages" for OpenAI-like chat history format
173
  chatbot = gr.Chatbot(
174
  height=500,
175
  label="Bella's Responses",
176
- type="messages", # Important for the history format
177
  autoscroll=True,
178
  resizable=True,
179
  show_copy_button=True
180
  )
181
 
182
- # Simplified input section
183
  msg = gr.Textbox(placeholder="Ask Bella a question...", show_label=False, submit_btn="Ask")
184
 
185
  token_slider = gr.Slider(64, 1024, value=256, step=16, label="Max tokens")
186
 
187
- # Clear button
188
  clear_btn = gr.ClearButton([msg, chatbot])
189
 
190
- # Gradio submit event for streaming.
191
- # The `outputs` here are: chatbot (for history updates) and msg (to clear it).
192
  msg.submit(
193
  fn=chat_fn,
194
  inputs=[msg, chatbot, token_slider],
195
- outputs=[chatbot, msg], # Order: chatbot first for history, msg second to clear input
196
- queue=True # Set to True for streaming to work correctly in Gradio
197
  )
198
 
199
- # When using FastAPI, Gradio is launched via FastAPI's startup event.
200
  @app.on_event("startup")
201
- async def startup_event(): # Use async def for FastAPI startup events
202
  print("Starting Gradio app within FastAPI startup event...")
203
- # This will launch Gradio within the Uvicorn server started by FastAPI
204
- # The `share=True` is not needed in Hugging Face Spaces; it's handled automatically.
205
  demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
206
  print("Gradio app launch initiated.")
207
 
208
  if __name__ == "__main__":
209
  import uvicorn
210
- # This block is for local testing. On Hugging Face Spaces, `app` is run by Gunicorn/Uvicorn.
211
  print("Running FastAPI app locally (if not in Hugging Face Space)...")
212
  uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))
 
1
  import os
2
  from llama_cpp import Llama
 
 
 
3
  from fastapi import FastAPI, Query
4
  import gradio as gr
5
 
 
12
  # === Load LLM ===
13
  llm = None # Initialize llm to None
14
  try:
 
 
15
  print("Loading MiniCPM-V-2_6-gguf model...")
16
  llm = Llama.from_pretrained(
17
  repo_id="openbmb/MiniCPM-V-2_6-gguf",
18
  filename="ggml-model-Q4_K_M.gguf",
19
  n_ctx=4096,
20
  n_threads=os.cpu_count(),
21
+ n_batch=512, # Increased batch size for prompt processing
22
+ n_gpu_layers=0, # Ensure this is 0 for CPU-only inference on free tier
23
+ verbose=False,
 
 
 
 
 
 
24
  )
25
  print("MiniCPM-V-2_6-gguf model loaded successfully.")
26
  except Exception as e:
27
  print(f"Error loading MiniCPM-V-2_6-gguf model: {e}")
28
+ # Consider more robust error handling for production
29
+ # e.g., setting a flag and displaying an error message in the UI
30
+
31
+
32
+ # === Query Function (Modified for better repetition control) ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def llm_query(messages_history: list, max_tokens: int) -> str:
34
  if llm is None:
35
+ yield "Error: LLM model not loaded. Cannot generate response."
36
+ return
 
 
37
 
38
  try:
39
+ common_stop_tokens = ["<|im_end|>", "</s>", "<|end_of_text|>"]
40
+
41
  response_generator = llm.create_chat_completion(
42
+ messages=messages_history,
43
  stream=True,
44
  max_tokens=max_tokens,
45
  temperature=0.7,
46
  top_p=0.9,
47
+ repeat_penalty=1.1,
48
+ #repeat_last_n=256, # <--- NEW/MODIFIED: Increase the window for repetition penalty
49
+ stop=common_stop_tokens
 
50
  )
51
 
52
  full_response = ""
53
  for chunk in response_generator:
 
54
  token = chunk["choices"][0]["delta"].get("content", "")
55
  full_response += token
56
+ yield full_response
57
 
58
  except Exception as e:
59
  print(f"Error during LLM inference: {e}")
 
61
 
62
 
63
  # === FastAPI App ===
 
 
64
  app = FastAPI()
65
 
66
  @app.get("/ask")
67
  def ask_api(q: str = Query(...), tokens: int = Query(TOKEN_LIMIT)):
 
 
68
  messages_for_api = [
69
  {"role": "system", "content": SYSTEM_MESSAGE},
70
  {"role": "user", "content": q}
71
  ]
 
 
 
 
72
  try:
73
  response = llm.create_chat_completion(
74
  messages=messages_for_api,
75
  max_tokens=tokens,
76
  temperature=0.7,
77
  top_p=0.9,
78
+ repeat_penalty=1.1,
79
+ repeat_last_n=256, # <--- NEW/MODIFIED: Apply here as well
80
+ stop=["<|im_end|>", "</s>", "<|end_of_text|>"]
81
  )
82
  return {"answer": response["choices"][0]["message"]["content"]}
83
  except Exception as e:
 
90
 
91
  # === Gradio UI ===
92
  def chat_fn(message, history, max_tokens):
 
 
 
 
 
93
  new_history = history + [{"role": "user", "content": message}]
94
+ yield new_history, gr.update(value="")
95
 
 
96
  messages_for_llm = [{"role": "system", "content": SYSTEM_MESSAGE}] + new_history
97
 
 
98
  full_bot_response = ""
99
  for chunk in llm_query(messages_for_llm, max_tokens):
100
+ full_bot_response = chunk
101
+ if len(new_history) > 0 and new_history[-1]["role"] == "assistant":
102
+ new_history[-1]["content"] = full_bot_response
103
+ else:
 
 
 
 
 
 
104
  new_history.append({"role": "assistant", "content": full_bot_response})
105
 
106
+ yield new_history, gr.update(value="")
 
 
 
 
107
 
108
 
109
  with gr.Blocks() as demo:
 
115
  """
116
  )
117
 
 
118
  chatbot = gr.Chatbot(
119
  height=500,
120
  label="Bella's Responses",
121
+ type="messages",
122
  autoscroll=True,
123
  resizable=True,
124
  show_copy_button=True
125
  )
126
 
 
127
  msg = gr.Textbox(placeholder="Ask Bella a question...", show_label=False, submit_btn="Ask")
128
 
129
  token_slider = gr.Slider(64, 1024, value=256, step=16, label="Max tokens")
130
 
 
131
  clear_btn = gr.ClearButton([msg, chatbot])
132
 
 
 
133
  msg.submit(
134
  fn=chat_fn,
135
  inputs=[msg, chatbot, token_slider],
136
+ outputs=[chatbot, msg],
137
+ queue=True
138
  )
139
 
 
140
  @app.on_event("startup")
141
+ async def startup_event():
142
  print("Starting Gradio app within FastAPI startup event...")
 
 
143
  demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
144
  print("Gradio app launch initiated.")
145
 
146
  if __name__ == "__main__":
147
  import uvicorn
 
148
  print("Running FastAPI app locally (if not in Hugging Face Space)...")
149
  uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))