code-slicer commited on
Commit
c5676e2
ยท
verified ยท
1 Parent(s): 2caf106

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -25
app.py CHANGED
@@ -230,8 +230,9 @@ def _call_ollama_chat(messages, model=OLLAMA_MODEL, temperature=0.8, top_p=0.9,
230
  "options": {"temperature": temperature, "top_p": top_p, "top_k": top_k, "repeat_penalty": repeat_penalty},
231
  "stream": False,
232
  }
 
233
  try:
234
- r = requests.post(url, json=payload, timeout=OLLAMA_TIMEOUT)
235
  r.raise_for_status()
236
  return (r.json().get("message") or {}).get("content", "") or ""
237
  except requests.Timeout:
@@ -252,39 +253,40 @@ def _call_ollama_chat(messages, model=OLLAMA_MODEL, temperature=0.8, top_p=0.9,
252
  def call_ollama_stream(messages, *, model: str = OLLAMA_MODEL,
253
  temperature: float = 0.8, top_p: float = 0.9,
254
  top_k: int = 40, repeat_penalty: float = 1.1,
255
- num_predict: int = 200, num_ctx: int = 2048,
256
- system_prompt: str | None = None):
257
- """
258
- Ollama /api/chat ์ŠคํŠธ๋ฆฌ๋ฐ ์ œ๋„ˆ๋ ˆ์ดํ„ฐ.
259
- Streamlit์—์„œ๋Š” st.write_stream(...)์œผ๋กœ ๋ฐ”๋กœ ์“ธ ์ˆ˜ ์žˆ์Œ.
260
- """
261
- url = f"{OLLAMA_HOST}/api/chat"
262
-
263
- _msgs = []
264
- if system_prompt:
265
- _msgs.append({"role": "system", "content": system_prompt})
266
- _msgs.extend(messages)
267
 
268
  payload = {
269
  "model": model,
270
- "messages": _msgs,
271
  "options": {
272
  "temperature": temperature,
273
  "top_p": top_p,
274
  "top_k": top_k,
275
  "repeat_penalty": repeat_penalty,
276
- "num_predict": num_predict, # CPU + 9B๋Š” 128~256 ๊ถŒ์žฅ
277
- "num_ctx": num_ctx # 2048~4096
 
278
  },
279
- "stream": True, # โœ… ํ•ต์‹ฌ
 
280
  }
281
 
282
- with requests.post(url, json=payload, stream=True, timeout=OLLAMA_TIMEOUT) as resp:
 
283
  resp.raise_for_status()
284
  for line in resp.iter_lines(decode_unicode=True):
285
  if not line:
286
  continue
287
- data = json.loads(line)
 
 
 
288
  if data.get("done"):
289
  break
290
  chunk = (data.get("message") or {}).get("content", "")
@@ -346,13 +348,14 @@ def render_llm_followup(chat_container, inline=False):
346
  msgs = st.session_state["llm_msgs"]
347
  full_text = st.write_stream(
348
  call_ollama_stream(
349
- msgs,
350
- model=OLLAMA_MODEL,
351
- system_prompt=KOREAN_SYSTEM_PROMPT,
352
- num_predict=200, # ํ•„์š”์‹œ 128~256 ์กฐ์ •
353
- num_ctx=2048
354
- )
355
  )
 
356
  st.session_state["llm_msgs"].append({"role": "assistant", "content": full_text})
357
  except requests.Timeout:
358
  st.error(f"โฑ๏ธ Ollama ํƒ€์ž„์•„์›ƒ({OLLAMA_TIMEOUT}s). host={OLLAMA_HOST}, model={OLLAMA_MODEL}")
 
230
  "options": {"temperature": temperature, "top_p": top_p, "top_k": top_k, "repeat_penalty": repeat_penalty},
231
  "stream": False,
232
  }
233
+ payload["keep_alive"] = "30m"
234
  try:
235
+ r = requests.post(url, json=payload, timeout=(10, OLLAMA_TIMEOUT))
236
  r.raise_for_status()
237
  return (r.json().get("message") or {}).get("content", "") or ""
238
  except requests.Timeout:
 
253
  def call_ollama_stream(messages, *, model: str = OLLAMA_MODEL,
254
  temperature: float = 0.8, top_p: float = 0.9,
255
  top_k: int = 40, repeat_penalty: float = 1.1,
256
+ num_predict: int = 160, # โœ… CPU ์•ˆ์ •๊ถŒ
257
+ num_ctx: int = 2048,
258
+ system_prompt: str | None = None,
259
+ read_timeout_sec: int | None = None): # โœ… ์‹ ๊ทœ
260
+ import os
261
+ if read_timeout_sec is None:
262
+ read_timeout_sec = int(os.getenv("OLLAMA_TIMEOUT", "300")) # โœ… 5๋ถ„
 
 
 
 
 
263
 
264
  payload = {
265
  "model": model,
266
+ "messages": _msgs, # <- ๊ธฐ์กด ๊ทธ๋Œ€๋กœ
267
  "options": {
268
  "temperature": temperature,
269
  "top_p": top_p,
270
  "top_k": top_k,
271
  "repeat_penalty": repeat_penalty,
272
+ "num_predict": num_predict,
273
+ "num_ctx": num_ctx,
274
+ "num_thread": os.cpu_count() or 8, # โœ… CPU ๋ณ‘๋ ฌ
275
  },
276
+ "stream": True,
277
+ "keep_alive": "30m", # โœ… ๋ชจ๋ธ ์œ ์ง€
278
  }
279
 
280
+ with requests.post(url, json=payload, stream=True,
281
+ timeout=(10, read_timeout_sec)) as resp: # โœ… (connect, read)
282
  resp.raise_for_status()
283
  for line in resp.iter_lines(decode_unicode=True):
284
  if not line:
285
  continue
286
+ try:
287
+ data = json.loads(line)
288
+ except Exception:
289
+ continue
290
  if data.get("done"):
291
  break
292
  chunk = (data.get("message") or {}).get("content", "")
 
348
  msgs = st.session_state["llm_msgs"]
349
  full_text = st.write_stream(
350
  call_ollama_stream(
351
+ msgs,
352
+ model=OLLAMA_MODEL,
353
+ system_prompt=KOREAN_SYSTEM_PROMPT,
354
+ num_predict=160, # โœ… 160~200 ๊ถŒ์žฅ
355
+ num_ctx=2048,
356
+ read_timeout_sec=300 # โœ… ์ถ”๊ฐ€
357
  )
358
+ )
359
  st.session_state["llm_msgs"].append({"role": "assistant", "content": full_text})
360
  except requests.Timeout:
361
  st.error(f"โฑ๏ธ Ollama ํƒ€์ž„์•„์›ƒ({OLLAMA_TIMEOUT}s). host={OLLAMA_HOST}, model={OLLAMA_MODEL}")