Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -230,8 +230,9 @@ def _call_ollama_chat(messages, model=OLLAMA_MODEL, temperature=0.8, top_p=0.9,
|
|
| 230 |
"options": {"temperature": temperature, "top_p": top_p, "top_k": top_k, "repeat_penalty": repeat_penalty},
|
| 231 |
"stream": False,
|
| 232 |
}
|
|
|
|
| 233 |
try:
|
| 234 |
-
r = requests.post(url, json=payload, timeout=OLLAMA_TIMEOUT)
|
| 235 |
r.raise_for_status()
|
| 236 |
return (r.json().get("message") or {}).get("content", "") or ""
|
| 237 |
except requests.Timeout:
|
|
@@ -252,39 +253,40 @@ def _call_ollama_chat(messages, model=OLLAMA_MODEL, temperature=0.8, top_p=0.9,
|
|
| 252 |
def call_ollama_stream(messages, *, model: str = OLLAMA_MODEL,
|
| 253 |
temperature: float = 0.8, top_p: float = 0.9,
|
| 254 |
top_k: int = 40, repeat_penalty: float = 1.1,
|
| 255 |
-
num_predict: int =
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
_msgs = []
|
| 264 |
-
if system_prompt:
|
| 265 |
-
_msgs.append({"role": "system", "content": system_prompt})
|
| 266 |
-
_msgs.extend(messages)
|
| 267 |
|
| 268 |
payload = {
|
| 269 |
"model": model,
|
| 270 |
-
"messages": _msgs,
|
| 271 |
"options": {
|
| 272 |
"temperature": temperature,
|
| 273 |
"top_p": top_p,
|
| 274 |
"top_k": top_k,
|
| 275 |
"repeat_penalty": repeat_penalty,
|
| 276 |
-
"num_predict": num_predict,
|
| 277 |
-
"num_ctx": num_ctx
|
|
|
|
| 278 |
},
|
| 279 |
-
"stream": True,
|
|
|
|
| 280 |
}
|
| 281 |
|
| 282 |
-
with requests.post(url, json=payload, stream=True,
|
|
|
|
| 283 |
resp.raise_for_status()
|
| 284 |
for line in resp.iter_lines(decode_unicode=True):
|
| 285 |
if not line:
|
| 286 |
continue
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
| 288 |
if data.get("done"):
|
| 289 |
break
|
| 290 |
chunk = (data.get("message") or {}).get("content", "")
|
|
@@ -346,13 +348,14 @@ def render_llm_followup(chat_container, inline=False):
|
|
| 346 |
msgs = st.session_state["llm_msgs"]
|
| 347 |
full_text = st.write_stream(
|
| 348 |
call_ollama_stream(
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
)
|
|
|
|
| 356 |
st.session_state["llm_msgs"].append({"role": "assistant", "content": full_text})
|
| 357 |
except requests.Timeout:
|
| 358 |
st.error(f"โฑ๏ธ Ollama ํ์์์({OLLAMA_TIMEOUT}s). host={OLLAMA_HOST}, model={OLLAMA_MODEL}")
|
|
|
|
| 230 |
"options": {"temperature": temperature, "top_p": top_p, "top_k": top_k, "repeat_penalty": repeat_penalty},
|
| 231 |
"stream": False,
|
| 232 |
}
|
| 233 |
+
payload["keep_alive"] = "30m"
|
| 234 |
try:
|
| 235 |
+
r = requests.post(url, json=payload, timeout=(10, OLLAMA_TIMEOUT))
|
| 236 |
r.raise_for_status()
|
| 237 |
return (r.json().get("message") or {}).get("content", "") or ""
|
| 238 |
except requests.Timeout:
|
|
|
|
| 253 |
def call_ollama_stream(messages, *, model: str = OLLAMA_MODEL,
|
| 254 |
temperature: float = 0.8, top_p: float = 0.9,
|
| 255 |
top_k: int = 40, repeat_penalty: float = 1.1,
|
| 256 |
+
num_predict: int = 160, # โ
CPU ์์ ๊ถ
|
| 257 |
+
num_ctx: int = 2048,
|
| 258 |
+
system_prompt: str | None = None,
|
| 259 |
+
read_timeout_sec: int | None = None): # โ
์ ๊ท
|
| 260 |
+
import os
|
| 261 |
+
if read_timeout_sec is None:
|
| 262 |
+
read_timeout_sec = int(os.getenv("OLLAMA_TIMEOUT", "300")) # โ
5๋ถ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
|
| 264 |
payload = {
|
| 265 |
"model": model,
|
| 266 |
+
"messages": _msgs, # <- ๊ธฐ์กด ๊ทธ๋๋ก
|
| 267 |
"options": {
|
| 268 |
"temperature": temperature,
|
| 269 |
"top_p": top_p,
|
| 270 |
"top_k": top_k,
|
| 271 |
"repeat_penalty": repeat_penalty,
|
| 272 |
+
"num_predict": num_predict,
|
| 273 |
+
"num_ctx": num_ctx,
|
| 274 |
+
"num_thread": os.cpu_count() or 8, # โ
CPU ๋ณ๋ ฌ
|
| 275 |
},
|
| 276 |
+
"stream": True,
|
| 277 |
+
"keep_alive": "30m", # โ
๋ชจ๋ธ ์ ์ง
|
| 278 |
}
|
| 279 |
|
| 280 |
+
with requests.post(url, json=payload, stream=True,
|
| 281 |
+
timeout=(10, read_timeout_sec)) as resp: # โ
(connect, read)
|
| 282 |
resp.raise_for_status()
|
| 283 |
for line in resp.iter_lines(decode_unicode=True):
|
| 284 |
if not line:
|
| 285 |
continue
|
| 286 |
+
try:
|
| 287 |
+
data = json.loads(line)
|
| 288 |
+
except Exception:
|
| 289 |
+
continue
|
| 290 |
if data.get("done"):
|
| 291 |
break
|
| 292 |
chunk = (data.get("message") or {}).get("content", "")
|
|
|
|
| 348 |
msgs = st.session_state["llm_msgs"]
|
| 349 |
full_text = st.write_stream(
|
| 350 |
call_ollama_stream(
|
| 351 |
+
msgs,
|
| 352 |
+
model=OLLAMA_MODEL,
|
| 353 |
+
system_prompt=KOREAN_SYSTEM_PROMPT,
|
| 354 |
+
num_predict=160, # โ
160~200 ๊ถ์ฅ
|
| 355 |
+
num_ctx=2048,
|
| 356 |
+
read_timeout_sec=300 # โ
์ถ๊ฐ
|
| 357 |
)
|
| 358 |
+
)
|
| 359 |
st.session_state["llm_msgs"].append({"role": "assistant", "content": full_text})
|
| 360 |
except requests.Timeout:
|
| 361 |
st.error(f"โฑ๏ธ Ollama ํ์์์({OLLAMA_TIMEOUT}s). host={OLLAMA_HOST}, model={OLLAMA_MODEL}")
|