Spaces:

code-slicer
/

chatbotMOAI

Sleeping

App Files Files Community

code-slicer commited on Sep 20

Commit

c5676e2

verified ·

1 Parent(s): 2caf106

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -25

app.py CHANGED Viewed

@@ -230,8 +230,9 @@ def _call_ollama_chat(messages, model=OLLAMA_MODEL, temperature=0.8, top_p=0.9,
         "options": {"temperature": temperature, "top_p": top_p, "top_k": top_k, "repeat_penalty": repeat_penalty},
         "stream": False,
     }
     try:
-        r = requests.post(url, json=payload, timeout=OLLAMA_TIMEOUT)
         r.raise_for_status()
         return (r.json().get("message") or {}).get("content", "") or ""
     except requests.Timeout:
@@ -252,39 +253,40 @@ def _call_ollama_chat(messages, model=OLLAMA_MODEL, temperature=0.8, top_p=0.9,
 def call_ollama_stream(messages, *, model: str = OLLAMA_MODEL,
                        temperature: float = 0.8, top_p: float = 0.9,
                        top_k: int = 40, repeat_penalty: float = 1.1,
-                       num_predict: int = 200, num_ctx: int = 2048,
-                       system_prompt: str | None = None):
-    """
-    Ollama /api/chat 스트리밍 제너레이터.
-    Streamlit에서는 st.write_stream(...)으로 바로 쓸 수 있음.
-    """
-    url = f"{OLLAMA_HOST}/api/chat"
-    _msgs = []
-    if system_prompt:
-        _msgs.append({"role": "system", "content": system_prompt})
-    _msgs.extend(messages)
     payload = {
         "model": model,
-        "messages": _msgs,
         "options": {
             "temperature": temperature,
             "top_p": top_p,
             "top_k": top_k,
             "repeat_penalty": repeat_penalty,
-            "num_predict": num_predict,   # CPU + 9B는 128~256 권장
-            "num_ctx": num_ctx            # 2048~4096
         },
-        "stream": True,                   # ✅ 핵심
     }
-    with requests.post(url, json=payload, stream=True, timeout=OLLAMA_TIMEOUT) as resp:
         resp.raise_for_status()
         for line in resp.iter_lines(decode_unicode=True):
             if not line:
                 continue
-            data = json.loads(line)
             if data.get("done"):
                 break
             chunk = (data.get("message") or {}).get("content", "")
@@ -346,13 +348,14 @@ def render_llm_followup(chat_container, inline=False):
             msgs = st.session_state["llm_msgs"]
             full_text = st.write_stream(
                 call_ollama_stream(
-                    msgs,
-                    model=OLLAMA_MODEL,
-                    system_prompt=KOREAN_SYSTEM_PROMPT,
-                    num_predict=200,   # 필요시 128~256 조정
-                    num_ctx=2048
-                )
             )
         st.session_state["llm_msgs"].append({"role": "assistant", "content": full_text})
     except requests.Timeout:
         st.error(f"⏱️ Ollama 타임아웃({OLLAMA_TIMEOUT}s). host={OLLAMA_HOST}, model={OLLAMA_MODEL}")

         "options": {"temperature": temperature, "top_p": top_p, "top_k": top_k, "repeat_penalty": repeat_penalty},
         "stream": False,
     }
+    payload["keep_alive"] = "30m"
     try:
+        r = requests.post(url, json=payload, timeout=(10, OLLAMA_TIMEOUT))
         r.raise_for_status()
         return (r.json().get("message") or {}).get("content", "") or ""
     except requests.Timeout:
 def call_ollama_stream(messages, *, model: str = OLLAMA_MODEL,
                        temperature: float = 0.8, top_p: float = 0.9,
                        top_k: int = 40, repeat_penalty: float = 1.1,
+                       num_predict: int = 160,      # ✅ CPU 안정권
+                       num_ctx: int = 2048,
+                       system_prompt: str | None = None,
+                       read_timeout_sec: int | None = None):  # ✅ 신규
+    import os
+    if read_timeout_sec is None:
+        read_timeout_sec = int(os.getenv("OLLAMA_TIMEOUT", "300"))  # ✅ 5분
     payload = {
         "model": model,
+        "messages": _msgs,          # <- 기존 그대로
         "options": {
             "temperature": temperature,
             "top_p": top_p,
             "top_k": top_k,
             "repeat_penalty": repeat_penalty,
+            "num_predict": num_predict,
+            "num_ctx": num_ctx,
+            "num_thread": os.cpu_count() or 8,   # ✅ CPU 병렬
         },
+        "stream": True,
+        "keep_alive": "30m",        # ✅ 모델 유지
     }
+    with requests.post(url, json=payload, stream=True,
+                       timeout=(10, read_timeout_sec)) as resp:  # ✅ (connect, read)
         resp.raise_for_status()
         for line in resp.iter_lines(decode_unicode=True):
             if not line:
                 continue
+            try:
+                data = json.loads(line)
+            except Exception:
+                continue
             if data.get("done"):
                 break
             chunk = (data.get("message") or {}).get("content", "")
             msgs = st.session_state["llm_msgs"]
             full_text = st.write_stream(
                 call_ollama_stream(
+                msgs,
+                model=OLLAMA_MODEL,
+                system_prompt=KOREAN_SYSTEM_PROMPT,
+                num_predict=160,     # ✅ 160~200 권장
+                num_ctx=2048,
+                read_timeout_sec=300 # ✅ 추가
             )
+        )
         st.session_state["llm_msgs"].append({"role": "assistant", "content": full_text})
     except requests.Timeout:
         st.error(f"⏱️ Ollama 타임아웃({OLLAMA_TIMEOUT}s). host={OLLAMA_HOST}, model={OLLAMA_MODEL}")