Spaces:

Princeaka
/

justiceai

Sleeping

App Files Files Community

Princeaka commited on Oct 29

Commit

2368b12

verified ·

1 Parent(s): fa9ff2a

Update app.py

Browse files

Files changed (1) hide show

app.py +413 -316

app.py CHANGED Viewed

@@ -1,28 +1,16 @@
 #!/usr/bin/env python3
 """
-JusticeAI Backend — app.py
-Features & behavior:
-- Two databases:
-    - DATABASE_URL (engine_user) stores personal user_memory ONLY and is never used to update global knowledge.
-    - KNOWLEDGEDATABASE_URL (engine_knowledge) stores global knowledge rows used for replies.
-- /chat accepts {"message": "..."} (or {"text": "..."}) and:
-    - infers topic (Ollama first if available, then embeddings, then keyword matching)
-    - retrieves ONLY from knowledge rows in that topic (strict topic isolation)
-    - composes a reply from topic-scoped knowledge (no automatic injection of user chats into knowledge)
-    - returns the reply in the user's detected language (translation via language.py if present or Helsinki fallback if transformers available)
-    - persists the user message and the reply into engine_user.user_memory and prunes to the last 10 messages per user
-    - blocks storing toxic messages using the moderator pipeline (if available)
-- All endpoints included: /chat, /response, /add, /add-bulk, /leaderboard, /reembed, /model-status,
-  /health, /metrics_stream, /metrics_recent, /verify-admin, /cleardatabase, / (frontend).
-- Ollama integration: uses HTTP (if ollama serve) or CLI (ollama run) to infer topic semantically if possible.
-- Optional models: SentenceTransformer for embeddings and transformers (Helsinki) for translation; code runs without them using fallbacks.
-Deployment notes:
-- Set DATABASE_URL and KNOWLEDGEDATABASE_URL environment variables.
-- Optionally install dependencies for better features:
-    pip install sentence-transformers transformers torch langdetect emoji hf-cli
-- To enable Ollama model auto-pull at startup set OLLAMA_AUTO_PULL=1 and ensure ollama CLI exists.
 """
 from sqlalchemy.pool import NullPool
@@ -36,31 +24,45 @@ import subprocess
 import shutil
 import logging
 import random
 from datetime import datetime, timezone
 from collections import deque
 from typing import Optional, Dict, Any, List
-from fastapi import FastAPI, Request, Body, Query, Header
-from fastapi.responses import JSONResponse, StreamingResponse, HTMLResponse
 from sqlalchemy import create_engine, text as sql_text
 # external helpers
 import requests
-# Optional ML libs
 try:
     from sentence_transformers import SentenceTransformer
 except Exception:
     SentenceTransformer = None
 try:
-    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, pipeline as hf_pipeline
 except Exception:
     AutoTokenizer = None
     AutoModelForSeq2SeqLM = None
-    AutoModelForCausalLM = None
     hf_pipeline = None
 # Optional local modules
 try:
     import language as language_module  # type: ignore
@@ -83,7 +85,7 @@ try:
 except Exception:
     detect_lang = None
-# Moderator pipeline (text-classification) - optional
 moderator = None
 try:
     if hf_pipeline is not None:
@@ -91,7 +93,7 @@ try:
 except Exception:
     moderator = None
-# Config
 ADMIN_KEY = os.environ.get("ADMIN_KEY")
 DATABASE_URL = os.environ.get("DATABASE_URL", "sqlite:///justice_user.db")
 KNOWLEDGEDATABASE_URL = os.environ.get("KNOWLEDGEDATABASE_URL", DATABASE_URL)
@@ -104,15 +106,23 @@ OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3")
 OLLAMA_HTTP_URL = os.environ.get("OLLAMA_HTTP_URL", "http://localhost:11434")
 OLLAMA_AUTO_PULL = os.environ.get("OLLAMA_AUTO_PULL", "0") in ("1", "true", "yes")
 # Logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("justicebrain")
-# Early heartbeat & start time
 last_heartbeat = {"time": datetime.utcnow().replace(tzinfo=timezone.utc).isoformat(), "ok": True}
 app_start_time = time.time()
-# Engines (user memory and knowledge separate)
 engine_user = create_engine(
     DATABASE_URL,
     poolclass=NullPool,
@@ -128,7 +138,7 @@ app = FastAPI(title="Justice Brain — Backend")
 # --- Database schema setup ---
 def ensure_tables():
-    # knowledge table in knowledge DB
     dialect_k = engine_knowledge.dialect.name
     with engine_knowledge.begin() as conn:
         if dialect_k == "sqlite":
@@ -165,7 +175,7 @@ def ensure_tables():
                     updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                 );
             """))
-    # user memory table in user DB
     dialect_u = engine_user.dialect.name
     with engine_user.begin() as conn:
         if dialect_u == "sqlite":
@@ -209,7 +219,6 @@ def ensure_tables():
 ensure_tables()
-# add columns if missing (best-effort; uses engine_user but applied generically)
 def ensure_column_exists(table: str, column: str, col_def_sql: str):
     dialect = engine_user.dialect.name
     try:
@@ -292,7 +301,7 @@ def emoji_sentiment_score(emojis: List[str]) -> float:
             score += 0.1
     return max(-1.0, min(1.0, score / max(1, len(emojis))))
-# Language detection & translation (language.py preferred)
 _translation_model_cache: Dict[str, Any] = {}
 def detect_language_safe(text: str) -> str:
@@ -352,7 +361,6 @@ def translate_text(text: str, src: str, tgt: str) -> str:
                     return out
         except Exception:
             pass
-    # Helsinki fallback
     src_code = (src or "und").split("-")[0].lower()
     tgt_code = (tgt or "und").split("-")[0].lower()
     if not re.fullmatch(r"[a-z]{2,3}", src_code) or not re.fullmatch(r"[a-z]{2,3}", tgt_code):
@@ -391,7 +399,7 @@ def translate_from_english(text: str, tgt_lang: str) -> str:
         return text
     return translate_text(text, "en", tgt)
-# Embedding utilities (optional)
 embed_model = None
 def try_load_embed():
     global embed_model
@@ -414,71 +422,13 @@ def embed_to_bytes(text: str) -> Optional[bytes]:
     except Exception:
         return None
-# Boilerplate detection + reply synthesis
-def is_boilerplate_candidate(s: str) -> bool:
-    s_low = (s or "").strip().lower()
-    generic = ["i don't know", "not sure", "maybe", "perhaps", "justiceai is a unified intelligence dashboard"]
-    if len(s_low) < 8:
-        return True
-    return any(g in s_low for g in generic)
-def generate_creative_reply(candidates: List[str]) -> str:
-    all_sent = []
-    seen = set()
-    for c in candidates:
-        for s in re.split(r'(?<=[.?!])\s+', c):
-            st = s.strip()
-            if not st or st in seen or is_boilerplate_candidate(st):
-                continue
-            seen.add(st)
-            all_sent.append(st)
-    if not all_sent:
-        return "I don't have enough context yet — can you give more details?"
-    return "\n".join(all_sent[:5])
-# Duplicate detection within topic
-def knowledge_text_exists_in_topic(text: str, topic: str, threshold: float = 0.92) -> bool:
-    t = (text or "").strip()
-    if not t:
-        return False
-    try:
-        with engine_knowledge.begin() as conn:
-            rows = conn.execute(sql_text("SELECT id, text FROM knowledge WHERE topic = :topic LIMIT 200"), {"topic": topic}).fetchall()
-        for r in rows:
-            existing = (r[1] or "").strip()
-            if existing.lower() == t.lower():
-                return True
-        if embed_model is not None and rows:
-            texts = [r[1] or "" for r in rows]
-            embs = embed_model.encode(texts, convert_to_tensor=True)
-            q_emb = embed_model.encode([t], convert_to_tensor=True)[0]
-            import torch
-            sims = torch.nn.functional.cosine_similarity(q_emb.unsqueeze(0), embs)
-            if float(torch.max(sims).item()) >= threshold:
-                return True
-    except Exception:
-        pass
-    return False
-# Topic inference fallback (embeddings/keywords)
-def infer_topic_from_message(msg: str, known_topics: List[str]) -> str:
-    msg_low = (msg or "").lower()
-    for topic in known_topics or []:
-        if topic and topic.lower() in msg_low:
-            return topic
-    if embed_model is not None and known_topics:
-        try:
-            import torch
-            topic_embs = embed_model.encode(known_topics, convert_to_tensor=True)
-            q_emb = embed_model.encode([msg], convert_to_tensor=True)[0]
-            sims = torch.nn.functional.cosine_similarity(q_emb.unsqueeze(0), topic_embs)
-            best_idx = int(torch.argmax(sims).item())
-            return known_topics[best_idx]
-        except Exception:
-            pass
-    return "general"
-# Ollama helpers
 def ollama_cli_available() -> bool:
     return shutil.which("ollama") is not None
@@ -489,19 +439,18 @@ def ollama_http_available() -> bool:
     except Exception:
         return False
-def call_ollama_http(prompt: str, model: str = OLLAMA_MODEL, timeout_s: int = 10) -> Optional[str]:
     try:
         url = f"{OLLAMA_HTTP_URL}/api/generate"
         payload = {"model": model, "prompt": prompt, "max_tokens": 256}
         headers = {"Content-Type": "application/json"}
-        r = requests.post(url, json=payload, headers=headers, timeout=timeout_s)
         if r.status_code == 200:
             try:
                 obj = r.json()
-                if isinstance(obj, dict):
-                    for key in ("output", "text", "result", "generations"):
-                        if key in obj:
-                            return obj[key] if isinstance(obj[key], str) else json.dumps(obj[key])
                 return r.text
             except Exception:
                 return r.text
@@ -512,11 +461,11 @@ def call_ollama_http(prompt: str, model: str = OLLAMA_MODEL, timeout_s: int = 10
         logger.debug(f"ollama HTTP call failed: {e}")
         return None
-def call_ollama_cli(prompt: str, model: str = OLLAMA_MODEL, timeout_s: int = 15) -> Optional[str]:
     if not ollama_cli_available():
         return None
     try:
-        proc = subprocess.run(["ollama", "run", model, "--prompt", prompt], capture_output=True, text=True, timeout=timeout_s)
         if proc.returncode == 0:
             return proc.stdout.strip() or proc.stderr.strip()
         else:
@@ -526,7 +475,7 @@ def call_ollama_cli(prompt: str, model: str = OLLAMA_MODEL, timeout_s: int = 15)
         logger.debug(f"ollama CLI call exception: {e}")
         return None
-def infer_topic_with_ollama(msg: str, topics: List[str], model: str = OLLAMA_MODEL, timeout_s: int = 8) -> Optional[str]:
     if not msg or not topics:
         return None
     topics_escaped = [t.replace('"','\\"') for t in topics]
@@ -580,12 +529,188 @@ def infer_topic_with_ollama(msg: str, topics: List[str], model: str = OLLAMA_MOD
                 pass
     return None
-# Metrics & cache state
 recent_request_times = deque()
 recent_learning_timestamps = deque()
 response_time_ema: Optional[float] = None
 EMA_ALPHA = 0.2
-knowledge_version = 0
 def record_request(duration_s: float):
     global response_time_ema
@@ -604,14 +729,20 @@ def record_learn_event():
     while recent_learning_timestamps and recent_learning_timestamps[0] < ts - 3600:
         recent_learning_timestamps.popleft()
-# Startup tasks
 @app.on_event("startup")
 async def startup_event():
-    logger.info("[JusticeAI] startup: attempting to load optional components")
-    try:
-        try_load_embed()
-    except Exception as e:
-        logger.warning(f"[startup] embed load issue: {e}")
     if OLLAMA_AUTO_PULL and ollama_cli_available():
         try:
             subprocess.run(["ollama", "pull", OLLAMA_MODEL], timeout=300)
@@ -620,8 +751,7 @@ async def startup_event():
             logger.debug(f"[startup] ollama pull failed: {e}")
     logger.info("[JusticeAI] startup complete")
-# --- Endpoints ---
 @app.post("/add")
 async def add_knowledge(data: dict = Body(...)):
     if not isinstance(data, dict):
@@ -642,7 +772,10 @@ async def add_knowledge(data: dict = Body(...)):
             return JSONResponse(status_code=400, content={"error": "translation failed"})
     emb_bytes = None
     if embed_model is not None:
-        emb_bytes = embed_to_bytes(text_data)
     try:
         with engine_knowledge.begin() as conn:
             if emb_bytes:
@@ -655,13 +788,8 @@ async def add_knowledge(data: dict = Body(...)):
                     "INSERT INTO knowledge (text, reply, language, category, topic, confidence, meta) "
                     "VALUES (:t, :r, :lang, 'manual', :topic, :conf, :meta)"
                 ), {"t": text_data, "r": reply, "lang": detected, "topic": topic, "conf": 0.9, "meta": json.dumps({"manual": True})})
-        global knowledge_version
-        knowledge_version += 1
         record_learn_event()
-        res = {"status": "✅ Knowledge added", "text": text_data, "topic": topic, "language": detected}
-        if not emb_bytes:
-            res["note"] = "stored without embedding"
-        return res
     except Exception as e:
         logger.exception("add failed")
         return JSONResponse(status_code=500, content={"error": "failed to store knowledge", "details": str(e)})
@@ -684,32 +812,132 @@ async def add_bulk(data: List[dict] = Body(...)):
             detected = detect_language_safe(text_data) or "und"
             if detected not in ("en", "eng", "und"):
                 errors.append({"index": i, "error": "non-english; skip"}); continue
-            emb_bytes = embed_to_bytes(text_data) if embed_model is not None else None
             with engine_knowledge.begin() as conn:
                 if emb_bytes:
                     conn.execute(sql_text(
-                        "INSERT INTO knowledge (text, reply, language, embedding, category, topic) "
-                        "VALUES (:t, :r, :lang, :e, 'manual', :topic)"
                     ), {"t": text_data, "r": reply, "lang": "en", "e": emb_bytes, "topic": topic})
                 else:
                     conn.execute(sql_text(
-                        "INSERT INTO knowledge (text, reply, language, category, topic) "
-                        "VALUES (:t, :r, :lang, 'manual', :topic)"
                     ), {"t": text_data, "r": reply, "lang": "en", "topic": topic})
             added += 1
         except Exception as e:
             logger.exception("add-bulk item error")
             errors.append({"index": i, "error": str(e)})
     if added:
-        global knowledge_version
-        knowledge_version += 1
         record_learn_event()
     return {"added": added, "errors": errors}
 @app.post("/chat")
 async def chat(request: Request, data: dict = Body(...)):
     t0 = time.time()
-    # Accept both "message" and "text"
     if isinstance(data, dict):
         raw_msg = str(data.get("message", "") or data.get("text", "") or "").strip()
     else:
@@ -727,7 +955,7 @@ async def chat(request: Request, data: dict = Body(...)):
     detected_lang = detect_language_safe(raw_msg)
     reply_lang = detected_lang if detected_lang and detected_lang != "und" else "en"
-    # Translate incoming to English for retrieval/synthesis if needed
     en_msg = raw_msg
     if detected_lang not in ("en", "eng", "", "und"):
         try:
@@ -735,7 +963,7 @@ async def chat(request: Request, data: dict = Body(...)):
         except Exception:
             en_msg = raw_msg
-    # Infer topic (Ollama -> embed -> keyword)
     topic = "general"
     try:
         if not topic_hint:
@@ -758,7 +986,7 @@ async def chat(request: Request, data: dict = Body(...)):
     except Exception:
         topic = topic_hint or "general"
-    # Moderation on incoming message
     flags = {}
     try:
         if moderator is not None:
@@ -771,43 +999,45 @@ async def chat(request: Request, data: dict = Body(...)):
     except Exception:
         pass
-    # IMPORTANT: Do NOT auto-add incoming message to global knowledge.
-    # We'll store into engine_user.user_memory only (personal).
-    # Load knowledge entries for this topic only
     try:
         with engine_knowledge.begin() as conn:
-            rows = conn.execute(sql_text(
-                "SELECT id, text, reply, language, embedding FROM knowledge WHERE topic = :topic ORDER BY created_at DESC"
-            ), {"topic": topic}).fetchall()
     except Exception as e:
         record_request(time.time() - t0)
         return JSONResponse(status_code=500, content={"error": "failed to read knowledge", "details": str(e)})
     knowledge_rows = [{"id": r[0], "text": r[1] or "", "reply": r[2] or "", "lang": r[3] or "und", "embedding": r[4]} for r in rows]
-    # Retrieval (embedding-first then substring) restricted to topic
     matches: List[str] = []
     confidence = 0.0
     try:
         if embed_model is not None and knowledge_rows:
             texts = [kr["text"] for kr in knowledge_rows]
-            embs = embed_model.encode(texts, convert_to_tensor=True)
-            q_emb = embed_model.encode([en_msg], convert_to_tensor=True)[0]
-            import torch
-            scores = torch.nn.functional.cosine_similarity(q_emb.unsqueeze(0), embs)
-            cand = []
-            for i in range(scores.shape[0]):
-                s = float(scores[i])
-                kr = knowledge_rows[i]
-                candidate_text = (kr["reply"] or kr["text"]).strip()
-                if is_boilerplate_candidate(candidate_text):
-                    continue
-                if s >= 0.30:
-                    cand.append({"text": candidate_text, "lang": kr["lang"], "score": s})
-            cand = sorted(cand, key=lambda x: -x["score"])
-            matches = [c["text"] for c in cand]
-            confidence = cand[0]["score"] if cand else 0.0
         else:
             cand = []
             for kr in knowledge_rows:
@@ -822,7 +1052,7 @@ async def chat(request: Request, data: dict = Body(...)):
         logger.warning(f"[retrieval] error: {e}")
         matches = []
-    # Compose reply from topic-only knowledge
     if matches and confidence >= 0.6:
         reply_en = matches[0]
     elif matches:
@@ -835,7 +1065,6 @@ async def chat(request: Request, data: dict = Body(...)):
             except Exception:
                 pass
         reply_final = base
-        # Persist user memory (even when no confident match), skipping toxic
         try:
             if not flags.get('toxic', False):
                 with engine_user.begin() as conn:
@@ -844,20 +1073,18 @@ async def chat(request: Request, data: dict = Body(...)):
                         "VALUES (:uid, :uname, :ip, :text, :reply, :lang, :mood, :conf, :topic, :source)"
                     ), {"uid": user_id, "uname": username, "ip": user_ip, "text": raw_msg, "reply": reply_final, "lang": detected_lang,
                         "mood": detect_mood(raw_msg + " " + reply_final), "conf": float(confidence), "topic": topic, "source": "chat"})
-                    # prune to last 10 per user
                     conn.execute(sql_text(
-                        "DELETE FROM user_memory WHERE id NOT IN ("
-                        "SELECT id FROM user_memory WHERE user_id = :uid ORDER BY created_at DESC LIMIT 10) AND user_id = :uid"
                     ), {"uid": user_id})
         except Exception as e:
             logger.debug(f"user_memory store error: {e}")
         record_request(time.time() - t0)
-        return {"reply": reply_final, "topic": topic, "language": reply_lang, "emoji": "", "confidence": round(confidence, 2), "flags": flags}
-    # Postprocess reply (dedupe)
     reply_en = dedupe_sentences(reply_en)
-    # Ensure translation into user's language (robust)
     reply_final = reply_en
     lang_code = (reply_lang or "und").split("-")[0].lower()
     if lang_code not in ("en", "eng", "und", ""):
@@ -868,7 +1095,7 @@ async def chat(request: Request, data: dict = Body(...)):
             logger.warning(f"[translation] failed to translate reply_en -> {lang_code}: {exc}")
             reply_final = reply_en
-    # Mood & emoji (non-intrusive)
     emoji = ""
     try:
         mood = detect_mood(raw_msg + " " + reply_final)
@@ -883,7 +1110,7 @@ async def chat(request: Request, data: dict = Body(...)):
     except Exception:
         emoji = ""
-    # Persist user memory into DATABASE_URL only (engine_user) and prune to last 10
     try:
         if not flags.get('toxic', False):
             with engine_user.begin() as conn:
@@ -892,10 +1119,8 @@ async def chat(request: Request, data: dict = Body(...)):
                     "VALUES (:uid, :uname, :ip, :text, :reply, :lang, :mood, :conf, :topic, :source)"
                 ), {"uid": user_id, "uname": username, "ip": user_ip, "text": raw_msg, "reply": reply_final, "lang": detected_lang,
                     "mood": detect_mood(raw_msg + " " + reply_final), "conf": float(confidence), "topic": topic, "source": "chat"})
-                # prune to last 10 per user
                 conn.execute(sql_text(
-                    "DELETE FROM user_memory WHERE id NOT IN ("
-                    "SELECT id FROM user_memory WHERE user_id = :uid ORDER BY created_at DESC LIMIT 10) AND user_id = :uid"
                 ), {"uid": user_id})
     except Exception as e:
         logger.debug(f"user_memory persist error: {e}")
@@ -906,126 +1131,12 @@ async def chat(request: Request, data: dict = Body(...)):
     if include_steps:
         reply_final = f"{reply_final}\n\n[Debug: topic={topic} confidence={round(confidence,2)}]"
-    return {"reply": reply_final, "topic": topic, "language": reply_lang, "emoji": emoji, "confidence": round(confidence, 2), "flags": flags}
 @app.post("/response")
 async def response_wrapper(request: Request, data: dict = Body(...)):
     return await chat(request, data)
-@app.get("/leaderboard")
-async def leaderboard(topic: str = Query("general")):
-    t = str(topic or "general").strip() or "general"
-    try:
-        with engine_knowledge.begin() as conn:
-            rows = conn.execute(sql_text("""
-                SELECT id, text, reply, language, category, confidence, created_at
-                FROM knowledge
-                WHERE topic = :topic
-                ORDER BY confidence DESC, created_at DESC
-                LIMIT 20
-            """), {"topic": t}).fetchall()
-        out = []
-        for r in rows:
-            text_en = r[1] or ""
-            lang = r[3] or "und"
-            display_text = text_en
-            if lang and lang not in ("en", "eng", "", "und"):
-                try:
-                    display_text = translate_to_english(text_en, lang)
-                except Exception:
-                    display_text = text_en
-            created_at = r[6]
-            out.append({
-                "id": r[0],
-                "text": display_text,
-                "reply": r[2],
-                "language": lang,
-                "category": r[4],
-                "confidence": round(r[5] or 0.0, 2),
-                "created_at": created_at.isoformat() if hasattr(created_at, "isoformat") else str(created_at)
-            })
-        return {"topic": t, "top_20": out}
-    except Exception as e:
-        logger.exception("leaderboard failed")
-        return JSONResponse(status_code=500, content={"error": "failed to fetch leaderboard", "details": str(e)})
-@app.post("/reembed")
-async def reembed_all(data: dict = Body(...), x_admin_key: str = Header(None, alias="X-Admin-Key")):
-    if ADMIN_KEY is None:
-        return JSONResponse(status_code=403, content={"error": "Server not configured for admin operations."})
-    if x_admin_key != ADMIN_KEY:
-        return JSONResponse(status_code=403, content={"error": "Invalid admin key."})
-    if embed_model is None:
-        return JSONResponse(status_code=503, content={"error": "Embedding model not ready."})
-    confirm = str(data.get("confirm", "") or "").strip()
-    if confirm != "REEMBED":
-        return JSONResponse(status_code=400, content={"error": "confirm token required."})
-    batch_size = int(data.get("batch_size", 100))
-    try:
-        with engine_knowledge.begin() as conn:
-            rows = conn.execute(sql_text("SELECT id, text FROM knowledge ORDER BY id")).fetchall()
-        ids_texts = [(r[0], r[1]) for r in rows]
-        total = len(ids_texts)
-        updated = 0
-        for i in range(0, total, batch_size):
-            batch = ids_texts[i:i+batch_size]
-            texts = [t for _, t in batch]
-            embs = embed_model.encode(texts, convert_to_tensor=True)
-            for j, (kid, _) in enumerate(batch):
-                emb_bytes = embs[j].cpu().numpy().tobytes()
-                with engine_knowledge.begin() as conn:
-                    conn.execute(sql_text("UPDATE knowledge SET embedding = :e, updated_at = CURRENT_TIMESTAMP WHERE id = :id"), {"e": emb_bytes, "id": kid})
-                updated += 1
-        return {"status": "✅ Re-embed complete", "total_rows": total, "updated": updated}
-    except Exception as e:
-        logger.exception("reembed failed")
-        return JSONResponse(status_code=500, content={"error": "reembed failed", "details": str(e)})
-@app.get("/model-status")
-async def model_status():
-    return {
-        "embed_loaded": embed_model is not None,
-        "ollama_cli": ollama_cli_available(),
-        "ollama_http": ollama_http_available(),
-        "moderator": moderator is not None,
-        "language_module": LANGUAGE_MODULE_AVAILABLE
-    }
-@app.get("/health")
-async def health():
-    try:
-        with engine_knowledge.connect() as c:
-            k = c.execute(sql_text("SELECT COUNT(*) FROM knowledge")).scalar() or 0
-    except Exception:
-        k = -1
-    try:
-        with engine_user.connect() as c:
-            u = c.execute(sql_text("SELECT COUNT(*) FROM user_memory")).scalar() or 0
-    except Exception:
-        u = -1
-    return {"ok": True, "knowledge_count": int(k), "user_memory_count": int(u), "uptime_s": round(time.time() - app_start_time, 2), "heartbeat": last_heartbeat}
-async def metrics_producer():
-    while True:
-        try:
-            import psutil
-            cpu = psutil.cpu_percent(interval=None)
-            mem = psutil.virtual_memory()
-            mem_percent = mem.percent
-        except Exception:
-            cpu = 0.0; mem_percent = 0.0
-        payload = {"time": datetime.utcnow().isoformat(), "cpu_percent": cpu, "memory_percent": mem_percent}
-        yield f"data: {json.dumps(payload)}\n\n"
-        await asyncio.sleep(1.0)
-@app.get("/metrics_stream")
-async def metrics_stream():
-    return StreamingResponse(metrics_producer(), media_type="text/event-stream", headers={"Cache-Control": "no-cache"})
-@app.get("/metrics_recent")
-async def metrics_recent(limit: int = Query(100, ge=1, le=600)):
-    return {"count": 0, "metrics": []}
 @app.post("/verify-admin")
 async def verify_admin(x_admin_key: str = Header(None, alias="X-Admin-Key")):
     if ADMIN_KEY is None:
@@ -1085,33 +1196,19 @@ async def frontend_dashboard():
     html = html.replace("%%STARTUP_TIME%%", str(startup_time_local))
     return HTMLResponse(html)
-# small helpers referenced above
-def detect_mood(text: str) -> str:
-    lower = (text or "").lower()
-    positive = ["great", "thanks", "awesome", "happy", "love", "excellent", "cool", "yes", "good"]
-    negative = ["sad", "bad", "problem", "angry", "hate", "fail", "no", "error", "issue"]
-    if any(w in lower for w in positive):
-        return "positive"
-    if any(w in lower for w in negative):
-        return "negative"
-    return "neutral"
-def should_append_emoji(user_text: str, reply_text: str, mood: str, flags: Dict) -> str:
-    if flags.get("toxic"):
-        return ""
-    if EMOJIS_AVAILABLE:
         try:
-            cat = get_category_for_mood(mood)
-            return get_emoji(cat, 0.6)
         except Exception:
-            return ""
-    return ""
-if __name__ == "__main__":
-    try:
-        try_load_embed()
-    except Exception:
-        pass
     app_start_time = time.time()
     import uvicorn
     port = int(os.environ.get("PORT", 7860))

 #!/usr/bin/env python3
 """
+JusticeAI Backend — merged app.py
+This file:
+- Consolidates the JusticeAI backend (knowledge DB, user DB, /chat and other endpoints)
+- Integrates Ollama topic inference (HTTP/CLI optional)
+- Integrates optional embeddings (SentenceTransformer) and optional Helsinki translation models
+- Adds a TTS /speak endpoint (voice cloning) using TTS.api (Coqui TTS) with optimizations for speed
+- Keeps strict separation: user chat stored only in DATABASE_URL.user_memory and never used to mutate the global knowledge DB
+- Prunes user_memory to the last 10 messages per user
+- Attempts to minimize TTS latency by preloading, using GPU if available, using inference_mode / autocast,
+  and caching identical speaker samples by file hash
 """
 from sqlalchemy.pool import NullPool
 import shutil
 import logging
 import random
+import tempfile
+import uuid
+import asyncio
 from datetime import datetime, timezone
 from collections import deque
 from typing import Optional, Dict, Any, List
+from fastapi import FastAPI, Request, Body, Query, Header, BackgroundTasks, File, UploadFile, Form, HTTPException, status
+from fastapi.responses import JSONResponse, StreamingResponse, HTMLResponse, FileResponse
 from sqlalchemy import create_engine, text as sql_text
 # external helpers
 import requests
+# ML libs (optional)
+try:
+    import torch
+except Exception:
+    torch = None
 try:
     from sentence_transformers import SentenceTransformer
 except Exception:
     SentenceTransformer = None
 try:
+    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline as hf_pipeline
 except Exception:
     AutoTokenizer = None
     AutoModelForSeq2SeqLM = None
     hf_pipeline = None
+# Optional TTS library (Coqui TTS)
+try:
+    from TTS.api import TTS
+    TTS_AVAILABLE = True
+except Exception:
+    TTS_AVAILABLE = False
 # Optional local modules
 try:
     import language as language_module  # type: ignore
 except Exception:
     detect_lang = None
+# Moderator pipeline (optional)
 moderator = None
 try:
     if hf_pipeline is not None:
 except Exception:
     moderator = None
+# Config (env)
 ADMIN_KEY = os.environ.get("ADMIN_KEY")
 DATABASE_URL = os.environ.get("DATABASE_URL", "sqlite:///justice_user.db")
 KNOWLEDGEDATABASE_URL = os.environ.get("KNOWLEDGEDATABASE_URL", DATABASE_URL)
 OLLAMA_HTTP_URL = os.environ.get("OLLAMA_HTTP_URL", "http://localhost:11434")
 OLLAMA_AUTO_PULL = os.environ.get("OLLAMA_AUTO_PULL", "0") in ("1", "true", "yes")
+# TTS configuration and speed options
+TTS_MODEL_NAME = os.environ.get("TTS_MODEL_NAME", "tts_models/multilingual/multi-dataset/xtts_v2")
+TTS_DEVICE = os.environ.get("TTS_DEVICE", "cuda" if (torch is not None and torch.cuda.is_available()) else "cpu")
+TTS_USE_HALF = os.environ.get("TTS_USE_HALF", "1") in ("1", "true", "yes")
+# Non-TTS operation timeout (for blocking calls we choose to limit)
+MODEL_TIMEOUT = float(os.environ.get("MODEL_TIMEOUT", "10"))
 # Logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("justicebrain")
+# Heartbeat & startup
 last_heartbeat = {"time": datetime.utcnow().replace(tzinfo=timezone.utc).isoformat(), "ok": True}
 app_start_time = time.time()
+# Engines (separate DBs)
 engine_user = create_engine(
     DATABASE_URL,
     poolclass=NullPool,
 # --- Database schema setup ---
 def ensure_tables():
+    # knowledge table
     dialect_k = engine_knowledge.dialect.name
     with engine_knowledge.begin() as conn:
         if dialect_k == "sqlite":
                     updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                 );
             """))
+    # user memory table
     dialect_u = engine_user.dialect.name
     with engine_user.begin() as conn:
         if dialect_u == "sqlite":
 ensure_tables()
 def ensure_column_exists(table: str, column: str, col_def_sql: str):
     dialect = engine_user.dialect.name
     try:
             score += 0.1
     return max(-1.0, min(1.0, score / max(1, len(emojis))))
+# --- Language detection & translation ---
 _translation_model_cache: Dict[str, Any] = {}
 def detect_language_safe(text: str) -> str:
                     return out
         except Exception:
             pass
     src_code = (src or "und").split("-")[0].lower()
     tgt_code = (tgt or "und").split("-")[0].lower()
     if not re.fullmatch(r"[a-z]{2,3}", src_code) or not re.fullmatch(r"[a-z]{2,3}", tgt_code):
         return text
     return translate_text(text, "en", tgt)
+# --- Embeddings utilities ---
 embed_model = None
 def try_load_embed():
     global embed_model
     except Exception:
         return None
+# --- Helpers for running blocking code with a timeout (for non-TTS operations) ---
+async def run_blocking_with_timeout(func, *args, timeout: float = MODEL_TIMEOUT):
+    loop = asyncio.get_running_loop()
+    fut = loop.run_in_executor(None, lambda: func(*args))
+    return await asyncio.wait_for(fut, timeout=timeout)
+# --- Ollama helpers (HTTP & CLI) ---
 def ollama_cli_available() -> bool:
     return shutil.which("ollama") is not None
     except Exception:
         return False
+def call_ollama_http(prompt: str, model: str = OLLAMA_MODEL, timeout_s: int = MODEL_TIMEOUT) -> Optional[str]:
     try:
         url = f"{OLLAMA_HTTP_URL}/api/generate"
         payload = {"model": model, "prompt": prompt, "max_tokens": 256}
         headers = {"Content-Type": "application/json"}
+        r = requests.post(url, json=payload, headers=headers, timeout=min(timeout_s, MODEL_TIMEOUT))
         if r.status_code == 200:
             try:
                 obj = r.json()
+                for key in ("output", "text", "result", "generations"):
+                    if key in obj:
+                        return obj[key] if isinstance(obj[key], str) else json.dumps(obj[key])
                 return r.text
             except Exception:
                 return r.text
         logger.debug(f"ollama HTTP call failed: {e}")
         return None
+def call_ollama_cli(prompt: str, model: str = OLLAMA_MODEL, timeout_s: int = MODEL_TIMEOUT) -> Optional[str]:
     if not ollama_cli_available():
         return None
     try:
+        proc = subprocess.run(["ollama", "run", model, "--prompt", prompt], capture_output=True, text=True, timeout=min(timeout_s, MODEL_TIMEOUT))
         if proc.returncode == 0:
             return proc.stdout.strip() or proc.stderr.strip()
         else:
         logger.debug(f"ollama CLI call exception: {e}")
         return None
+def infer_topic_with_ollama(msg: str, topics: List[str], model: str = OLLAMA_MODEL, timeout_s: int = MODEL_TIMEOUT) -> Optional[str]:
     if not msg or not topics:
         return None
     topics_escaped = [t.replace('"','\\"') for t in topics]
                 pass
     return None
+# --- Boilerplate detection & reply synthesis helpers ---
+def is_boilerplate_candidate(s: str) -> bool:
+    s_low = (s or "").strip().lower()
+    generic = ["i don't know", "not sure", "maybe", "perhaps", "justiceai is a unified intelligence dashboard"]
+    if len(s_low) < 8:
+        return True
+    return any(g in s_low for g in generic)
+def generate_creative_reply(candidates: List[str]) -> str:
+    all_sent = []
+    seen = set()
+    for c in candidates:
+        for s in re.split(r'(?<=[.?!])\s+', c):
+            st = s.strip()
+            if not st or st in seen or is_boilerplate_candidate(st):
+                continue
+            seen.add(st)
+            all_sent.append(st)
+    if not all_sent:
+        return "I don't have enough context yet — can you give more details?"
+    return "\n".join(all_sent[:5])
+# --- TTS: optimized loader, caching speaker files ---
+_tts_model = None
+_tts_lock = threading.Lock()
+_speaker_hash_cache: Dict[str, str] = {}
+_tts_loaded_event = threading.Event()
+def compute_file_sha256(path: str) -> str:
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        while True:
+            b = f.read(8192)
+            if not b:
+                break
+            h.update(b)
+    return h.hexdigest()
+def get_tts_model_blocking():
+    global _tts_model
+    if not TTS_AVAILABLE:
+        raise RuntimeError("TTS.api not available on server")
+    with _tts_lock:
+        if _tts_model is None:
+            model_name = os.environ.get("TTS_MODEL_NAME", TTS_MODEL_NAME)
+            device = os.environ.get("TTS_DEVICE", TTS_DEVICE)
+            logger.info(f"[TTS] Loading model {model_name} on device {device}")
+            _tts_model = TTS(model_name)
+            try:
+                if device and torch is not None:
+                    if device.startswith("cuda") and torch.cuda.is_available():
+                        try:
+                            _tts_model.to(device)
+                        except Exception:
+                            pass
+                        try:
+                            torch.backends.cudnn.benchmark = True
+                        except Exception:
+                            pass
+                        if TTS_USE_HALF:
+                            try:
+                                if hasattr(_tts_model, "model") and hasattr(_tts_model.model, "half"):
+                                    _tts_model.model.half()
+                            except Exception:
+                                pass
+                        try:
+                            torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "4")))
+                        except Exception:
+                            pass
+                    else:
+                        try:
+                            torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "4")))
+                        except Exception:
+                            pass
+            except Exception as e:
+                logger.debug(f"[TTS] model device tuning warning: {e}")
+            logger.info("[TTS] model loaded")
+            _tts_loaded_event.set()
+    return _tts_model
+def _save_upload_file_tmp(upload_file: UploadFile) -> str:
+    suffix = os.path.splitext(upload_file.filename)[1] or ".wav"
+    fd, tmp_path = tempfile.mkstemp(suffix=suffix, prefix="tts_speaker_")
+    os.close(fd)
+    with open(tmp_path, "wb") as f:
+        content = upload_file.file.read()
+        f.write(content)
+    return tmp_path
+# Preload TTS in background at process start
+if TTS_AVAILABLE:
+    threading.Thread(target=lambda: (get_tts_model_blocking()), daemon=True).start()
+@app.post("/speak")
+async def speak(
+    background_tasks: BackgroundTasks,
+    text: str = Form(...),
+    voice_wav: Optional[UploadFile] = File(None),
+    language: Optional[str] = Form(None),
+):
+    """
+    Generate speech for `text`. Optionally use an uploaded `voice_wav` (WAV) file as speaker sample.
+    This endpoint aims for speed by using a preloaded model and GPU/half precision if configured.
+    """
+    if not text or not text.strip():
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Field 'text' is required")
+    if not TTS_AVAILABLE:
+        raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="TTS engine not available")
+    speaker_path = None
+    speaker_hash = None
+    if voice_wav is not None:
+        try:
+            speaker_path = _save_upload_file_tmp(voice_wav)
+            speaker_hash = compute_file_sha256(speaker_path)
+            cached = _speaker_hash_cache.get(speaker_hash)
+            if cached and os.path.exists(cached):
+                try:
+                    os.remove(speaker_path)
+                except Exception:
+                    pass
+                speaker_path = cached
+            else:
+                _speaker_hash_cache[speaker_hash] = speaker_path
+        except Exception as e:
+            logger.exception("Failed to save uploaded voice sample")
+            raise HTTPException(status_code=500, detail="Failed to process uploaded voice sample")
+    out_fd, out_path = tempfile.mkstemp(suffix=".wav", prefix="tts_out_")
+    os.close(out_fd)
+    background_tasks.add_task(lambda p: os.path.exists(p) and os.remove(p), out_path)
+    try:
+        tts = get_tts_model_blocking()
+    except Exception as e:
+        logger.exception("[TTS] model load failed")
+        try:
+            if os.path.exists(out_path):
+                os.remove(out_path)
+        except Exception:
+            pass
+        raise HTTPException(status_code=500, detail="Failed to load TTS model")
+    kwargs = {}
+    if speaker_path:
+        kwargs["speaker_wav"] = speaker_path
+    if language:
+        kwargs["language"] = language
+    try:
+        if torch is not None and torch.cuda.is_available() and TTS_USE_HALF:
+            try:
+                with torch.inference_mode():
+                    with torch.cuda.amp.autocast():
+                        tts.tts_to_file(text=text, file_path=out_path, **kwargs)
+            except Exception as e:
+                logger.debug(f"[TTS] autocast path failed: {e}, falling back")
+                with torch.inference_mode():
+                    tts.tts_to_file(text=text, file_path=out_path, **kwargs)
+        else:
+            if torch is not None:
+                with torch.inference_mode():
+                    tts.tts_to_file(text=text, file_path=out_path, **kwargs)
+            else:
+                tts.tts_to_file(text=text, file_path=out_path, **kwargs)
+    except Exception as e:
+        logger.exception("[TTS] synthesis failed")
+        try:
+            if os.path.exists(out_path):
+                os.remove(out_path)
+        except Exception:
+            pass
+        raise HTTPException(status_code=500, detail="TTS synthesis failed")
+    filename = f"speech-{uuid.uuid4().hex}.wav"
+    return FileResponse(path=out_path, filename=filename, media_type="audio/wav", background=background_tasks)
+# --- Metrics & caches ---
 recent_request_times = deque()
 recent_learning_timestamps = deque()
 response_time_ema: Optional[float] = None
 EMA_ALPHA = 0.2
 def record_request(duration_s: float):
     global response_time_ema
     while recent_learning_timestamps and recent_learning_timestamps[0] < ts - 3600:
         recent_learning_timestamps.popleft()
+# --- Startup event: warm up optional components ---
 @app.on_event("startup")
 async def startup_event():
+    logger.info("[JusticeAI] startup event beginning")
+    # Try to warmup embedding model quickly in background
+    if SentenceTransformer is not None:
+        def _warm_embed():
+            try:
+                try_load_embed()
+                logger.info("[startup] embed model warmup complete")
+            except Exception as e:
+                logger.debug(f"[startup] embed warmup issue: {e}")
+        threading.Thread(target=_warm_embed, daemon=True).start()
+    # Optionally attempt ollama pull (best-effort)
     if OLLAMA_AUTO_PULL and ollama_cli_available():
         try:
             subprocess.run(["ollama", "pull", OLLAMA_MODEL], timeout=300)
             logger.debug(f"[startup] ollama pull failed: {e}")
     logger.info("[JusticeAI] startup complete")
+# --- Knowledge management endpoints ---
 @app.post("/add")
 async def add_knowledge(data: dict = Body(...)):
     if not isinstance(data, dict):
             return JSONResponse(status_code=400, content={"error": "translation failed"})
     emb_bytes = None
     if embed_model is not None:
+        try:
+            emb_bytes = await run_blocking_with_timeout(lambda: embed_to_bytes(text_data), timeout=MODEL_TIMEOUT)
+        except Exception:
+            emb_bytes = None
     try:
         with engine_knowledge.begin() as conn:
             if emb_bytes:
                     "INSERT INTO knowledge (text, reply, language, category, topic, confidence, meta) "
                     "VALUES (:t, :r, :lang, 'manual', :topic, :conf, :meta)"
                 ), {"t": text_data, "r": reply, "lang": detected, "topic": topic, "conf": 0.9, "meta": json.dumps({"manual": True})})
         record_learn_event()
+        return {"status": "✅ Knowledge added", "text": text_data, "topic": topic, "language": detected}
     except Exception as e:
         logger.exception("add failed")
         return JSONResponse(status_code=500, content={"error": "failed to store knowledge", "details": str(e)})
             detected = detect_language_safe(text_data) or "und"
             if detected not in ("en", "eng", "und"):
                 errors.append({"index": i, "error": "non-english; skip"}); continue
+            emb_bytes = None
+            if embed_model is not None:
+                try:
+                    emb_bytes = await run_blocking_with_timeout(lambda: embed_to_bytes(text_data), timeout=MODEL_TIMEOUT)
+                except Exception:
+                    emb_bytes = None
             with engine_knowledge.begin() as conn:
                 if emb_bytes:
                     conn.execute(sql_text(
+                        "INSERT INTO knowledge (text, reply, language, embedding, category, topic) VALUES (:t, :r, :lang, :e, 'manual', :topic)"
                     ), {"t": text_data, "r": reply, "lang": "en", "e": emb_bytes, "topic": topic})
                 else:
                     conn.execute(sql_text(
+                        "INSERT INTO knowledge (text, reply, language, category, topic) VALUES (:t, :r, :lang, 'manual', :topic)"
                     ), {"t": text_data, "r": reply, "lang": "en", "topic": topic})
             added += 1
         except Exception as e:
             logger.exception("add-bulk item error")
             errors.append({"index": i, "error": str(e)})
     if added:
         record_learn_event()
     return {"added": added, "errors": errors}
+@app.get("/leaderboard")
+async def leaderboard(topic: str = Query("general")):
+    t = str(topic or "general").strip() or "general"
+    try:
+        with engine_knowledge.begin() as conn:
+            rows = conn.execute(sql_text("""
+                SELECT id, text, reply, language, category, confidence, created_at
+                FROM knowledge
+                WHERE topic = :topic
+                ORDER BY confidence DESC, created_at DESC
+                LIMIT 20
+            """), {"topic": t}).fetchall()
+        out = []
+        for r in rows:
+            text_en = r[1] or ""
+            lang = r[3] or "und"
+            display_text = text_en
+            if lang and lang not in ("en", "eng", "", "und"):
+                try:
+                    display_text = translate_to_english(text_en, lang)
+                except Exception:
+                    display_text = text_en
+            created_at = r[6]
+            out.append({
+                "id": r[0],
+                "text": display_text,
+                "reply": r[2],
+                "language": lang,
+                "category": r[4],
+                "confidence": round(r[5] or 0.0, 2),
+                "created_at": created_at.isoformat() if hasattr(created_at, "isoformat") else str(created_at)
+            })
+        return {"topic": t, "top_20": out}
+    except Exception as e:
+        logger.exception("leaderboard failed")
+        return JSONResponse(status_code=500, content={"error": "failed to fetch leaderboard", "details": str(e)})
+@app.post("/reembed")
+async def reembed_all(data: dict = Body(...), x_admin_key: str = Header(None, alias="X-Admin-Key")):
+    if ADMIN_KEY is None:
+        return JSONResponse(status_code=403, content={"error": "Server not configured for admin operations."})
+    if x_admin_key != ADMIN_KEY:
+        return JSONResponse(status_code=403, content={"error": "Invalid admin key."})
+    if embed_model is None:
+        return JSONResponse(status_code=503, content={"error": "Embedding model not ready."})
+    confirm = str(data.get("confirm", "") or "").strip()
+    if confirm != "REEMBED":
+        return JSONResponse(status_code=400, content={"error": "confirm token required."})
+    batch_size = int(data.get("batch_size", 100))
+    try:
+        with engine_knowledge.begin() as conn:
+            rows = conn.execute(sql_text("SELECT id, text FROM knowledge ORDER BY id")).fetchall()
+        ids_texts = [(r[0], r[1]) for r in rows]
+        total = len(ids_texts)
+        updated = 0
+        for i in range(0, total, batch_size):
+            batch = ids_texts[i:i+batch_size]
+            texts = [t for _, t in batch]
+            try:
+                embs = await run_blocking_with_timeout(lambda: embed_model.encode(texts, convert_to_tensor=True), timeout=MODEL_TIMEOUT)
+            except Exception:
+                embs = None
+            if embs is None:
+                continue
+            for j, (kid, _) in enumerate(batch):
+                emb_bytes = embs[j].cpu().numpy().tobytes()
+                with engine_knowledge.begin() as conn:
+                    conn.execute(sql_text("UPDATE knowledge SET embedding = :e, updated_at = CURRENT_TIMESTAMP WHERE id = :id"), {"e": emb_bytes, "id": kid})
+                updated += 1
+        return {"status": "✅ Re-embed complete", "total_rows": total, "updated": updated}
+    except Exception as e:
+        logger.exception("reembed failed")
+        return JSONResponse(status_code=500, content={"error": "reembed failed", "details": str(e)})
+@app.get("/model-status")
+async def model_status():
+    return {
+        "embed_loaded": embed_model is not None,
+        "ollama_cli": ollama_cli_available(),
+        "ollama_http": ollama_http_available(),
+        "moderator": moderator is not None,
+        "language_module": LANGUAGE_MODULE_AVAILABLE,
+        "tts_available": TTS_AVAILABLE
+    }
+@app.get("/health")
+async def health():
+    try:
+        with engine_knowledge.connect() as c:
+            k = c.execute(sql_text("SELECT COUNT(*) FROM knowledge")).scalar() or 0
+    except Exception:
+        k = -1
+    try:
+        with engine_user.connect() as c:
+            u = c.execute(sql_text("SELECT COUNT(*) FROM user_memory")).scalar() or 0
+    except Exception:
+        u = -1
+    return {"ok": True, "knowledge_count": int(k), "user_memory_count": int(u), "uptime_s": round(time.time() - app_start_time, 2), "heartbeat": last_heartbeat}
 @app.post("/chat")
 async def chat(request: Request, data: dict = Body(...)):
     t0 = time.time()
+    # Accept "message" or "text"
     if isinstance(data, dict):
         raw_msg = str(data.get("message", "") or data.get("text", "") or "").strip()
     else:
     detected_lang = detect_language_safe(raw_msg)
     reply_lang = detected_lang if detected_lang and detected_lang != "und" else "en"
+    # Translate incoming to English for retrieval if needed
     en_msg = raw_msg
     if detected_lang not in ("en", "eng", "", "und"):
         try:
         except Exception:
             en_msg = raw_msg
+    # Determine topic: Ollama first, then embedding, then keyword
     topic = "general"
     try:
         if not topic_hint:
     except Exception:
         topic = topic_hint or "general"
+    # Moderation
     flags = {}
     try:
         if moderator is not None:
     except Exception:
         pass
+    # Load topic-scoped knowledge
     try:
         with engine_knowledge.begin() as conn:
+            rows = conn.execute(sql_text("SELECT id, text, reply, language, embedding FROM knowledge WHERE topic = :topic ORDER BY created_at DESC"), {"topic": topic}).fetchall()
     except Exception as e:
         record_request(time.time() - t0)
         return JSONResponse(status_code=500, content={"error": "failed to read knowledge", "details": str(e)})
     knowledge_rows = [{"id": r[0], "text": r[1] or "", "reply": r[2] or "", "lang": r[3] or "und", "embedding": r[4]} for r in rows]
+    # Retrieval (embedding-first)
     matches: List[str] = []
     confidence = 0.0
     try:
         if embed_model is not None and knowledge_rows:
             texts = [kr["text"] for kr in knowledge_rows]
+            try:
+                embs = await run_blocking_with_timeout(lambda: embed_model.encode(texts, convert_to_tensor=True), timeout=MODEL_TIMEOUT)
+                q_emb = await run_blocking_with_timeout(lambda: embed_model.encode([en_msg], convert_to_tensor=True)[0], timeout=MODEL_TIMEOUT)
+                import torch as _torch
+                scores = _torch.nn.functional.cosine_similarity(q_emb.unsqueeze(0), embs)
+                cand = []
+                for i in range(scores.shape[0]):
+                    s = float(scores[i])
+                    kr = knowledge_rows[i]
+                    candidate_text = (kr["reply"] or kr["text"]).strip()
+                    if is_boilerplate_candidate(candidate_text):
+                        continue
+                    if s >= 0.30:
+                        cand.append({"text": candidate_text, "lang": kr["lang"], "score": s})
+                cand = sorted(cand, key=lambda x: -x["score"])
+                matches = [c["text"] for c in cand]
+                confidence = cand[0]["score"] if cand else 0.0
+            except asyncio.TimeoutError:
+                logger.warning("[retrieval] embedding encode timed out")
+                matches = []
+            except Exception as e:
+                logger.warning(f"[retrieval] embedding error: {e}")
+                matches = []
         else:
             cand = []
             for kr in knowledge_rows:
         logger.warning(f"[retrieval] error: {e}")
         matches = []
+    # Compose reply strictly from topic matches
     if matches and confidence >= 0.6:
         reply_en = matches[0]
     elif matches:
             except Exception:
                 pass
         reply_final = base
         try:
             if not flags.get('toxic', False):
                 with engine_user.begin() as conn:
                         "VALUES (:uid, :uname, :ip, :text, :reply, :lang, :mood, :conf, :topic, :source)"
                     ), {"uid": user_id, "uname": username, "ip": user_ip, "text": raw_msg, "reply": reply_final, "lang": detected_lang,
                         "mood": detect_mood(raw_msg + " " + reply_final), "conf": float(confidence), "topic": topic, "source": "chat"})
                     conn.execute(sql_text(
+                        "DELETE FROM user_memory WHERE id NOT IN (SELECT id FROM user_memory WHERE user_id = :uid ORDER BY created_at DESC LIMIT 10) AND user_id = :uid"
                     ), {"uid": user_id})
         except Exception as e:
             logger.debug(f"user_memory store error: {e}")
         record_request(time.time() - t0)
+        return {"reply": reply_final, "topic": topic, "language": reply_lang, "emoji": "", "confidence": round(confidence,2), "flags": flags}
+    # Postprocess reply_en
     reply_en = dedupe_sentences(reply_en)
+    # Translate to user's language if needed
     reply_final = reply_en
     lang_code = (reply_lang or "und").split("-")[0].lower()
     if lang_code not in ("en", "eng", "und", ""):
             logger.warning(f"[translation] failed to translate reply_en -> {lang_code}: {exc}")
             reply_final = reply_en
+    # Mood & emoji
     emoji = ""
     try:
         mood = detect_mood(raw_msg + " " + reply_final)
     except Exception:
         emoji = ""
+    # Persist user memory (only in user DB) and prune to last 10
     try:
         if not flags.get('toxic', False):
             with engine_user.begin() as conn:
                     "VALUES (:uid, :uname, :ip, :text, :reply, :lang, :mood, :conf, :topic, :source)"
                 ), {"uid": user_id, "uname": username, "ip": user_ip, "text": raw_msg, "reply": reply_final, "lang": detected_lang,
                     "mood": detect_mood(raw_msg + " " + reply_final), "conf": float(confidence), "topic": topic, "source": "chat"})
                 conn.execute(sql_text(
+                    "DELETE FROM user_memory WHERE id NOT IN (SELECT id FROM user_memory WHERE user_id = :uid ORDER BY created_at DESC LIMIT 10) AND user_id = :uid"
                 ), {"uid": user_id})
     except Exception as e:
         logger.debug(f"user_memory persist error: {e}")
     if include_steps:
         reply_final = f"{reply_final}\n\n[Debug: topic={topic} confidence={round(confidence,2)}]"
+    return {"reply": reply_final, "topic": topic, "language": reply_lang, "emoji": emoji, "confidence": round(confidence,2), "flags": flags}
 @app.post("/response")
 async def response_wrapper(request: Request, data: dict = Body(...)):
     return await chat(request, data)
 @app.post("/verify-admin")
 async def verify_admin(x_admin_key: str = Header(None, alias="X-Admin-Key")):
     if ADMIN_KEY is None:
     html = html.replace("%%STARTUP_TIME%%", str(startup_time_local))
     return HTMLResponse(html)
+# --- Start app ---
+if __name__ == "__main__":
+    # preload embed and TTS in background
+    if TTS_AVAILABLE:
         try:
+            threading.Thread(target=lambda: get_tts_model_blocking(), daemon=True).start()
         except Exception:
+            pass
+    if SentenceTransformer is not None:
+        try:
+            threading.Thread(target=try_load_embed, daemon=True).start()
+        except Exception:
+            pass
     app_start_time = time.time()
     import uvicorn
     port = int(os.environ.get("PORT", 7860))