Spaces:

Princeaka
/

justiceai

Sleeping

App Files Files Community

Princeaka commited on 9 days ago

Commit

491f92c

verified ·

1 Parent(s): 9f95176

Update app.py

Browse files

Files changed (1) hide show

app.py +333 -111

app.py CHANGED Viewed

@@ -111,6 +111,13 @@ try:
 except Exception:
     detect_lang = None
 # Moderator pipeline (optional)
 moderator = None
 try:
@@ -643,6 +650,15 @@ def infer_topic_with_ollama(msg: str, topics: List[str], model: str = OLLAMA_MOD
 # -------------------------
 # Simple fallback topic inference (NEW)
 # -------------------------
 def infer_topic_from_message(msg: str, topics: List[str]) -> Optional[str]:
     """
     Fallback topic inference: tries keyword matching against topic names and
@@ -651,7 +667,8 @@ def infer_topic_from_message(msg: str, topics: List[str]) -> Optional[str]:
     if not msg or not topics:
         return None
     low = msg.lower()
-    # Try exact topic token matches
     for t in topics:
         if not t:
             continue
@@ -662,12 +679,36 @@ def infer_topic_from_message(msg: str, topics: List[str]) -> Optional[str]:
         for w in re.split(r'[\s\-_]+', token):
             if w and re.search(r'\b' + re.escape(w) + r'\b', low):
                 return t
     # If no direct match, try heuristics: map some keywords to topics
     heuristics = {
-        "security": ["security", "vulnerability", "exploit", "attack", "auth"],
-        "billing": ["bill", "invoice", "payment", "charge"],
-        "installation": ["install", "setup", "deploy", "deployment"],
-        "general": ["help", "question", "how", "what", "why"]
     }
     for topic, kws in heuristics.items():
         for kw in kws:
@@ -677,6 +718,70 @@ def infer_topic_from_message(msg: str, topics: List[str]) -> Optional[str]:
                     return topic
     return None
 # -------------------------
 # Boilerplate detection & reply helpers
 # -------------------------
@@ -1440,29 +1545,59 @@ async def chat(request: Request, data: dict = Body(...)):
         except Exception:
             en_msg = raw_msg
-    # Determine topic: Ollama first, then embeddings, then keyword
     topic = "general"
     try:
         if not topic_hint:
-            with engine_knowledge.begin() as conn:
-                rows = conn.execute(sql_text("SELECT DISTINCT topic FROM knowledge")).fetchall()
-            known_topics = [r[0] for r in rows if r and r[0]]
             chosen = None
-            try:
-                if (ollama_http_available() or ollama_cli_available()) and known_topics:
-                    possible = infer_topic_with_ollama(en_msg, known_topics)
-                    if possible:
-                        chosen = possible
-            except Exception:
-                chosen = None
             if not chosen:
-                # use our local fallback inference
                 chosen = infer_topic_from_message(en_msg, known_topics)
             topic = chosen or "general"
         else:
             topic = topic_hint or "general"
-    except Exception:
         topic = topic_hint or "general"
     # Moderation
     flags = {}
@@ -1477,107 +1612,179 @@ async def chat(request: Request, data: dict = Body(...)):
     except Exception:
         pass
-    # Load knowledge entries for this topic only
-    try:
-        with engine_knowledge.begin() as conn:
-            rows = conn.execute(sql_text("SELECT id, text, reply, language, embedding FROM knowledge WHERE topic = :topic ORDER BY created_at DESC"), {"topic": topic}).fetchall()
-    except Exception as e:
-        record_request(time.time() - t0)
-        return JSONResponse(status_code=500, content={"error": "failed to read knowledge", "details": str(e)})
-    knowledge_rows = [{"id": r[0], "text": r[1] or "", "reply": r[2] or "", "lang": r[3] or "und", "embedding": r[4]} for r in rows]
-    # Retrieval (embedding-first) - Optimized and now uses stored embeddings when available
     matches: List[str] = []
     confidence = 0.0
     try:
-        # If we have an embed model and any stored embeddings, prefer using stored embeddings to avoid recomputing
-        stored_embs = []
-        stored_indices = []
-        for i, kr in enumerate(knowledge_rows):
-            if kr.get("embedding") is not None:
-                t = bytes_to_tensor(kr["embedding"])
-                if t is not None:
-                    stored_embs.append(t)
-                    stored_indices.append(i)
-        # If we have stored embeddings and torch is available, compute similarity using them
-        if torch is not None and stored_embs and embed_model is not None:
-            try:
-                # Stack stored embeddings into a single tensor
-                embs_tensor = torch.stack(stored_embs)
-                # Compute query embedding
-                q_emb = await run_blocking_with_timeout(lambda: embed_model.encode([en_msg], convert_to_tensor=True, show_progress_bar=False)[0], timeout=MODEL_TIMEOUT)
-                if not isinstance(q_emb, torch.Tensor):
-                    try:
                         q_emb = torch.from_numpy(q_emb.cpu().numpy())
                     except Exception:
-                        pass
-                # Ensure shapes align: embs_tensor (N, dim), q_emb (dim,) -> unsqueeze
-                try:
-                    scores = torch.nn.functional.cosine_similarity(q_emb.unsqueeze(0), embs_tensor, dim=1)
-                except Exception:
-                    # Try alternative orientation
-                    scores = torch.nn.functional.cosine_similarity(embs_tensor, q_emb.unsqueeze(0), dim=1)
-                cand = []
-                for idx, s in enumerate(scores):
-                    i_orig = stored_indices[idx]
-                    kr = knowledge_rows[i_orig]
-                    candidate_text = (kr["reply"] or kr["text"]).strip()
-                    if is_boilerplate_candidate(candidate_text):
-                        continue
-                    s_float = float(s)
-                    if s_float >= 0.30:
-                        cand.append({"text": candidate_text, "lang": kr["lang"], "score": s_float})
-                cand = sorted(cand, key=lambda x: -x["score"])
-                matches = [c["text"] for c in cand]
-                confidence = float(cand[0]["score"]) if cand else 0.0
-            except asyncio.TimeoutError:
-                logger.warning("[retrieval] embedding encode timed out (query)")
-                matches = []
-            except Exception as e:
-                logger.warning(f"[retrieval] embedding (stored) error: {e}")
-                matches = []
-        # If we didn't find matches via stored embeddings, fallback to computing embeddings for all texts if embed_model available
-        if not matches and embed_model is not None and knowledge_rows:
-            texts = [kr["text"] for kr in knowledge_rows]
-            try:
-                # compute embeddings for texts and query
-                embs = await run_blocking_with_timeout(lambda: embed_model.encode(texts, convert_to_tensor=True, show_progress_bar=False), timeout=MODEL_TIMEOUT)
-                q_emb = await run_blocking_with_timeout(lambda: embed_model.encode([en_msg], convert_to_tensor=True, show_progress_bar=False)[0], timeout=MODEL_TIMEOUT)
-                import torch as _torch
                 try:
-                    scores = _torch.nn.functional.cosine_similarity(q_emb.unsqueeze(0), embs, dim=1)
-                except Exception:
-                    scores = _torch.nn.functional.cosine_similarity(embs, q_emb.unsqueeze(0), dim=1)
-                cand = []
-                for i in range(scores.shape[0]):
-                    s = float(scores[i])
-                    kr = knowledge_rows[i]
-                    candidate_text = (kr["reply"] or kr["text"]).strip()
-                    if is_boilerplate_candidate(candidate_text):
-                        continue
-                    if s >= 0.30:
-                        cand.append({"text": candidate_text, "lang": kr["lang"], "score": s})
-                cand = sorted(cand, key=lambda x: -x["score"])
-                matches = [c["text"] for c in cand]
-                confidence = float(cand[0]["score"]) if cand else 0.0
-            except asyncio.TimeoutError:
-                logger.warning("[retrieval] embedding encode timed out")
-                matches = []
-            except Exception as e:
-                logger.warning(f"[retrieval] embedding error: {e}")
-                matches = []
-        else:
-            # No embed model: fallback to simple keyword substring matching inside replies/text
             cand = []
             for kr in knowledge_rows:
                 txt = (kr["reply"] or kr["text"]) or ""
-                if en_msg.lower() in txt.lower():
-                    if is_boilerplate_candidate(txt):
                         continue
-                    cand.append({"text": txt, "lang": kr["lang"], "score": 0.0})
-            matches = [c["text"] for c in cand]
-            confidence = 0.0
     except Exception as e:
         logger.warning(f"[retrieval] error: {e}")
         matches = []
@@ -1612,17 +1819,32 @@ async def chat(request: Request, data: dict = Body(...)):
         record_request(time.time() - t0)
         return {"reply": reply_final, "topic": topic, "language": reply_lang, "emoji": "", "confidence": round(confidence,2), "flags": flags}
-    # Post-process and translate
     reply_en = dedupe_sentences(reply_en)
     reply_final = reply_en
-    lang_code = (reply_lang or "und").split("-")[0].lower()
-    if lang_code not in ("en", "eng", "und", ""):
         try:
             reply_final = translate_from_english(reply_en, lang_code)
             reply_final = dedupe_sentences(reply_final)
         except Exception as exc:
             logger.warning(f"[translation] failed to translate reply_en -> {lang_code}: {exc}")
             reply_final = reply_en
     # Mood & emoji append
     emoji = ""

 except Exception:
     detect_lang = None
+# Optional fuzzy matching for spell tolerance
+try:
+    from difflib import SequenceMatcher
+    FUZZY_AVAILABLE = True
+except Exception:
+    FUZZY_AVAILABLE = False
 # Moderator pipeline (optional)
 moderator = None
 try:
 # -------------------------
 # Simple fallback topic inference (NEW)
 # -------------------------
+def fuzzy_match_score(s1: str, s2: str) -> float:
+    """
+    Calculate fuzzy match score between two strings (0.0 to 1.0).
+    Handles spell errors and variations.
+    """
+    if not FUZZY_AVAILABLE:
+        return 1.0 if s1.lower() == s2.lower() else 0.0
+    return SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
 def infer_topic_from_message(msg: str, topics: List[str]) -> Optional[str]:
     """
     Fallback topic inference: tries keyword matching against topic names and
     if not msg or not topics:
         return None
     low = msg.lower()
+    # Try exact topic token matches first
     for t in topics:
         if not t:
             continue
         for w in re.split(r'[\s\-_]+', token):
             if w and re.search(r'\b' + re.escape(w) + r'\b', low):
                 return t
+    # Try fuzzy matching for spell tolerance
+    if FUZZY_AVAILABLE:
+        best_match = None
+        best_score = 0.0
+        for t in topics:
+            if not t:
+                continue
+            token = str(t).lower()
+            # Check fuzzy match against whole message
+            score = fuzzy_match_score(token, low)
+            if score > 0.7 and score > best_score:
+                best_score = score
+                best_match = t
+            # Check fuzzy match against individual words
+            for word in low.split():
+                if len(word) > 3:  # Only check meaningful words
+                    score = fuzzy_match_score(token, word)
+                    if score > 0.75 and score > best_score:
+                        best_score = score
+                        best_match = t
+        if best_match:
+            return best_match
     # If no direct match, try heuristics: map some keywords to topics
     heuristics = {
+        "security": ["security", "vulnerability", "exploit", "attack", "auth", "password", "login"],
+        "billing": ["bill", "invoice", "payment", "charge", "price", "cost"],
+        "installation": ["install", "setup", "deploy", "deployment", "configure"],
+        "general": ["help", "question", "how", "what", "why", "issue", "problem"]
     }
     for topic, kws in heuristics.items():
         for kw in kws:
                     return topic
     return None
+def infer_topic_with_embeddings(msg: str, topics: List[str], knowledge_rows: List[dict]) -> Optional[str]:
+    """
+    Use cosine similarity on embeddings to infer the best matching topic.
+    This provides semantic understanding instead of just keyword matching.
+    """
+    if not embed_model or not topics or not knowledge_rows:
+        return None
+    try:
+        # Compute query embedding
+        q_emb = embed_model.encode([msg], convert_to_tensor=True, show_progress_bar=False)[0]
+        # Group knowledge by topic and compute average embedding per topic
+        topic_embeddings = {}
+        topic_counts = {}
+        for kr in knowledge_rows:
+            t = kr.get("topic", "general")
+            if t not in topics:
+                continue
+            emb_bytes = kr.get("embedding")
+            if emb_bytes is None:
+                continue
+            emb_tensor = bytes_to_tensor(emb_bytes)
+            if emb_tensor is None:
+                continue
+            if t not in topic_embeddings:
+                topic_embeddings[t] = emb_tensor
+                topic_counts[t] = 1
+            else:
+                topic_embeddings[t] = topic_embeddings[t] + emb_tensor
+                topic_counts[t] += 1
+        # Average the embeddings
+        for t in topic_embeddings:
+            topic_embeddings[t] = topic_embeddings[t] / topic_counts[t]
+        if not topic_embeddings:
+            return None
+        # Compute cosine similarity with each topic
+        best_topic = None
+        best_score = 0.0
+        for t, t_emb in topic_embeddings.items():
+            try:
+                score = float(torch.nn.functional.cosine_similarity(q_emb.unsqueeze(0), t_emb.unsqueeze(0), dim=1)[0])
+                if score > best_score:
+                    best_score = score
+                    best_topic = t
+            except Exception:
+                continue
+        # Only return if confidence is high enough
+        if best_score > 0.4:
+            logger.info(f"[topic inference] embedding-based: {best_topic} (score={best_score:.2f})")
+            return best_topic
+    except Exception as e:
+        logger.debug(f"[topic inference] embedding error: {e}")
+    return None
 # -------------------------
 # Boilerplate detection & reply helpers
 # -------------------------
         except Exception:
             en_msg = raw_msg
+    # Load ALL knowledge entries first (needed for embedding-based topic inference)
+    try:
+        with engine_knowledge.begin() as conn:
+            all_rows = conn.execute(sql_text("SELECT id, text, reply, language, embedding, topic FROM knowledge ORDER BY created_at DESC")).fetchall()
+    except Exception as e:
+        record_request(time.time() - t0)
+        return JSONResponse(status_code=500, content={"error": "failed to read knowledge", "details": str(e)})
+    all_knowledge_rows = [{"id": r[0], "text": r[1] or "", "reply": r[2] or "", "lang": r[3] or "und", "embedding": r[4], "topic": r[5] or "general"} for r in all_rows]
+    # Get list of known topics
+    known_topics = list(set([kr.get("topic", "general") for kr in all_knowledge_rows if kr.get("topic")]))
+    # Determine topic: Embeddings first (best), then Ollama, then keyword matching
     topic = "general"
     try:
         if not topic_hint:
             chosen = None
+            # 1. Try embedding-based topic inference (BEST - semantic understanding)
+            if embed_model is not None and all_knowledge_rows:
+                try:
+                    chosen = infer_topic_with_embeddings(en_msg, known_topics, all_knowledge_rows)
+                    if chosen:
+                        logger.info(f"[topic] Selected via embeddings: {chosen}")
+                except Exception as e:
+                    logger.debug(f"[topic] embedding inference failed: {e}")
+            # 2. Fallback to Ollama if embeddings didn't work
+            if not chosen:
+                try:
+                    if (ollama_http_available() or ollama_cli_available()) and known_topics:
+                        possible = infer_topic_with_ollama(en_msg, known_topics)
+                        if possible:
+                            chosen = possible
+                            logger.info(f"[topic] Selected via Ollama: {chosen}")
+                except Exception as e:
+                    logger.debug(f"[topic] ollama inference failed: {e}")
+            # 3. Final fallback to keyword/fuzzy matching
             if not chosen:
                 chosen = infer_topic_from_message(en_msg, known_topics)
+                if chosen:
+                    logger.info(f"[topic] Selected via keyword/fuzzy: {chosen}")
             topic = chosen or "general"
         else:
             topic = topic_hint or "general"
+    except Exception as e:
+        logger.warning(f"[topic] inference error: {e}")
         topic = topic_hint or "general"
+    logger.info(f"[chat] Final topic: {topic}")
     # Moderation
     flags = {}
     except Exception:
         pass
+    # Filter knowledge entries for this topic only
+    knowledge_rows = [kr for kr in all_knowledge_rows if kr.get("topic") == topic]
+    # Retrieval using cosine similarity with spell tolerance
     matches: List[str] = []
     confidence = 0.0
+    match_lang = "en"
     try:
+        # If we have an embed model, use semantic similarity (BEST approach)
+        if embed_model is not None and knowledge_rows:
+            stored_embs = []
+            stored_indices = []
+            # Collect stored embeddings
+            for i, kr in enumerate(knowledge_rows):
+                if kr.get("embedding") is not None:
+                    t = bytes_to_tensor(kr["embedding"])
+                    if t is not None:
+                        stored_embs.append(t)
+                        stored_indices.append(i)
+            # Use stored embeddings if available
+            if torch is not None and stored_embs:
+                try:
+                    # Stack stored embeddings
+                    embs_tensor = torch.stack(stored_embs)
+                    # Compute query embedding
+                    q_emb = await run_blocking_with_timeout(
+                        lambda: embed_model.encode([en_msg], convert_to_tensor=True, show_progress_bar=False)[0],
+                        timeout=MODEL_TIMEOUT
+                    )
+                    if not isinstance(q_emb, torch.Tensor):
                         q_emb = torch.from_numpy(q_emb.cpu().numpy())
+                    # Compute cosine similarity
+                    try:
+                        scores = torch.nn.functional.cosine_similarity(q_emb.unsqueeze(0), embs_tensor, dim=1)
                     except Exception:
+                        scores = torch.nn.functional.cosine_similarity(embs_tensor, q_emb.unsqueeze(0), dim=1)
+                    # Collect candidates with scores
+                    cand = []
+                    for idx, s in enumerate(scores):
+                        i_orig = stored_indices[idx]
+                        kr = knowledge_rows[i_orig]
+                        candidate_text = (kr["reply"] or kr["text"]).strip()
+                        if is_boilerplate_candidate(candidate_text):
+                            continue
+                        s_float = float(s)
+                        # Lower threshold for better recall
+                        if s_float >= 0.25:
+                            cand.append({
+                                "text": candidate_text,
+                                "lang": kr["lang"],
+                                "score": s_float
+                            })
+                    # Sort by score
+                    cand = sorted(cand, key=lambda x: -x["score"])
+                    matches = [c["text"] for c in cand[:5]]  # Top 5 matches
+                    confidence = float(cand[0]["score"]) if cand else 0.0
+                    match_lang = cand[0]["lang"] if cand else "en"
+                    logger.info(f"[retrieval] Found {len(matches)} matches via embeddings, best score: {confidence:.2f}")
+                except asyncio.TimeoutError:
+                    logger.warning("[retrieval] embedding encode timed out")
+                except Exception as e:
+                    logger.warning(f"[retrieval] embedding error: {e}")
+            # Fallback: compute embeddings on the fly if no stored embeddings
+            if not matches and knowledge_rows:
                 try:
+                    texts = [kr["text"] for kr in knowledge_rows]
+                    embs = await run_blocking_with_timeout(
+                        lambda: embed_model.encode(texts, convert_to_tensor=True, show_progress_bar=False),
+                        timeout=MODEL_TIMEOUT
+                    )
+                    q_emb = await run_blocking_with_timeout(
+                        lambda: embed_model.encode([en_msg], convert_to_tensor=True, show_progress_bar=False)[0],
+                        timeout=MODEL_TIMEOUT
+                    )
+                    try:
+                        scores = torch.nn.functional.cosine_similarity(q_emb.unsqueeze(0), embs, dim=1)
+                    except Exception:
+                        scores = torch.nn.functional.cosine_similarity(embs, q_emb.unsqueeze(0), dim=1)
+                    cand = []
+                    for i in range(scores.shape[0]):
+                        s = float(scores[i])
+                        kr = knowledge_rows[i]
+                        candidate_text = (kr["reply"] or kr["text"]).strip()
+                        if is_boilerplate_candidate(candidate_text):
+                            continue
+                        if s >= 0.25:
+                            cand.append({
+                                "text": candidate_text,
+                                "lang": kr["lang"],
+                                "score": s
+                            })
+                    cand = sorted(cand, key=lambda x: -x["score"])
+                    matches = [c["text"] for c in cand[:5]]
+                    confidence = float(cand[0]["score"]) if cand else 0.0
+                    match_lang = cand[0]["lang"] if cand else "en"
+                    logger.info(f"[retrieval] Found {len(matches)} matches via on-the-fly embeddings, best score: {confidence:.2f}")
+                except asyncio.TimeoutError:
+                    logger.warning("[retrieval] embedding encode timed out")
+                except Exception as e:
+                    logger.warning(f"[retrieval] embedding error: {e}")
+        # Final fallback: fuzzy keyword matching with spell tolerance
+        if not matches and knowledge_rows:
+            logger.info("[retrieval] Using fuzzy keyword matching fallback")
             cand = []
             for kr in knowledge_rows:
                 txt = (kr["reply"] or kr["text"]) or ""
+                txt_lower = txt.lower()
+                msg_lower = en_msg.lower()
+                # Exact substring match
+                if msg_lower in txt_lower:
+                    if not is_boilerplate_candidate(txt):
+                        cand.append({"text": txt, "lang": kr["lang"], "score": 0.8})
                         continue
+                # Fuzzy matching for spell tolerance
+                if FUZZY_AVAILABLE and len(en_msg) > 3:
+                    # Check fuzzy match against text
+                    fuzzy_score = fuzzy_match_score(en_msg, txt)
+                    if fuzzy_score > 0.6:
+                        if not is_boilerplate_candidate(txt):
+                            cand.append({"text": txt, "lang": kr["lang"], "score": fuzzy_score * 0.7})
+                            continue
+                    # Check fuzzy match against individual words
+                    msg_words = [w for w in msg_lower.split() if len(w) > 3]
+                    txt_words = [w for w in txt_lower.split() if len(w) > 3]
+                    for msg_word in msg_words:
+                        for txt_word in txt_words:
+                            word_score = fuzzy_match_score(msg_word, txt_word)
+                            if word_score > 0.75:
+                                if not is_boilerplate_candidate(txt):
+                                    cand.append({"text": txt, "lang": kr["lang"], "score": word_score * 0.5})
+                                    break
+            # Remove duplicates and sort
+            seen = set()
+            unique_cand = []
+            for c in cand:
+                if c["text"] not in seen:
+                    seen.add(c["text"])
+                    unique_cand.append(c)
+            cand = sorted(unique_cand, key=lambda x: -x["score"])
+            matches = [c["text"] for c in cand[:5]]
+            confidence = float(cand[0]["score"]) if cand else 0.0
+            match_lang = cand[0]["lang"] if cand else "en"
+            logger.info(f"[retrieval] Found {len(matches)} matches via fuzzy matching, best score: {confidence:.2f}")
     except Exception as e:
         logger.warning(f"[retrieval] error: {e}")
         matches = []
         record_request(time.time() - t0)
         return {"reply": reply_final, "topic": topic, "language": reply_lang, "emoji": "", "confidence": round(confidence,2), "flags": flags}
+    # Post-process and translate back to user's language
     reply_en = dedupe_sentences(reply_en)
     reply_final = reply_en
+    # Determine target language for translation
+    target_lang = reply_lang if reply_lang and reply_lang not in ("en", "eng", "und", "") else None
+    # If match was in a different language, try to use that
+    if match_lang and match_lang not in ("en", "eng", "und", ""):
+        # If user's language matches the match language, use it
+        if target_lang and target_lang.split("-")[0].lower() == match_lang.split("-")[0].lower():
+            target_lang = match_lang
+    # Translate to user's language
+    if target_lang:
+        lang_code = target_lang.split("-")[0].lower()
         try:
+            logger.info(f"[translation] Translating reply from en to {lang_code}")
             reply_final = translate_from_english(reply_en, lang_code)
             reply_final = dedupe_sentences(reply_final)
+            logger.info(f"[translation] Translation successful")
         except Exception as exc:
             logger.warning(f"[translation] failed to translate reply_en -> {lang_code}: {exc}")
             reply_final = reply_en
+    else:
+        logger.info("[translation] No translation needed, using English")
     # Mood & emoji append
     emoji = ""