Spaces:

chouchouvs
/

DeepIndex

Running

App Files Files Community

chouchouvs commited on Sep 15

Commit

75159bc

verified ·

1 Parent(s): edeec3c

Update main.py

Browse files

Files changed (1) hide show

main.py +133 -68

main.py CHANGED Viewed

@@ -7,9 +7,8 @@ import uuid
 import random
 import logging
 import hashlib
 import json
-import tempfile
-import threading
 from typing import List, Optional, Dict, Any, Tuple
 import numpy as np
@@ -72,15 +71,10 @@ WIPE_BEFORE_INDEX = os.getenv("WIPE_BEFORE_INDEX", "false").lower() in ("1","tru
 # --- Auth d’API de ce service (simple header) ---
 AUTH_TOKEN = os.getenv("REMOTE_INDEX_TOKEN", "").strip()
-# --- Jobs persistence / runtime behavior ---
-JOBS_FILE = os.getenv("JOBS_FILE", "/tmp/remote_index_jobs.json")
-RUN_INDEX_BLOCKING = os.getenv("RUN_INDEX_BLOCKING", "false").lower() in ("1","true","yes","on")
 LOG.info(f"Embeddings backend order = {EMB_BACKEND_ORDER}")
 LOG.info(f"HF pipeline URL = {HF_URL_PIPE}")
 LOG.info(f"HF models   URL = {HF_URL_MODL}")
 LOG.info(f"VECTOR_STORE = {VECTOR_STORE}")
-LOG.info(f"JOBS_FILE = {JOBS_FILE} RUN_INDEX_BLOCKING = {RUN_INDEX_BLOCKING}")
 if "deepinfra" in EMB_BACKEND_ORDER and not DI_TOKEN:
     LOG.warning("DEEPINFRA_API_KEY manquant — tentatives DeepInfra échoueront.")
@@ -301,59 +295,19 @@ class StatusBody(BaseModel):
     job_id: str
 # ======================================================================================
-# Jobs store (persistence simple)
 # ======================================================================================
 JOBS: Dict[str, Dict[str, Any]] = {}  # {job_id: {"status": "...", "logs": [...], "created": ts}}
-_jobs_lock = threading.Lock()
-def _atomic_write(path: str, data: str):
-    dirn = os.path.dirname(path) or "/tmp"
-    fd, tmp_path = tempfile.mkstemp(dir=dirn)
-    try:
-        with os.fdopen(fd, "w", encoding="utf-8") as f:
-            f.write(data)
-        os.replace(tmp_path, path)
-    finally:
-        if os.path.exists(tmp_path):
-            try:
-                os.unlink(tmp_path)
-            except Exception:
-                pass
-def _load_jobs():
-    global JOBS
-    try:
-        if os.path.exists(JOBS_FILE):
-            with open(JOBS_FILE, "r", encoding="utf-8") as f:
-                JOBS = json.load(f)
-            LOG.info("Jobs loaded from %s (%d entries)", JOBS_FILE, len(JOBS))
-    except Exception as e:
-        LOG.warning("Could not load jobs file %s: %s", JOBS_FILE, e)
-def _save_jobs():
-    try:
-        with _jobs_lock:
-            _atomic_write(JOBS_FILE, json.dumps(JOBS, ensure_ascii=False))
-    except Exception as e:
-        LOG.warning("Could not save jobs file %s: %s", JOBS_FILE, e)
-# load existing jobs at startup
-_load_jobs()
 def _append_log(job_id: str, line: str):
     job = JOBS.get(job_id)
-    if job is not None:
-        job.setdefault("logs", []).append(line)
-        # keep logs bounded to avoid huge files
-        if len(job["logs"]) > 5000:
-            job["logs"] = job["logs"][-5000:]
-        _save_jobs()
 def _set_status(job_id: str, status: str):
     job = JOBS.get(job_id)
-    if job is not None:
         job["status"] = status
-        _save_jobs()
 def _auth(x_auth: Optional[str]):
     if AUTH_TOKEN and (x_auth or "") != AUTH_TOKEN:
@@ -531,8 +485,50 @@ def _chunk_with_spans(text: str, size: int, overlap: int):
         if i >= n:
             break
 # ======================================================================================
-# Background task : indexation — VERSION CORRIGÉE (persistance JOBS)
 # ======================================================================================
 def run_index_job(job_id: str, req: IndexRequest):
@@ -672,21 +668,97 @@ def run_index_job(job_id: str, req: IndexRequest):
                 continue
             ext = os.path.splitext(path)[1].lower()
-            if ext not in TEXT_EXTS:
-                _append_log(job_id, f"📁 Ignored: {path} (extension non supportée: {ext})")
-                continue
-            text = f.text.strip()
-            if len(text) < 50:  # ✅ Ignorer les fichiers trop courts
-                _append_log(job_id, f"📄 Ignored: {path} (texte trop court: {len(text)} chars)")
                 continue
             _append_log(job_id, f"📄 Processing: {path} ({len(text)} chars)")
             for ci, (start, end, chunk_txt) in enumerate(_chunk_with_spans(text, req.chunk_size, req.overlap)):
                 chunk_txt = chunk_txt.strip()
                 if len(chunk_txt) < 30:  # ✅ Ignorer les chunks trop courts
                     continue
                 buf_chunks.append(chunk_txt)
                 meta = {
                     "path": path,
@@ -754,16 +826,9 @@ def start_index(req: IndexRequest, background_tasks: BackgroundTasks, x_auth_tok
     _check_backend_ready()
     job_id = uuid.uuid4().hex[:12]
     JOBS[job_id] = {"status": "queued", "logs": [], "created": time.time()}
-    _save_jobs()
     LOG.info(f"Created job {job_id} for project {req.project_id}")
     _append_log(job_id, f"Job created: {job_id} project={req.project_id}")
-    if RUN_INDEX_BLOCKING:
-        # utile pour environnements où les BackgroundTasks ne persistent pas / sont killés
-        run_index_job(job_id, req)
-    else:
-        background_tasks.add_task(run_index_job, job_id, req)
     return {"job_id": job_id}
 # --- 3 variantes pour /status ---
@@ -774,7 +839,7 @@ def status_path(job_id: str, x_auth_token: Optional[str] = Header(default=None))
     if not j:
         # Response JSON plus explicite pour faciliter le debug côté client
         raise HTTPException(status_code=404, detail={"error": "job inconnu", "advice": "POST /index to create a new job"})
-    return {"status": j["status"], "logs": j.get("logs", [])[-1500:]}
 @app.get("/status")
 def status_query(job_id: str = Query(...), x_auth_token: Optional[str] = Header(default=None)):

 import random
 import logging
 import hashlib
+import re
 import json
 from typing import List, Optional, Dict, Any, Tuple
 import numpy as np
 # --- Auth d’API de ce service (simple header) ---
 AUTH_TOKEN = os.getenv("REMOTE_INDEX_TOKEN", "").strip()
 LOG.info(f"Embeddings backend order = {EMB_BACKEND_ORDER}")
 LOG.info(f"HF pipeline URL = {HF_URL_PIPE}")
 LOG.info(f"HF models   URL = {HF_URL_MODL}")
 LOG.info(f"VECTOR_STORE = {VECTOR_STORE}")
 if "deepinfra" in EMB_BACKEND_ORDER and not DI_TOKEN:
     LOG.warning("DEEPINFRA_API_KEY manquant — tentatives DeepInfra échoueront.")
     job_id: str
 # ======================================================================================
+# Jobs store (mémoire)
 # ======================================================================================
 JOBS: Dict[str, Dict[str, Any]] = {}  # {job_id: {"status": "...", "logs": [...], "created": ts}}
 def _append_log(job_id: str, line: str):
     job = JOBS.get(job_id)
+    if job:
+        job["logs"].append(line)
 def _set_status(job_id: str, status: str):
     job = JOBS.get(job_id)
+    if job:
         job["status"] = status
 def _auth(x_auth: Optional[str]):
     if AUTH_TOKEN and (x_auth or "") != AUTH_TOKEN:
         if i >= n:
             break
+def _clean_chunk_text(text: str) -> str:
+    """
+    Nettoyage simple des fragments JSON / artefacts:
+    - supprime un champ "indexed_at" tronqué à la fin,
+    - supprime accolades/caractères isolés en début/fin,
+    - compacte sauts de ligne multiples,
+    - tente d'extraire des valeurs textuelles si le chunk ressemble fortement à du JSON.
+    """
+    if not text:
+        return text
+    t = text.strip()
+    # retirer un suffixe typique: , "indexed_at": "2025-..."}}
+    t = re.sub(r',\s*"indexed_at"\s*:\s*"[^"]*"\s*}+\s*$', '', t, flags=re.IGNORECASE)
+    # retirer d'autres clés timestamps communes à la fin si tronquées
+    t = re.sub(r',\s*"(created_at|timestamp|time|date)"\s*:\s*"[^"]*"\s*}+\s*$', '', t, flags=re.IGNORECASE)
+    # retirer accolades ou crochets seuls en début/fin
+    t = re.sub(r'^[\s\]\}\,]+', '', t)
+    t = re.sub(r'[\s\]\}\,]+$', '', t)
+    # si le chunk ressemble majoritairement à du JSON (beaucoup de ":" ou "{"), essayer d'en extraire les valeurs textuelles
+    if t.count(':') >= 3 and (t.count('{') + t.count('}')) >= 1:
+        try:
+            j = json.loads(t)
+            if isinstance(j, dict):
+                # concatène les valeurs textuelles pertinentes
+                vals = []
+                for v in j.values():
+                    if isinstance(v, (str, int, float)):
+                        vals.append(str(v))
+                if vals:
+                    t = " ".join(vals)
+        except Exception:
+            # ignore, on garde t tel quel
+            pass
+    # compacter sauts de ligne
+    t = re.sub(r'\n{3,}', '\n\n', t)
+    return t.strip()
 # ======================================================================================
+# Background task : indexation — VERSION CORRIGÉE
 # ======================================================================================
 def run_index_job(job_id: str, req: IndexRequest):
                 continue
             ext = os.path.splitext(path)[1].lower()
+            text = f.text or ""
+            if len(text.strip()) < 50:  # ✅ Ignorer les fichiers trop courts
+                _append_log(job_id, f"📄 Ignored: {path} (texte trop court: {len(text.strip())} chars)")
                 continue
             _append_log(job_id, f"📄 Processing: {path} ({len(text)} chars)")
+            # --- traitement spécial JSON / NDJSON ---
+            if ext in {".json"} or path.lower().endswith(".ndjson"):
+                # essayer JSON complet
+                handled = False
+                try:
+                    parsed = json.loads(text)
+                    # si c'est une liste -> indexer chaque entrée séparément
+                    if isinstance(parsed, list):
+                        for idx, obj in enumerate(parsed):
+                            if isinstance(obj, dict):
+                                s = " ".join(str(v) for v in obj.values() if isinstance(v, (str, int, float)))
+                            else:
+                                s = str(obj)
+                            s = _clean_chunk_text(s)
+                            if len(s) < 30:
+                                continue
+                            meta = {"path": path, "chunk": idx, "start": 0, "end": len(s)}
+                            if req.store_text:
+                                meta["text"] = s
+                            buf_chunks.append(s); buf_metas.append(meta)
+                            if len(buf_chunks) >= req.batch_size:
+                                _flush()
+                        handled = True
+                    elif isinstance(parsed, dict):
+                        s = " ".join(str(v) for v in parsed.values() if isinstance(v, (str, int, float)))
+                        s = _clean_chunk_text(s)
+                        if len(s) >= 30:
+                            meta = {"path": path, "chunk": 0, "start": 0, "end": len(s)}
+                            if req.store_text:
+                                meta["text"] = s
+                            buf_chunks.append(s); buf_metas.append(meta)
+                            if len(buf_chunks) >= req.batch_size:
+                                _flush()
+                        handled = True
+                except Exception:
+                    # fallback NDJSON: une ligne == un JSON
+                    try:
+                        lines = [L for L in text.splitlines() if L.strip()]
+                        for li, line in enumerate(lines):
+                            try:
+                                obj = json.loads(line)
+                                if isinstance(obj, dict):
+                                    s = " ".join(str(v) for v in obj.values() if isinstance(v, (str, int, float)))
+                                else:
+                                    s = str(obj)
+                                s = _clean_chunk_text(s)
+                                if len(s) < 30:
+                                    continue
+                                meta = {"path": path, "chunk": li, "start": 0, "end": len(s)}
+                                if req.store_text:
+                                    meta["text"] = s
+                                buf_chunks.append(s); buf_metas.append(meta)
+                                if len(buf_chunks) >= req.batch_size:
+                                    _flush()
+                            except Exception:
+                                # ligne non JSON -> indexer comme texte si longue
+                                sl = line.strip()
+                                if len(sl) >= 30:
+                                    sl = _clean_chunk_text(sl)
+                                    meta = {"path": path, "chunk": li, "start": 0, "end": len(sl)}
+                                    if req.store_text:
+                                        meta["text"] = sl
+                                    buf_chunks.append(sl); buf_metas.append(meta)
+                                    if len(buf_chunks) >= req.batch_size:
+                                        _flush()
+                        handled = True
+                    except Exception:
+                        handled = False
+                if handled:
+                    _flush()
+                    _append_log(job_id, f"File done: {path}")
+                    continue  # passe au fichier suivant
+            # --- traitement normal pour fichiers texte ---
             for ci, (start, end, chunk_txt) in enumerate(_chunk_with_spans(text, req.chunk_size, req.overlap)):
                 chunk_txt = chunk_txt.strip()
                 if len(chunk_txt) < 30:  # ✅ Ignorer les chunks trop courts
                     continue
+                # nettoyage pour éviter artefacts JSON / timestamps
+                chunk_txt = _clean_chunk_text(chunk_txt)
+                if len(chunk_txt) < 30:
+                    continue
                 buf_chunks.append(chunk_txt)
                 meta = {
                     "path": path,
     _check_backend_ready()
     job_id = uuid.uuid4().hex[:12]
     JOBS[job_id] = {"status": "queued", "logs": [], "created": time.time()}
     LOG.info(f"Created job {job_id} for project {req.project_id}")
     _append_log(job_id, f"Job created: {job_id} project={req.project_id}")
+    background_tasks.add_task(run_index_job, job_id, req)
     return {"job_id": job_id}
 # --- 3 variantes pour /status ---
     if not j:
         # Response JSON plus explicite pour faciliter le debug côté client
         raise HTTPException(status_code=404, detail={"error": "job inconnu", "advice": "POST /index to create a new job"})
+    return {"status": j["status"], "logs": j["logs"][-1500:]}
 @app.get("/status")
 def status_query(job_id: str = Query(...), x_auth_token: Optional[str] = Header(default=None)):