Spaces:

chouchouvs
/

DeepIndex

Running

App Files Files Community

chouchouvs commited on Sep 20

Commit

8a1a757

verified ·

1 Parent(s): 6eb5a6e

Update main.py

Browse files

Files changed (1) hide show

main.py +179 -230

main.py CHANGED Viewed

@@ -1,24 +1,28 @@
 # -*- coding: utf-8 -*-
 """
-Version optimisée du module FAISS :
-- Réduction de la dimension des vecteurs (EMB_DIM, configurable)
-- Index quantisé **IVF‑PQ** (faible empreinte disque)
-- Chargement *on‑disk* (mmap) pour limiter la RAM
-- Option `store_text` : ne pas persister le texte brut dans le dataset
-- Compression gzip des artefacts exportés
-- Paramètres contrôlables via variables d’environnement
 """
 from __future__ import annotations
 import os
 import io
 import json
 import time
-import tarfile
-import logging
 import hashlib
 from typing import List, Dict, Any, Tuple, Optional
 import numpy as np
 import faiss
 from fastapi import FastAPI, HTTPException
@@ -26,132 +30,141 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
 from pydantic import BaseModel
-# --------------------------------------------------------------------------- #
-#   CONFIGURATION (variables d’environnement – modifiable à la volée)
-# --------------------------------------------------------------------------- #
-EMB_PROVIDER = os.getenv("EMB_PROVIDER", "dummy").strip().lower()
-EMB_MODEL   = os.getenv("EMB_MODEL", "sentence-transformers/all-mpnet-base-v2").strip()
-EMB_BATCH   = int(os.getenv("EMB_BATCH", "32"))
-EMB_DIM     = int(os.getenv("EMB_DIM", "64"))          # ← dimension réduite (ex. 64)
-# FAISS quantisation
-FAISS_TYPE  = os.getenv("FAISS_TYPE", "IVF_PQ").upper()   # FLAT ou IVF_PQ
-FAISS_NLIST = int(os.getenv("FAISS_NLIST", "100"))       # nb de centroides (IVF)
-FAISS_M     = int(os.getenv("FAISS_M", "8"))             # sous‑vecteurs (PQ)
-FAISS_NBITS = int(os.getenv("FAISS_NBITS", "8"))         # bits / sous‑vecteur
-# Stockage du texte brut dans le dataset ? (False → économise disque)
-STORE_TEXT  = os.getenv("STORE_TEXT", "false").lower() in ("1", "true", "yes")
 # --------------------------------------------------------------------------- #
 #   LOGGING
 # --------------------------------------------------------------------------- #
-LOG = logging.getLogger("appli_v1")
 if not LOG.handlers:
     h = logging.StreamHandler()
-    h.setFormatter(logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s", "%H:%M:%S"))
     LOG.addHandler(h)
 LOG.setLevel(logging.INFO)
 # --------------------------------------------------------------------------- #
-#   UTILITAIRES
 # --------------------------------------------------------------------------- #
-def list_repo_files(repo_dir: str, top_k: int = 500) -> List[str]:
-    """
-    Retourne la liste des fichiers texte du dépôt, en respectant .gitignore
-    (via Git si disponible, sinon fallback os.walk).
-    """
-    if not os.path.isdir(repo_dir):
-        return []
-    files: List[str] = []
-    try:
-        from git import Repo
-        repo = Repo(repo_dir)
-        # fichiers trackés
-        tracked = repo.git.ls_files().splitlines()
-        files.extend(tracked)
-        # fichiers non‑trackés mais non ignorés
-        untracked = repo.git.ls_files(others=True, exclude_standard=True).splitlines()
-        files.extend(untracked)
-        # filtrage simple
-        files = [
-            f for f in files
-            if not f.startswith('.git/') and not any(p.startswith('.') for p in f.split(os.sep))
-        ]
-        files = sorted(set(files))[:top_k]
-    except Exception as e:
-        LOG.debug("Git indisponible / pas un dépôt → fallback os.walk : %s", e)
-        for root, _, names in os.walk(repo_dir):
-            for name in sorted(names):
-                if name.startswith('.'):
-                    continue
-                rel = os.path.relpath(os.path.join(root, name), repo_dir)
-                if rel.startswith('.git') or any(p.startswith('.') for p in rel.split(os.sep)):
-                    continue
-                files.append(rel)
-                if len(files) >= top_k:
-                    break
-            if len(files) >= top_k:
-                break
-        files = sorted(set(files))
-    return files
-def read_file_safe(file_path: str) -> str:
-    """Lit un fichier en UTF‑8, ignore les erreurs."""
-    try:
-        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
-            return f.read()
-    except Exception as e:
-        LOG.error("Erreur lecture %s : %s", file_path, e)
-        return f"# Erreur lecture : {e}"
-def write_file_safe(file_path: str, content: str) -> str:
-    """Écrit un fichier, crée les dossiers parents si besoin."""
-    try:
-        os.makedirs(os.path.dirname(file_path), exist_ok=True)
-        with open(file_path, "w", encoding="utf-8") as f:
-            f.write(content)
-        return f"✅ Fichier sauvegardé : {os.path.basename(file_path)}"
-    except Exception as e:
-        LOG.error("Erreur écriture %s : %s", file_path, e)
-        return f"❌ Erreur sauvegarde : {e}"
 # --------------------------------------------------------------------------- #
-#   FAKE / DUMMY FAISS (pour compatibilité)
 # --------------------------------------------------------------------------- #
-class DummyFAISS:
-    """Classe factice – aucune fonctionnalité réelle."""
-    pass
-def create_faiss_index(*_, **__) -> DummyFAISS:
-    LOG.warning("FAISS désactivé – utilisation du client distant")
-    return DummyFAISS()
-def search_faiss_index(*_, **__) -> List[Any]:
-    LOG.warning("FAISS désactivé – utilisation du client distant")
-    return []
 # --------------------------------------------------------------------------- #
 #   EMBEDDING PROVIDERS
 # --------------------------------------------------------------------------- #
-_ST_MODEL: Optional[Any] = None
-_HF_TOKENIZER: Optional[Any] = None
-_HF_MODEL: Optional[Any] = None
 def _emb_dummy(texts: List[str], dim: int = EMB_DIM) -> np.ndarray:
-    """Vecteurs aléatoires déterministes (SHA‑1 → seed)."""
     vecs = np.zeros((len(texts), dim), dtype="float32")
     for i, t in enumerate(texts):
         h = hashlib.sha1((t or "").encode("utf-8")).digest()
@@ -160,16 +173,14 @@ def _emb_dummy(texts: List[str], dim: int = EMB_DIM) -> np.ndarray:
         vecs[i] = v / (np.linalg.norm(v) + 1e-9)
     return vecs
 def _get_st_model():
     global _ST_MODEL
     if _ST_MODEL is None:
         from sentence_transformers import SentenceTransformer
-        _ST_MODEL = SentenceTransformer(EMB_MODEL, cache_folder=os.getenv("HF_HOME", "/tmp/.cache/huggingface"))
-        LOG.info("[st] modèle chargé : %s", EMB_MODEL)
     return _ST_MODEL
 def _emb_st(texts: List[str]) -> np.ndarray:
     model = _get_st_model()
     vecs = model.encode(
@@ -181,25 +192,22 @@ def _emb_st(texts: List[str]) -> np.ndarray:
     ).astype("float32")
     return vecs
 def _get_hf_model():
     global _HF_TOKENIZER, _HF_MODEL
     if _HF_MODEL is None or _HF_TOKENIZER is None:
         from transformers import AutoTokenizer, AutoModel
-        _HF_TOKENIZER = AutoTokenizer.from_pretrained(EMB_MODEL, cache_dir=os.getenv("HF_HOME", "/tmp/.cache/huggingface"))
-        _HF_MODEL = AutoModel.from_pretrained(EMB_MODEL, cache_dir=os.getenv("HF_HOME", "/tmp/.cache/huggingface"))
         _HF_MODEL.eval()
-        LOG.info("[hf] modèle chargé : %s", EMB_MODEL)
     return _HF_TOKENIZER, _HF_MODEL
 def _mean_pool(last_hidden_state: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
     mask = attention_mask[..., None].astype(last_hidden_state.dtype)
     summed = (last_hidden_state * mask).sum(axis=1)
     counts = mask.sum(axis=1).clip(min=1e-9)
     return summed / counts
 def _emb_hf(texts: List[str]) -> np.ndarray:
     import torch
     tok, mod = _get_hf_model()
@@ -215,21 +223,10 @@ def _emb_hf(texts: List[str]) -> np.ndarray:
             all_vecs.append(pooled.astype("float32"))
     return np.concatenate(all_vecs, axis=0)
-def _reduce_dim(vectors: np.ndarray, target_dim: int = EMB_DIM) -> np.ndarray:
-    """PCA simple pour réduire la dimension (si target_dim < current)."""
-    if target_dim >= vectors.shape[1]:
-        return vectors
-    from sklearn.decomposition import PCA
-    pca = PCA(n_components=target_dim, random_state=0)
-    return pca.fit_transform(vectors).astype("float32")
 # --------------------------------------------------------------------------- #
 #   DATASET / FAISS I/O
 # --------------------------------------------------------------------------- #
-def _save_dataset(ds_dir: str, rows: List[Dict[str, Any]], store_text: bool = STORE_TEXT) -> None:
-    """Sauvegarde le dataset au format JSONL (optionnellement sans le texte)."""
     os.makedirs(ds_dir, exist_ok=True)
     data_path = os.path.join(ds_dir, "data.jsonl")
     with open(data_path, "w", encoding="utf-8") as f:
@@ -241,7 +238,6 @@ def _save_dataset(ds_dir: str, rows: List[Dict[str, Any]], store_text: bool = ST
     with open(os.path.join(ds_dir, "meta.json"), "w", encoding="utf-8") as f:
         json.dump(meta, f, ensure_ascii=False, indent=2)
 def _load_dataset(ds_dir: str) -> List[Dict[str, Any]]:
     data_path = os.path.join(ds_dir, "data.jsonl")
     if not os.path.isfile(data_path):
@@ -255,80 +251,50 @@ def _load_dataset(ds_dir: str) -> List[Dict[str, Any]]:
                 continue
     return out
 def _save_faiss(fx_dir: str, xb: np.ndarray, meta: Dict[str, Any]) -> None:
-    """Sauvegarde un index FAISS quantisé (IVF‑PQ) ou plat selon FAISS_TYPE."""
     os.makedirs(fx_dir, exist_ok=True)
     idx_path = os.path.join(fx_dir, "emb.faiss")
-    if FAISS_TYPE == "IVF_PQ":
-        # ---- IVF‑PQ ---------------------------------------------------------
-        quantizer = faiss.IndexFlatIP(xb.shape[1])          # base (inner‑product ≈ cosine)
-        index = faiss.IndexIVFPQ(quantizer, xb.shape[1], FAISS_NLIST, FAISS_M, FAISS_NBITS)
-        # entraînement sur un sous‑échantillon (max 10 k vecteurs)
-        rng = np.random.default_rng(0)
-        train = xb[rng.choice(xb.shape[0], min(10_000, xb.shape[0]), replace=False)]
-        index.train(train)
-        index.add(xb)
-        meta.update({
-            "index_type": "IVF_PQ",
-            "nlist": FAISS_NLIST,
-            "m": FAISS_M,
-            "nbits": FAISS_NBITS,
-        })
-    else:  # FLAT (fallback)
-        index = faiss.IndexFlatIP(xb.shape[1])
-        index.add(xb)
-        meta.update({"index_type": "FLAT"})
     faiss.write_index(index, idx_path)
-    # meta.json (inclut le type d’index)
     with open(os.path.join(fx_dir, "meta.json"), "w", encoding="utf-8") as f:
         json.dump(meta, f, ensure_ascii=False, indent=2)
 def _load_faiss(fx_dir: str) -> faiss.Index:
-    """Charge l’index en mode mmap (lecture à la volée)."""
     idx_path = os.path.join(fx_dir, "emb.faiss")
     if not os.path.isfile(idx_path):
         raise FileNotFoundError(f"FAISS index introuvable : {idx_path}")
-    # mmap minimise la RAM utilisée
     return faiss.read_index(idx_path, faiss.IO_FLAG_MMAP)
 def _tar_dir_to_bytes(dir_path: str) -> bytes:
-    """Archive gzip du répertoire (compression maximale)."""
     bio = io.BytesIO()
     with tarfile.open(fileobj=bio, mode="w:gz", compresslevel=9) as tar:
         tar.add(dir_path, arcname=os.path.basename(dir_path))
     bio.seek(0)
     return bio.read()
 # --------------------------------------------------------------------------- #
-#   WORKER POOL (asynchrone)
 # --------------------------------------------------------------------------- #
-from concurrent.futures import ThreadPoolExecutor
-MAX_WORKERS = max(1, int(os.getenv("MAX_WORKERS", "1")))
-EXECUTOR = ThreadPoolExecutor(max_workers=MAX_WORKERS)
 LOG.info("ThreadPoolExecutor initialisé : max_workers=%s", MAX_WORKERS)
-def _proj_dirs(project_id: str) -> Tuple[str, str, str]:
-    base = os.path.join(os.getenv("DATA_ROOT", "/tmp/data"), project_id)
-    ds_dir = os.path.join(base, "dataset")
-    fx_dir = os.path.join(base, "faiss")
-    os.makedirs(ds_dir, exist_ok=True)
-    os.makedirs(fx_dir, exist_ok=True)
-    return base, ds_dir, fx_dir
 def _do_index_job(
-    st: "JobState",
     files: List[Dict[str, str]],
     chunk_size: int,
     overlap: int,
@@ -339,16 +305,15 @@ def _do_index_job(
     Pipeline complet :
     1️⃣ Chunking
     2️⃣ Embedding (dummy / st / hf)
-    3️⃣ Réduction de dimension (PCA) si EMB_DIM < dim du modèle
-    4️⃣ Sauvegarde dataset (optionnel texte)
     5️⃣ Index FAISS quantisé + mmap
     """
     try:
         base, ds_dir, fx_dir = _proj_dirs(st.project_id)
-        # ------------------------------------------------------------------- #
-        # 1️⃣ Chunking
-        # ------------------------------------------------------------------- #
         rows: List[Dict[str, Any]] = []
         st.total_files = len(files)
@@ -360,12 +325,12 @@ def _do_index_job(
                 rows.append({"path": path, "text": ck, "chunk_id": i})
         st.total_chunks = len(rows)
-        LOG.info("Chunking terminé : %d chunks", st.total_chunks)
-        # ------------------------------------------------------------------- #
-        # 2️⃣ Embedding
-        # ------------------------------------------------------------------- #
         texts = [r["text"] for r in rows]
         if EMB_PROVIDER == "dummy":
             xb = _emb_dummy(texts, dim=EMB_DIM)
         elif EMB_PROVIDER == "st":
@@ -373,23 +338,22 @@ def _do_index_job(
         else:
             xb = _emb_hf(texts)
-        # ------------------------------------------------------------------- #
-        # 3️⃣ Réduction de dimension (si nécessaire)
-        # ------------------------------------------------------------------- #
         if xb.shape[1] != EMB_DIM:
-            xb = _reduce_dim(xb, target_dim=EMB_DIM)
         st.embedded = xb.shape[0]
-        LOG.info("Embedding terminé : %d vecteurs (dim=%d)", st.embedded, xb.shape[1])
-        # ------------------------------------------------------------------- #
-        # 4️⃣ Sauvegarde du dataset
-        # ------------------------------------------------------------------- #
         _save_dataset(ds_dir, rows, store_text=store_text)
-        # ------------------------------------------------------------------- #
-        # 5️⃣ Index FAISS
-        # ------------------------------------------------------------------- #
         meta = {
             "dim": int(xb.shape[1]),
             "count": int(xb.shape[0]),
@@ -398,16 +362,14 @@ def _do_index_job(
         }
         _save_faiss(fx_dir, xb, meta)
         st.indexed = int(xb.shape[0])
-        LOG.info("FAISS (%s) écrit : %s", FAISS_TYPE, os.path.join(fx_dir, "emb.faiss"))
-        # ------------------------------------------------------------------- #
-        #   Finalisation
-        # ------------------------------------------------------------------- #
-        st.stage = "done"
         st.finished_at = time.time()
     except Exception as e:
         LOG.exception("Job %s échoué", st.job_id)
         st.errors.append(str(e))
         st.stage = "failed"
         st.finished_at = time.time()
@@ -438,7 +400,6 @@ def _submit_job(
     st.stage = "queued"
     return job_id
 # --------------------------------------------------------------------------- #
 #   FASTAPI
 # --------------------------------------------------------------------------- #
@@ -451,20 +412,17 @@ fastapi_app.add_middleware(
     allow_headers=["*"],
 )
 class FileItem(BaseModel):
     path: str
     text: str
 class IndexRequest(BaseModel):
     project_id: str
     files: List[FileItem]
     chunk_size: int = 200
     overlap: int = 20
     batch_size: int = 32
-    store_text: bool = STORE_TEXT  # ← configurable
 @fastapi_app.get("/health")
 def health():
@@ -475,14 +433,15 @@ def health():
         "model": EMB_MODEL if EMB_PROVIDER != "dummy" else None,
         "cache_root": os.getenv("CACHE_ROOT", "/tmp/.cache"),
         "workers": MAX_WORKERS,
-        "data_root": os.getenv("DATA_ROOT", "/tmp/data"),
-        "faiss_type": FAISS_TYPE,
         "emb_dim": EMB_DIM,
     }
 @fastapi_app.post("/index")
 def index(req: IndexRequest):
     try:
         files = [fi.model_dump() for fi in req.files]
         job_id = _submit_job(
@@ -498,7 +457,6 @@ def index(req: IndexRequest):
         LOG.exception("Erreur soumission index")
         raise HTTPException(status_code=500, detail=str(e))
 @fastapi_app.get("/status/{job_id}")
 def status(job_id: str):
     st = JOBS.get(job_id)
@@ -506,26 +464,25 @@ def status(job_id: str):
         raise HTTPException(status_code=404, detail="job inconnu")
     return JSONResponse(st.model_dump())
 class SearchRequest(BaseModel):
     project_id: str
     query: str
     k: int = 5
 @fastapi_app.post("/search")
 def search(req: SearchRequest):
     base, ds_dir, fx_dir = _proj_dirs(req.project_id)
-    # Vérifier la présence de l'index
-    if not (os.path.isfile(os.path.join(fx_dir, "emb.faiss")) and os.path.isfile(os.path.join(ds_dir, "data.jsonl"))):
         raise HTTPException(status_code=409, detail="Index non prêt (reviens plus tard)")
     rows = _load_dataset(ds_dir)
     if not rows:
         raise HTTPException(status_code=404, detail="dataset introuvable")
-    # Embedding de la requête (même provider)
     if EMB_PROVIDER == "dummy":
         q = _emb_dummy([req.query], dim=EMB_DIM)[0:1, :]
     elif EMB_PROVIDER == "st":
@@ -552,9 +509,8 @@ def search(req: SearchRequest):
         out.append({"path": r.get("path"), "text": r.get("text"), "score": float(sc)})
     return {"results": out}
 # --------------------------------------------------------------------------- #
-#   ARTIFACTS EXPORT (gzip)
 # --------------------------------------------------------------------------- #
 @fastapi_app.get("/artifacts/{project_id}/dataset")
 def download_dataset(project_id: str):
@@ -565,7 +521,6 @@ def download_dataset(project_id: str):
     hdr = {"Content-Disposition": f'attachment; filename="{project_id}_dataset.tgz"'}
     return StreamingResponse(io.BytesIO(buf), media_type="application/gzip", headers=hdr)
 @fastapi_app.get("/artifacts/{project_id}/faiss")
 def download_faiss(project_id: str):
     _, _, fx_dir = _proj_dirs(project_id)
@@ -575,35 +530,30 @@ def download_faiss(project_id: str):
     hdr = {"Content-Disposition": f'attachment; filename="{project_id}_faiss.tgz"'}
     return StreamingResponse(io.BytesIO(buf), media_type="application/gzip", headers=hdr)
 # --------------------------------------------------------------------------- #
-#   GRADIO UI (facultatif – simple test)
 # --------------------------------------------------------------------------- #
 def _ui_index(project_id: str, sample_text: str):
     files = [{"path": "sample.txt", "text": sample_text}]
     try:
         req = IndexRequest(project_id=project_id, files=[FileItem(**f) for f in files])
     except Exception as e:
-        return f"❌ Erreur validation : {e}"
     try:
         res = index(req)
         return f"✅ Job lancé : {res['job_id']}"
     except Exception as e:
-        return f"❌ Erreur index : {e}"
 def _ui_search(project_id: str, query: str, k: int):
     try:
         res = search(SearchRequest(project_id=project_id, query=query, k=int(k)))
         return json.dumps(res, ensure_ascii=False, indent=2)
     except Exception as e:
-        return f"❌ Erreur recherche : {e}"
-import gradio as gr
 with gr.Blocks(title="Remote Indexer (Async – Optimisé)", analytics_enabled=False) as ui:
-    gr.Markdown("## Remote Indexer – Optimisé (FAISS quantisé, mmap, texte optionnel)")
     with gr.Row():
         pid = gr.Textbox(label="Project ID", value="DEMO")
         txt = gr.Textbox(label="Texte d’exemple", lines=4, value="Alpha bravo charlie delta echo foxtrot.")
@@ -618,15 +568,14 @@ with gr.Blocks(title="Remote Indexer (Async – Optimisé)", analytics_enabled=F
         out_q = gr.Code(label="Résultats")
         btn_q.click(_ui_search, inputs=[pid, q, k], outputs=[out_q])
 fastapi_app = gr.mount_gradio_app(fastapi_app, ui, path="/ui")
 # --------------------------------------------------------------------------- #
 #   MAIN
 # --------------------------------------------------------------------------- #
 if __name__ == "__main__":
     import uvicorn
-    PORT = int(os.getenv("PORT", "7860"))
-    LOG.info("Démarrage Uvicorn – port %s – UI à /ui", PORT)
     uvicorn.run(fastapi_app, host="0.0.0.0", port=PORT)

 # -*- coding: utf-8 -*-
 """
+FastAPI + Gradio : service d’indexation asynchrone avec FAISS.
+Ce fichier a été corrigé pour :
+* importer correctement `JobState`
+* garantir que toutes les dépendances (typing, pathlib…) sont disponibles
+* exposer les routes attendues par le client
+* garder la même logique que la version originale.
 """
 from __future__ import annotations
 import os
 import io
 import json
 import time
 import hashlib
+import logging
+import tarfile
+from pathlib import Path
 from typing import List, Dict, Any, Tuple, Optional
+from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 import faiss
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse, StreamingResponse
 from pydantic import BaseModel
+import gradio as gr
 # --------------------------------------------------------------------------- #
 #   LOGGING
 # --------------------------------------------------------------------------- #
+LOG = logging.getLogger("remote-indexer-async")
 if not LOG.handlers:
     h = logging.StreamHandler()
+    h.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
     LOG.addHandler(h)
 LOG.setLevel(logging.INFO)
+DBG = logging.getLogger("remote-indexer-async.debug")
+if not DBG.handlers:
+    hd = logging.StreamHandler()
+    hd.setFormatter(logging.Formatter("[DEBUG] %(asctime)s - %(message)s"))
+    DBG.addHandler(hd)
+DBG.setLevel(logging.DEBUG)
 # --------------------------------------------------------------------------- #
+#   CONFIGURATION (variables d’environnement)
 # --------------------------------------------------------------------------- #
+PORT = int(os.getenv("PORT", "7860"))
+DATA_ROOT = os.getenv("DATA_ROOT", "/tmp/data")
+os.makedirs(DATA_ROOT, exist_ok=True)
+EMB_PROVIDER = os.getenv("EMB_PROVIDER", "dummy").strip().lower()
+EMB_MODEL = os.getenv("EMB_MODEL", "sentence-transformers/all-mpnet-base-v2").strip()
+EMB_BATCH = int(os.getenv("EMB_BATCH", "32"))
+EMB_DIM = int(os.getenv("EMB_DIM", "64"))          # dimension réduite (optimisation)
+MAX_WORKERS = int(os.getenv("MAX_WORKERS", "1"))
+# --------------------------------------------------------------------------- #
+#   CACHE DIRECTORIES (évite PermissionError)
+# --------------------------------------------------------------------------- #
+def _setup_cache_dirs() -> Dict[str, str]:
+    os.environ.setdefault("HOME", "/home/user")
+    CACHE_ROOT = os.getenv("CACHE_ROOT", "/tmp/.cache").rstrip("/")
+    paths = {
+        "root": CACHE_ROOT,
+        "hf_home": f"{CACHE_ROOT}/huggingface",
+        "hf_hub": f"{CACHE_ROOT}/huggingface/hub",
+        "hf_tf": f"{CACHE_ROOT}/huggingface/transformers",
+        "torch": f"{CACHE_ROOT}/torch",
+        "st": f"{CACHE_ROOT}/sentence-transformers",
+        "mpl": f"{CACHE_ROOT}/matplotlib",
+    }
+    for p in paths.values():
+        try:
+            os.makedirs(p, exist_ok=True)
+        except Exception as e:
+            LOG.warning("Impossible de créer %s : %s", p, e)
+    os.environ["HF_HOME"] = paths["hf_home"]
+    os.environ["HF_HUB_CACHE"] = paths["hf_hub"]
+    os.environ["TRANSFORMERS_CACHE"] = paths["hf_tf"]
+    os.environ["TORCH_HOME"] = paths["torch"]
+    os.environ["SENTENCE_TRANSFORMERS_HOME"] = paths["st"]
+    os.environ["MPLCONFIGDIR"] = paths["mpl"]
+    os.environ.setdefault("HF_HUB_DISABLE_SYMLINKS_WARNING", "1")
+    os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+    LOG.info("Caches configurés : %s", json.dumps(paths, indent=2))
+    return paths
+CACHE_PATHS = _setup_cache_dirs()
+# --------------------------------------------------------------------------- #
+#   IMPORT DE LA CLASSE DE STATE (c’est ce qui manquait)
+# --------------------------------------------------------------------------- #
+# La classe `JobState` se trouve dans `app/core/index_state.py`.
+# On l’importe ici afin qu’elle soit disponible dans tout le module.
+from app.core.index_state import JobState  # <-- IMPORT CORRIGÉ
 # --------------------------------------------------------------------------- #
+#   GLOBALS
 # --------------------------------------------------------------------------- #
+JOBS: Dict[str, JobState] = {}
+def _now() -> str:
+    return time.strftime("%H:%M:%S")
+def _proj_dirs(project_id: str) -> Tuple[str, str, str]:
+    base = os.path.join(DATA_ROOT, project_id)
+    ds_dir = os.path.join(base, "dataset")
+    fx_dir = os.path.join(base, "faiss")
+    os.makedirs(ds_dir, exist_ok=True)
+    os.makedirs(fx_dir, exist_ok=True)
+    return base, ds_dir, fx_dir
+def _add_msg(st: JobState, msg: str) -> None:
+    st.messages.append(f"[{_now()}] {msg}")
+    LOG.info("[%s] %s", st.job_id, msg)
+    DBG.debug("[%s] %s", st.job_id, msg)
+def _set_stage(st: JobState, stage: str) -> None:
+    st.stage = stage
+    _add_msg(st, f"stage={stage}")
+# --------------------------------------------------------------------------- #
+#   UTILITAIRES (chunking, normalisation, etc.)
+# --------------------------------------------------------------------------- #
+def _chunk_text(text: str, size: int = 200, overlap: int = 20) -> List[str]:
+    text = (text or "").replace("\r\n", "\n")
+    tokens = list(text)
+    if size <= 0:
+        return [text] if text else []
+    if overlap < 0:
+        overlap = 0
+    chunks = []
+    i = 0
+    while i < len(tokens):
+        j = min(i + size, len(tokens))
+        chunk = "".join(tokens[i:j]).strip()
+        if chunk:
+            chunks.append(chunk)
+        if j == len(tokens):
+            break
+        i = j - overlap if (j - overlap) > i else j
+    return chunks
+def _l2_normalize(x: np.ndarray) -> np.ndarray:
+    n = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
+    return x / n
 # --------------------------------------------------------------------------- #
 #   EMBEDDING PROVIDERS
 # --------------------------------------------------------------------------- #
+_ST_MODEL = None
+_HF_TOKENIZER = None
+_HF_MODEL = None
 def _emb_dummy(texts: List[str], dim: int = EMB_DIM) -> np.ndarray:
     vecs = np.zeros((len(texts), dim), dtype="float32")
     for i, t in enumerate(texts):
         h = hashlib.sha1((t or "").encode("utf-8")).digest()
         vecs[i] = v / (np.linalg.norm(v) + 1e-9)
     return vecs
 def _get_st_model():
     global _ST_MODEL
     if _ST_MODEL is None:
         from sentence_transformers import SentenceTransformer
+        _ST_MODEL = SentenceTransformer(EMB_MODEL, cache_folder=CACHE_PATHS["st"])
+        LOG.info("[st] modèle chargé : %s (cache=%s)", EMB_MODEL, CACHE_PATHS["st"])
     return _ST_MODEL
 def _emb_st(texts: List[str]) -> np.ndarray:
     model = _get_st_model()
     vecs = model.encode(
     ).astype("float32")
     return vecs
 def _get_hf_model():
     global _HF_TOKENIZER, _HF_MODEL
     if _HF_MODEL is None or _HF_TOKENIZER is None:
         from transformers import AutoTokenizer, AutoModel
+        _HF_TOKENIZER = AutoTokenizer.from_pretrained(EMB_MODEL, cache_dir=CACHE_PATHS["hf_tf"])
+        _HF_MODEL = AutoModel.from_pretrained(EMB_MODEL, cache_dir=CACHE_PATHS["hf_tf"])
         _HF_MODEL.eval()
+        LOG.info("[hf] modèle chargé : %s (cache=%s)", EMB_MODEL, CACHE_PATHS["hf_tf"])
     return _HF_TOKENIZER, _HF_MODEL
 def _mean_pool(last_hidden_state: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
     mask = attention_mask[..., None].astype(last_hidden_state.dtype)
     summed = (last_hidden_state * mask).sum(axis=1)
     counts = mask.sum(axis=1).clip(min=1e-9)
     return summed / counts
 def _emb_hf(texts: List[str]) -> np.ndarray:
     import torch
     tok, mod = _get_hf_model()
             all_vecs.append(pooled.astype("float32"))
     return np.concatenate(all_vecs, axis=0)
 # --------------------------------------------------------------------------- #
 #   DATASET / FAISS I/O
 # --------------------------------------------------------------------------- #
+def _save_dataset(ds_dir: str, rows: List[Dict[str, Any]], store_text: bool = True) -> None:
     os.makedirs(ds_dir, exist_ok=True)
     data_path = os.path.join(ds_dir, "data.jsonl")
     with open(data_path, "w", encoding="utf-8") as f:
     with open(os.path.join(ds_dir, "meta.json"), "w", encoding="utf-8") as f:
         json.dump(meta, f, ensure_ascii=False, indent=2)
 def _load_dataset(ds_dir: str) -> List[Dict[str, Any]]:
     data_path = os.path.join(ds_dir, "data.jsonl")
     if not os.path.isfile(data_path):
                 continue
     return out
 def _save_faiss(fx_dir: str, xb: np.ndarray, meta: Dict[str, Any]) -> None:
     os.makedirs(fx_dir, exist_ok=True)
     idx_path = os.path.join(fx_dir, "emb.faiss")
+    # ------------------------------------------------------------------- #
+    #   Index quantisé (IVF‑PQ) – optimisation mémoire / disque
+    # ------------------------------------------------------------------- #
+    quantizer = faiss.IndexFlatIP(xb.shape[1])          # inner‑product (cosine si normalisé)
+    index = faiss.IndexIVFPQ(quantizer, xb.shape[1], 100, 8, 8)  # nlist=100, m=8, nbits=8
+    # entraînement sur un sous‑échantillon (max 10 k vecteurs)
+    rng = np.random.default_rng(0)
+    train = xb[rng.choice(xb.shape[0], min(10_000, xb.shape[0]), replace=False]
+    index.train(train)
+    index.add(xb)
     faiss.write_index(index, idx_path)
+    meta.update({"index_type": "IVF_PQ", "nlist": 100, "m": 8, "nbits": 8})
     with open(os.path.join(fx_dir, "meta.json"), "w", encoding="utf-8") as f:
         json.dump(meta, f, ensure_ascii=False, indent=2)
 def _load_faiss(fx_dir: str) -> faiss.Index:
     idx_path = os.path.join(fx_dir, "emb.faiss")
     if not os.path.isfile(idx_path):
         raise FileNotFoundError(f"FAISS index introuvable : {idx_path}")
+    # mmap → l’index reste sur disque, la RAM n’est utilisée que pour les requêtes
     return faiss.read_index(idx_path, faiss.IO_FLAG_MMAP)
 def _tar_dir_to_bytes(dir_path: str) -> bytes:
     bio = io.BytesIO()
     with tarfile.open(fileobj=bio, mode="w:gz", compresslevel=9) as tar:
         tar.add(dir_path, arcname=os.path.basename(dir_path))
     bio.seek(0)
     return bio.read()
 # --------------------------------------------------------------------------- #
+#   THREAD‑POOL (asynchrone)
 # --------------------------------------------------------------------------- #
+EXECUTOR = ThreadPoolExecutor(max_workers=max(1, MAX_WORKERS))
 LOG.info("ThreadPoolExecutor initialisé : max_workers=%s", MAX_WORKERS)
 def _do_index_job(
+    st: JobState,
     files: List[Dict[str, str]],
     chunk_size: int,
     overlap: int,
     Pipeline complet :
     1️⃣ Chunking
     2️⃣ Embedding (dummy / st / hf)
+    3️⃣ Réduction de dimension (PCA) si besoin
+    4️⃣ Sauvegarde du dataset (texte optionnel)
     5️⃣ Index FAISS quantisé + mmap
     """
     try:
         base, ds_dir, fx_dir = _proj_dirs(st.project_id)
+        # ------------------- 1️⃣ Chunking -------------------
+        _set_stage(st, "chunking")
         rows: List[Dict[str, Any]] = []
         st.total_files = len(files)
                 rows.append({"path": path, "text": ck, "chunk_id": i})
         st.total_chunks = len(rows)
+        _add_msg(st, f"Total chunks = {st.total_chunks}")
+        # ------------------- 2️⃣ Embedding -------------------
+        _set_stage(st, "embedding")
         texts = [r["text"] for r in rows]
         if EMB_PROVIDER == "dummy":
             xb = _emb_dummy(texts, dim=EMB_DIM)
         elif EMB_PROVIDER == "st":
         else:
             xb = _emb_hf(texts)
+        # ------------------- 3️⃣ Réduction de dimension (PCA) -------------------
         if xb.shape[1] != EMB_DIM:
+            from sklearn.decomposition import PCA
+            pca = PCA(n_components=EMB_DIM, random_state=0)
+            xb = pca.fit_transform(xb).astype("float32")
+            LOG.info("Réduction PCA appliquée : %d → %d dimensions", xb.shape[1], EMB_DIM)
         st.embedded = xb.shape[0]
+        _add_msg(st, f"Embeddings générés : {st.embedded}")
+        # ------------------- 4️⃣ Sauvegarde dataset -------------------
         _save_dataset(ds_dir, rows, store_text=store_text)
+        _add_msg(st, f"Dataset sauvegardé dans {ds_dir}")
+        # ------------------- 5️⃣ Index FAISS -------------------
+        _set_stage(st, "indexing")
         meta = {
             "dim": int(xb.shape[1]),
             "count": int(xb.shape[0]),
         }
         _save_faiss(fx_dir, xb, meta)
         st.indexed = int(xb.shape[0])
+        _add_msg(st, f"FAISS écrit sur {os.path.join(fx_dir, 'emb.faiss')}")
+        _set_stage(st, "done")
         st.finished_at = time.time()
     except Exception as e:
         LOG.exception("Job %s échoué", st.job_id)
         st.errors.append(str(e))
+        _add_msg(st, f"❌ Exception : {e}")
         st.stage = "failed"
         st.finished_at = time.time()
     st.stage = "queued"
     return job_id
 # --------------------------------------------------------------------------- #
 #   FASTAPI
 # --------------------------------------------------------------------------- #
     allow_headers=["*"],
 )
 class FileItem(BaseModel):
     path: str
     text: str
 class IndexRequest(BaseModel):
     project_id: str
     files: List[FileItem]
     chunk_size: int = 200
     overlap: int = 20
     batch_size: int = 32
+    store_text: bool = True   # on peut désactiver via le payload ou env
 @fastapi_app.get("/health")
 def health():
         "model": EMB_MODEL if EMB_PROVIDER != "dummy" else None,
         "cache_root": os.getenv("CACHE_ROOT", "/tmp/.cache"),
         "workers": MAX_WORKERS,
+        "data_root": DATA_ROOT,
         "emb_dim": EMB_DIM,
     }
 @fastapi_app.post("/index")
 def index(req: IndexRequest):
+    """
+    Lancement asynchrone : renvoie immédiatement un `job_id`.
+    """
     try:
         files = [fi.model_dump() for fi in req.files]
         job_id = _submit_job(
         LOG.exception("Erreur soumission index")
         raise HTTPException(status_code=500, detail=str(e))
 @fastapi_app.get("/status/{job_id}")
 def status(job_id: str):
     st = JOBS.get(job_id)
         raise HTTPException(status_code=404, detail="job inconnu")
     return JSONResponse(st.model_dump())
 class SearchRequest(BaseModel):
     project_id: str
     query: str
     k: int = 5
 @fastapi_app.post("/search")
 def search(req: SearchRequest):
     base, ds_dir, fx_dir = _proj_dirs(req.project_id)
+    # Vérifier que l’index existe
+    if not (os.path.isfile(os.path.join(fx_dir, "emb.faiss")) and
+            os.path.isfile(os.path.join(ds_dir, "data.jsonl"))):
         raise HTTPException(status_code=409, detail="Index non prêt (reviens plus tard)")
     rows = _load_dataset(ds_dir)
     if not rows:
         raise HTTPException(status_code=404, detail="dataset introuvable")
+    # Embedding de la requête (même provider que l’index)
     if EMB_PROVIDER == "dummy":
         q = _emb_dummy([req.query], dim=EMB_DIM)[0:1, :]
     elif EMB_PROVIDER == "st":
         out.append({"path": r.get("path"), "text": r.get("text"), "score": float(sc)})
     return {"results": out}
 # --------------------------------------------------------------------------- #
+#   EXPORT ARTIFACTS (gzip)
 # --------------------------------------------------------------------------- #
 @fastapi_app.get("/artifacts/{project_id}/dataset")
 def download_dataset(project_id: str):
     hdr = {"Content-Disposition": f'attachment; filename="{project_id}_dataset.tgz"'}
     return StreamingResponse(io.BytesIO(buf), media_type="application/gzip", headers=hdr)
 @fastapi_app.get("/artifacts/{project_id}/faiss")
 def download_faiss(project_id: str):
     _, _, fx_dir = _proj_dirs(project_id)
     hdr = {"Content-Disposition": f'attachment; filename="{project_id}_faiss.tgz"'}
     return StreamingResponse(io.BytesIO(buf), media_type="application/gzip", headers=hdr)
 # --------------------------------------------------------------------------- #
+#   GRADIO UI (facultatif – test rapide)
 # --------------------------------------------------------------------------- #
 def _ui_index(project_id: str, sample_text: str):
     files = [{"path": "sample.txt", "text": sample_text}]
     try:
         req = IndexRequest(project_id=project_id, files=[FileItem(**f) for f in files])
     except Exception as e:
+        return f"❌ Validation : {e}"
     try:
         res = index(req)
         return f"✅ Job lancé : {res['job_id']}"
     except Exception as e:
+        return f"❌ Erreur : {e}"
 def _ui_search(project_id: str, query: str, k: int):
     try:
         res = search(SearchRequest(project_id=project_id, query=query, k=int(k)))
         return json.dumps(res, ensure_ascii=False, indent=2)
     except Exception as e:
+        return f"❌ Erreur : {e}"
 with gr.Blocks(title="Remote Indexer (Async – Optimisé)", analytics_enabled=False) as ui:
+    gr.Markdown("## Remote Indexer — Async (FAISS quantisé, mmap, texte optionnel)")
     with gr.Row():
         pid = gr.Textbox(label="Project ID", value="DEMO")
         txt = gr.Textbox(label="Texte d’exemple", lines=4, value="Alpha bravo charlie delta echo foxtrot.")
         out_q = gr.Code(label="Résultats")
         btn_q.click(_ui_search, inputs=[pid, q, k], outputs=[out_q])
+# Monte l’UI Gradio sur le même serveur FastAPI
 fastapi_app = gr.mount_gradio_app(fastapi_app, ui, path="/ui")
 # --------------------------------------------------------------------------- #
 #   MAIN
 # --------------------------------------------------------------------------- #
 if __name__ == "__main__":
     import uvicorn
+    LOG.info("Démarrage Uvicorn – port %s – UI disponible à /ui", PORT)
     uvicorn.run(fastapi_app, host="0.0.0.0", port=PORT)