Spaces:

chouchouvs
/

DeepIndex

Running

App Files Files Community

chouchouvs commited on Sep 13

Commit

7a8bf2f

verified ·

1 Parent(s): e08033d

Update main.py

Browse files

Files changed (1) hide show

main.py +265 -300

main.py CHANGED Viewed

@@ -1,91 +1,87 @@
 # -*- coding: utf-8 -*-
 from __future__ import annotations
-import os, time, uuid, logging, random
 from typing import List, Optional, Dict, Any, Tuple
 import numpy as np
 import requests
-from fastapi import FastAPI, BackgroundTasks, Header, HTTPException, Query
 from pydantic import BaseModel, Field
-# Qdrant (optionnel si VECTOR_STORE=memory)
-try:
-    from qdrant_client import QdrantClient
-    from qdrant_client.http.models import VectorParams, Distance, PointStruct
-except Exception:  # si non installé, on retombe en mémoire
-    QdrantClient = None
-    VectorParams = Distance = PointStruct = None
-# ---------- logging ----------
 logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
 LOG = logging.getLogger("remote_indexer")
-# ---------- ENV (config) ----------
-# Choix du store: "qdrant" (par défaut) ou "memory"
-VECTOR_STORE = os.getenv("VECTOR_STORE", "qdrant").strip().lower()
-# Ordre des backends d'embeddings à essayer. Par défaut: DeepInfra, puis HF.
-DEFAULT_BACKENDS = "deepinfra,hf"
-EMB_BACKEND_ORDER = [s.strip().lower()
-                     for s in os.getenv("EMB_BACKEND_ORDER", os.getenv("EMB_BACKEND", DEFAULT_BACKENDS)).split(",")
-                     if s.strip()]
-ALLOW_DI_AUTOFALLBACK = os.getenv("ALLOW_DI_AUTOFALLBACK", "true").lower() in ("1","true","yes","on")
-# HF Inference API
-HF_TOKEN   = os.getenv("HF_API_TOKEN", "").strip()
-HF_MODEL   = os.getenv("HF_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2").strip()
-HF_API_URL_USER      = os.getenv("HF_API_URL", "").strip()
-HF_API_URL_PIPELINE  = os.getenv("HF_API_URL_PIPELINE", "").strip()
-HF_API_URL_MODELS    = os.getenv("HF_API_URL_MODELS", "").strip()
-if HF_API_URL_USER:
-    if "/pipeline" in HF_API_URL_USER:
-        HF_API_URL_PIPELINE = HF_API_URL_USER
-    else:
-        HF_API_URL_MODELS = HF_API_URL_USER
-HF_URL_PIPELINE = (HF_API_URL_PIPELINE or f"https://api-inference.huggingface.co/pipeline/feature-extraction/{HF_MODEL}")
-HF_URL_MODELS   = (HF_API_URL_MODELS   or f"https://api-inference.huggingface.co/models/{HF_MODEL}")
-HF_TIMEOUT = float(os.getenv("EMB_TIMEOUT_SEC", "120"))
-HF_WAIT    = os.getenv("HF_WAIT_FOR_MODEL", "true").lower() in ("1","true","yes","on")
-HF_PIPELINE_FIRST = os.getenv("HF_PIPELINE_FIRST", "true").lower() in ("1","true","yes","on")
-# DeepInfra (OpenAI-compatible embeddings)
 DI_TOKEN   = os.getenv("DEEPINFRA_API_KEY", "").strip()
 DI_MODEL   = os.getenv("DEEPINFRA_EMBED_MODEL", "BAAI/bge-m3").strip()
 DI_URL     = os.getenv("DEEPINFRA_EMBED_URL", "https://api.deepinfra.com/v1/openai/embeddings").strip()
 DI_TIMEOUT = float(os.getenv("EMB_TIMEOUT_SEC", "120"))
-# Retries embeddings
-RETRY_MAX      = int(os.getenv("EMB_RETRY_MAX", "6"))
-RETRY_BASE_SEC = float(os.getenv("EMB_RETRY_BASE", "1.5"))
-RETRY_JITTER   = float(os.getenv("EMB_RETRY_JITTER", "0.35"))
-# Qdrant
-QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333").strip()
-QDRANT_API = os.getenv("QDRANT_API_KEY", "").strip()
-# Auth d’API du service (simple header)
 AUTH_TOKEN = os.getenv("REMOTE_INDEX_TOKEN", "").strip()
 LOG.info(f"Embeddings backend order = {EMB_BACKEND_ORDER}")
-LOG.info(f"HF pipeline URL = {HF_URL_PIPELINE}")
-LOG.info(f"HF models   URL = {HF_URL_MODELS}")
 LOG.info(f"VECTOR_STORE = {VECTOR_STORE}")
-if "hf" in EMB_BACKEND_ORDER and not HF_TOKEN:
-    LOG.warning("HF_API_TOKEN manquant — tentatives HF échoueront.")
 if "deepinfra" in EMB_BACKEND_ORDER and not DI_TOKEN:
     LOG.warning("DEEPINFRA_API_KEY manquant — tentatives DeepInfra échoueront.")
-# ---------- Vector store abstraction ----------
-# ---------- Vector Stores ----------
-from typing import Dict, Any, List, Optional, Tuple
-import numpy as np
-import logging
-LOG = logging.getLogger("remote_indexer")
 try:
     from qdrant_client import QdrantClient
@@ -94,19 +90,16 @@ except Exception:
     QdrantClient = None
     PointStruct = None
 class BaseStore:
     def ensure_collection(self, name: str, dim: int): ...
     def upsert(self, name: str, vectors: np.ndarray, payloads: List[Dict[str, Any]]) -> int: ...
     def search(self, name: str, query_vec: np.ndarray, top_k: int) -> List[Dict[str, Any]]: ...
     def wipe(self, name: str): ...
 class MemoryStore(BaseStore):
-    """Store en mémoire (volatile)."""
     def __init__(self):
-        # { collection: {"vecs": [np.ndarray], "payloads": [dict]} }
-        self.db: Dict[str, Dict[str, List[Any]]] = {}
     def ensure_collection(self, name: str, dim: int):
         self.db.setdefault(name, {"vecs": [], "payloads": [], "dim": dim})
@@ -116,18 +109,17 @@ class MemoryStore(BaseStore):
             raise RuntimeError(f"MemoryStore: collection {name} inconnue")
         if len(vectors) != len(payloads):
             raise ValueError("MemoryStore.upsert: tailles vectors/payloads incohérentes")
-        self.db[name]["vecs"].extend([v.astype(np.float32) for v in vectors])
         self.db[name]["payloads"].extend(payloads)
         return len(vectors)
     def search(self, name: str, query_vec: np.ndarray, top_k: int) -> List[Dict[str, Any]]:
         if name not in self.db or not self.db[name]["vecs"]:
             return []
-        mat = np.vstack(self.db[name]["vecs"])  # [N, dim]
-        q = query_vec.reshape(1, -1).astype(np.float32)      # [1, dim]
-        # cosine similarity sur vecteurs normalisés
-        # (on suppose que les embeddings sont déjà normalisés en amont)
-        sims = (mat @ q.T).ravel()  # [N]
         top_idx = np.argsort(-sims)[:top_k]
         out = []
         for i in top_idx:
@@ -139,27 +131,22 @@ class MemoryStore(BaseStore):
     def wipe(self, name: str):
         self.db.pop(name, None)
 class QdrantStore(BaseStore):
-    """Store Qdrant avec gestion d'IDs séquentiels par collection."""
     def __init__(self, url: str, api_key: Optional[str] = None):
         if QdrantClient is None or PointStruct is None:
             raise RuntimeError("qdrant_client non disponible")
         self.client = QdrantClient(url=url, api_key=api_key if api_key else None)
-        # compteur d'IDs par collection
         self._next_ids: Dict[str, int] = {}
     def _init_next_id(self, name: str):
-        # on cherche le count exact des points existants pour démarrer l'ID à count
         try:
             cnt = self.client.count(collection_name=name, exact=True).count
         except Exception:
-            # si count échoue (collection vide juste créée), on démarre à 0
             cnt = 0
         self._next_ids[name] = int(cnt)
     def ensure_collection(self, name: str, dim: int):
-        # si existe déjà, rien à faire ; sinon, création
         try:
             self.client.get_collection(name)
         except Exception:
@@ -167,7 +154,6 @@ class QdrantStore(BaseStore):
                 collection_name=name,
                 vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
             )
-        # initialiser le prochain id si absent
         if name not in self._next_ids:
             self._init_next_id(name)
@@ -176,14 +162,14 @@ class QdrantStore(BaseStore):
             return 0
         if len(vectors) != len(payloads):
             raise ValueError("QdrantStore.upsert: tailles vectors/payloads incohérentes")
         if name not in self._next_ids:
             self._init_next_id(name)
         start = self._next_ids[name]
-        # construction des points avec IDs séquentiels (int)
         pts = [
-            PointStruct(id=start + i, vector=v.astype(np.float32).tolist(), payload=payloads[i])
             for i, v in enumerate(vectors)
         ]
         self.client.upsert(collection_name=name, points=pts)
@@ -201,7 +187,10 @@ class QdrantStore(BaseStore):
         out = []
         for p in res:
             pl = p.payload or {}
-            pl["_score"] = float(p.score) if hasattr(p, "score") else None
             out.append(pl)
         return out
@@ -212,47 +201,25 @@ class QdrantStore(BaseStore):
             pass
         self._next_ids.pop(name, None)
-# ---------- Initialisation du store actif ----------
-import os
-VECTOR_STORE = os.getenv("VECTOR_STORE", "qdrant").strip().lower()
-QDRANT_URL = os.getenv("QDRANT_URL", "").strip()
-QDRANT_API = os.getenv("QDRANT_API_KEY", "").strip()
 try:
     if VECTOR_STORE == "qdrant" and QDRANT_URL:
-        STORE: BaseStore = QdrantStore(QDRANT_URL, api_key=QDRANT_API)
-        # test léger: liste des collections
-        _ = STORE.client.get_collections()
         LOG.info("Connecté à Qdrant.")
         VECTOR_STORE_ACTIVE = "QdrantStore"
     else:
         raise RuntimeError("Qdrant non configuré, fallback mémoire.")
 except Exception as e:
-    LOG.error(f"Qdrant indisponible ({e}) — fallback en mémoire.")
     STORE = MemoryStore()
     VECTOR_STORE_ACTIVE = "MemoryStore"
     LOG.warning("Vector store: MEMORY (fallback). Les données sont volatiles (perdues au restart).")
-# Sélection / auto-fallback du store
-STORE: VectorStoreBase
-def _init_store() -> VectorStoreBase:
-    prefer = VECTOR_STORE
-    if prefer == "memory":
-        return MemoryStore()
-    # prefer qdrant
-    try:
-        return QdrantStore(QDRANT_URL, QDRANT_API if QDRANT_API else None)
-    except Exception as e:
-        LOG.error(f"Qdrant indisponible ({e}) — fallback en mémoire.")
-        return MemoryStore()
-STORE = _init_store()
-# ---------- Pydantic ----------
 class FileIn(BaseModel):
     path: str
     text: str
@@ -270,103 +237,47 @@ class QueryRequest(BaseModel):
     query: str
     top_k: int = 6
-# ---------- Jobs store ----------
-JOBS: Dict[str, Dict[str, Any]] = {}
 def _append_log(job_id: str, line: str):
     job = JOBS.get(job_id)
-    if job: job["logs"].append(line)
 def _set_status(job_id: str, status: str):
     job = JOBS.get(job_id)
-    if job: job["status"] = status
 def _auth(x_auth: Optional[str]):
     if AUTH_TOKEN and (x_auth or "") != AUTH_TOKEN:
-        raise HTTPException(status_code=401, detail="Unauthorized")
-# ---------- Helpers retry ----------
-def _retry_sleep(attempt: int):
     back = (RETRY_BASE_SEC ** attempt)
     jitter = 1.0 + random.uniform(-RETRY_JITTER, RETRY_JITTER)
     return max(0.25, back * jitter)
-def _with_task_param(url: str, task: str = "feature-extraction") -> str:
-    return url + ("&" if "?" in url else "?") + f"task={task}"
-# ---------- HF embeddings ----------
-def _hf_http(url: str, payload: Dict[str, Any], headers_extra: Optional[Dict[str, str]] = None) -> Tuple[np.ndarray, int]:
-    if not HF_TOKEN:
-        raise RuntimeError("HF_API_TOKEN manquant (backend=hf).")
-    headers = {
-        "Authorization": f"Bearer {HF_TOKEN}",
-        "Content-Type": "application/json",
-        "Accept": "application/json",
-    }
-    if HF_WAIT:
-        payload.setdefault("options", {})["wait_for_model"] = True
-        headers["X-Wait-For-Model"] = "true"
-        headers["X-Use-Cache"] = "true"
-    if headers_extra:
-        headers.update(headers_extra)
-    r = requests.post(url, headers=headers, json=payload, timeout=HF_TIMEOUT)
-    size = int(r.headers.get("Content-Length", "0"))
-    if r.status_code >= 400:
-        LOG.error(f"HF error {r.status_code}: {r.text[:1000]}")
-        r.raise_for_status()
-    data = r.json()
-    arr = np.array(data, dtype=np.float32)
-    if arr.ndim == 3:
-        arr = arr.mean(axis=1)
-    elif arr.ndim == 1:
-        arr = arr.reshape(1, -1)
-    if arr.ndim != 2:
-        raise RuntimeError(f"HF: unexpected embeddings shape: {arr.shape}")
     norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12
-    arr = arr / norms
-    return arr.astype(np.float32), size
-def _hf_post_embeddings_once(batch: List[str]) -> Tuple[np.ndarray, int]:
-    payload: Dict[str, Any] = {"inputs": (batch if len(batch) > 1 else batch[0])}
-    urls = [HF_URL_PIPELINE, HF_URL_MODELS] if HF_PIPELINE_FIRST else [HF_URL_MODELS, HF_URL_PIPELINE]
-    last_exc: Optional[Exception] = None
-    for idx, url in enumerate(urls, 1):
-        try:
-            if "/models/" in url:
-                return _hf_http(url, payload, headers_extra={"X-Task": "feature-extraction"})
-            else:
-                return _hf_http(url, payload, headers_extra=None)
-        except requests.HTTPError as he:
-            code = he.response.status_code if he.response is not None else 0
-            body = he.response.text if he.response is not None else ""
-            last_exc = he
-            if code in (404, 405, 501) and idx < len(urls):
-                LOG.warning(f"HF endpoint {url} non dispo ({code}), fallback vers alternative ...")
-                continue
-            if "/models/" in url and "SentenceSimilarityPipeline" in (body or ""):
-                try:
-                    forced_url = _with_task_param(url, "feature-extraction")
-                    LOG.warning("HF MODELS a choisi Similarity -> retry avec %s + X-Task", forced_url)
-                    return _hf_http(forced_url, payload, headers_extra={"X-Task": "feature-extraction"})
-                except Exception as he2:
-                    last_exc = he2
-            raise
-        except Exception as e:
-            last_exc = e
-            raise
-    raise RuntimeError(f"HF: aucun endpoint utilisable ({last_exc})")
-# ---------- DeepInfra embeddings ----------
 def _di_post_embeddings_once(batch: List[str]) -> Tuple[np.ndarray, int]:
     if not DI_TOKEN:
         raise RuntimeError("DEEPINFRA_API_KEY manquant (backend=deepinfra).")
-    headers = {"Authorization": f"Bearer {DI_TOKEN}", "Content-Type": "application/json", "Accept": "application/json"}
     payload = {"model": DI_MODEL, "input": batch}
     r = requests.post(DI_URL, headers=headers, json=payload, timeout=DI_TIMEOUT)
-    size = int(r.headers.get("Content-Length", "0"))
     if r.status_code >= 400:
         LOG.error(f"DeepInfra error {r.status_code}: {r.text[:1000]}")
         r.raise_for_status()
@@ -378,15 +289,92 @@ def _di_post_embeddings_once(batch: List[str]) -> Tuple[np.ndarray, int]:
     arr = np.asarray(embs, dtype=np.float32)
     if arr.ndim != 2:
         raise RuntimeError(f"DeepInfra: unexpected embeddings shape: {arr.shape}")
-    norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12
-    arr = arr / norms
-    return arr.astype(np.float32), size
-# ---------- Retry orchestrator ----------
-def _retry_sleep(attempt: int):
-    back = (RETRY_BASE_SEC ** attempt)
-    jitter = 1.0 + random.uniform(-RETRY_JITTER, RETRY_JITTER)
-    return max(0.25, back * jitter)
 def _call_with_retries(func, batch: List[str], label: str, job_id: Optional[str] = None) -> Tuple[np.ndarray, int]:
     last_exc = None
@@ -414,35 +402,43 @@ def _call_with_retries(func, batch: List[str], label: str, job_id: Optional[str]
     raise RuntimeError(f"{label}: retries exhausted: {last_exc}")
 def _post_embeddings(batch: List[str], job_id: Optional[str] = None) -> Tuple[np.ndarray, int]:
     last_err = None
-    similarity_misroute = False
     for b in EMB_BACKEND_ORDER:
-        if b == "hf":
-            try:
-                return _call_with_retries(_hf_post_embeddings_once, batch, "HF", job_id)
-            except requests.HTTPError as he:
-                body = he.response.text if getattr(he, "response", None) is not None else ""
-                if "SentenceSimilarityPipeline.__call__()" in (body or ""):
-                    similarity_misroute = True
-                last_err = he
-                _append_log(job_id, f"HF failed: {he}.")
-                LOG.error(f"HF failed: {he}")
-        elif b == "deepinfra":
             try:
                 return _call_with_retries(_di_post_embeddings_once, batch, "DeepInfra", job_id)
             except Exception as e:
                 last_err = e
                 _append_log(job_id, f"DeepInfra failed: {e}.")
                 LOG.error(f"DeepInfra failed: {e}")
         else:
             _append_log(job_id, f"Backend inconnu ignoré: {b}")
-    if ALLOW_DI_AUTOFALLBACK and similarity_misroute and DI_TOKEN:
-        LOG.warning("HF a routé sur SentenceSimilarity => auto-fallback DeepInfra (override ordre).")
-        _append_log(job_id, "Auto-fallback DeepInfra (HF => SentenceSimilarity).")
-        return _call_with_retries(_di_post_embeddings_once, batch, "DeepInfra", job_id)
     raise RuntimeError(f"Tous les backends ont échoué: {last_err}")
-# ---------- Chunking ----------
 def _chunk_with_spans(text: str, size: int, overlap: int):
     n = len(text or "")
     if size <= 0:
@@ -452,66 +448,71 @@ def _chunk_with_spans(text: str, size: int, overlap: int):
         j = min(n, i + size)
         yield (i, j, text[i:j])
         i = max(0, j - overlap)
-        if i >= n: break
-# ---------- Background task ----------
 def run_index_job(job_id: str, req: IndexRequest):
     try:
         _set_status(job_id, "running")
-        total_chunks = 0
         _append_log(job_id, f"Start project={req.project_id} files={len(req.files)} | backends={EMB_BACKEND_ORDER} | store={VECTOR_STORE}")
         LOG.info(f"[{job_id}] Index start project={req.project_id} files={len(req.files)}")
         # Warmup -> dimension
-        warm = "warmup"
-        if req.files:
-            for _, _, chunk_txt in _chunk_with_spans(req.files[0].text or "", req.chunk_size, req.overlap):
-                if (chunk_txt or "").strip():
-                    warm = chunk_txt; break
         embs, _ = _post_embeddings([warm], job_id=job_id)
         dim = embs.shape[1]
         col = f"proj_{req.project_id}"
         STORE.ensure_collection(col, dim)
         _append_log(job_id, f"Collection ready: {col} (dim={dim})")
-        # loop fichiers
         for fi, f in enumerate(req.files, 1):
-            if not (f.text or "").strip():
-                _append_log(job_id, f"file {fi}: vide — ignoré")
-                continue
-            batch_txts, metas = [], []
-            def _flush():
-                nonlocal batch_txts, metas, total_chunks
-                if not batch_txts: return
-                vecs, sz = _post_embeddings(batch_txts, job_id=job_id)
-                added = STORE.upsert(col, vecs, metas)
-                total_chunks += added
-                _append_log(job_id, f"file {fi}/{len(req.files)}: +{added} chunks (total={total_chunks})")
-                batch_txts, metas = [], []
             for ci, (start, end, chunk_txt) in enumerate(_chunk_with_spans(f.text, req.chunk_size, req.overlap)):
-                if not (chunk_txt or "").strip():
-                    continue
-                batch_txts.append(chunk_txt)
                 meta = {"path": f.path, "chunk": ci, "start": start, "end": end}
                 if req.store_text:
                     meta["text"] = chunk_txt
-                metas.append(meta)
-                if len(batch_txts) >= req.batch_size:
                     _flush()
             _flush()
         _append_log(job_id, f"Done. chunks={total_chunks}")
         _set_status(job_id, "done")
         LOG.info(f"[{job_id}] Index finished. chunks={total_chunks}")
     except Exception as e:
         LOG.exception("Index job failed")
         _append_log(job_id, f"ERROR: {e}")
         _set_status(job_id, "error")
-# ---------- API ----------
 app = FastAPI()
 @app.get("/")
@@ -520,18 +521,18 @@ def root():
         "ok": True,
         "service": "remote-indexer",
         "backends": EMB_BACKEND_ORDER,
-        "hf_url_pipeline": HF_URL_PIPELINE if "hf" in EMB_BACKEND_ORDER else None,
-        "hf_url_models": HF_URL_MODELS if "hf" in EMB_BACKEND_ORDER else None,
         "di_url": DI_URL if "deepinfra" in EMB_BACKEND_ORDER else None,
         "di_model": DI_MODEL if "deepinfra" in EMB_BACKEND_ORDER else None,
         "vector_store": VECTOR_STORE,
-        "vector_store_active": type(STORE).__name__,
-        "docs": "/health, /index, /status/{job_id}, /query, /wipe"
     }
 @app.get("/health")
 def health():
-    return {"ok": True}
 def _check_backend_ready():
     if "hf" in EMB_BACKEND_ORDER and not HF_TOKEN:
@@ -541,14 +542,8 @@ def _check_backend_ready():
 @app.post("/index")
 def start_index(req: IndexRequest, background_tasks: BackgroundTasks, x_auth_token: Optional[str] = Header(default=None)):
-    if AUTH_TOKEN and (x_auth_token or "") != AUTH_TOKEN:
-        raise HTTPException(401, "Unauthorized")
     _check_backend_ready()
-    non_empty = [f for f in req.files if (f.text or "").strip()]
-    if not non_empty:
-        raise HTTPException(422, "Aucun fichier non vide à indexer.")
-    req.files = non_empty
     job_id = uuid.uuid4().hex[:12]
     JOBS[job_id] = {"status": "queued", "logs": [], "created": time.time()}
     background_tasks.add_task(run_index_job, job_id, req)
@@ -556,58 +551,25 @@ def start_index(req: IndexRequest, background_tasks: BackgroundTasks, x_auth_tok
 @app.get("/status/{job_id}")
 def status(job_id: str, x_auth_token: Optional[str] = Header(default=None)):
-    if AUTH_TOKEN and (x_auth_token or "") != AUTH_TOKEN:
-        raise HTTPException(401, "Unauthorized")
-    j = JOBS.get(job_id)
-    if not j:
-        raise HTTPException(404, "job inconnu")
-    return {"status": j["status"], "logs": j["logs"][-800:]}
-# Legacy compat
-@app.get("/status")
-def status_qp(job_id: str = Query(None), x_auth_token: Optional[str] = Header(default=None)):
-    if AUTH_TOKEN and (x_auth_token or "") != AUTH_TOKEN:
-        raise HTTPException(401, "Unauthorized")
-    if not job_id:
-        raise HTTPException(404, "job inconnu")
     j = JOBS.get(job_id)
     if not j:
         raise HTTPException(404, "job inconnu")
-    return {"status": j["status"], "logs": j["logs"][-800:]}
-class _StatusBody(BaseModel):
-    job_id: str
-@app.post("/status")
-def status_post(body: _StatusBody, x_auth_token: Optional[str] = Header(default=None)):
-    if AUTH_TOKEN and (x_auth_token or "") != AUTH_TOKEN:
-        raise HTTPException(401, "Unauthorized")
-    j = JOBS.get(body.job_id)
-    if not j:
-        raise HTTPException(404, "job inconnu")
-    return {"status": j["status"], "logs": j["logs"][-800:]}
 @app.post("/query")
 def query(req: QueryRequest, x_auth_token: Optional[str] = Header(default=None)):
-    if AUTH_TOKEN and (x_auth_token or "") != AUTH_TOKEN:
-        raise HTTPException(401, "Unauthorized")
     _check_backend_ready()
-    k = int(max(1, min(50, req.top_k or 6)))
     vecs, _ = _post_embeddings([req.query])
     col = f"proj_{req.project_id}"
-    # Recherche selon le store actif
     try:
-        hits = STORE.search(col, vecs[0], k)
     except Exception as e:
         raise HTTPException(400, f"Search failed: {e}")
     out = []
-    # Qdrant renvoie des objets avec .score, .payload
-    for p in hits:
-        pl = getattr(p, "payload", None) or {}
-        score = float(getattr(p, "score", 0.0))
         txt = pl.get("text")
         if txt and len(txt) > 800:
             txt = txt[:800] + "..."
@@ -617,21 +579,24 @@ def query(req: QueryRequest, x_auth_token: Optional[str] = Header(default=None))
             "start": pl.get("start"),
             "end": pl.get("end"),
             "text": txt,
-            "score": score,
         })
     return {"results": out}
 @app.post("/wipe")
 def wipe_collection(project_id: str, x_auth_token: Optional[str] = Header(default=None)):
-    if AUTH_TOKEN and (x_auth_token or "") != AUTH_TOKEN:
-        raise HTTPException(401, "Unauthorized")
     col = f"proj_{project_id}"
     try:
-        STORE.wipe(col); return {"ok": True}
     except Exception as e:
         raise HTTPException(400, f"wipe failed: {e}")
-# ---------- Entrypoint ----------
 if __name__ == "__main__":
     import uvicorn
     port = int(os.getenv("PORT", "7860"))

 # -*- coding: utf-8 -*-
 from __future__ import annotations
+import os
+import time
+import uuid
+import math
+import random
+import logging
 from typing import List, Optional, Dict, Any, Tuple
 import numpy as np
 import requests
+from fastapi import FastAPI, BackgroundTasks, Header, HTTPException
 from pydantic import BaseModel, Field
+# ======================================================================================
+# Logging
+# ======================================================================================
 logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
 LOG = logging.getLogger("remote_indexer")
+# ======================================================================================
+# ENV (config)
+# ======================================================================================
+# Ordre des backends d'embeddings à essayer (séparés par des virgules). Ex: "deepinfra,hf"
+EMB_BACKEND_ORDER = [
+    s.strip().lower()
+    for s in os.getenv("EMB_BACKEND_ORDER", os.getenv("EMB_BACKEND", "deepinfra,hf")).split(",")
+    if s.strip()
+]
+# --- DeepInfra Embeddings (OpenAI-like) ---
+# API: POST https://api.deepinfra.com/v1/openai/embeddings
+# Body: {"model":"BAAI/bge-m3","input":[text1,text2,...]}
 DI_TOKEN   = os.getenv("DEEPINFRA_API_KEY", "").strip()
 DI_MODEL   = os.getenv("DEEPINFRA_EMBED_MODEL", "BAAI/bge-m3").strip()
 DI_URL     = os.getenv("DEEPINFRA_EMBED_URL", "https://api.deepinfra.com/v1/openai/embeddings").strip()
 DI_TIMEOUT = float(os.getenv("EMB_TIMEOUT_SEC", "120"))
+# --- Hugging Face Inference API ---
+# Deux endpoints possibles :
+# 1) Pipeline feature-extraction (souvent 404 selon le modèle)
+# 2) Models (parfois route sur SentenceSimilarity => besoin de forcer feature-extraction)
+HF_TOKEN    = os.getenv("HF_API_TOKEN", "").strip()
+HF_MODEL    = os.getenv("HF_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2").strip()
+HF_URL_PIPE = os.getenv("HF_API_URL_PIPELINE", "").strip() or (
+    f"https://api-inference.huggingface.co/pipeline/feature-extraction/{HF_MODEL}"
+)
+HF_URL_MODL = os.getenv("HF_API_URL_MODELS", "").strip() or (
+    f"https://api-inference.huggingface.co/models/{HF_MODEL}"
+)
+HF_TIMEOUT  = float(os.getenv("EMB_TIMEOUT_SEC", "120"))
+HF_WAIT     = os.getenv("HF_WAIT_FOR_MODEL", "true").lower() in ("1", "true", "yes", "on")
+# --- Retries / backoff ---
+RETRY_MAX      = int(os.getenv("EMB_RETRY_MAX", "6"))         # tentatives max par backend
+RETRY_BASE_SEC = float(os.getenv("EMB_RETRY_BASE", "1.6"))    # base du backoff exponentiel
+RETRY_JITTER   = float(os.getenv("EMB_RETRY_JITTER", "0.35")) # jitter fraction (0..1)
+# --- Vector store (Qdrant / Memory fallback) ---
+VECTOR_STORE = os.getenv("VECTOR_STORE", "qdrant").strip().lower()
+QDRANT_URL   = os.getenv("QDRANT_URL", "").strip()
+QDRANT_API   = os.getenv("QDRANT_API_KEY", "").strip()
+# --- Auth d’API de ce service (simple header) ---
+# Si défini, le client doit envoyer X-Auth-Token:{REMOTE_INDEX_TOKEN}
 AUTH_TOKEN = os.getenv("REMOTE_INDEX_TOKEN", "").strip()
 LOG.info(f"Embeddings backend order = {EMB_BACKEND_ORDER}")
+LOG.info(f"HF pipeline URL = {HF_URL_PIPE}")
+LOG.info(f"HF models   URL = {HF_URL_MODL}")
 LOG.info(f"VECTOR_STORE = {VECTOR_STORE}")
 if "deepinfra" in EMB_BACKEND_ORDER and not DI_TOKEN:
     LOG.warning("DEEPINFRA_API_KEY manquant — tentatives DeepInfra échoueront.")
+if "hf" in EMB_BACKEND_ORDER and not HF_TOKEN:
+    LOG.warning("HF_API_TOKEN manquant — tentatives HF échoueront.")
+# ======================================================================================
+# Vector Stores (Memory + Qdrant)
+# ======================================================================================
+from typing import Iterable
 try:
     from qdrant_client import QdrantClient
     QdrantClient = None
     PointStruct = None
 class BaseStore:
     def ensure_collection(self, name: str, dim: int): ...
     def upsert(self, name: str, vectors: np.ndarray, payloads: List[Dict[str, Any]]) -> int: ...
     def search(self, name: str, query_vec: np.ndarray, top_k: int) -> List[Dict[str, Any]]: ...
     def wipe(self, name: str): ...
 class MemoryStore(BaseStore):
+    """Store en mémoire (volatile) — pour fallback et tests."""
     def __init__(self):
+        self.db: Dict[str, Dict[str, Any]] = {}  # name -> {"vecs":[np.ndarray], "payloads":[dict], "dim":int}
     def ensure_collection(self, name: str, dim: int):
         self.db.setdefault(name, {"vecs": [], "payloads": [], "dim": dim})
             raise RuntimeError(f"MemoryStore: collection {name} inconnue")
         if len(vectors) != len(payloads):
             raise ValueError("MemoryStore.upsert: tailles vectors/payloads incohérentes")
+        self.db[name]["vecs"].extend([np.asarray(v, dtype=np.float32) for v in vectors])
         self.db[name]["payloads"].extend(payloads)
         return len(vectors)
     def search(self, name: str, query_vec: np.ndarray, top_k: int) -> List[Dict[str, Any]]:
         if name not in self.db or not self.db[name]["vecs"]:
             return []
+        mat = np.vstack(self.db[name]["vecs"]).astype(np.float32)  # [N, dim]
+        q = query_vec.reshape(1, -1).astype(np.float32)             # [1, dim]
+        # cosine similarity (embeddings déjà normalisés en amont)
+        sims = (mat @ q.T).ravel()
         top_idx = np.argsort(-sims)[:top_k]
         out = []
         for i in top_idx:
     def wipe(self, name: str):
         self.db.pop(name, None)
 class QdrantStore(BaseStore):
+    """Store Qdrant avec gestion d'IDs séquentiels (requis par PointStruct)."""
     def __init__(self, url: str, api_key: Optional[str] = None):
         if QdrantClient is None or PointStruct is None:
             raise RuntimeError("qdrant_client non disponible")
         self.client = QdrantClient(url=url, api_key=api_key if api_key else None)
         self._next_ids: Dict[str, int] = {}
     def _init_next_id(self, name: str):
         try:
             cnt = self.client.count(collection_name=name, exact=True).count
         except Exception:
             cnt = 0
         self._next_ids[name] = int(cnt)
     def ensure_collection(self, name: str, dim: int):
         try:
             self.client.get_collection(name)
         except Exception:
                 collection_name=name,
                 vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
             )
         if name not in self._next_ids:
             self._init_next_id(name)
             return 0
         if len(vectors) != len(payloads):
             raise ValueError("QdrantStore.upsert: tailles vectors/payloads incohérentes")
         if name not in self._next_ids:
             self._init_next_id(name)
         start = self._next_ids[name]
         pts = [
+            PointStruct(id=start + i,
+                        vector=np.asarray(v, dtype=np.float32).tolist(),
+                        payload=payloads[i])
             for i, v in enumerate(vectors)
         ]
         self.client.upsert(collection_name=name, points=pts)
         out = []
         for p in res:
             pl = p.payload or {}
+            try:
+                pl["_score"] = float(p.score)
+            except Exception:
+                pl["_score"] = None
             out.append(pl)
         return out
             pass
         self._next_ids.pop(name, None)
+# Initialisation du store actif (avec test de connexion)
 try:
     if VECTOR_STORE == "qdrant" and QDRANT_URL:
+        STORE: BaseStore = QdrantStore(QDRANT_URL, api_key=QDRANT_API if QDRANT_API else None)
+        _ = STORE.client.get_collections()  # ping
         LOG.info("Connecté à Qdrant.")
         VECTOR_STORE_ACTIVE = "QdrantStore"
     else:
         raise RuntimeError("Qdrant non configuré, fallback mémoire.")
 except Exception as e:
+    LOG.error(f"Qdrant indisponible (Connexion Qdrant impossible: {e}) — fallback en mémoire.")
     STORE = MemoryStore()
     VECTOR_STORE_ACTIVE = "MemoryStore"
     LOG.warning("Vector store: MEMORY (fallback). Les données sont volatiles (perdues au restart).")
+# ======================================================================================
+# Pydantic I/O
+# ======================================================================================
 class FileIn(BaseModel):
     path: str
     text: str
     query: str
     top_k: int = 6
+# ======================================================================================
+# Jobs store (mémoire)
+# ======================================================================================
+JOBS: Dict[str, Dict[str, Any]] = {}  # {job_id: {"status": "...", "logs": [...], "created": ts}}
 def _append_log(job_id: str, line: str):
     job = JOBS.get(job_id)
+    if job:
+        job["logs"].append(line)
 def _set_status(job_id: str, status: str):
     job = JOBS.get(job_id)
+    if job:
+        job["status"] = status
 def _auth(x_auth: Optional[str]):
     if AUTH_TOKEN and (x_auth or "") != AUTH_TOKEN:
+        raise HTTPException(401, "Unauthorized")
+# ======================================================================================
+# Embeddings backends + retry/fallback
+# ======================================================================================
+def _retry_sleep(attempt: int) -> float:
+    # backoff exponentiel + jitter
     back = (RETRY_BASE_SEC ** attempt)
     jitter = 1.0 + random.uniform(-RETRY_JITTER, RETRY_JITTER)
     return max(0.25, back * jitter)
+def _normalize_rows(arr: np.ndarray) -> np.ndarray:
+    arr = np.asarray(arr, dtype=np.float32)
     norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12
+    return (arr / norms).astype(np.float32)
 def _di_post_embeddings_once(batch: List[str]) -> Tuple[np.ndarray, int]:
     if not DI_TOKEN:
         raise RuntimeError("DEEPINFRA_API_KEY manquant (backend=deepinfra).")
+    headers = {"Authorization": f"Bearer {DI_TOKEN}", "Content-Type": "application/json"}
     payload = {"model": DI_MODEL, "input": batch}
     r = requests.post(DI_URL, headers=headers, json=payload, timeout=DI_TIMEOUT)
+    size = int(r.headers.get("Content-Length", "0") or 0)
     if r.status_code >= 400:
         LOG.error(f"DeepInfra error {r.status_code}: {r.text[:1000]}")
         r.raise_for_status()
     arr = np.asarray(embs, dtype=np.float32)
     if arr.ndim != 2:
         raise RuntimeError(f"DeepInfra: unexpected embeddings shape: {arr.shape}")
+    return _normalize_rows(arr), size
+def _hf_post_embeddings_once(batch: List[str]) -> Tuple[np.ndarray, int]:
+    """
+    1) On tente PIPELINE feature-extraction
+    2) Si 404 => on tente MODELS
+       2a) Si la route sélectionne SentenceSimilarity (erreur "missing 'sentences'"),
+           on reforce la tâche feature-extraction par ?task=feature-extraction + X-Task
+    """
+    if not HF_TOKEN:
+        raise RuntimeError("HF_API_TOKEN manquant (backend=hf).")
+    headers = {
+        "Authorization": f"Bearer {HF_TOKEN}",
+        "Content-Type": "application/json",
+    }
+    if HF_WAIT:
+        headers["X-Wait-For-Model"] = "true"
+        headers["X-Use-Cache"] = "true"
+    # Helper interne
+    def _call(url: str, payload: Dict[str, Any], extra_headers: Optional[Dict[str, str]] = None):
+        h = dict(headers)
+        if extra_headers:
+            h.update(extra_headers)
+        r = requests.post(url, headers=h, json=payload, timeout=HF_TIMEOUT)
+        return r
+    # 1) Pipeline
+    payload = {"inputs": batch if len(batch) > 1 else batch[0]}
+    r = _call(HF_URL_PIPE, payload)
+    size = int(r.headers.get("Content-Length", "0") or 0)
+    if r.status_code == 404:
+        LOG.error("HF error 404: Not Found")
+        LOG.warning(f"HF endpoint {HF_URL_PIPE} non dispo (404), fallback vers alternative ...")
+    elif r.status_code >= 400:
+        LOG.error(f"HF error {r.status_code}: {r.text[:1000]}")
+        r.raise_for_status()
+        # si on arrive ici, pas de fallback (raise)
+    else:
+        data = r.json()
+        arr = np.array(data, dtype=np.float32)
+        if arr.ndim == 3:  # [batch, tokens, dim]
+            arr = arr.mean(axis=1)
+        if arr.ndim == 1:
+            arr = arr.reshape(1, -1)
+        if arr.ndim != 2:
+            raise RuntimeError(f"HF: unexpected embeddings shape: {arr.shape}")
+        return _normalize_rows(arr), size
+    # 2) MODELS
+    r2 = _call(HF_URL_MODL, payload)
+    size2 = int(r2.headers.get("Content-Length", "0") or 0)
+    if r2.status_code >= 400:
+        LOG.error(f"HF error {r2.status_code}: {r2.text[:1000]}")
+        # Si c'est la fameuse erreur Similarity => tenter X-Task + query param
+        if r2.status_code == 400 and "SentenceSimilarityPipeline" in (r2.text or ""):
+            LOG.warning("HF MODELS a choisi Similarity -> retry avec ?task=feature-extraction + X-Task")
+            r3 = _call(
+                HF_URL_MODL + "?task=feature-extraction",
+                payload,
+                extra_headers={"X-Task": "feature-extraction"}
+            )
+            size3 = int(r3.headers.get("Content-Length", "0") or 0)
+            if r3.status_code >= 400:
+                LOG.error(f"HF error {r3.status_code}: {r3.text[:1000]}")
+                r3.raise_for_status()
+            data3 = r3.json()
+            arr3 = np.array(data3, dtype=np.float32)
+            if arr3.ndim == 3:
+                arr3 = arr3.mean(axis=1)
+            if arr3.ndim == 1:
+                arr3 = arr3.reshape(1, -1)
+            if arr3.ndim != 2:
+                raise RuntimeError(f"HF: unexpected embeddings shape: {arr3.shape}")
+            return _normalize_rows(arr3), size3
+        else:
+            r2.raise_for_status()
+    data2 = r2.json()
+    arr2 = np.array(data2, dtype=np.float32)
+    if arr2.ndim == 3:  # [batch, tokens, dim]
+        arr2 = arr2.mean(axis=1)
+    if arr2.ndim == 1:
+        arr2 = arr2.reshape(1, -1)
+    if arr2.ndim != 2:
+        raise RuntimeError(f"HF: unexpected embeddings shape: {arr2.shape}")
+    return _normalize_rows(arr2), size2
 def _call_with_retries(func, batch: List[str], label: str, job_id: Optional[str] = None) -> Tuple[np.ndarray, int]:
     last_exc = None
     raise RuntimeError(f"{label}: retries exhausted: {last_exc}")
 def _post_embeddings(batch: List[str], job_id: Optional[str] = None) -> Tuple[np.ndarray, int]:
+    """
+    Essaie les backends dans EMB_BACKEND_ORDER avec retries.
+    Ex: EMB_BACKEND_ORDER=deepinfra,hf
+    """
     last_err = None
     for b in EMB_BACKEND_ORDER:
+        if b == "deepinfra":
             try:
                 return _call_with_retries(_di_post_embeddings_once, batch, "DeepInfra", job_id)
             except Exception as e:
                 last_err = e
                 _append_log(job_id, f"DeepInfra failed: {e}.")
                 LOG.error(f"DeepInfra failed: {e}")
+        elif b == "hf":
+            try:
+                return _call_with_retries(_hf_post_embeddings_once, batch, "HF", job_id)
+            except Exception as e:
+                last_err = e
+                _append_log(job_id, f"HF failed: {e}.")
+                LOG.error(f"HF failed: {e}")
+                # Si HF route vers SentenceSimilarity (erreur 'sentences'), on peut tenter auto-fallback DI
+                if "SentenceSimilarityPipeline" in str(e) and "deepinfra" not in EMB_BACKEND_ORDER:
+                    _append_log(job_id, "Auto-fallback DeepInfra (HF => SentenceSimilarity).")
+                    try:
+                        return _call_with_retries(_di_post_embeddings_once, batch, "DeepInfra", job_id)
+                    except Exception as e2:
+                        last_err = e2
+                        _append_log(job_id, f"DeepInfra failed after HF: {e2}.")
+                        LOG.error(f"DeepInfra failed after HF: {e2}")
         else:
             _append_log(job_id, f"Backend inconnu ignoré: {b}")
     raise RuntimeError(f"Tous les backends ont échoué: {last_err}")
+# ======================================================================================
+# Helpers chunking
+# ======================================================================================
 def _chunk_with_spans(text: str, size: int, overlap: int):
     n = len(text or "")
     if size <= 0:
         j = min(n, i + size)
         yield (i, j, text[i:j])
         i = max(0, j - overlap)
+        if i >= n:
+            break
+# ======================================================================================
+# Background task : indexation
+# ======================================================================================
 def run_index_job(job_id: str, req: IndexRequest):
     try:
         _set_status(job_id, "running")
         _append_log(job_id, f"Start project={req.project_id} files={len(req.files)} | backends={EMB_BACKEND_ORDER} | store={VECTOR_STORE}")
         LOG.info(f"[{job_id}] Index start project={req.project_id} files={len(req.files)}")
         # Warmup -> dimension
+        warm = next(_chunk_with_spans(req.files[0].text if req.files else "", req.chunk_size, req.overlap))[2] if req.files else "warmup"
         embs, _ = _post_embeddings([warm], job_id=job_id)
         dim = embs.shape[1]
         col = f"proj_{req.project_id}"
+        # Créer/assurer la collection
         STORE.ensure_collection(col, dim)
         _append_log(job_id, f"Collection ready: {col} (dim={dim})")
+        total_chunks = 0
+        buf_chunks: List[str] = []
+        buf_metas: List[Dict[str, Any]] = []
+        def _flush():
+            nonlocal buf_chunks, buf_metas, total_chunks
+            if not buf_chunks:
+                return
+            vecs, sz = _post_embeddings(buf_chunks, job_id=job_id)
+            added = STORE.upsert(col, vecs, buf_metas)
+            total_chunks += added
+            _append_log(job_id, f"+{added} chunks (total={total_chunks}) ~{(sz/1024.0):.1f}KiB")
+            buf_chunks, buf_metas = [], []
+        # Boucle fichiers + chunks
         for fi, f in enumerate(req.files, 1):
             for ci, (start, end, chunk_txt) in enumerate(_chunk_with_spans(f.text, req.chunk_size, req.overlap)):
+                buf_chunks.append(chunk_txt)
                 meta = {"path": f.path, "chunk": ci, "start": start, "end": end}
                 if req.store_text:
                     meta["text"] = chunk_txt
+                buf_metas.append(meta)
+                if len(buf_chunks) >= req.batch_size:
                     _flush()
+                    _append_log(job_id, f"file {fi}/{len(req.files)}: +{req.batch_size} chunks (total={total_chunks})")
+            # flush fin de fichier
             _flush()
+            _append_log(job_id, f"file {fi}/{len(req.files)} processed.")
         _append_log(job_id, f"Done. chunks={total_chunks}")
         _set_status(job_id, "done")
         LOG.info(f"[{job_id}] Index finished. chunks={total_chunks}")
     except Exception as e:
         LOG.exception("Index job failed")
         _append_log(job_id, f"ERROR: {e}")
         _set_status(job_id, "error")
+# ======================================================================================
+# API
+# ======================================================================================
 app = FastAPI()
 @app.get("/")
         "ok": True,
         "service": "remote-indexer",
         "backends": EMB_BACKEND_ORDER,
+        "hf_url_pipeline": HF_URL_PIPE if "hf" in EMB_BACKEND_ORDER else None,
+        "hf_url_models": HF_URL_MODL if "hf" in EMB_BACKEND_ORDER else None,
         "di_url": DI_URL if "deepinfra" in EMB_BACKEND_ORDER else None,
         "di_model": DI_MODEL if "deepinfra" in EMB_BACKEND_ORDER else None,
         "vector_store": VECTOR_STORE,
+        "vector_store_active": VECTOR_STORE_ACTIVE,
+        "docs": "/health, /index, /status/{job_id}, /query, /wipe",
     }
 @app.get("/health")
 def health():
+    return {"ok": True, "store": VECTOR_STORE_ACTIVE}
 def _check_backend_ready():
     if "hf" in EMB_BACKEND_ORDER and not HF_TOKEN:
 @app.post("/index")
 def start_index(req: IndexRequest, background_tasks: BackgroundTasks, x_auth_token: Optional[str] = Header(default=None)):
+    _auth(x_auth_token)
     _check_backend_ready()
     job_id = uuid.uuid4().hex[:12]
     JOBS[job_id] = {"status": "queued", "logs": [], "created": time.time()}
     background_tasks.add_task(run_index_job, job_id, req)
 @app.get("/status/{job_id}")
 def status(job_id: str, x_auth_token: Optional[str] = Header(default=None)):
+    _auth(x_auth_token)
     j = JOBS.get(job_id)
     if not j:
         raise HTTPException(404, "job inconnu")
+    # garder les derniers logs pour éviter de gonfler la réponse
+    return {"status": j["status"], "logs": j["logs"][-1200:]}
 @app.post("/query")
 def query(req: QueryRequest, x_auth_token: Optional[str] = Header(default=None)):
+    _auth(x_auth_token)
     _check_backend_ready()
     vecs, _ = _post_embeddings([req.query])
     col = f"proj_{req.project_id}"
     try:
+        results = STORE.search(col, vecs[0], int(req.top_k))
     except Exception as e:
         raise HTTPException(400, f"Search failed: {e}")
     out = []
+    for pl in results:
         txt = pl.get("text")
         if txt and len(txt) > 800:
             txt = txt[:800] + "..."
             "start": pl.get("start"),
             "end": pl.get("end"),
             "text": txt,
+            "score": float(pl.get("_score")) if pl.get("_score") is not None else None
         })
     return {"results": out}
 @app.post("/wipe")
 def wipe_collection(project_id: str, x_auth_token: Optional[str] = Header(default=None)):
+    _auth(x_auth_token)
     col = f"proj_{project_id}"
     try:
+        STORE.wipe(col)
+        return {"ok": True}
     except Exception as e:
         raise HTTPException(400, f"wipe failed: {e}")
+# ======================================================================================
+# Entrypoint
+# ======================================================================================
 if __name__ == "__main__":
     import uvicorn
     port = int(os.getenv("PORT", "7860"))