Spaces:

chouchouvs
/

DeepIndex

Running

App Files Files Community

chouchouvs commited on Sep 16

Commit

f1ee128

verified ·

1 Parent(s): 903baeb

Update main.py

Browse files

Files changed (1) hide show

main.py +257 -248

main.py CHANGED Viewed

@@ -1,51 +1,58 @@
 # -*- coding: utf-8 -*-
 """
-Remote Indexer (HF Space) — Qdrant + embeddings (HF ou dummy)
-Version: worker-thread + ensure_collection_hard (retries + verify) + count tolérant.
-Endpoints:
-- GET  /            → redirige vers UI_PATH (défaut: /ui)
-- GET  /ui          → UI Gradio
-- GET  /health      → healthcheck
-- GET  /api         → infos service
-- GET  /debug/env   → aperçu config (sans secrets)
-- POST /wipe?project_id=XXX
-- POST /index
-- GET  /status/{job_id}
-- GET  /collections/{project_id}/count
-- POST /query
-- POST /collections/{project_id}/ensure?dim=XXX  (debug)
 ENV:
-- QDRANT_URL, QDRANT_API_KEY      (requis pour upsert)
-- COLLECTION_PREFIX               (défaut "proj_")
-- EMB_PROVIDER                    ("hf" | "dummy"; défaut "hf")
-- HF_EMBED_MODEL                  (défaut "BAAI/bge-m3")
-- HUGGINGFACEHUB_API_TOKEN        (si EMB_PROVIDER=hf)
-- EMB_FALLBACK_TO_DUMMY           (true/false) → bascule dummy si HF échoue
-- LOG_LEVEL                       (défaut DEBUG)
-- UI_PATH                         (défaut "/ui")
-- PORT                            (défaut 7860)
 """
 from __future__ import annotations
 import os
 import time
 import uuid
 import hashlib
 import logging
-import threading
 import asyncio
 from typing import List, Dict, Any, Optional, Tuple
 import numpy as np
 import httpx
 import uvicorn
 from pydantic import BaseModel, Field, ValidationError
 from fastapi import FastAPI, HTTPException, Query
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import RedirectResponse
-import gradio as gr
 # ------------------------------------------------------------------------------
 # Config & logs
@@ -55,26 +62,24 @@ logging.basicConfig(
     level=getattr(logging, LOG_LEVEL, logging.DEBUG),
     format="%(asctime)s - %(levelname)s - %(message)s",
 )
-LOG = logging.getLogger("remote_indexer_min")
-QDRANT_URL = os.getenv("QDRANT_URL", "").rstrip("/")
-QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "")
-COLLECTION_PREFIX = os.getenv("COLLECTION_PREFIX", "proj_").strip() or "proj_"
-EMB_PROVIDER = os.getenv("EMB_PROVIDER", "hf").lower()
 HF_EMBED_MODEL = os.getenv("HF_EMBED_MODEL", "BAAI/bge-m3")
 HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
 EMB_FALLBACK_TO_DUMMY = os.getenv("EMB_FALLBACK_TO_DUMMY", "false").lower() in ("1","true","yes","on")
 UI_PATH = os.getenv("UI_PATH", "/ui")
-if not QDRANT_URL or not QDRANT_API_KEY:
-    LOG.warning("QDRANT_URL / QDRANT_API_KEY non fournis : l'upsert échouera.")
 if EMB_PROVIDER == "hf" and not HF_TOKEN and not EMB_FALLBACK_TO_DUMMY:
-    LOG.warning("EMB_PROVIDER=hf sans HUGGINGFACEHUB_API_TOKEN (pas de fallback) → préférer EMB_PROVIDER=dummy ou EMB_FALLBACK_TO_DUMMY=true.")
 # ------------------------------------------------------------------------------
-# Models
 # ------------------------------------------------------------------------------
 class FileItem(BaseModel):
     path: str
@@ -96,11 +101,11 @@ class QueryRequest(BaseModel):
 class JobState(BaseModel):
     job_id: str
     project_id: str
-    stage: str = "pending"      # pending -> embedding -> upserting -> done/failed
     total_files: int = 0
     total_chunks: int = 0
     embedded: int = 0
-    upserted: int = 0
     errors: List[str] = Field(default_factory=list)
     messages: List[str] = Field(default_factory=list)
     started_at: float = Field(default_factory=time.time)
@@ -114,6 +119,9 @@ class JobState(BaseModel):
 JOBS: Dict[str, JobState] = {}
 # ------------------------------------------------------------------------------
 # Utils
 # ------------------------------------------------------------------------------
@@ -151,90 +159,37 @@ def chunk_text(text: str, chunk_size: int, overlap: int) -> List[Tuple[int, int,
             i = j
     return res
-# ------------------------------------------------------------------------------
-# Qdrant
-# ------------------------------------------------------------------------------
-async def ensure_collection_hard(client: httpx.AsyncClient, coll: str, vector_size: int, job: Optional[JobState] = None) -> None:
-    """
-    Crée la collection si absente, ou la recrée si dimension ≠, puis
-    vérifie qu'elle existe bien (poll GET avec retries).
-    """
-    def _j(msg: str):
-        if job: job.log(msg)
-        else: LOG.debug(msg)
-    url = f"{QDRANT_URL}/collections/{coll}"
-    r = await client.get(url, headers={"api-key": QDRANT_API_KEY}, timeout=20)
-    recreate = False
-    if r.status_code == 200:
-        data = r.json()
-        existing_size = data.get("result", {}).get("vectors", {}).get("size")
-        _j(f"GET collection '{coll}' → 200 dim={existing_size}")
-        if existing_size and int(existing_size) != int(vector_size):
-            _j(f"Dimension ≠ ({existing_size} ≠ {vector_size}) → suppression + recréation")
-            rd = await client.delete(url, headers={"api-key": QDRANT_API_KEY}, timeout=20)
-            _j(f"DELETE collection '{coll}' → {rd.status_code}")
-            recreate = True
-    elif r.status_code == 404:
-        _j(f"GET collection '{coll}' → 404 (à créer)")
-    else:
-        _j(f"GET collection '{coll}' → {r.status_code} {r.text}")
-    if r.status_code != 200 or recreate:
-        body = {"vectors": {"size": vector_size, "distance": "Cosine"}}
-        r2 = await client.put(url, headers={"api-key": QDRANT_API_KEY}, json=body, timeout=30)
-        _j(f"PUT create collection '{coll}' dim={vector_size} → {r2.status_code}")
-        if r2.status_code not in (200, 201):
-            raise HTTPException(status_code=500, detail=f"Qdrant PUT collection a échoué: {r2.text}")
-    # Poll jusqu'à visibilité
-    for i in range(10):
-        rg = await client.get(url, headers={"api-key": QDRANT_API_KEY}, timeout=20)
-        if rg.status_code == 200:
-            _j(f"Collection '{coll}' disponible (try={i+1})")
-            return
-        await asyncio.sleep(0.3)
-    raise HTTPException(status_code=500, detail=f"Collection '{coll}' non visible après création.")
-async def qdrant_upsert(client: httpx.AsyncClient, coll: str, points: List[Dict[str, Any]], job: Optional[JobState] = None) -> int:
-    if not points:
-        return 0
-    url = f"{QDRANT_URL}/collections/{coll}/points?wait=true"
-    body = {"points": points}
-    r = await client.put(url, headers={"api-key": QDRANT_API_KEY}, json=body, timeout=60)
-    if r.status_code not in (200, 202):
-        if job: job.log(f"Upsert → {r.status_code}: {r.text[:200]}")
-        raise HTTPException(status_code=500, detail=f"Qdrant upsert échoué: {r.text}")
-    return len(points)
-async def qdrant_count(client: httpx.AsyncClient, coll: str) -> int:
-    url = f"{QDRANT_URL}/collections/{coll}/points/count"
-    r = await client.post(url, headers={"api-key": QDRANT_API_KEY}, json={"exact": True}, timeout=20)
-    if r.status_code == 404 and "doesn't exist" in (r.text or ""):
-        # Tolérant: collection absente → 0 (utile pour l'UI)
-        return 0
-    if r.status_code != 200:
-        raise HTTPException(status_code=500, detail=f"Qdrant count échoué: {r.text}")
-    return int(r.json().get("result", {}).get("count", 0))
-async def qdrant_search(client: httpx.AsyncClient, coll: str, vector: List[float], limit: int = 5) -> Dict[str, Any]:
-    url = f"{QDRANT_URL}/collections/{coll}/points/search"
-    r = await client.post(
-        url,
-        headers={"api-key": QDRANT_API_KEY},
-        json={"vector": vector, "limit": limit, "with_payload": True},
-        timeout=30,
-    )
-    if r.status_code != 200:
-        raise HTTPException(status_code=500, detail=f"Qdrant search échoué: {r.text}")
-    return r.json()
 # ------------------------------------------------------------------------------
-# Embeddings
 # ------------------------------------------------------------------------------
 def _maybe_prefix_for_model(texts: List[str], model_name: str) -> List[str]:
     m = (model_name or "").lower()
     if "e5" in m:
         return [("query: " + t) for t in texts]
     return texts
@@ -284,9 +239,14 @@ async def embed_texts(client: httpx.AsyncClient, texts: List[str]) -> List[List[
     return embed_dummy(texts, dim=128)
 # ------------------------------------------------------------------------------
-# Core: run_index_job (async) + worker thread wrapper
 # ------------------------------------------------------------------------------
-async def run_index_job(job: JobState, req: IndexRequest) -> None:
     try:
         job.stage = "embedding"
         job.total_files = len(req.files)
@@ -306,7 +266,10 @@ async def run_index_job(job: JobState, req: IndexRequest) -> None:
                 payload = {"path": f.path, "chunk": idx, "start": start, "end": end}
                 if req.store_text:
                     payload["text"] = ch
-                records.append({"payload": payload, "raw": ch})
         job.total_chunks = len(records)
         job.log(f"Total chunks = {job.total_chunks}")
         if job.total_chunks == 0:
@@ -315,53 +278,72 @@ async def run_index_job(job: JobState, req: IndexRequest) -> None:
             job.finished_at = time.time()
             return
         async with httpx.AsyncClient(timeout=180) as client:
-            # Warmup dim
-            warmup_vec = (await embed_texts(client, [records[0]["raw"]]))[0]
-            vec_dim = len(warmup_vec)
-            job.log(f"Warmup embeddings dim={vec_dim}")
-            # Collection (hard ensure)
-            coll = f"{COLLECTION_PREFIX}{req.project_id}"
-            await ensure_collection_hard(client, coll, vector_size=vec_dim, job=job)
-            job.log(f"Collection prête: {coll} (dim={vec_dim})")
-            # Upsert
-            job.stage = "upserting"
-            batch_points: List[Dict[str, Any]] = []
-            async def flush_batch():
-                nonlocal batch_points
-                if not batch_points:
-                    return 0
-                added = await qdrant_upsert(client, coll, batch_points, job=job)
-                job.upserted += added
-                job.log(f"+{added} points upsert (total={job.upserted})")
-                batch_points = []
-                return added
-            EMB_BATCH = max(8, min(64, req.batch_size * 2))
             i = 0
             while i < len(records):
-                sub = records[i : i + EMB_BATCH]
                 texts = [r["raw"] for r in sub]
                 vecs = await embed_texts(client, texts)
                 if len(vecs) != len(sub):
                     raise HTTPException(status_code=500, detail="Embedding batch size mismatch")
                 job.embedded += len(vecs)
-                for r, v in zip(sub, vecs):
-                    point = {"id": str(uuid.uuid4()), "vector": v, "payload": r["payload"]}
-                    batch_points.append(point)
-                    if len(batch_points) >= req.batch_size:
-                        await flush_batch()
-                i += EMB_BATCH
-            await flush_batch()
         job.stage = "done"
         job.finished_at = time.time()
-        job.log("Index job terminé.")
     except Exception as e:
         job.stage = "failed"
         job.errors.append(str(e))
@@ -369,10 +351,9 @@ async def run_index_job(job: JobState, req: IndexRequest) -> None:
         job.log(f"❌ Exception: {e}")
 def _run_job_in_thread(job: JobState, req: IndexRequest) -> None:
-    """Exécute l'async run_index_job dans un thread dédié avec son propre event loop."""
     def _runner():
         try:
-            asyncio.run(run_index_job(job, req))
         except Exception as e:
             job.stage = "failed"
             job.errors.append(str(e))
@@ -389,41 +370,49 @@ def create_and_start_job(req: IndexRequest) -> JobState:
     _run_job_in_thread(job, req)
     return job
 # ------------------------------------------------------------------------------
 # FastAPI app
 # ------------------------------------------------------------------------------
-fastapi_app = FastAPI(title="Remote Indexer - Minimal Test Space")
 fastapi_app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_methods=["*"],
-    allow_headers=["*"],
 )
 @fastapi_app.get("/health")
 async def health():
-    return {"status": "ok"}
 @fastapi_app.get("/api")
 async def api_info():
     return {
-        "ok": True, "service": "remote-indexer-min",
-        "qdrant": bool(QDRANT_URL),
         "emb_provider": EMB_PROVIDER, "hf_model": HF_EMBED_MODEL,
         "fallback_to_dummy": EMB_FALLBACK_TO_DUMMY,
-        "ui_path": UI_PATH,
-    }
-@fastapi_app.get("/debug/env")
-async def debug_env():
-    return {
-        "qdrant_url_set": bool(QDRANT_URL),
-        "qdrant_key_set": bool(QDRANT_API_KEY),
-        "emb_provider": EMB_PROVIDER,
-        "hf_model": HF_EMBED_MODEL,
-        "hf_token_set": bool(HF_TOKEN),
-        "fallback_to_dummy": EMB_FALLBACK_TO_DUMMY,
-        "collection_prefix": COLLECTION_PREFIX,
     }
 @fastapi_app.get("/")
@@ -432,19 +421,15 @@ async def root_redirect():
 @fastapi_app.post("/wipe")
 async def wipe(project_id: str = Query(..., min_length=1)):
-    if not QDRANT_URL or not QDRANT_API_KEY:
-        raise HTTPException(status_code=400, detail="QDRANT_URL / QDRANT_API_KEY requis")
-    coll = f"{COLLECTION_PREFIX}{project_id}"
-    async with httpx.AsyncClient() as client:
-        r = await client.delete(f"{QDRANT_URL}/collections/{coll}", headers={"api-key": QDRANT_API_KEY}, timeout=30)
-        if r.status_code not in (200, 202, 404):
-            raise HTTPException(status_code=500, detail=f"Echec wipe: {r.text}")
-    return {"ok": True, "collection": coll, "wiped": True}
 @fastapi_app.post("/index")
 async def index(req: IndexRequest):
-    if not QDRANT_URL or not QDRANT_API_KEY:
-        raise HTTPException(status_code=400, detail="QDRANT_URL / QDRANT_API_KEY requis")
     job = create_and_start_job(req)
     return {"job_id": job.job_id, "project_id": job.project_id}
@@ -457,39 +442,70 @@ async def status(job_id: str):
 @fastapi_app.get("/collections/{project_id}/count")
 async def coll_count(project_id: str):
-    if not QDRANT_URL or not QDRANT_API_KEY:
-        raise HTTPException(status_code=400, detail="QDRANT_URL / QDRANT_API_KEY requis")
-    coll = f"{COLLECTION_PREFIX}{project_id}"
-    async with httpx.AsyncClient() as client:
-        try:
-            cnt = await qdrant_count(client, coll)
-            return {"project_id": project_id, "collection": coll, "count": cnt}
-        except HTTPException as he:
-            # On remonte le 500 (autre qu'un 404 not found géré dans qdrant_count)
-            raise he
 @fastapi_app.post("/query")
 async def query(req: QueryRequest):
-    if not QDRANT_URL or not QDRANT_API_KEY:
-        raise HTTPException(status_code=400, detail="QDRANT_URL / QDRANT_API_KEY requis")
-    coll = f"{COLLECTION_PREFIX}{req.project_id}"
-    async with httpx.AsyncClient() as client:
-        vec = (await embed_texts(client, [req.text]))[0]
-        data = await qdrant_search(client, coll, vec, limit=req.top_k)
-    return data
-@fastapi_app.post("/collections/{project_id}/ensure")
-async def http_ensure(project_id: str, dim: int = Query(..., ge=16, le=8192)):
-    """Endpoint debug pour forcer la création d'une collection à une dimension donnée."""
-    if not QDRANT_URL or not QDRANT_API_KEY:
-        raise HTTPException(status_code=400, detail="QDRANT_URL / QDRANT_API_KEY requis")
-    coll = f"{COLLECTION_PREFIX}{project_id}"
-    async with httpx.AsyncClient() as client:
-        await ensure_collection_hard(client, coll, vector_size=dim, job=None)
-    return {"ok": True, "collection": coll, "dim": dim}
 # ------------------------------------------------------------------------------
-# Gradio UI (avec auto-refresh + bouton Ensure debug)
 # ------------------------------------------------------------------------------
 def _default_two_docs() -> List[Dict[str, str]]:
     a = "Alpha bravo charlie delta echo foxtrot golf hotel india. " * 3
@@ -498,8 +514,8 @@ def _default_two_docs() -> List[Dict[str, str]]:
 async def ui_wipe(project: str):
     try:
-        resp = await wipe(project)
-        return f"✅ Wipe ok — collection {resp['collection']} supprimée."
     except Exception as e:
         LOG.exception("wipe UI error")
         return f"❌ Wipe erreur: {e}"
@@ -528,8 +544,8 @@ async def ui_status(job_id: str):
         return "⚠️ Renseigne un job_id"
     try:
         st = await status(job_id)
-        lines = [f"Job {st['job_id']} — stage={st['stage']} files={st['total_files']} chunks={st['total_chunks']} embedded={st['embedded']} upserted={st['upserted']}"]
-        lines += st.get("messages", [])[-50:]
         if st.get("errors"):
             lines.append("Erreurs:")
             lines += [f" - {e}" for e in st['errors']]
@@ -539,11 +555,8 @@ async def ui_status(job_id: str):
 async def ui_count(project: str):
     try:
-        # Count tolérant (0 si collection absente)
-        async with httpx.AsyncClient() as client:
-            coll = f"{COLLECTION_PREFIX}{project}"
-            cnt = await qdrant_count(client, coll)
-            return f"📊 Count — collection={coll} → {cnt} points"
     except Exception as e:
         LOG.exception("count UI error")
         return f"❌ Count erreur: {e}"
@@ -556,37 +569,31 @@ async def ui_query(project: str, text: str, topk: int):
             return "Aucun résultat."
         out = []
         for h in hits:
-            score = h.get("score")
-            payload = h.get("payload", {})
-            path = payload.get("path")
-            chunk = payload.get("chunk")
-            preview = (payload.get("text") or "")[:120].replace("\n", " ")
-            out.append(f"{score:.4f} — {path} [chunk {chunk}] — {preview}…")
         return "\n".join(out)
     except Exception as e:
         LOG.exception("query UI error")
         return f"❌ Query erreur: {e}"
-async def ui_ensure(project: str, dim: int):
     try:
-        resp = await http_ensure(project, dim)
-        return f"🛠️ Ensure — collection={resp['collection']} dim={resp['dim']} OK"
     except Exception as e:
-        LOG.exception("ensure UI error")
-        return f"❌ Ensure erreur: {e}"
-with gr.Blocks(title="Remote Indexer — Tests sans console", analytics_enabled=False) as ui:
-    gr.Markdown("## 🔬 Remote Indexer — Tests sans console\n"
                 "Wipe → Index 2 docs → Status → Count → Query\n"
-                f"- **Embeddings**: `{EMB_PROVIDER}` (model: `{HF_EMBED_MODEL}`)\n"
-                f"- **Token HF présent**: `{'oui' if bool(HF_TOKEN) else 'non'}` — "
-                f"**Fallback dummy**: `{'on' if EMB_FALLBACK_TO_DUMMY else 'off'}`\n"
-                f"- **Qdrant**: `{'OK' if QDRANT_URL else 'ABSENT'}`")
     with gr.Row():
         project_tb = gr.Textbox(label="Project ID", value="DEEPWEB")
         jobid_tb = gr.Textbox(label="Job ID", value="", interactive=True)
     with gr.Row():
-        wipe_btn = gr.Button("🧨 Wipe collection", variant="stop")
         index_btn = gr.Button("🚀 Indexer 2 documents", variant="primary")
         count_btn = gr.Button("📊 Count points", variant="secondary")
     with gr.Row():
@@ -600,16 +607,16 @@ with gr.Blocks(title="Remote Indexer — Tests sans console", analytics_enabled=
         status_btn = gr.Button("📡 Status (refresh)")
         auto_chk = gr.Checkbox(False, label="⏱️ Auto-refresh status (2 s)")
-    with gr.Row():
-        ensure_dim = gr.Slider(16, 2048, value=128, step=16, label="ensure dim (debug)")
-        ensure_btn = gr.Button("🛠️ Ensure collection (debug)")
     with gr.Row():
         query_tb = gr.Textbox(label="Query text", value="alpha bravo")
         topk = gr.Slider(1, 20, value=5, step=1, label="top_k")
         query_btn = gr.Button("🔎 Query")
     query_out = gr.Textbox(lines=10, label="Résultats Query", interactive=False)
     wipe_btn.click(ui_wipe, inputs=[project_tb], outputs=[out_log])
     index_btn.click(ui_index_sample, inputs=[project_tb, chunk_size, overlap, batch_size, store_text], outputs=[out_log, jobid_tb])
     count_btn.click(ui_count, inputs=[project_tb], outputs=[out_log])
@@ -619,9 +626,11 @@ with gr.Blocks(title="Remote Indexer — Tests sans console", analytics_enabled=
     timer.tick(ui_status, inputs=[jobid_tb], outputs=[out_log])
     auto_chk.change(lambda x: gr.update(active=x), inputs=auto_chk, outputs=timer)
-    ensure_btn.click(ui_ensure, inputs=[project_tb, ensure_dim], outputs=[out_log])
-# Monte l'UI Gradio
 app = gr.mount_gradio_app(fastapi_app, ui, path=UI_PATH)
 if __name__ == "__main__":

 # -*- coding: utf-8 -*-
 """
+HF Space - Remote Indexer (No-Qdrant)
+Stockage & recherche vectorielle avec 🤗 datasets + FAISS (local), UI Gradio.
+Pipeline:
+- /index: chunk → embeddings (HF Inference ou dummy) → Dataset.from_dict → add_faiss_index(IP) → save_to_disk
+- /count: lit le dataset sur disque (si non chargé) → renvoie nb de lignes
+- /query: embed requête → dataset.get_nearest_examples('embedding', query, k)
+- /wipe: supprime le dossier projet
+- /export_hub (optionnel): pousse le dossier projet dans un repo Dataset du Hub
 ENV:
+- EMB_PROVIDER         ("hf" | "dummy", défaut "hf")
+- HF_EMBED_MODEL       (ex: "BAAI/bge-m3" | "intfloat/e5-base-v2")
+- HUGGINGFACEHUB_API_TOKEN (requis si EMB_PROVIDER=hf)
+- EMB_FALLBACK_TO_DUMMY (true/false)
+- DATA_DIR             (défaut "/data") → stockage local par projet
+- HF_DATASET_REPO      (optionnel "username/my_proj_vectors") pour export
+- LOG_LEVEL            (DEBUG par défaut)
+- UI_PATH              ("/ui")
+- PORT                 (7860)
 """
 from __future__ import annotations
 import os
+import io
+import re
+import json
 import time
 import uuid
+import shutil
 import hashlib
 import logging
 import asyncio
+import threading
 from typing import List, Dict, Any, Optional, Tuple
 import numpy as np
 import httpx
 import uvicorn
+import gradio as gr
+import faiss  # type: ignore
 from pydantic import BaseModel, Field, ValidationError
 from fastapi import FastAPI, HTTPException, Query
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import RedirectResponse, StreamingResponse
+from datasets import Dataset, Features, Sequence, Value, load_from_disk
+try:
+    from huggingface_hub import HfApi, create_repo
+except Exception:
+    HfApi = None
+    create_repo = None
 # ------------------------------------------------------------------------------
 # Config & logs
     level=getattr(logging, LOG_LEVEL, logging.DEBUG),
     format="%(asctime)s - %(levelname)s - %(message)s",
 )
+LOG = logging.getLogger("remote_indexer_noqdrant")
+EMB_PROVIDER = os.getenv("EMB_PROVIDER", "hf").lower()  # "hf" | "dummy"
 HF_EMBED_MODEL = os.getenv("HF_EMBED_MODEL", "BAAI/bge-m3")
 HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
 EMB_FALLBACK_TO_DUMMY = os.getenv("EMB_FALLBACK_TO_DUMMY", "false").lower() in ("1","true","yes","on")
+DATA_DIR = os.getenv("DATA_DIR", "/data")
+os.makedirs(DATA_DIR, exist_ok=True)
 UI_PATH = os.getenv("UI_PATH", "/ui")
+HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "").strip()  # optionnel
 if EMB_PROVIDER == "hf" and not HF_TOKEN and not EMB_FALLBACK_TO_DUMMY:
+    LOG.warning("EMB_PROVIDER=hf sans HUGGINGFACEHUB_API_TOKEN (pas de fallback). Mets EMB_PROVIDER=dummy ou EMB_FALLBACK_TO_DUMMY=true pour tester.")
 # ------------------------------------------------------------------------------
+# Modèles Pydantic
 # ------------------------------------------------------------------------------
 class FileItem(BaseModel):
     path: str
 class JobState(BaseModel):
     job_id: str
     project_id: str
+    stage: str = "pending"      # pending -> embedding -> indexing -> done/failed
     total_files: int = 0
     total_chunks: int = 0
     embedded: int = 0
+    indexed: int = 0
     errors: List[str] = Field(default_factory=list)
     messages: List[str] = Field(default_factory=list)
     started_at: float = Field(default_factory=time.time)
 JOBS: Dict[str, JobState] = {}
+# In-memory cache {project_id: (Dataset, dim)}
+DATASETS: Dict[str, Tuple[Dataset, int]] = {}
 # ------------------------------------------------------------------------------
 # Utils
 # ------------------------------------------------------------------------------
             i = j
     return res
+def project_paths(project_id: str) -> Dict[str, str]:
+    base = os.path.join(DATA_DIR, project_id)
+    return {
+        "base": base,
+        "ds_dir": os.path.join(base, "dataset"),
+        "faiss_dir": os.path.join(base, "faiss"),
+        "faiss_file": os.path.join(base, "faiss", "emb.faiss"),
+        "meta_file": os.path.join(base, "meta.json"),
+    }
+def save_meta(meta_path: str, data: Dict[str, Any]) -> None:
+    os.makedirs(os.path.dirname(meta_path), exist_ok=True)
+    with open(meta_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+def load_meta(meta_path: str) -> Dict[str, Any]:
+    if not os.path.exists(meta_path):
+        return {}
+    try:
+        with open(meta_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except Exception:
+        return {}
 # ------------------------------------------------------------------------------
+# Embeddings (HF Inference ou dummy)
 # ------------------------------------------------------------------------------
 def _maybe_prefix_for_model(texts: List[str], model_name: str) -> List[str]:
     m = (model_name or "").lower()
     if "e5" in m:
+        # E5: "query: ..." / "passage: ..." etc. Ici on uniformise simple.
         return [("query: " + t) for t in texts]
     return texts
     return embed_dummy(texts, dim=128)
 # ------------------------------------------------------------------------------
+# Indexation (datasets + FAISS)
 # ------------------------------------------------------------------------------
+async def build_dataset_with_faiss(job: JobState, req: IndexRequest) -> None:
+    """
+    Construit un dataset HuggingFace avec colonnes:
+    - path (str), text (optionnel), chunk (int), start (int), end (int), embedding (float32[])
+    Ajoute un index FAISS (Inner Product) et persiste sur disque.
+    """
     try:
         job.stage = "embedding"
         job.total_files = len(req.files)
                 payload = {"path": f.path, "chunk": idx, "start": start, "end": end}
                 if req.store_text:
                     payload["text"] = ch
+                else:
+                    payload["text"] = None
+                payload["raw"] = ch
+                records.append(payload)
         job.total_chunks = len(records)
         job.log(f"Total chunks = {job.total_chunks}")
         if job.total_chunks == 0:
             job.finished_at = time.time()
             return
+        # Embeddings par batch
         async with httpx.AsyncClient(timeout=180) as client:
+            all_vecs: List[List[float]] = []
+            B = max(8, min(64, req.batch_size * 2))
             i = 0
             while i < len(records):
+                sub = records[i : i + B]
                 texts = [r["raw"] for r in sub]
                 vecs = await embed_texts(client, texts)
                 if len(vecs) != len(sub):
                     raise HTTPException(status_code=500, detail="Embedding batch size mismatch")
+                all_vecs.extend(vecs)
                 job.embedded += len(vecs)
+                job.log(f"Embeddings {job.embedded}/{job.total_chunks}")
+                i += B
+        vec_dim = len(all_vecs[0])
+        job.log(f"Embeddings dim={vec_dim}")
+        # Prépare colonnes du dataset
+        paths = [r["path"] for r in records]
+        chunks = [int(r["chunk"]) for r in records]
+        starts = [int(r["start"]) for r in records]
+        ends = [int(r["end"]) for r in records]
+        texts = [r.get("text") for r in records]
+        features = Features({
+            "path": Value("string"),
+            "chunk": Value("int32"),
+            "start": Value("int32"),
+            "end": Value("int32"),
+            "text": Value("string"),   # peut contenir None -> sera "None" si None ; OK pour tests
+            "embedding": Sequence(Value("float32")),
+        })
+        ds = Dataset.from_dict(
+            {
+                "path": paths,
+                "chunk": chunks,
+                "start": starts,
+                "end": ends,
+                "text": texts,
+                "embedding": [np.array(v, dtype=np.float32) for v in all_vecs],
+            },
+            features=features,
+        )
+        # Ajoute index FAISS (Inner Product sur vecteurs normalisés ~ cosine)
+        job.stage = "indexing"
+        ds.add_faiss_index(column="embedding", metric_type=faiss.METRIC_INNER_PRODUCT)
+        job.indexed = ds.num_rows
+        job.log(f"FAISS index ajouté ({ds.num_rows} points)")
+        # Persistance disque
+        p = project_paths(req.project_id)
+        os.makedirs(p["faiss_dir"], exist_ok=True)
+        ds.save_to_disk(p["ds_dir"])
+        ds.save_faiss_index("embedding", p["faiss_file"])
+        save_meta(p["meta_file"], {"dim": vec_dim, "rows": ds.num_rows, "model": HF_EMBED_MODEL, "ts": time.time()})
+        # Cache mémoire
+        DATASETS[req.project_id] = (ds, vec_dim)
         job.stage = "done"
         job.finished_at = time.time()
+        job.log(f"Dataset sauvegardé dans {p['ds_dir']}, index FAISS → {p['faiss_file']}")
     except Exception as e:
         job.stage = "failed"
         job.errors.append(str(e))
         job.log(f"❌ Exception: {e}")
 def _run_job_in_thread(job: JobState, req: IndexRequest) -> None:
     def _runner():
         try:
+            asyncio.run(build_dataset_with_faiss(job, req))
         except Exception as e:
             job.stage = "failed"
             job.errors.append(str(e))
     _run_job_in_thread(job, req)
     return job
+# ------------------------------------------------------------------------------
+# Chargement / Query helpers
+# ------------------------------------------------------------------------------
+def ensure_loaded(project_id: str) -> Tuple[Dataset, int]:
+    """Charge le dataset+faiss depuis disque si pas en cache mémoire."""
+    if project_id in DATASETS:
+        return DATASETS[project_id]
+    p = project_paths(project_id)
+    if not os.path.exists(p["ds_dir"]):
+        raise HTTPException(status_code=404, detail=f"Dataset absent pour projet {project_id}")
+    ds = load_from_disk(p["ds_dir"])
+    if os.path.exists(p["faiss_file"]):
+        ds.load_faiss_index("embedding", p["faiss_file"])
+    meta = load_meta(p["meta_file"])
+    vec_dim = int(meta.get("dim", 0)) or len(ds[0]["embedding"])
+    DATASETS[project_id] = (ds, vec_dim)
+    return ds, vec_dim
+async def embed_query(text: str) -> List[float]:
+    async with httpx.AsyncClient(timeout=60) as client:
+        vec = (await embed_texts(client, [text]))[0]
+    return vec
 # ------------------------------------------------------------------------------
 # FastAPI app
 # ------------------------------------------------------------------------------
+fastapi_app = FastAPI(title="Remote Indexer - NoQdrant (Datasets+FAISS)")
 fastapi_app.add_middleware(
+    CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]
 )
 @fastapi_app.get("/health")
 async def health():
+    return {"status": "ok", "emb_provider": EMB_PROVIDER, "model": HF_EMBED_MODEL}
 @fastapi_app.get("/api")
 async def api_info():
     return {
+        "ok": True, "service": "remote-indexer-noqdrant",
         "emb_provider": EMB_PROVIDER, "hf_model": HF_EMBED_MODEL,
         "fallback_to_dummy": EMB_FALLBACK_TO_DUMMY,
+        "data_dir": DATA_DIR, "ui_path": UI_PATH,
+        "hub_export_enabled": bool(HF_DATASET_REPO and HfApi),
     }
 @fastapi_app.get("/")
 @fastapi_app.post("/wipe")
 async def wipe(project_id: str = Query(..., min_length=1)):
+    p = project_paths(project_id)
+    if os.path.exists(p["base"]):
+        shutil.rmtree(p["base"], ignore_errors=True)
+    if project_id in DATASETS:
+        DATASETS.pop(project_id, None)
+    return {"ok": True, "project_id": project_id, "removed": True}
 @fastapi_app.post("/index")
 async def index(req: IndexRequest):
     job = create_and_start_job(req)
     return {"job_id": job.job_id, "project_id": job.project_id}
 @fastapi_app.get("/collections/{project_id}/count")
 async def coll_count(project_id: str):
+    try:
+        ds, _ = ensure_loaded(project_id)
+        return {"project_id": project_id, "count": ds.num_rows}
+    except Exception as e:
+        return {"project_id": project_id, "count": 0, "note": f"{e}"}
 @fastapi_app.post("/query")
 async def query(req: QueryRequest):
+    ds, vec_dim = ensure_loaded(req.project_id)
+    qvec = await embed_query(req.text)
+    if len(qvec) != vec_dim:
+        raise HTTPException(status_code=400, detail=f"Dim requête {len(qvec)} ≠ dim index {vec_dim}")
+    # get_nearest_examples renvoie (scores, examples)
+    scores, ex = ds.get_nearest_examples("embedding", np.array(qvec, dtype=np.float32), k=req.top_k)
+    results = []
+    for s, path, chunk, text in zip(scores, ex["path"], ex["chunk"], ex["text"]):
+        preview = ((text or "")[:160]).replace("\n", " ")
+        results.append({"score": float(s), "path": path, "chunk": int(chunk), "preview": preview})
+    return {"result": results, "k": req.top_k}
+@fastapi_app.post("/export_hub")
+async def export_hub(project_id: str = Query(..., min_length=1), repo_id: Optional[str] = None):
+    """
+    Optionnel: push le dossier du projet (dataset + faiss + meta) dans un repo Dataset du Hub.
+    - HF_DATASET_REPO ou ?repo_id=... (ex: "chourmovs/deepweb_vectors")
+    """
+    if not HfApi or not HF_TOKEN:
+        raise HTTPException(status_code=400, detail="huggingface_hub non dispo ou HF token absent.")
+    p = project_paths(project_id)
+    if not os.path.exists(p["ds_dir"]):
+        raise HTTPException(status_code=404, detail="Aucun dataset local à exporter.")
+    rid = (repo_id or HF_DATASET_REPO or "").strip()
+    if not rid:
+        raise HTTPException(status_code=400, detail="repo_id requis (ou HF_DATASET_REPO).")
+    api = HfApi(token=HF_TOKEN)
+    try:
+        create_repo(rid, repo_type="dataset", exist_ok=True, token=HF_TOKEN)
+    except Exception:
+        pass
+    # Zipper le dossier projet pour un upload rapide
+    buf = io.BytesIO()
+    base_dir = p["base"]
+    zip_name = f"{project_id}_vectors.zip"
+    import zipfile
+    with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as z:
+        for root, _, files in os.walk(base_dir):
+            for fn in files:
+                full = os.path.join(root, fn)
+                rel = os.path.relpath(full, base_dir)
+                z.write(full, arcname=rel)
+    buf.seek(0)
+    api.upload_file(
+        path_or_fileobj=buf,
+        path_in_repo=zip_name,
+        repo_id=rid,
+        repo_type="dataset",
+    )
+    return {"ok": True, "repo_id": rid, "file": zip_name}
 # ------------------------------------------------------------------------------
+# Gradio UI
 # ------------------------------------------------------------------------------
 def _default_two_docs() -> List[Dict[str, str]]:
     a = "Alpha bravo charlie delta echo foxtrot golf hotel india. " * 3
 async def ui_wipe(project: str):
     try:
+        resp = await wipe(project)  # appelle route interne
+        return f"✅ Wipe ok — projet {resp['project_id']} vidé."
     except Exception as e:
         LOG.exception("wipe UI error")
         return f"❌ Wipe erreur: {e}"
         return "⚠️ Renseigne un job_id"
     try:
         st = await status(job_id)
+        lines = [f"Job {st['job_id']} — stage={st['stage']} files={st['total_files']} chunks={st['total_chunks']} embedded={st['embedded']} indexed={st['indexed']}"]
+        lines += st.get("messages", [])[-80:]
         if st.get("errors"):
             lines.append("Erreurs:")
             lines += [f" - {e}" for e in st['errors']]
 async def ui_count(project: str):
     try:
+        data = await coll_count(project)
+        return f"📊 Count — project={project} → {data['count']} points" + (f" ({data.get('note')})" if 'note' in data else "")
     except Exception as e:
         LOG.exception("count UI error")
         return f"❌ Count erreur: {e}"
             return "Aucun résultat."
         out = []
         for h in hits:
+            out.append(f"{h['score']:.4f} — {h['path']} [chunk {h['chunk']}] — {h['preview']}…")
         return "\n".join(out)
     except Exception as e:
         LOG.exception("query UI error")
         return f"❌ Query erreur: {e}"
+async def ui_export(project: str, repo_id: str):
     try:
+        resp = await export_hub(project, repo_id or None)
+        return f"📤 Export → dataset repo={resp['repo_id']} file={resp['file']}"
     except Exception as e:
+        LOG.exception("export UI error")
+        return f"❌ Export erreur: {e}"
+with gr.Blocks(title="Remote Indexer — No-Qdrant (datasets+FAISS)", analytics_enabled=False) as ui:
+    gr.Markdown("## 🧪 Remote Indexer — No-Qdrant (datasets+FAISS)\n"
                 "Wipe → Index 2 docs → Status → Count → Query\n"
+                f"- **Embeddings**: `{EMB_PROVIDER}` (model: `{HF_EMBED_MODEL}`) — "
+                f"HF token présent: `{'oui' if bool(HF_TOKEN) else 'non'}` — Fallback dummy: `{'on' if EMB_FALLBACK_TO_DUMMY else 'off'}`\n"
+                f"- **Data dir**: `{DATA_DIR}` — **Hub export**: `{'on' if (HF_DATASET_REPO and HfApi) else 'off'}`")
     with gr.Row():
         project_tb = gr.Textbox(label="Project ID", value="DEEPWEB")
         jobid_tb = gr.Textbox(label="Job ID", value="", interactive=True)
     with gr.Row():
+        wipe_btn = gr.Button("🧨 Wipe project", variant="stop")
         index_btn = gr.Button("🚀 Indexer 2 documents", variant="primary")
         count_btn = gr.Button("📊 Count points", variant="secondary")
     with gr.Row():
         status_btn = gr.Button("📡 Status (refresh)")
         auto_chk = gr.Checkbox(False, label="⏱️ Auto-refresh status (2 s)")
     with gr.Row():
         query_tb = gr.Textbox(label="Query text", value="alpha bravo")
         topk = gr.Slider(1, 20, value=5, step=1, label="top_k")
         query_btn = gr.Button("🔎 Query")
     query_out = gr.Textbox(lines=10, label="Résultats Query", interactive=False)
+    with gr.Row():
+        repo_tb = gr.Textbox(label="Hub dataset repo (ex: user/deepweb_vectors)", value=os.getenv("HF_DATASET_REPO", ""))
+        export_btn = gr.Button("📤 Export to Hub", variant="secondary")
     wipe_btn.click(ui_wipe, inputs=[project_tb], outputs=[out_log])
     index_btn.click(ui_index_sample, inputs=[project_tb, chunk_size, overlap, batch_size, store_text], outputs=[out_log, jobid_tb])
     count_btn.click(ui_count, inputs=[project_tb], outputs=[out_log])
     timer.tick(ui_status, inputs=[jobid_tb], outputs=[out_log])
     auto_chk.change(lambda x: gr.update(active=x), inputs=auto_chk, outputs=timer)
+    query_btn.click(ui_query, inputs=[project_tb, query_tb, topk], outputs=[query_out])
+    export_btn.click(ui_export, inputs=[project_tb, repo_tb], outputs=[out_log])
+# Monte l'UI
 app = gr.mount_gradio_app(fastapi_app, ui, path=UI_PATH)
 if __name__ == "__main__":