Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
|
@@ -3,19 +3,17 @@
|
|
| 3 |
HF Space - Remote Indexer (No-Qdrant)
|
| 4 |
Stockage & recherche vectorielle avec 🤗 datasets + FAISS (local), UI Gradio.
|
| 5 |
|
| 6 |
-
|
| 7 |
-
-
|
| 8 |
-
-
|
| 9 |
-
-
|
| 10 |
-
- /wipe: supprime le dossier projet
|
| 11 |
-
- /export_hub (optionnel): pousse le dossier projet dans un repo Dataset du Hub
|
| 12 |
|
| 13 |
ENV:
|
| 14 |
- EMB_PROVIDER ("hf" | "dummy", défaut "hf")
|
| 15 |
- HF_EMBED_MODEL (ex: "BAAI/bge-m3" | "intfloat/e5-base-v2")
|
| 16 |
- HUGGINGFACEHUB_API_TOKEN (requis si EMB_PROVIDER=hf)
|
| 17 |
- EMB_FALLBACK_TO_DUMMY (true/false)
|
| 18 |
-
- DATA_DIR (
|
| 19 |
- HF_DATASET_REPO (optionnel "username/my_proj_vectors") pour export
|
| 20 |
- LOG_LEVEL (DEBUG par défaut)
|
| 21 |
- UI_PATH ("/ui")
|
|
@@ -25,7 +23,6 @@ ENV:
|
|
| 25 |
from __future__ import annotations
|
| 26 |
import os
|
| 27 |
import io
|
| 28 |
-
import re
|
| 29 |
import json
|
| 30 |
import time
|
| 31 |
import uuid
|
|
@@ -65,7 +62,7 @@ logging.basicConfig(
|
|
| 65 |
LOG = logging.getLogger("remote_indexer_noqdrant")
|
| 66 |
|
| 67 |
EMB_PROVIDER = os.getenv("EMB_PROVIDER", "hf").lower() # "hf" | "dummy"
|
| 68 |
-
HF_EMBED_MODEL = os.getenv("HF_EMBED_MODEL", "
|
| 69 |
HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
|
| 70 |
EMB_FALLBACK_TO_DUMMY = os.getenv("EMB_FALLBACK_TO_DUMMY", "false").lower() in ("1","true","yes","on")
|
| 71 |
|
|
@@ -80,11 +77,11 @@ if EMB_PROVIDER == "hf" and not HF_TOKEN and not EMB_FALLBACK_TO_DUMMY:
|
|
| 80 |
# ------------------------------------------------------------------------------
|
| 81 |
def pick_data_dir() -> str:
|
| 82 |
candidates = [
|
| 83 |
-
os.getenv("DATA_DIR", "").strip(),
|
| 84 |
-
os.path.join(os.getcwd(), "data"),
|
| 85 |
-
"/home/user/app/data",
|
| 86 |
"/home/user/data",
|
| 87 |
-
"/tmp/data",
|
| 88 |
]
|
| 89 |
for p in candidates:
|
| 90 |
if not p:
|
|
@@ -113,8 +110,8 @@ class FileItem(BaseModel):
|
|
| 113 |
class IndexRequest(BaseModel):
|
| 114 |
project_id: str = Field(..., min_length=1)
|
| 115 |
files: List[FileItem] = Field(default_factory=list)
|
| 116 |
-
chunk_size: int = Field(200, ge=
|
| 117 |
-
overlap: int = Field(20, ge=0, le=
|
| 118 |
batch_size: int = Field(32, ge=1, le=1024)
|
| 119 |
store_text: bool = True
|
| 120 |
|
|
@@ -143,45 +140,43 @@ class JobState(BaseModel):
|
|
| 143 |
LOG.debug(f"[{self.job_id}] {msg}")
|
| 144 |
|
| 145 |
JOBS: Dict[str, JobState] = {}
|
| 146 |
-
|
| 147 |
-
# In-memory cache {project_id: (Dataset, dim)}
|
| 148 |
-
DATASETS: Dict[str, Tuple[Dataset, int]] = {}
|
| 149 |
|
| 150 |
# ------------------------------------------------------------------------------
|
| 151 |
-
# Utils
|
| 152 |
# ------------------------------------------------------------------------------
|
| 153 |
-
def
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
n = float(np.linalg.norm(arr))
|
| 159 |
-
if n > 0:
|
| 160 |
-
arr = arr / n
|
| 161 |
-
return arr.astype(np.float32).tolist()
|
| 162 |
-
|
| 163 |
-
def flatten_any(x: Any) -> List[float]:
|
| 164 |
-
if isinstance(x, (list, tuple)):
|
| 165 |
-
if len(x) > 0 and isinstance(x[0], (list, tuple)):
|
| 166 |
-
return flatten_any(x[0])
|
| 167 |
-
return list(map(float, x))
|
| 168 |
-
raise ValueError("Embedding vector mal formé")
|
| 169 |
-
|
| 170 |
-
def chunk_text(text: str, chunk_size: int, overlap: int) -> List[Tuple[int, int, str]]:
|
| 171 |
text = text or ""
|
| 172 |
-
|
|
|
|
|
|
|
| 173 |
return []
|
| 174 |
-
|
| 175 |
n = len(text)
|
|
|
|
| 176 |
i = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
while i < n:
|
| 178 |
j = min(i + chunk_size, n)
|
| 179 |
chunk = text[i:j]
|
| 180 |
-
if len(chunk.strip()) >=
|
| 181 |
res.append((i, j, chunk))
|
|
|
|
|
|
|
| 182 |
i = j - overlap
|
| 183 |
if i <= 0:
|
| 184 |
i = j
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
return res
|
| 186 |
|
| 187 |
def project_paths(project_id: str) -> Dict[str, str]:
|
|
@@ -211,10 +206,23 @@ def load_meta(meta_path: str) -> Dict[str, Any]:
|
|
| 211 |
# ------------------------------------------------------------------------------
|
| 212 |
# Embeddings (HF Inference ou dummy)
|
| 213 |
# ------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
def _maybe_prefix_for_model(texts: List[str], model_name: str) -> List[str]:
|
| 215 |
m = (model_name or "").lower()
|
| 216 |
if "e5" in m:
|
| 217 |
-
# E5: "query: ..." / "passage: ..." etc. Ici on uniformise simple.
|
| 218 |
return [("query: " + t) for t in texts]
|
| 219 |
return texts
|
| 220 |
|
|
@@ -269,7 +277,7 @@ async def embed_texts(client: httpx.AsyncClient, texts: List[str]) -> List[List[
|
|
| 269 |
async def build_dataset_with_faiss(job: JobState, req: IndexRequest) -> None:
|
| 270 |
"""
|
| 271 |
Construit un dataset HuggingFace avec colonnes:
|
| 272 |
-
- path (str), text (
|
| 273 |
Ajoute un index FAISS (Inner Product) et persiste sur disque.
|
| 274 |
"""
|
| 275 |
try:
|
|
@@ -281,32 +289,32 @@ async def build_dataset_with_faiss(job: JobState, req: IndexRequest) -> None:
|
|
| 281 |
f"provider={EMB_PROVIDER} model={HF_EMBED_MODEL}"
|
| 282 |
)
|
| 283 |
|
| 284 |
-
# Chunking
|
| 285 |
records: List[Dict[str, Any]] = []
|
| 286 |
for f in req.files:
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
|
|
|
|
|
|
| 290 |
for idx, (start, end, ch) in enumerate(chunks):
|
| 291 |
payload = {"path": f.path, "chunk": idx, "start": start, "end": end}
|
| 292 |
-
if req.store_text
|
| 293 |
-
payload["text"] = ch
|
| 294 |
-
else:
|
| 295 |
-
payload["text"] = None
|
| 296 |
payload["raw"] = ch
|
| 297 |
records.append(payload)
|
|
|
|
| 298 |
job.total_chunks = len(records)
|
| 299 |
job.log(f"Total chunks = {job.total_chunks}")
|
| 300 |
if job.total_chunks == 0:
|
| 301 |
job.stage = "failed"
|
| 302 |
-
job.errors.append("Aucun chunk à indexer
|
| 303 |
job.finished_at = time.time()
|
| 304 |
return
|
| 305 |
|
| 306 |
# Embeddings par batch
|
| 307 |
async with httpx.AsyncClient(timeout=180) as client:
|
| 308 |
all_vecs: List[List[float]] = []
|
| 309 |
-
B = max(8, min(
|
| 310 |
i = 0
|
| 311 |
while i < len(records):
|
| 312 |
sub = records[i : i + B]
|
|
@@ -322,12 +330,12 @@ async def build_dataset_with_faiss(job: JobState, req: IndexRequest) -> None:
|
|
| 322 |
vec_dim = len(all_vecs[0])
|
| 323 |
job.log(f"Embeddings dim={vec_dim}")
|
| 324 |
|
| 325 |
-
#
|
| 326 |
paths = [r["path"] for r in records]
|
| 327 |
chunks = [int(r["chunk"]) for r in records]
|
| 328 |
starts = [int(r["start"]) for r in records]
|
| 329 |
ends = [int(r["end"]) for r in records]
|
| 330 |
-
texts = [r.get("text") for r in records]
|
| 331 |
|
| 332 |
features = Features({
|
| 333 |
"path": Value("string"),
|
|
@@ -350,20 +358,19 @@ async def build_dataset_with_faiss(job: JobState, req: IndexRequest) -> None:
|
|
| 350 |
features=features,
|
| 351 |
)
|
| 352 |
|
| 353 |
-
#
|
| 354 |
job.stage = "indexing"
|
| 355 |
ds.add_faiss_index(column="embedding", metric_type=faiss.METRIC_INNER_PRODUCT)
|
| 356 |
job.indexed = ds.num_rows
|
| 357 |
job.log(f"FAISS index ajouté ({ds.num_rows} points)")
|
| 358 |
|
| 359 |
-
# Persistance
|
| 360 |
p = project_paths(req.project_id)
|
| 361 |
os.makedirs(p["faiss_dir"], exist_ok=True)
|
| 362 |
ds.save_to_disk(p["ds_dir"])
|
| 363 |
ds.save_faiss_index("embedding", p["faiss_file"])
|
| 364 |
save_meta(p["meta_file"], {"dim": vec_dim, "rows": ds.num_rows, "model": HF_EMBED_MODEL, "ts": time.time()})
|
| 365 |
|
| 366 |
-
# Cache mémoire
|
| 367 |
DATASETS[req.project_id] = (ds, vec_dim)
|
| 368 |
|
| 369 |
job.stage = "done"
|
|
@@ -399,7 +406,6 @@ def create_and_start_job(req: IndexRequest) -> JobState:
|
|
| 399 |
# Chargement / Query helpers
|
| 400 |
# ------------------------------------------------------------------------------
|
| 401 |
def ensure_loaded(project_id: str) -> Tuple[Dataset, int]:
|
| 402 |
-
"""Charge le dataset+faiss depuis disque si pas en cache mémoire."""
|
| 403 |
if project_id in DATASETS:
|
| 404 |
return DATASETS[project_id]
|
| 405 |
p = project_paths(project_id)
|
|
@@ -440,13 +446,6 @@ async def api_info():
|
|
| 440 |
"hub_export_enabled": bool(HF_DATASET_REPO and HfApi),
|
| 441 |
}
|
| 442 |
|
| 443 |
-
@fastapi_app.get("/debug/paths")
|
| 444 |
-
async def debug_paths(project_id: Optional[str] = None):
|
| 445 |
-
res = {"DATA_DIR": DATA_DIR, "cwd": os.getcwd()}
|
| 446 |
-
if project_id:
|
| 447 |
-
res["project_paths"] = project_paths(project_id)
|
| 448 |
-
return res
|
| 449 |
-
|
| 450 |
@fastapi_app.get("/")
|
| 451 |
async def root_redirect():
|
| 452 |
return RedirectResponse(url=UI_PATH, status_code=307)
|
|
@@ -510,7 +509,6 @@ async def export_hub(project_id: str = Query(..., min_length=1), repo_id: Option
|
|
| 510 |
except Exception:
|
| 511 |
pass
|
| 512 |
|
| 513 |
-
# zip le dossier projet
|
| 514 |
buf = io.BytesIO()
|
| 515 |
base_dir = p["base"]
|
| 516 |
zip_name = f"{project_id}_vectors.zip"
|
|
@@ -535,8 +533,10 @@ async def export_hub(project_id: str = Query(..., min_length=1), repo_id: Option
|
|
| 535 |
# Gradio UI
|
| 536 |
# ------------------------------------------------------------------------------
|
| 537 |
def _default_two_docs() -> List[Dict[str, str]]:
|
| 538 |
-
a = "Alpha bravo charlie delta echo foxtrot golf hotel india
|
| 539 |
-
|
|
|
|
|
|
|
| 540 |
return [{"path": "a.txt", "text": a}, {"path": "b.txt", "text": b}]
|
| 541 |
|
| 542 |
async def ui_wipe(project: str):
|
|
@@ -566,13 +566,33 @@ async def ui_index_sample(project: str, chunk_size: int, overlap: int, batch_siz
|
|
| 566 |
LOG.exception("index UI error")
|
| 567 |
return f"❌ Index erreur: {e}", ""
|
| 568 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
async def ui_status(job_id: str):
|
| 570 |
if not job_id.strip():
|
| 571 |
return "⚠️ Renseigne un job_id"
|
| 572 |
try:
|
| 573 |
st = await status(job_id)
|
| 574 |
lines = [f"Job {st['job_id']} — stage={st['stage']} files={st['total_files']} chunks={st['total_chunks']} embedded={st['embedded']} indexed={st['indexed']}"]
|
| 575 |
-
lines += st.get("messages", [])[-
|
| 576 |
if st.get("errors"):
|
| 577 |
lines.append("Erreurs:")
|
| 578 |
lines += [f" - {e}" for e in st['errors']]
|
|
@@ -621,7 +641,7 @@ with gr.Blocks(title="Remote Indexer — No-Qdrant (datasets+FAISS)", analytics_
|
|
| 621 |
jobid_tb = gr.Textbox(label="Job ID", value="", interactive=True)
|
| 622 |
with gr.Row():
|
| 623 |
wipe_btn = gr.Button("🧨 Wipe project", variant="stop")
|
| 624 |
-
index_btn = gr.Button("🚀 Indexer 2 documents", variant="primary")
|
| 625 |
count_btn = gr.Button("📊 Count points", variant="secondary")
|
| 626 |
with gr.Row():
|
| 627 |
chunk_size = gr.Slider(64, 1024, value=200, step=8, label="chunk_size")
|
|
@@ -630,6 +650,11 @@ with gr.Blocks(title="Remote Indexer — No-Qdrant (datasets+FAISS)", analytics_
|
|
| 630 |
store_text = gr.Checkbox(value=True, label="store_text (payload)")
|
| 631 |
out_log = gr.Textbox(lines=18, label="Logs / Résultats", interactive=False)
|
| 632 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 633 |
with gr.Row():
|
| 634 |
status_btn = gr.Button("📡 Status (refresh)")
|
| 635 |
auto_chk = gr.Checkbox(False, label="⏱️ Auto-refresh status (2 s)")
|
|
@@ -646,6 +671,7 @@ with gr.Blocks(title="Remote Indexer — No-Qdrant (datasets+FAISS)", analytics_
|
|
| 646 |
|
| 647 |
wipe_btn.click(ui_wipe, inputs=[project_tb], outputs=[out_log])
|
| 648 |
index_btn.click(ui_index_sample, inputs=[project_tb, chunk_size, overlap, batch_size, store_text], outputs=[out_log, jobid_tb])
|
|
|
|
| 649 |
count_btn.click(ui_count, inputs=[project_tb], outputs=[out_log])
|
| 650 |
|
| 651 |
status_btn.click(ui_status, inputs=[jobid_tb], outputs=[out_log])
|
|
|
|
| 3 |
HF Space - Remote Indexer (No-Qdrant)
|
| 4 |
Stockage & recherche vectorielle avec 🤗 datasets + FAISS (local), UI Gradio.
|
| 5 |
|
| 6 |
+
Améliorations:
|
| 7 |
+
- Découpage "fail-safe": si aucun chunk, on prend 1 chunk = tout le texte.
|
| 8 |
+
- Logs détaillés: taille par fichier, nb chunks par fichier.
|
| 9 |
+
- UI: bouton "Indexer depuis textarea" pour tester avec 2 gros textes injectés depuis l'UI.
|
|
|
|
|
|
|
| 10 |
|
| 11 |
ENV:
|
| 12 |
- EMB_PROVIDER ("hf" | "dummy", défaut "hf")
|
| 13 |
- HF_EMBED_MODEL (ex: "BAAI/bge-m3" | "intfloat/e5-base-v2")
|
| 14 |
- HUGGINGFACEHUB_API_TOKEN (requis si EMB_PROVIDER=hf)
|
| 15 |
- EMB_FALLBACK_TO_DUMMY (true/false)
|
| 16 |
+
- DATA_DIR (auto-pick writable: $DATA_DIR, ./data, /home/user/app/data, /home/user/data, /tmp/data)
|
| 17 |
- HF_DATASET_REPO (optionnel "username/my_proj_vectors") pour export
|
| 18 |
- LOG_LEVEL (DEBUG par défaut)
|
| 19 |
- UI_PATH ("/ui")
|
|
|
|
| 23 |
from __future__ import annotations
|
| 24 |
import os
|
| 25 |
import io
|
|
|
|
| 26 |
import json
|
| 27 |
import time
|
| 28 |
import uuid
|
|
|
|
| 62 |
LOG = logging.getLogger("remote_indexer_noqdrant")
|
| 63 |
|
| 64 |
EMB_PROVIDER = os.getenv("EMB_PROVIDER", "hf").lower() # "hf" | "dummy"
|
| 65 |
+
HF_EMBED_MODEL = os.getenv("HF_EMBED_MODEL", "intfloat/e5-base-v2")
|
| 66 |
HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
|
| 67 |
EMB_FALLBACK_TO_DUMMY = os.getenv("EMB_FALLBACK_TO_DUMMY", "false").lower() in ("1","true","yes","on")
|
| 68 |
|
|
|
|
| 77 |
# ------------------------------------------------------------------------------
|
| 78 |
def pick_data_dir() -> str:
|
| 79 |
candidates = [
|
| 80 |
+
os.getenv("DATA_DIR", "").strip(),
|
| 81 |
+
os.path.join(os.getcwd(), "data"),
|
| 82 |
+
"/home/user/app/data",
|
| 83 |
"/home/user/data",
|
| 84 |
+
"/tmp/data",
|
| 85 |
]
|
| 86 |
for p in candidates:
|
| 87 |
if not p:
|
|
|
|
| 110 |
class IndexRequest(BaseModel):
|
| 111 |
project_id: str = Field(..., min_length=1)
|
| 112 |
files: List[FileItem] = Field(default_factory=list)
|
| 113 |
+
chunk_size: int = Field(200, ge=32, le=8192)
|
| 114 |
+
overlap: int = Field(20, ge=0, le=1024)
|
| 115 |
batch_size: int = Field(32, ge=1, le=1024)
|
| 116 |
store_text: bool = True
|
| 117 |
|
|
|
|
| 140 |
LOG.debug(f"[{self.job_id}] {msg}")
|
| 141 |
|
| 142 |
JOBS: Dict[str, JobState] = {}
|
| 143 |
+
DATASETS: Dict[str, Tuple[Dataset, int]] = {} # cache mémoire {project_id: (Dataset, dim)}
|
|
|
|
|
|
|
| 144 |
|
| 145 |
# ------------------------------------------------------------------------------
|
| 146 |
+
# Utils découpage
|
| 147 |
# ------------------------------------------------------------------------------
|
| 148 |
+
def chunk_text_fail_safe(text: str, chunk_size: int, overlap: int, min_keep_chars: int = 1) -> List[Tuple[int, int, str]]:
|
| 149 |
+
"""
|
| 150 |
+
Découpe le texte en fenêtres chevauchantes. Si aucun chunk "utile" n'est produit
|
| 151 |
+
mais que le texte contient au moins min_keep_chars non-blanc, on retourne 1 chunk = 100% du texte.
|
| 152 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
text = text or ""
|
| 154 |
+
base = text.strip("\n\r\t ")
|
| 155 |
+
nclean = len(base)
|
| 156 |
+
if nclean < min_keep_chars:
|
| 157 |
return []
|
| 158 |
+
|
| 159 |
n = len(text)
|
| 160 |
+
res: List[Tuple[int, int, str]] = []
|
| 161 |
i = 0
|
| 162 |
+
# Normalisation des paramètres absurdes
|
| 163 |
+
chunk_size = max(32, int(chunk_size))
|
| 164 |
+
overlap = max(0, min(int(overlap), chunk_size - 1))
|
| 165 |
+
|
| 166 |
while i < n:
|
| 167 |
j = min(i + chunk_size, n)
|
| 168 |
chunk = text[i:j]
|
| 169 |
+
if len(chunk.strip()) >= min_keep_chars:
|
| 170 |
res.append((i, j, chunk))
|
| 171 |
+
if j == n:
|
| 172 |
+
break
|
| 173 |
i = j - overlap
|
| 174 |
if i <= 0:
|
| 175 |
i = j
|
| 176 |
+
|
| 177 |
+
if not res:
|
| 178 |
+
# fail-safe : 1 chunk couvrant tout le texte
|
| 179 |
+
res = [(0, n, text)]
|
| 180 |
return res
|
| 181 |
|
| 182 |
def project_paths(project_id: str) -> Dict[str, str]:
|
|
|
|
| 206 |
# ------------------------------------------------------------------------------
|
| 207 |
# Embeddings (HF Inference ou dummy)
|
| 208 |
# ------------------------------------------------------------------------------
|
| 209 |
+
def l2_normalize(vec: List[float]) -> List[float]:
|
| 210 |
+
arr = np.array(vec, dtype=np.float32)
|
| 211 |
+
n = float(np.linalg.norm(arr))
|
| 212 |
+
if n > 0:
|
| 213 |
+
arr = arr / n
|
| 214 |
+
return arr.astype(np.float32).tolist()
|
| 215 |
+
|
| 216 |
+
def flatten_any(x: Any) -> List[float]:
|
| 217 |
+
if isinstance(x, (list, tuple)):
|
| 218 |
+
if len(x) > 0 and isinstance(x[0], (list, tuple)):
|
| 219 |
+
return flatten_any(x[0])
|
| 220 |
+
return list(map(float, x))
|
| 221 |
+
raise ValueError("Embedding vector mal formé")
|
| 222 |
+
|
| 223 |
def _maybe_prefix_for_model(texts: List[str], model_name: str) -> List[str]:
|
| 224 |
m = (model_name or "").lower()
|
| 225 |
if "e5" in m:
|
|
|
|
| 226 |
return [("query: " + t) for t in texts]
|
| 227 |
return texts
|
| 228 |
|
|
|
|
| 277 |
async def build_dataset_with_faiss(job: JobState, req: IndexRequest) -> None:
|
| 278 |
"""
|
| 279 |
Construit un dataset HuggingFace avec colonnes:
|
| 280 |
+
- path (str), text (str), chunk (int), start (int), end (int), embedding (float32[])
|
| 281 |
Ajoute un index FAISS (Inner Product) et persiste sur disque.
|
| 282 |
"""
|
| 283 |
try:
|
|
|
|
| 289 |
f"provider={EMB_PROVIDER} model={HF_EMBED_MODEL}"
|
| 290 |
)
|
| 291 |
|
| 292 |
+
# Chunking + logs par fichier
|
| 293 |
records: List[Dict[str, Any]] = []
|
| 294 |
for f in req.files:
|
| 295 |
+
t = f.text or ""
|
| 296 |
+
tlen = len(t)
|
| 297 |
+
job.log(f"{f.path}: len(text)={tlen}")
|
| 298 |
+
chunks = chunk_text_fail_safe(t, req.chunk_size, req.overlap, min_keep_chars=1)
|
| 299 |
+
job.log(f"{f.path}: chunks créés={len(chunks)}")
|
| 300 |
for idx, (start, end, ch) in enumerate(chunks):
|
| 301 |
payload = {"path": f.path, "chunk": idx, "start": start, "end": end}
|
| 302 |
+
payload["text"] = ch if req.store_text else ""
|
|
|
|
|
|
|
|
|
|
| 303 |
payload["raw"] = ch
|
| 304 |
records.append(payload)
|
| 305 |
+
|
| 306 |
job.total_chunks = len(records)
|
| 307 |
job.log(f"Total chunks = {job.total_chunks}")
|
| 308 |
if job.total_chunks == 0:
|
| 309 |
job.stage = "failed"
|
| 310 |
+
job.errors.append("Aucun chunk à indexer (textes vides ?)")
|
| 311 |
job.finished_at = time.time()
|
| 312 |
return
|
| 313 |
|
| 314 |
# Embeddings par batch
|
| 315 |
async with httpx.AsyncClient(timeout=180) as client:
|
| 316 |
all_vecs: List[List[float]] = []
|
| 317 |
+
B = max(8, min(128, req.batch_size * 2))
|
| 318 |
i = 0
|
| 319 |
while i < len(records):
|
| 320 |
sub = records[i : i + B]
|
|
|
|
| 330 |
vec_dim = len(all_vecs[0])
|
| 331 |
job.log(f"Embeddings dim={vec_dim}")
|
| 332 |
|
| 333 |
+
# Dataset columns
|
| 334 |
paths = [r["path"] for r in records]
|
| 335 |
chunks = [int(r["chunk"]) for r in records]
|
| 336 |
starts = [int(r["start"]) for r in records]
|
| 337 |
ends = [int(r["end"]) for r in records]
|
| 338 |
+
texts = [r.get("text", "") for r in records]
|
| 339 |
|
| 340 |
features = Features({
|
| 341 |
"path": Value("string"),
|
|
|
|
| 358 |
features=features,
|
| 359 |
)
|
| 360 |
|
| 361 |
+
# Index FAISS (Inner Product ≈ cosine après normalisation)
|
| 362 |
job.stage = "indexing"
|
| 363 |
ds.add_faiss_index(column="embedding", metric_type=faiss.METRIC_INNER_PRODUCT)
|
| 364 |
job.indexed = ds.num_rows
|
| 365 |
job.log(f"FAISS index ajouté ({ds.num_rows} points)")
|
| 366 |
|
| 367 |
+
# Persistance
|
| 368 |
p = project_paths(req.project_id)
|
| 369 |
os.makedirs(p["faiss_dir"], exist_ok=True)
|
| 370 |
ds.save_to_disk(p["ds_dir"])
|
| 371 |
ds.save_faiss_index("embedding", p["faiss_file"])
|
| 372 |
save_meta(p["meta_file"], {"dim": vec_dim, "rows": ds.num_rows, "model": HF_EMBED_MODEL, "ts": time.time()})
|
| 373 |
|
|
|
|
| 374 |
DATASETS[req.project_id] = (ds, vec_dim)
|
| 375 |
|
| 376 |
job.stage = "done"
|
|
|
|
| 406 |
# Chargement / Query helpers
|
| 407 |
# ------------------------------------------------------------------------------
|
| 408 |
def ensure_loaded(project_id: str) -> Tuple[Dataset, int]:
|
|
|
|
| 409 |
if project_id in DATASETS:
|
| 410 |
return DATASETS[project_id]
|
| 411 |
p = project_paths(project_id)
|
|
|
|
| 446 |
"hub_export_enabled": bool(HF_DATASET_REPO and HfApi),
|
| 447 |
}
|
| 448 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
@fastapi_app.get("/")
|
| 450 |
async def root_redirect():
|
| 451 |
return RedirectResponse(url=UI_PATH, status_code=307)
|
|
|
|
| 509 |
except Exception:
|
| 510 |
pass
|
| 511 |
|
|
|
|
| 512 |
buf = io.BytesIO()
|
| 513 |
base_dir = p["base"]
|
| 514 |
zip_name = f"{project_id}_vectors.zip"
|
|
|
|
| 533 |
# Gradio UI
|
| 534 |
# ------------------------------------------------------------------------------
|
| 535 |
def _default_two_docs() -> List[Dict[str, str]]:
|
| 536 |
+
a = ("Alpha bravo charlie delta echo foxtrot golf hotel india juliett kilo lima mike november oscar papa "
|
| 537 |
+
"quebec romeo sierra tango uniform victor whiskey xray yankee zulu. ") * 5
|
| 538 |
+
b = ("Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet "
|
| 539 |
+
"dolore magna aliquam erat volutpat. ") * 5
|
| 540 |
return [{"path": "a.txt", "text": a}, {"path": "b.txt", "text": b}]
|
| 541 |
|
| 542 |
async def ui_wipe(project: str):
|
|
|
|
| 566 |
LOG.exception("index UI error")
|
| 567 |
return f"❌ Index erreur: {e}", ""
|
| 568 |
|
| 569 |
+
async def ui_index_from_textarea(project: str, text1: str, text2: str, chunk_size: int, overlap: int, batch_size: int, store_text: bool):
|
| 570 |
+
files = [
|
| 571 |
+
{"path": "ui_text_1.txt", "text": text1 or ""},
|
| 572 |
+
{"path": "ui_text_2.txt", "text": text2 or ""},
|
| 573 |
+
]
|
| 574 |
+
req = IndexRequest(
|
| 575 |
+
project_id=project,
|
| 576 |
+
files=[FileItem(**f) for f in files],
|
| 577 |
+
chunk_size=chunk_size,
|
| 578 |
+
overlap=overlap,
|
| 579 |
+
batch_size=batch_size,
|
| 580 |
+
store_text=store_text,
|
| 581 |
+
)
|
| 582 |
+
try:
|
| 583 |
+
job = create_and_start_job(req)
|
| 584 |
+
return f"🚀 Job (textarea) lancé: {job.job_id}", job.job_id
|
| 585 |
+
except Exception as e:
|
| 586 |
+
LOG.exception("index-from-text UI error")
|
| 587 |
+
return f"❌ Index (textarea) erreur: {e}", ""
|
| 588 |
+
|
| 589 |
async def ui_status(job_id: str):
|
| 590 |
if not job_id.strip():
|
| 591 |
return "⚠️ Renseigne un job_id"
|
| 592 |
try:
|
| 593 |
st = await status(job_id)
|
| 594 |
lines = [f"Job {st['job_id']} — stage={st['stage']} files={st['total_files']} chunks={st['total_chunks']} embedded={st['embedded']} indexed={st['indexed']}"]
|
| 595 |
+
lines += st.get("messages", [])[-100:]
|
| 596 |
if st.get("errors"):
|
| 597 |
lines.append("Erreurs:")
|
| 598 |
lines += [f" - {e}" for e in st['errors']]
|
|
|
|
| 641 |
jobid_tb = gr.Textbox(label="Job ID", value="", interactive=True)
|
| 642 |
with gr.Row():
|
| 643 |
wipe_btn = gr.Button("🧨 Wipe project", variant="stop")
|
| 644 |
+
index_btn = gr.Button("🚀 Indexer 2 documents (démo)", variant="primary")
|
| 645 |
count_btn = gr.Button("📊 Count points", variant="secondary")
|
| 646 |
with gr.Row():
|
| 647 |
chunk_size = gr.Slider(64, 1024, value=200, step=8, label="chunk_size")
|
|
|
|
| 650 |
store_text = gr.Checkbox(value=True, label="store_text (payload)")
|
| 651 |
out_log = gr.Textbox(lines=18, label="Logs / Résultats", interactive=False)
|
| 652 |
|
| 653 |
+
with gr.Accordion("Indexer depuis textarea (bypass fichiers)", open=False):
|
| 654 |
+
txt1 = gr.Textbox(label="Texte 1", value="Ceci est un texte de test assez long pour produire des chunks. " * 10, lines=6)
|
| 655 |
+
txt2 = gr.Textbox(label="Texte 2", value="Deuxième texte de test pour vérifier l'indexation et la recherche. " * 10, lines=6)
|
| 656 |
+
index_txt_btn = gr.Button("📝 Indexer ces 2 textes")
|
| 657 |
+
|
| 658 |
with gr.Row():
|
| 659 |
status_btn = gr.Button("📡 Status (refresh)")
|
| 660 |
auto_chk = gr.Checkbox(False, label="⏱️ Auto-refresh status (2 s)")
|
|
|
|
| 671 |
|
| 672 |
wipe_btn.click(ui_wipe, inputs=[project_tb], outputs=[out_log])
|
| 673 |
index_btn.click(ui_index_sample, inputs=[project_tb, chunk_size, overlap, batch_size, store_text], outputs=[out_log, jobid_tb])
|
| 674 |
+
index_txt_btn.click(ui_index_from_textarea, inputs=[project_tb, txt1, txt2, chunk_size, overlap, batch_size, store_text], outputs=[out_log, jobid_tb])
|
| 675 |
count_btn.click(ui_count, inputs=[project_tb], outputs=[out_log])
|
| 676 |
|
| 677 |
status_btn.click(ui_status, inputs=[jobid_tb], outputs=[out_log])
|