chouchouvs commited on
Commit
82a76f0
·
verified ·
1 Parent(s): aebd89f

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +96 -70
main.py CHANGED
@@ -3,19 +3,17 @@
3
  HF Space - Remote Indexer (No-Qdrant)
4
  Stockage & recherche vectorielle avec 🤗 datasets + FAISS (local), UI Gradio.
5
 
6
- Pipeline:
7
- - /index: chunk embeddings (HF Inference ou dummy) Dataset.from_dict → add_faiss_index(IP) → save_to_disk
8
- - /count: lit le dataset sur disque (si non chargé) → renvoie nb de lignes
9
- - /query: embed requête dataset.get_nearest_examples('embedding', query, k)
10
- - /wipe: supprime le dossier projet
11
- - /export_hub (optionnel): pousse le dossier projet dans un repo Dataset du Hub
12
 
13
  ENV:
14
  - EMB_PROVIDER ("hf" | "dummy", défaut "hf")
15
  - HF_EMBED_MODEL (ex: "BAAI/bge-m3" | "intfloat/e5-base-v2")
16
  - HUGGINGFACEHUB_API_TOKEN (requis si EMB_PROVIDER=hf)
17
  - EMB_FALLBACK_TO_DUMMY (true/false)
18
- - DATA_DIR (par défaut: auto-pick writable: $DATA_DIR, ./data, /home/user/app/data, /home/user/data, /tmp/data)
19
  - HF_DATASET_REPO (optionnel "username/my_proj_vectors") pour export
20
  - LOG_LEVEL (DEBUG par défaut)
21
  - UI_PATH ("/ui")
@@ -25,7 +23,6 @@ ENV:
25
  from __future__ import annotations
26
  import os
27
  import io
28
- import re
29
  import json
30
  import time
31
  import uuid
@@ -65,7 +62,7 @@ logging.basicConfig(
65
  LOG = logging.getLogger("remote_indexer_noqdrant")
66
 
67
  EMB_PROVIDER = os.getenv("EMB_PROVIDER", "hf").lower() # "hf" | "dummy"
68
- HF_EMBED_MODEL = os.getenv("HF_EMBED_MODEL", "BAAI/bge-m3")
69
  HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
70
  EMB_FALLBACK_TO_DUMMY = os.getenv("EMB_FALLBACK_TO_DUMMY", "false").lower() in ("1","true","yes","on")
71
 
@@ -80,11 +77,11 @@ if EMB_PROVIDER == "hf" and not HF_TOKEN and not EMB_FALLBACK_TO_DUMMY:
80
  # ------------------------------------------------------------------------------
81
  def pick_data_dir() -> str:
82
  candidates = [
83
- os.getenv("DATA_DIR", "").strip(), # priorité à l'env si fourni
84
- os.path.join(os.getcwd(), "data"), # ./data dans le WORKDIR (/app)
85
- "/home/user/app/data", # chemins typiques HF Spaces
86
  "/home/user/data",
87
- "/tmp/data", # toujours writable
88
  ]
89
  for p in candidates:
90
  if not p:
@@ -113,8 +110,8 @@ class FileItem(BaseModel):
113
  class IndexRequest(BaseModel):
114
  project_id: str = Field(..., min_length=1)
115
  files: List[FileItem] = Field(default_factory=list)
116
- chunk_size: int = Field(200, ge=64, le=4096)
117
- overlap: int = Field(20, ge=0, le=512)
118
  batch_size: int = Field(32, ge=1, le=1024)
119
  store_text: bool = True
120
 
@@ -143,45 +140,43 @@ class JobState(BaseModel):
143
  LOG.debug(f"[{self.job_id}] {msg}")
144
 
145
  JOBS: Dict[str, JobState] = {}
146
-
147
- # In-memory cache {project_id: (Dataset, dim)}
148
- DATASETS: Dict[str, Tuple[Dataset, int]] = {}
149
 
150
  # ------------------------------------------------------------------------------
151
- # Utils
152
  # ------------------------------------------------------------------------------
153
- def hash8(s: str) -> str:
154
- return hashlib.sha256(s.encode("utf-8")).hexdigest()[:16]
155
-
156
- def l2_normalize(vec: List[float]) -> List[float]:
157
- arr = np.array(vec, dtype=np.float32)
158
- n = float(np.linalg.norm(arr))
159
- if n > 0:
160
- arr = arr / n
161
- return arr.astype(np.float32).tolist()
162
-
163
- def flatten_any(x: Any) -> List[float]:
164
- if isinstance(x, (list, tuple)):
165
- if len(x) > 0 and isinstance(x[0], (list, tuple)):
166
- return flatten_any(x[0])
167
- return list(map(float, x))
168
- raise ValueError("Embedding vector mal formé")
169
-
170
- def chunk_text(text: str, chunk_size: int, overlap: int) -> List[Tuple[int, int, str]]:
171
  text = text or ""
172
- if not text.strip():
 
 
173
  return []
174
- res = []
175
  n = len(text)
 
176
  i = 0
 
 
 
 
177
  while i < n:
178
  j = min(i + chunk_size, n)
179
  chunk = text[i:j]
180
- if len(chunk.strip()) >= 30:
181
  res.append((i, j, chunk))
 
 
182
  i = j - overlap
183
  if i <= 0:
184
  i = j
 
 
 
 
185
  return res
186
 
187
  def project_paths(project_id: str) -> Dict[str, str]:
@@ -211,10 +206,23 @@ def load_meta(meta_path: str) -> Dict[str, Any]:
211
  # ------------------------------------------------------------------------------
212
  # Embeddings (HF Inference ou dummy)
213
  # ------------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  def _maybe_prefix_for_model(texts: List[str], model_name: str) -> List[str]:
215
  m = (model_name or "").lower()
216
  if "e5" in m:
217
- # E5: "query: ..." / "passage: ..." etc. Ici on uniformise simple.
218
  return [("query: " + t) for t in texts]
219
  return texts
220
 
@@ -269,7 +277,7 @@ async def embed_texts(client: httpx.AsyncClient, texts: List[str]) -> List[List[
269
  async def build_dataset_with_faiss(job: JobState, req: IndexRequest) -> None:
270
  """
271
  Construit un dataset HuggingFace avec colonnes:
272
- - path (str), text (optionnel), chunk (int), start (int), end (int), embedding (float32[])
273
  Ajoute un index FAISS (Inner Product) et persiste sur disque.
274
  """
275
  try:
@@ -281,32 +289,32 @@ async def build_dataset_with_faiss(job: JobState, req: IndexRequest) -> None:
281
  f"provider={EMB_PROVIDER} model={HF_EMBED_MODEL}"
282
  )
283
 
284
- # Chunking
285
  records: List[Dict[str, Any]] = []
286
  for f in req.files:
287
- chunks = chunk_text(f.text, req.chunk_size, req.overlap)
288
- if not chunks:
289
- job.log(f"{f.path}: 0 chunk (trop court ou vide)")
 
 
290
  for idx, (start, end, ch) in enumerate(chunks):
291
  payload = {"path": f.path, "chunk": idx, "start": start, "end": end}
292
- if req.store_text:
293
- payload["text"] = ch
294
- else:
295
- payload["text"] = None
296
  payload["raw"] = ch
297
  records.append(payload)
 
298
  job.total_chunks = len(records)
299
  job.log(f"Total chunks = {job.total_chunks}")
300
  if job.total_chunks == 0:
301
  job.stage = "failed"
302
- job.errors.append("Aucun chunk à indexer.")
303
  job.finished_at = time.time()
304
  return
305
 
306
  # Embeddings par batch
307
  async with httpx.AsyncClient(timeout=180) as client:
308
  all_vecs: List[List[float]] = []
309
- B = max(8, min(64, req.batch_size * 2))
310
  i = 0
311
  while i < len(records):
312
  sub = records[i : i + B]
@@ -322,12 +330,12 @@ async def build_dataset_with_faiss(job: JobState, req: IndexRequest) -> None:
322
  vec_dim = len(all_vecs[0])
323
  job.log(f"Embeddings dim={vec_dim}")
324
 
325
- # Prépare colonnes du dataset
326
  paths = [r["path"] for r in records]
327
  chunks = [int(r["chunk"]) for r in records]
328
  starts = [int(r["start"]) for r in records]
329
  ends = [int(r["end"]) for r in records]
330
- texts = [r.get("text") for r in records]
331
 
332
  features = Features({
333
  "path": Value("string"),
@@ -350,20 +358,19 @@ async def build_dataset_with_faiss(job: JobState, req: IndexRequest) -> None:
350
  features=features,
351
  )
352
 
353
- # Ajoute index FAISS (Inner Product sur vecteurs normalisés ~ cosine)
354
  job.stage = "indexing"
355
  ds.add_faiss_index(column="embedding", metric_type=faiss.METRIC_INNER_PRODUCT)
356
  job.indexed = ds.num_rows
357
  job.log(f"FAISS index ajouté ({ds.num_rows} points)")
358
 
359
- # Persistance disque
360
  p = project_paths(req.project_id)
361
  os.makedirs(p["faiss_dir"], exist_ok=True)
362
  ds.save_to_disk(p["ds_dir"])
363
  ds.save_faiss_index("embedding", p["faiss_file"])
364
  save_meta(p["meta_file"], {"dim": vec_dim, "rows": ds.num_rows, "model": HF_EMBED_MODEL, "ts": time.time()})
365
 
366
- # Cache mémoire
367
  DATASETS[req.project_id] = (ds, vec_dim)
368
 
369
  job.stage = "done"
@@ -399,7 +406,6 @@ def create_and_start_job(req: IndexRequest) -> JobState:
399
  # Chargement / Query helpers
400
  # ------------------------------------------------------------------------------
401
  def ensure_loaded(project_id: str) -> Tuple[Dataset, int]:
402
- """Charge le dataset+faiss depuis disque si pas en cache mémoire."""
403
  if project_id in DATASETS:
404
  return DATASETS[project_id]
405
  p = project_paths(project_id)
@@ -440,13 +446,6 @@ async def api_info():
440
  "hub_export_enabled": bool(HF_DATASET_REPO and HfApi),
441
  }
442
 
443
- @fastapi_app.get("/debug/paths")
444
- async def debug_paths(project_id: Optional[str] = None):
445
- res = {"DATA_DIR": DATA_DIR, "cwd": os.getcwd()}
446
- if project_id:
447
- res["project_paths"] = project_paths(project_id)
448
- return res
449
-
450
  @fastapi_app.get("/")
451
  async def root_redirect():
452
  return RedirectResponse(url=UI_PATH, status_code=307)
@@ -510,7 +509,6 @@ async def export_hub(project_id: str = Query(..., min_length=1), repo_id: Option
510
  except Exception:
511
  pass
512
 
513
- # zip le dossier projet
514
  buf = io.BytesIO()
515
  base_dir = p["base"]
516
  zip_name = f"{project_id}_vectors.zip"
@@ -535,8 +533,10 @@ async def export_hub(project_id: str = Query(..., min_length=1), repo_id: Option
535
  # Gradio UI
536
  # ------------------------------------------------------------------------------
537
  def _default_two_docs() -> List[Dict[str, str]]:
538
- a = "Alpha bravo charlie delta echo foxtrot golf hotel india. " * 3
539
- b = "Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy. " * 3
 
 
540
  return [{"path": "a.txt", "text": a}, {"path": "b.txt", "text": b}]
541
 
542
  async def ui_wipe(project: str):
@@ -566,13 +566,33 @@ async def ui_index_sample(project: str, chunk_size: int, overlap: int, batch_siz
566
  LOG.exception("index UI error")
567
  return f"❌ Index erreur: {e}", ""
568
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569
  async def ui_status(job_id: str):
570
  if not job_id.strip():
571
  return "⚠️ Renseigne un job_id"
572
  try:
573
  st = await status(job_id)
574
  lines = [f"Job {st['job_id']} — stage={st['stage']} files={st['total_files']} chunks={st['total_chunks']} embedded={st['embedded']} indexed={st['indexed']}"]
575
- lines += st.get("messages", [])[-80:]
576
  if st.get("errors"):
577
  lines.append("Erreurs:")
578
  lines += [f" - {e}" for e in st['errors']]
@@ -621,7 +641,7 @@ with gr.Blocks(title="Remote Indexer — No-Qdrant (datasets+FAISS)", analytics_
621
  jobid_tb = gr.Textbox(label="Job ID", value="", interactive=True)
622
  with gr.Row():
623
  wipe_btn = gr.Button("🧨 Wipe project", variant="stop")
624
- index_btn = gr.Button("🚀 Indexer 2 documents", variant="primary")
625
  count_btn = gr.Button("📊 Count points", variant="secondary")
626
  with gr.Row():
627
  chunk_size = gr.Slider(64, 1024, value=200, step=8, label="chunk_size")
@@ -630,6 +650,11 @@ with gr.Blocks(title="Remote Indexer — No-Qdrant (datasets+FAISS)", analytics_
630
  store_text = gr.Checkbox(value=True, label="store_text (payload)")
631
  out_log = gr.Textbox(lines=18, label="Logs / Résultats", interactive=False)
632
 
 
 
 
 
 
633
  with gr.Row():
634
  status_btn = gr.Button("📡 Status (refresh)")
635
  auto_chk = gr.Checkbox(False, label="⏱️ Auto-refresh status (2 s)")
@@ -646,6 +671,7 @@ with gr.Blocks(title="Remote Indexer — No-Qdrant (datasets+FAISS)", analytics_
646
 
647
  wipe_btn.click(ui_wipe, inputs=[project_tb], outputs=[out_log])
648
  index_btn.click(ui_index_sample, inputs=[project_tb, chunk_size, overlap, batch_size, store_text], outputs=[out_log, jobid_tb])
 
649
  count_btn.click(ui_count, inputs=[project_tb], outputs=[out_log])
650
 
651
  status_btn.click(ui_status, inputs=[jobid_tb], outputs=[out_log])
 
3
  HF Space - Remote Indexer (No-Qdrant)
4
  Stockage & recherche vectorielle avec 🤗 datasets + FAISS (local), UI Gradio.
5
 
6
+ Améliorations:
7
+ - Découpage "fail-safe": si aucun chunk, on prend 1 chunk = tout le texte.
8
+ - Logs détaillés: taille par fichier, nb chunks par fichier.
9
+ - UI: bouton "Indexer depuis textarea" pour tester avec 2 gros textes injectés depuis l'UI.
 
 
10
 
11
  ENV:
12
  - EMB_PROVIDER ("hf" | "dummy", défaut "hf")
13
  - HF_EMBED_MODEL (ex: "BAAI/bge-m3" | "intfloat/e5-base-v2")
14
  - HUGGINGFACEHUB_API_TOKEN (requis si EMB_PROVIDER=hf)
15
  - EMB_FALLBACK_TO_DUMMY (true/false)
16
+ - DATA_DIR (auto-pick writable: $DATA_DIR, ./data, /home/user/app/data, /home/user/data, /tmp/data)
17
  - HF_DATASET_REPO (optionnel "username/my_proj_vectors") pour export
18
  - LOG_LEVEL (DEBUG par défaut)
19
  - UI_PATH ("/ui")
 
23
  from __future__ import annotations
24
  import os
25
  import io
 
26
  import json
27
  import time
28
  import uuid
 
62
  LOG = logging.getLogger("remote_indexer_noqdrant")
63
 
64
  EMB_PROVIDER = os.getenv("EMB_PROVIDER", "hf").lower() # "hf" | "dummy"
65
+ HF_EMBED_MODEL = os.getenv("HF_EMBED_MODEL", "intfloat/e5-base-v2")
66
  HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
67
  EMB_FALLBACK_TO_DUMMY = os.getenv("EMB_FALLBACK_TO_DUMMY", "false").lower() in ("1","true","yes","on")
68
 
 
77
  # ------------------------------------------------------------------------------
78
  def pick_data_dir() -> str:
79
  candidates = [
80
+ os.getenv("DATA_DIR", "").strip(),
81
+ os.path.join(os.getcwd(), "data"),
82
+ "/home/user/app/data",
83
  "/home/user/data",
84
+ "/tmp/data",
85
  ]
86
  for p in candidates:
87
  if not p:
 
110
  class IndexRequest(BaseModel):
111
  project_id: str = Field(..., min_length=1)
112
  files: List[FileItem] = Field(default_factory=list)
113
+ chunk_size: int = Field(200, ge=32, le=8192)
114
+ overlap: int = Field(20, ge=0, le=1024)
115
  batch_size: int = Field(32, ge=1, le=1024)
116
  store_text: bool = True
117
 
 
140
  LOG.debug(f"[{self.job_id}] {msg}")
141
 
142
  JOBS: Dict[str, JobState] = {}
143
+ DATASETS: Dict[str, Tuple[Dataset, int]] = {} # cache mémoire {project_id: (Dataset, dim)}
 
 
144
 
145
  # ------------------------------------------------------------------------------
146
+ # Utils découpage
147
  # ------------------------------------------------------------------------------
148
+ def chunk_text_fail_safe(text: str, chunk_size: int, overlap: int, min_keep_chars: int = 1) -> List[Tuple[int, int, str]]:
149
+ """
150
+ Découpe le texte en fenêtres chevauchantes. Si aucun chunk "utile" n'est produit
151
+ mais que le texte contient au moins min_keep_chars non-blanc, on retourne 1 chunk = 100% du texte.
152
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  text = text or ""
154
+ base = text.strip("\n\r\t ")
155
+ nclean = len(base)
156
+ if nclean < min_keep_chars:
157
  return []
158
+
159
  n = len(text)
160
+ res: List[Tuple[int, int, str]] = []
161
  i = 0
162
+ # Normalisation des paramètres absurdes
163
+ chunk_size = max(32, int(chunk_size))
164
+ overlap = max(0, min(int(overlap), chunk_size - 1))
165
+
166
  while i < n:
167
  j = min(i + chunk_size, n)
168
  chunk = text[i:j]
169
+ if len(chunk.strip()) >= min_keep_chars:
170
  res.append((i, j, chunk))
171
+ if j == n:
172
+ break
173
  i = j - overlap
174
  if i <= 0:
175
  i = j
176
+
177
+ if not res:
178
+ # fail-safe : 1 chunk couvrant tout le texte
179
+ res = [(0, n, text)]
180
  return res
181
 
182
  def project_paths(project_id: str) -> Dict[str, str]:
 
206
  # ------------------------------------------------------------------------------
207
  # Embeddings (HF Inference ou dummy)
208
  # ------------------------------------------------------------------------------
209
+ def l2_normalize(vec: List[float]) -> List[float]:
210
+ arr = np.array(vec, dtype=np.float32)
211
+ n = float(np.linalg.norm(arr))
212
+ if n > 0:
213
+ arr = arr / n
214
+ return arr.astype(np.float32).tolist()
215
+
216
+ def flatten_any(x: Any) -> List[float]:
217
+ if isinstance(x, (list, tuple)):
218
+ if len(x) > 0 and isinstance(x[0], (list, tuple)):
219
+ return flatten_any(x[0])
220
+ return list(map(float, x))
221
+ raise ValueError("Embedding vector mal formé")
222
+
223
  def _maybe_prefix_for_model(texts: List[str], model_name: str) -> List[str]:
224
  m = (model_name or "").lower()
225
  if "e5" in m:
 
226
  return [("query: " + t) for t in texts]
227
  return texts
228
 
 
277
  async def build_dataset_with_faiss(job: JobState, req: IndexRequest) -> None:
278
  """
279
  Construit un dataset HuggingFace avec colonnes:
280
+ - path (str), text (str), chunk (int), start (int), end (int), embedding (float32[])
281
  Ajoute un index FAISS (Inner Product) et persiste sur disque.
282
  """
283
  try:
 
289
  f"provider={EMB_PROVIDER} model={HF_EMBED_MODEL}"
290
  )
291
 
292
+ # Chunking + logs par fichier
293
  records: List[Dict[str, Any]] = []
294
  for f in req.files:
295
+ t = f.text or ""
296
+ tlen = len(t)
297
+ job.log(f"{f.path}: len(text)={tlen}")
298
+ chunks = chunk_text_fail_safe(t, req.chunk_size, req.overlap, min_keep_chars=1)
299
+ job.log(f"{f.path}: chunks créés={len(chunks)}")
300
  for idx, (start, end, ch) in enumerate(chunks):
301
  payload = {"path": f.path, "chunk": idx, "start": start, "end": end}
302
+ payload["text"] = ch if req.store_text else ""
 
 
 
303
  payload["raw"] = ch
304
  records.append(payload)
305
+
306
  job.total_chunks = len(records)
307
  job.log(f"Total chunks = {job.total_chunks}")
308
  if job.total_chunks == 0:
309
  job.stage = "failed"
310
+ job.errors.append("Aucun chunk à indexer (textes vides ?)")
311
  job.finished_at = time.time()
312
  return
313
 
314
  # Embeddings par batch
315
  async with httpx.AsyncClient(timeout=180) as client:
316
  all_vecs: List[List[float]] = []
317
+ B = max(8, min(128, req.batch_size * 2))
318
  i = 0
319
  while i < len(records):
320
  sub = records[i : i + B]
 
330
  vec_dim = len(all_vecs[0])
331
  job.log(f"Embeddings dim={vec_dim}")
332
 
333
+ # Dataset columns
334
  paths = [r["path"] for r in records]
335
  chunks = [int(r["chunk"]) for r in records]
336
  starts = [int(r["start"]) for r in records]
337
  ends = [int(r["end"]) for r in records]
338
+ texts = [r.get("text", "") for r in records]
339
 
340
  features = Features({
341
  "path": Value("string"),
 
358
  features=features,
359
  )
360
 
361
+ # Index FAISS (Inner Product cosine après normalisation)
362
  job.stage = "indexing"
363
  ds.add_faiss_index(column="embedding", metric_type=faiss.METRIC_INNER_PRODUCT)
364
  job.indexed = ds.num_rows
365
  job.log(f"FAISS index ajouté ({ds.num_rows} points)")
366
 
367
+ # Persistance
368
  p = project_paths(req.project_id)
369
  os.makedirs(p["faiss_dir"], exist_ok=True)
370
  ds.save_to_disk(p["ds_dir"])
371
  ds.save_faiss_index("embedding", p["faiss_file"])
372
  save_meta(p["meta_file"], {"dim": vec_dim, "rows": ds.num_rows, "model": HF_EMBED_MODEL, "ts": time.time()})
373
 
 
374
  DATASETS[req.project_id] = (ds, vec_dim)
375
 
376
  job.stage = "done"
 
406
  # Chargement / Query helpers
407
  # ------------------------------------------------------------------------------
408
  def ensure_loaded(project_id: str) -> Tuple[Dataset, int]:
 
409
  if project_id in DATASETS:
410
  return DATASETS[project_id]
411
  p = project_paths(project_id)
 
446
  "hub_export_enabled": bool(HF_DATASET_REPO and HfApi),
447
  }
448
 
 
 
 
 
 
 
 
449
  @fastapi_app.get("/")
450
  async def root_redirect():
451
  return RedirectResponse(url=UI_PATH, status_code=307)
 
509
  except Exception:
510
  pass
511
 
 
512
  buf = io.BytesIO()
513
  base_dir = p["base"]
514
  zip_name = f"{project_id}_vectors.zip"
 
533
  # Gradio UI
534
  # ------------------------------------------------------------------------------
535
  def _default_two_docs() -> List[Dict[str, str]]:
536
+ a = ("Alpha bravo charlie delta echo foxtrot golf hotel india juliett kilo lima mike november oscar papa "
537
+ "quebec romeo sierra tango uniform victor whiskey xray yankee zulu. ") * 5
538
+ b = ("Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet "
539
+ "dolore magna aliquam erat volutpat. ") * 5
540
  return [{"path": "a.txt", "text": a}, {"path": "b.txt", "text": b}]
541
 
542
  async def ui_wipe(project: str):
 
566
  LOG.exception("index UI error")
567
  return f"❌ Index erreur: {e}", ""
568
 
569
+ async def ui_index_from_textarea(project: str, text1: str, text2: str, chunk_size: int, overlap: int, batch_size: int, store_text: bool):
570
+ files = [
571
+ {"path": "ui_text_1.txt", "text": text1 or ""},
572
+ {"path": "ui_text_2.txt", "text": text2 or ""},
573
+ ]
574
+ req = IndexRequest(
575
+ project_id=project,
576
+ files=[FileItem(**f) for f in files],
577
+ chunk_size=chunk_size,
578
+ overlap=overlap,
579
+ batch_size=batch_size,
580
+ store_text=store_text,
581
+ )
582
+ try:
583
+ job = create_and_start_job(req)
584
+ return f"🚀 Job (textarea) lancé: {job.job_id}", job.job_id
585
+ except Exception as e:
586
+ LOG.exception("index-from-text UI error")
587
+ return f"❌ Index (textarea) erreur: {e}", ""
588
+
589
  async def ui_status(job_id: str):
590
  if not job_id.strip():
591
  return "⚠️ Renseigne un job_id"
592
  try:
593
  st = await status(job_id)
594
  lines = [f"Job {st['job_id']} — stage={st['stage']} files={st['total_files']} chunks={st['total_chunks']} embedded={st['embedded']} indexed={st['indexed']}"]
595
+ lines += st.get("messages", [])[-100:]
596
  if st.get("errors"):
597
  lines.append("Erreurs:")
598
  lines += [f" - {e}" for e in st['errors']]
 
641
  jobid_tb = gr.Textbox(label="Job ID", value="", interactive=True)
642
  with gr.Row():
643
  wipe_btn = gr.Button("🧨 Wipe project", variant="stop")
644
+ index_btn = gr.Button("🚀 Indexer 2 documents (démo)", variant="primary")
645
  count_btn = gr.Button("📊 Count points", variant="secondary")
646
  with gr.Row():
647
  chunk_size = gr.Slider(64, 1024, value=200, step=8, label="chunk_size")
 
650
  store_text = gr.Checkbox(value=True, label="store_text (payload)")
651
  out_log = gr.Textbox(lines=18, label="Logs / Résultats", interactive=False)
652
 
653
+ with gr.Accordion("Indexer depuis textarea (bypass fichiers)", open=False):
654
+ txt1 = gr.Textbox(label="Texte 1", value="Ceci est un texte de test assez long pour produire des chunks. " * 10, lines=6)
655
+ txt2 = gr.Textbox(label="Texte 2", value="Deuxième texte de test pour vérifier l'indexation et la recherche. " * 10, lines=6)
656
+ index_txt_btn = gr.Button("📝 Indexer ces 2 textes")
657
+
658
  with gr.Row():
659
  status_btn = gr.Button("📡 Status (refresh)")
660
  auto_chk = gr.Checkbox(False, label="⏱️ Auto-refresh status (2 s)")
 
671
 
672
  wipe_btn.click(ui_wipe, inputs=[project_tb], outputs=[out_log])
673
  index_btn.click(ui_index_sample, inputs=[project_tb, chunk_size, overlap, batch_size, store_text], outputs=[out_log, jobid_tb])
674
+ index_txt_btn.click(ui_index_from_textarea, inputs=[project_tb, txt1, txt2, chunk_size, overlap, batch_size, store_text], outputs=[out_log, jobid_tb])
675
  count_btn.click(ui_count, inputs=[project_tb], outputs=[out_log])
676
 
677
  status_btn.click(ui_status, inputs=[jobid_tb], outputs=[out_log])