chouchouvs commited on
Commit
c6a5264
·
verified ·
1 Parent(s): 2624013

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +313 -659
main.py CHANGED
@@ -1,705 +1,359 @@
1
  # -*- coding: utf-8 -*-
2
- """
3
- HF Space - Remote Indexer (No-Qdrant)
4
- Stockage & recherche vectorielle avec 🤗 datasets + FAISS (local), UI Gradio.
5
-
6
- Améliorations clés de cette version :
7
- - ✅ Sauvegarde d'abord le dataset SANS index -> save_to_disk()
8
- - ✅ Puis ajoute FAISS et sauvegarde l'index séparément -> save_faiss_index()
9
- - ✅ L'index reste chargé en mémoire pour la session courante
10
- - ✅ Découpage "fail-safe" (au moins 1 chunk si texte non vide)
11
- - ✅ Logs détaillés par fichier et par étape
12
- - ✅ UI: bouton "Indexer depuis textarea" pour tester facilement
13
-
14
- ENV:
15
- - EMB_PROVIDER ("hf" | "dummy", défaut "hf")
16
- - HF_EMBED_MODEL (ex: "BAAI/bge-m3" | "intfloat/e5-base-v2")
17
- - HUGGINGFACEHUB_API_TOKEN (requis si EMB_PROVIDER=hf)
18
- - EMB_FALLBACK_TO_DUMMY (true/false)
19
- - DATA_DIR (auto-pick writable: $DATA_DIR, ./data, /home/user/app/data, /home/user/data, /tmp/data)
20
- - HF_DATASET_REPO (optionnel "username/my_proj_vectors") pour export
21
- - LOG_LEVEL (DEBUG par défaut)
22
- - UI_PATH ("/ui")
23
- - PORT (7860)
24
- """
25
-
26
  from __future__ import annotations
 
27
  import os
28
  import io
29
  import json
30
  import time
31
- import uuid
32
- import shutil
33
- import hashlib
34
  import logging
35
- import asyncio
36
- import threading
37
- from typing import List, Dict, Any, Optional, Tuple
38
 
39
  import numpy as np
40
- import httpx
41
- import uvicorn
42
- import gradio as gr
43
- import faiss # type: ignore
44
- from pydantic import BaseModel, Field, ValidationError
45
- from fastapi import FastAPI, HTTPException, Query
46
  from fastapi.middleware.cors import CORSMiddleware
47
- from fastapi.responses import RedirectResponse
48
-
49
- from datasets import Dataset, Features, Sequence, Value, load_from_disk
50
-
51
- try:
52
- from huggingface_hub import HfApi, create_repo
53
- except Exception:
54
- HfApi = None
55
- create_repo = None
56
-
57
- # ------------------------------------------------------------------------------
58
- # Config & logs
59
- # ------------------------------------------------------------------------------
60
- LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG").upper()
61
- logging.basicConfig(
62
- level=getattr(logging, LOG_LEVEL, logging.DEBUG),
63
- format="%(asctime)s - %(levelname)s - %(message)s",
64
- )
65
- LOG = logging.getLogger("remote_indexer_noqdrant")
66
-
67
- EMB_PROVIDER = os.getenv("EMB_PROVIDER", "hf").lower() # "hf" | "dummy"
68
- HF_EMBED_MODEL = os.getenv("HF_EMBED_MODEL", "intfloat/e5-base-v2")
69
- HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
70
- EMB_FALLBACK_TO_DUMMY = os.getenv("EMB_FALLBACK_TO_DUMMY", "false").lower() in ("1","true","yes","on")
71
-
72
- UI_PATH = os.getenv("UI_PATH", "/ui")
73
- HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "").strip() # optionnel
74
-
75
- if EMB_PROVIDER == "hf" and not HF_TOKEN and not EMB_FALLBACK_TO_DUMMY:
76
- LOG.warning("EMB_PROVIDER=hf sans HUGGINGFACEHUB_API_TOKEN (pas de fallback). Mets EMB_PROVIDER=dummy ou EMB_FALLBACK_TO_DUMMY=true pour tester.")
77
-
78
- # ------------------------------------------------------------------------------
79
- # Sélection robuste d'un DATA_DIR writable
80
- # ------------------------------------------------------------------------------
81
- def pick_data_dir() -> str:
82
- candidates = [
83
- os.getenv("DATA_DIR", "").strip(),
84
- os.path.join(os.getcwd(), "data"),
85
- "/home/user/app/data",
86
- "/home/user/data",
87
- "/tmp/data",
88
- ]
89
- for p in candidates:
90
- if not p:
91
- continue
92
- try:
93
- os.makedirs(p, exist_ok=True)
94
- testp = os.path.join(p, ".rw_test")
95
- with open(testp, "w", encoding="utf-8") as f:
96
- f.write("ok")
97
- os.remove(testp)
98
- LOG.info(f"[DATA_DIR] Utilisation de: {p}")
99
- return p
100
- except Exception as e:
101
- LOG.warning(f"[DATA_DIR] Candidat non writable '{p}': {e}")
102
- raise RuntimeError("Aucun répertoire DATA_DIR accessible en écriture.")
103
-
104
- DATA_DIR = pick_data_dir()
105
-
106
- # ------------------------------------------------------------------------------
107
- # Modèles Pydantic
108
- # ------------------------------------------------------------------------------
109
- class FileItem(BaseModel):
110
- path: str
111
- text: str
112
 
113
- class IndexRequest(BaseModel):
114
- project_id: str = Field(..., min_length=1)
115
- files: List[FileItem] = Field(default_factory=list)
116
- chunk_size: int = Field(200, ge=32, le=8192)
117
- overlap: int = Field(20, ge=0, le=1024)
118
- batch_size: int = Field(32, ge=1, le=1024)
119
- store_text: bool = True
120
-
121
- class QueryRequest(BaseModel):
122
- project_id: str
123
- text: str
124
- top_k: int = Field(5, ge=1, le=100)
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  class JobState(BaseModel):
127
  job_id: str
128
  project_id: str
129
- stage: str = "pending" # pending -> embedding -> indexing -> done/failed
130
  total_files: int = 0
131
  total_chunks: int = 0
132
  embedded: int = 0
133
  indexed: int = 0
134
- errors: List[str] = Field(default_factory=list)
135
- messages: List[str] = Field(default_factory=list)
136
- started_at: float = Field(default_factory=time.time)
137
  finished_at: Optional[float] = None
138
 
139
- def log(self, msg: str) -> None:
140
- stamp = time.strftime("%H:%M:%S")
141
- line = f"[{stamp}] {msg}"
142
- self.messages.append(line)
143
- LOG.debug(f"[{self.job_id}] {msg}")
144
-
145
  JOBS: Dict[str, JobState] = {}
146
- DATASETS: Dict[str, Tuple[Dataset, int]] = {} # cache mémoire {project_id: (Dataset, dim)}
147
-
148
- # ------------------------------------------------------------------------------
149
- # Utils découpage
150
- # ------------------------------------------------------------------------------
151
- def chunk_text_fail_safe(text: str, chunk_size: int, overlap: int, min_keep_chars: int = 1) -> List[Tuple[int, int, str]]:
152
- """
153
- Découpe le texte en fenêtres chevauchantes. Si aucun chunk "utile" n'est produit
154
- mais que le texte contient au moins min_keep_chars non-blanc, on retourne 1 chunk = 100% du texte.
155
- """
156
- text = text or ""
157
- base = text.strip("\n\r\t ")
158
- nclean = len(base)
159
- if nclean < min_keep_chars:
160
- return []
161
 
162
- n = len(text)
163
- res: List[Tuple[int, int, str]] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  i = 0
165
- # Normalisation des paramètres absurdes
166
- chunk_size = max(32, int(chunk_size))
167
- overlap = max(0, min(int(overlap), chunk_size - 1))
168
-
169
- while i < n:
170
- j = min(i + chunk_size, n)
171
- chunk = text[i:j]
172
- if len(chunk.strip()) >= min_keep_chars:
173
- res.append((i, j, chunk))
174
- if j == n:
175
  break
176
- i = j - overlap
177
- if i <= 0:
178
- i = j
179
-
180
- if not res:
181
- # fail-safe : 1 chunk couvrant tout le texte
182
- res = [(0, n, text)]
183
- return res
184
-
185
- def project_paths(project_id: str) -> Dict[str, str]:
186
- base = os.path.join(DATA_DIR, project_id)
187
- return {
188
- "base": base,
189
- "ds_dir": os.path.join(base, "dataset"),
190
- "faiss_dir": os.path.join(base, "faiss"),
191
- "faiss_file": os.path.join(base, "faiss", "emb.faiss"),
192
- "meta_file": os.path.join(base, "meta.json"),
193
- }
194
-
195
- def save_meta(meta_path: str, data: Dict[str, Any]) -> None:
196
- os.makedirs(os.path.dirname(meta_path), exist_ok=True)
197
- with open(meta_path, "w", encoding="utf-8") as f:
198
- json.dump(data, f, indent=2, ensure_ascii=False)
199
-
200
- def load_meta(meta_path: str) -> Dict[str, Any]:
201
- if not os.path.exists(meta_path):
202
- return {}
203
- try:
204
- with open(meta_path, "r", encoding="utf-8") as f:
205
- return json.load(f)
206
- except Exception:
207
- return {}
208
-
209
- # ------------------------------------------------------------------------------
210
- # Embeddings (HF Inference ou dummy)
211
- # ------------------------------------------------------------------------------
212
- def l2_normalize(vec: List[float]) -> List[float]:
213
- arr = np.array(vec, dtype=np.float32)
214
- n = float(np.linalg.norm(arr))
215
- if n > 0:
216
- arr = arr / n
217
- return arr.astype(np.float32).tolist()
218
-
219
- def flatten_any(x: Any) -> List[float]:
220
- if isinstance(x, (list, tuple)):
221
- if len(x) > 0 and isinstance(x[0], (list, tuple)):
222
- return flatten_any(x[0])
223
- return list(map(float, x))
224
- raise ValueError("Embedding vector mal formé")
225
-
226
- def _maybe_prefix_for_model(texts: List[str], model_name: str) -> List[str]:
227
- m = (model_name or "").lower()
228
- if "e5" in m:
229
- return [("query: " + t) for t in texts]
230
- return texts
231
-
232
- async def embed_hf(client: httpx.AsyncClient, texts: List[str], model: str = HF_EMBED_MODEL, token: str = HF_TOKEN) -> List[List[float]]:
233
- if not token:
234
- raise HTTPException(status_code=400, detail="HUGGINGFACEHUB_API_TOKEN manquant pour EMB_PROVIDER=hf")
235
- url = f"https://api-inference.huggingface.co/models/{model}"
236
- headers = {"Authorization": f"Bearer {token}"}
237
- inputs = _maybe_prefix_for_model(texts, model)
238
- payload = {"inputs": inputs, "options": {"wait_for_model": True}}
239
- LOG.debug(f"HF POST model={model} n_texts={len(texts)}")
240
- r = await client.post(url, headers=headers, json=payload, timeout=180)
241
- if r.status_code != 200:
242
- detail = r.text
243
- LOG.error(f"HF Inference error {r.status_code}: {detail[:400]}")
244
- raise HTTPException(status_code=502, detail=f"HF Inference error {r.status_code}: {detail}")
245
- data = r.json()
246
- embeddings: List[List[float]] = []
247
- if isinstance(data, list):
248
- for row in data:
249
- vec = flatten_any(row)
250
- embeddings.append(l2_normalize(vec))
251
- else:
252
- vec = flatten_any(data)
253
- embeddings.append(l2_normalize(vec))
254
- return embeddings
255
-
256
- def embed_dummy(texts: List[str], dim: int = 128) -> List[List[float]]:
257
- out: List[List[float]] = []
258
- for t in texts:
259
- h = hashlib.sha256(t.encode("utf-8")).digest()
260
- arr = np.frombuffer((h * ((dim // len(h)) + 1))[:dim], dtype=np.uint8).astype(np.float32)
261
- arr = (arr - 127.5) / 127.5
262
- arr = arr / (np.linalg.norm(arr) + 1e-9)
263
- out.append(arr.astype(np.float32).tolist())
264
  return out
265
 
266
- async def embed_texts(client: httpx.AsyncClient, texts: List[str]) -> List[List[float]]:
267
- if EMB_PROVIDER == "hf":
268
- try:
269
- return await embed_hf(client, texts)
270
- except Exception as e:
271
- if EMB_FALLBACK_TO_DUMMY:
272
- LOG.warning(f"Fallback embeddings → dummy (cause: {e})")
273
- return embed_dummy(texts, dim=128)
274
- raise
275
- return embed_dummy(texts, dim=128)
276
-
277
- # ------------------------------------------------------------------------------
278
- # Indexation (datasets + FAISS)
279
- # ------------------------------------------------------------------------------
280
- async def build_dataset_with_faiss(job: JobState, req: IndexRequest) -> None:
281
- """
282
- Construit un dataset HuggingFace avec colonnes:
283
- - path (str), text (str), chunk (int), start (int), end (int), embedding (float32[])
284
- Sauvegarde le dataset SANS index (save_to_disk), puis crée un index FAISS
285
- et l'écrit séparément (save_faiss_index). En mémoire, on garde l'index chargé.
286
- """
287
- try:
288
- job.stage = "embedding"
289
- job.total_files = len(req.files)
290
- job.log(
291
- f"Index start project={req.project_id} files={len(req.files)} "
292
- f"chunk_size={req.chunk_size} overlap={req.overlap} batch_size={req.batch_size} store_text={req.store_text} "
293
- f"provider={EMB_PROVIDER} model={HF_EMBED_MODEL}"
294
- )
295
-
296
- # Chunking + logs par fichier
297
- records: List[Dict[str, Any]] = []
298
- for f in req.files:
299
- t = f.text or ""
300
- tlen = len(t)
301
- job.log(f"{f.path}: len(text)={tlen}")
302
- chunks = chunk_text_fail_safe(t, req.chunk_size, req.overlap, min_keep_chars=1)
303
- job.log(f"{f.path}: chunks créés={len(chunks)}")
304
- for idx, (start, end, ch) in enumerate(chunks):
305
- payload = {"path": f.path, "chunk": idx, "start": start, "end": end}
306
- payload["text"] = ch if req.store_text else ""
307
- payload["raw"] = ch
308
- records.append(payload)
309
-
310
- job.total_chunks = len(records)
311
- job.log(f"Total chunks = {job.total_chunks}")
312
- if job.total_chunks == 0:
313
- job.stage = "failed"
314
- job.errors.append("Aucun chunk à indexer (textes vides ?)")
315
- job.finished_at = time.time()
316
- return
317
-
318
- # Embeddings par batch
319
- async with httpx.AsyncClient(timeout=180) as client:
320
- all_vecs: List[List[float]] = []
321
- B = max(8, min(128, req.batch_size * 2))
322
- i = 0
323
- while i < len(records):
324
- sub = records[i : i + B]
325
- texts = [r["raw"] for r in sub]
326
- vecs = await embed_texts(client, texts)
327
- if len(vecs) != len(sub):
328
- raise HTTPException(status_code=500, detail="Embedding batch size mismatch")
329
- all_vecs.extend(vecs)
330
- job.embedded += len(vecs)
331
- job.log(f"Embeddings {job.embedded}/{job.total_chunks}")
332
- i += B
333
-
334
- vec_dim = len(all_vecs[0])
335
- job.log(f"Embeddings dim={vec_dim}")
336
-
337
- # Dataset columns
338
- paths = [r["path"] for r in records]
339
- chunks = [int(r["chunk"]) for r in records]
340
- starts = [int(r["start"]) for r in records]
341
- ends = [int(r["end"]) for r in records]
342
- texts = [r.get("text", "") for r in records]
343
-
344
- features = Features({
345
- "path": Value("string"),
346
- "chunk": Value("int32"),
347
- "start": Value("int32"),
348
- "end": Value("int32"),
349
- "text": Value("string"),
350
- "embedding": Sequence(Value("float32")),
351
- })
352
-
353
- ds = Dataset.from_dict(
354
- {
355
- "path": paths,
356
- "chunk": chunks,
357
- "start": starts,
358
- "end": ends,
359
- "text": texts,
360
- "embedding": [np.array(v, dtype=np.float32) for v in all_vecs],
361
- },
362
- features=features,
363
- )
364
-
365
- # *** IMPORTANT ***
366
- # 1) Sauvegarder le dataset SANS index (sinon save_to_disk lève une exception)
367
- p = project_paths(req.project_id)
368
- os.makedirs(p["faiss_dir"], exist_ok=True)
369
- ds.save_to_disk(p["ds_dir"])
370
- job.log(f"Dataset (sans index) sauvegardé dans {p['ds_dir']}")
371
-
372
- # 2) Ajouter l'index FAISS en mémoire, puis le sauvegarder séparément
373
- job.stage = "indexing"
374
- ds.add_faiss_index(column="embedding", metric_type=faiss.METRIC_INNER_PRODUCT)
375
- job.indexed = ds.num_rows
376
- job.log(f"FAISS index ajouté ({ds.num_rows} points)")
377
- ds.save_faiss_index("embedding", p["faiss_file"])
378
- job.log(f"FAISS écrit sur {p['faiss_file']}")
379
-
380
- # 3) Métadonnées + cache mémoire (garde l'index attaché pour cette session)
381
- save_meta(p["meta_file"], {"dim": vec_dim, "rows": ds.num_rows, "model": HF_EMBED_MODEL, "ts": time.time()})
382
- DATASETS[req.project_id] = (ds, vec_dim)
383
-
384
- job.stage = "done"
385
- job.finished_at = time.time()
386
- job.log(f"OK — dataset+index prêts (projet={req.project_id})")
387
- except Exception as e:
388
- job.stage = "failed"
389
- job.errors.append(str(e))
390
- job.finished_at = time.time()
391
- job.log(f"❌ Exception: {e}")
392
-
393
- def _run_job_in_thread(job: JobState, req: IndexRequest) -> None:
394
- def _runner():
395
- try:
396
- asyncio.run(build_dataset_with_faiss(job, req))
397
- except Exception as e:
398
- job.stage = "failed"
399
- job.errors.append(str(e))
400
- job.finished_at = time.time()
401
- job.log(f"❌ Thread exception: {e}")
402
- t = threading.Thread(target=_runner, daemon=True)
403
- t.start()
404
-
405
- def create_and_start_job(req: IndexRequest) -> JobState:
406
- job_id = uuid.uuid4().hex[:12]
407
- job = JobState(job_id=job_id, project_id=req.project_id)
408
- JOBS[job_id] = job
409
- job.log(f"Job {job_id} créé pour project {req.project_id}")
410
- _run_job_in_thread(job, req)
411
- return job
412
-
413
- # ------------------------------------------------------------------------------
414
- # Chargement / Query helpers
415
- # ------------------------------------------------------------------------------
416
- def ensure_loaded(project_id: str) -> Tuple[Dataset, int]:
417
- """
418
- Charge ou recharge le dataset et l'index FAISS depuis disque si nécessaire.
419
- - Le dataset sur disque n'a pas d'index (par design), on recharge FAISS via .load_faiss_index().
420
- """
421
- if project_id in DATASETS:
422
- return DATASETS[project_id]
423
- p = project_paths(project_id)
424
- if not os.path.exists(p["ds_dir"]):
425
- raise HTTPException(status_code=404, detail=f"Dataset absent pour projet {project_id}")
426
- ds = load_from_disk(p["ds_dir"])
427
- if os.path.exists(p["faiss_file"]):
428
- ds.load_faiss_index("embedding", p["faiss_file"])
429
- meta = load_meta(p["meta_file"])
430
- vec_dim = int(meta.get("dim", 0)) or len(ds[0]["embedding"])
431
- DATASETS[project_id] = (ds, vec_dim)
432
- return ds, vec_dim
433
-
434
- async def embed_query(text: str) -> List[float]:
435
- async with httpx.AsyncClient(timeout=60) as client:
436
- vec = (await embed_texts(client, [text]))[0]
437
- return vec
438
-
439
- # ------------------------------------------------------------------------------
440
- # FastAPI app
441
- # ------------------------------------------------------------------------------
442
- fastapi_app = FastAPI(title="Remote Indexer - NoQdrant (Datasets+FAISS)")
443
  fastapi_app.add_middleware(
444
- CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]
 
445
  )
446
 
 
 
 
 
 
 
 
 
 
 
 
 
447
  @fastapi_app.get("/health")
448
- async def health():
449
- return {"status": "ok", "emb_provider": EMB_PROVIDER, "model": HF_EMBED_MODEL, "data_dir": DATA_DIR}
450
-
451
- @fastapi_app.get("/api")
452
- async def api_info():
453
- return {
454
- "ok": True, "service": "remote-indexer-noqdrant",
455
- "emb_provider": EMB_PROVIDER, "hf_model": HF_EMBED_MODEL,
456
- "fallback_to_dummy": EMB_FALLBACK_TO_DUMMY,
457
- "data_dir": DATA_DIR, "ui_path": UI_PATH,
458
- "hub_export_enabled": bool(HF_DATASET_REPO and HfApi),
459
- }
460
 
461
  @fastapi_app.get("/")
462
- async def root_redirect():
463
- return RedirectResponse(url=UI_PATH, status_code=307)
464
-
465
- @fastapi_app.post("/wipe")
466
- async def wipe(project_id: str = Query(..., min_length=1)):
467
- p = project_paths(project_id)
468
- if os.path.exists(p["base"]):
469
- shutil.rmtree(p["base"], ignore_errors=True)
470
- if project_id in DATASETS:
471
- DATASETS.pop(project_id, None)
472
- return {"ok": True, "project_id": project_id, "removed": True}
473
 
474
  @fastapi_app.post("/index")
475
- async def index(req: IndexRequest):
476
- job = create_and_start_job(req)
477
- return {"job_id": job.job_id, "project_id": job.project_id}
478
-
479
- @fastapi_app.get("/status/{job_id}")
480
- async def status(job_id: str):
481
- job = JOBS.get(job_id)
482
- if not job:
483
- raise HTTPException(status_code=404, detail="job_id inconnu")
484
- return job.model_dump()
485
-
486
- @fastapi_app.get("/collections/{project_id}/count")
487
- async def coll_count(project_id: str):
488
- try:
489
- ds, _ = ensure_loaded(project_id)
490
- return {"project_id": project_id, "count": ds.num_rows}
491
- except Exception as e:
492
- return {"project_id": project_id, "count": 0, "note": f"{e}"}
493
-
494
- @fastapi_app.post("/query")
495
- async def query(req: QueryRequest):
496
- ds, vec_dim = ensure_loaded(req.project_id)
497
- qvec = await embed_query(req.text)
498
- if len(qvec) != vec_dim:
499
- raise HTTPException(status_code=400, detail=f"Dim requête {len(qvec)} ≠ dim index {vec_dim}")
500
- scores, ex = ds.get_nearest_examples("embedding", np.array(qvec, dtype=np.float32), k=req.top_k)
501
- results = []
502
- for s, path, chunk, text in zip(scores, ex["path"], ex["chunk"], ex["text"]):
503
- preview = ((text or "")[:160]).replace("\n", " ")
504
- results.append({"score": float(s), "path": path, "chunk": int(chunk), "preview": preview})
505
- return {"result": results, "k": req.top_k}
506
-
507
- @fastapi_app.post("/export_hub")
508
- async def export_hub(project_id: str = Query(..., min_length=1), repo_id: Optional[str] = None):
509
- if not HfApi or not HF_TOKEN:
510
- raise HTTPException(status_code=400, detail="huggingface_hub non dispo ou HF token absent.")
511
- p = project_paths(project_id)
512
- if not os.path.exists(p["ds_dir"]):
513
- raise HTTPException(status_code=404, detail="Aucun dataset local à exporter.")
514
- rid = (repo_id or HF_DATASET_REPO or "").strip()
515
- if not rid:
516
- raise HTTPException(status_code=400, detail="repo_id requis (ou HF_DATASET_REPO).")
517
-
518
- api = HfApi(token=HF_TOKEN)
519
- try:
520
- create_repo(rid, repo_type="dataset", exist_ok=True, token=HF_TOKEN)
521
- except Exception:
522
- pass
523
-
524
- # Zip du dossier projet
525
- buf = io.BytesIO()
526
- base_dir = p["base"]
527
- zip_name = f"{project_id}_vectors.zip"
528
- import zipfile
529
- with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as z:
530
- for root, _, files in os.walk(base_dir):
531
- for fn in files:
532
- full = os.path.join(root, fn)
533
- rel = os.path.relpath(full, base_dir)
534
- z.write(full, arcname=rel)
535
- buf.seek(0)
536
-
537
- api.upload_file(
538
- path_or_fileobj=buf,
539
- path_in_repo=zip_name,
540
- repo_id=rid,
541
- repo_type="dataset",
542
- )
543
- return {"ok": True, "repo_id": rid, "file": zip_name}
544
-
545
- # ------------------------------------------------------------------------------
546
- # Gradio UI
547
- # ------------------------------------------------------------------------------
548
- def _default_two_docs() -> List[Dict[str, str]]:
549
- a = ("Alpha bravo charlie delta echo foxtrot golf hotel india juliett kilo lima mike november oscar papa "
550
- "quebec romeo sierra tango uniform victor whiskey xray yankee zulu. ") * 5
551
- b = ("Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet "
552
- "dolore magna aliquam erat volutpat. ") * 5
553
- return [{"path": "a.txt", "text": a}, {"path": "b.txt", "text": b}]
554
-
555
- async def ui_wipe(project: str):
556
- try:
557
- resp = await wipe(project)
558
- return f"✅ Wipe ok — projet {resp['project_id']} vidé."
559
- except Exception as e:
560
- LOG.exception("wipe UI error")
561
- return f"❌ Wipe erreur: {e}"
562
-
563
- async def ui_index_sample(project: str, chunk_size: int, overlap: int, batch_size: int, store_text: bool):
564
- files = _default_two_docs()
565
- req = IndexRequest(
566
- project_id=project,
567
- files=[FileItem(**f) for f in files],
568
- chunk_size=chunk_size,
569
- overlap=overlap,
570
- batch_size=batch_size,
571
- store_text=store_text,
572
- )
573
- try:
574
- job = create_and_start_job(req)
575
- return f"🚀 Job lancé: {job.job_id}", job.job_id
576
- except ValidationError as ve:
577
- return f"❌ Payload invalide: {ve}", ""
578
- except Exception as e:
579
- LOG.exception("index UI error")
580
- return f"❌ Index erreur: {e}", ""
581
-
582
- async def ui_index_from_textarea(project: str, text1: str, text2: str, chunk_size: int, overlap: int, batch_size: int, store_text: bool):
583
- files = [
584
- {"path": "ui_text_1.txt", "text": text1 or ""},
585
- {"path": "ui_text_2.txt", "text": text2 or ""},
586
- ]
587
- req = IndexRequest(
588
- project_id=project,
589
- files=[FileItem(**f) for f in files],
590
- chunk_size=chunk_size,
591
- overlap=overlap,
592
- batch_size=batch_size,
593
- store_text=store_text,
594
- )
595
  try:
596
- job = create_and_start_job(req)
597
- return f"🚀 Job (textarea) lancé: {job.job_id}", job.job_id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598
  except Exception as e:
599
- LOG.exception("index-from-text UI error")
600
- return f"❌ Index (textarea) erreur: {e}", ""
 
 
 
 
601
 
602
- async def ui_status(job_id: str):
603
- if not job_id.strip():
604
- return "⚠️ Renseigne un job_id"
605
- try:
606
- st = await status(job_id)
607
- lines = [f"Job {st['job_id']} — stage={st['stage']} files={st['total_files']} chunks={st['total_chunks']} embedded={st['embedded']} indexed={st['indexed']}"]
608
- lines += st.get("messages", [])[-100:]
609
- if st.get("errors"):
610
- lines.append("Erreurs:")
611
- lines += [f" - {e}" for e in st['errors']]
612
- return "\n".join(lines)
613
- except Exception as e:
614
- return f"❌ Status erreur: {e}"
615
 
616
- async def ui_count(project: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617
  try:
618
- data = await coll_count(project)
619
- return f"📊 Count — project={project} → {data['count']} points" + (f" ({data.get('note')})" if 'note' in data else "")
620
- except Exception as e:
621
- LOG.exception("count UI error")
622
- return f"❌ Count erreur: {e}"
623
-
624
- async def ui_query(project: str, text: str, topk: int):
625
  try:
626
- data = await query(QueryRequest(project_id=project, text=text, top_k=topk))
627
- hits = data.get("result", [])
628
- if not hits:
629
- return "Aucun résultat."
630
- out = []
631
- for h in hits:
632
- out.append(f"{h['score']:.4f} — {h['path']} [chunk {h['chunk']}] — {h['preview']}…")
633
- return "\n".join(out)
634
  except Exception as e:
635
- LOG.exception("query UI error")
636
- return f"❌ Query erreur: {e}"
637
 
638
- async def ui_export(project: str, repo_id: str):
639
  try:
640
- resp = await export_hub(project, repo_id or None)
641
- return f"📤 Export → dataset repo={resp['repo_id']} file={resp['file']}"
642
  except Exception as e:
643
- LOG.exception("export UI error")
644
- return f"❌ Export erreur: {e}"
645
-
646
- with gr.Blocks(title="Remote Indexer — No-Qdrant (datasets+FAISS)", analytics_enabled=False) as ui:
647
- gr.Markdown("## 🧪 Remote Indexer — No-Qdrant (datasets+FAISS)\n"
648
- "Wipe Index 2 docs → Status → Count → Query\n"
649
- f"- **Embeddings**: `{EMB_PROVIDER}` (model: `{HF_EMBED_MODEL}`) "
650
- f"HF token présent: `{'oui' if bool(HF_TOKEN) else 'non'}` — Fallback dummy: `{'on' if EMB_FALLBACK_TO_DUMMY else 'off'}`\n"
651
- f"- **Data dir**: `{DATA_DIR}` — **Hub export**: `{'on' if (HF_DATASET_REPO and HfApi) else 'off'}`")
652
- with gr.Row():
653
- project_tb = gr.Textbox(label="Project ID", value="DEEPWEB")
654
- jobid_tb = gr.Textbox(label="Job ID", value="", interactive=True)
655
- with gr.Row():
656
- wipe_btn = gr.Button("🧨 Wipe project", variant="stop")
657
- index_btn = gr.Button("🚀 Indexer 2 documents (démo)", variant="primary")
658
- count_btn = gr.Button("📊 Count points", variant="secondary")
659
- with gr.Row():
660
- chunk_size = gr.Slider(64, 1024, value=200, step=8, label="chunk_size")
661
- overlap = gr.Slider(0, 256, value=20, step=2, label="overlap")
662
- batch_size = gr.Slider(1, 128, value=32, step=1, label="batch_size")
663
- store_text = gr.Checkbox(value=True, label="store_text (payload)")
664
- out_log = gr.Textbox(lines=18, label="Logs / Résultats", interactive=False)
665
-
666
- with gr.Accordion("Indexer depuis textarea (bypass fichiers)", open=False):
667
- txt1 = gr.Textbox(label="Texte 1", value="Ceci est un texte de test assez long pour produire des chunks. " * 10, lines=6)
668
- txt2 = gr.Textbox(label="Texte 2", value="Deuxième texte de test pour vérifier l'indexation et la recherche. " * 10, lines=6)
669
- index_txt_btn = gr.Button("📝 Indexer ces 2 textes")
670
-
671
- with gr.Row():
672
- status_btn = gr.Button("📡 Status (refresh)")
673
- auto_chk = gr.Checkbox(False, label="⏱️ Auto-refresh status (2 s)")
674
-
675
- with gr.Row():
676
- query_tb = gr.Textbox(label="Query text", value="alpha bravo")
677
- topk = gr.Slider(1, 20, value=5, step=1, label="top_k")
678
- query_btn = gr.Button("🔎 Query")
679
- query_out = gr.Textbox(lines=10, label="Résultats Query", interactive=False)
680
-
681
- with gr.Row():
682
- repo_tb = gr.Textbox(label="Hub dataset repo (ex: user/deepweb_vectors)", value=os.getenv("HF_DATASET_REPO", ""))
683
- export_btn = gr.Button("📤 Export to Hub", variant="secondary")
684
-
685
- wipe_btn.click(ui_wipe, inputs=[project_tb], outputs=[out_log])
686
- index_btn.click(ui_index_sample, inputs=[project_tb, chunk_size, overlap, batch_size, store_text], outputs=[out_log, jobid_tb])
687
- index_txt_btn.click(ui_index_from_textarea, inputs=[project_tb, txt1, txt2, chunk_size, overlap, batch_size, store_text], outputs=[out_log, jobid_tb])
688
- count_btn.click(ui_count, inputs=[project_tb], outputs=[out_log])
689
-
690
- status_btn.click(ui_status, inputs=[jobid_tb], outputs=[out_log])
691
- timer = gr.Timer(2.0, active=False)
692
- timer.tick(ui_status, inputs=[jobid_tb], outputs=[out_log])
693
- auto_chk.change(lambda x: gr.update(active=x), inputs=auto_chk, outputs=timer)
694
-
695
- query_btn.click(ui_query, inputs=[project_tb, query_tb, topk], outputs=[query_out])
696
-
697
- export_btn.click(ui_export, inputs=[project_tb, repo_tb], outputs=[out_log])
698
-
699
- # Monte l'UI
700
- app = gr.mount_gradio_app(fastapi_app, ui, path=UI_PATH)
701
-
702
  if __name__ == "__main__":
703
- port = int(os.getenv("PORT", "7860"))
704
- LOG.info(f"Démarrage Uvicorn sur 0.0.0.0:{port} (UI_PATH={UI_PATH})")
705
- uvicorn.run(app, host="0.0.0.0", port=port)
 
1
  # -*- coding: utf-8 -*-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from __future__ import annotations
3
+
4
  import os
5
  import io
6
  import json
7
  import time
8
+ import tarfile
 
 
9
  import logging
10
+ import hashlib
11
+ from typing import Dict, Any, List, Tuple, Optional
 
12
 
13
  import numpy as np
14
+ import faiss
15
+ from fastapi import FastAPI, HTTPException, Body
 
 
 
 
16
  from fastapi.middleware.cors import CORSMiddleware
17
+ from fastapi.responses import JSONResponse, PlainTextResponse, StreamingResponse
18
+ from pydantic import BaseModel
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # =============================================================================
23
+ # LOGGING
24
+ # =============================================================================
25
+ LOG = logging.getLogger("remote-indexer-space")
26
+ if not LOG.handlers:
27
+ h = logging.StreamHandler()
28
+ h.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
29
+ LOG.addHandler(h)
30
+ LOG.setLevel(logging.INFO)
31
+
32
+ # =============================================================================
33
+ # CONFIG
34
+ # =============================================================================
35
+ PORT = int(os.getenv("PORT", "7860"))
36
+ DATA_ROOT = os.getenv("DATA_ROOT", "/tmp/data") # persistant dans le conteneur Space
37
+ os.makedirs(DATA_ROOT, exist_ok=True)
38
+
39
+ # Embeddings "provider"
40
+ # - "dummy": hash → vecteurs 128D (léger pour un Space Free)
41
+ # (Tu peux plus tard brancher HF Transformers si tu veux.)
42
+ EMB_PROVIDER = os.getenv("EMB_PROVIDER", "dummy").strip().lower()
43
+ EMB_DIM = int(os.getenv("EMB_DIM", "128"))
44
+
45
+ # =============================================================================
46
+ # JOB STATE
47
+ # =============================================================================
48
  class JobState(BaseModel):
49
  job_id: str
50
  project_id: str
51
+ stage: str = "pending" # pending -> chunking -> embedding -> indexing -> done/failed
52
  total_files: int = 0
53
  total_chunks: int = 0
54
  embedded: int = 0
55
  indexed: int = 0
56
+ errors: List[str] = []
57
+ messages: List[str] = []
58
+ started_at: float = time.time()
59
  finished_at: Optional[float] = None
60
 
 
 
 
 
 
 
61
  JOBS: Dict[str, JobState] = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ def _now() -> str:
64
+ return time.strftime("%H:%M:%S")
65
+
66
+ def _proj_dirs(project_id: str) -> Tuple[str, str, str]:
67
+ base = os.path.join(DATA_ROOT, project_id)
68
+ ds_dir = os.path.join(base, "dataset")
69
+ fx_dir = os.path.join(base, "faiss")
70
+ os.makedirs(ds_dir, exist_ok=True)
71
+ os.makedirs(fx_dir, exist_ok=True)
72
+ return base, ds_dir, fx_dir
73
+
74
+ def _add_msg(st: JobState, msg: str):
75
+ st.messages.append(f"[{_now()}] {msg}")
76
+ LOG.info("[%s] %s", st.job_id, msg)
77
+
78
+ def _set_stage(st: JobState, stage: str):
79
+ st.stage = stage
80
+ _add_msg(st, f"stage={stage}")
81
+
82
+ # =============================================================================
83
+ # UTILS
84
+ # =============================================================================
85
+ def _chunk_text(text: str, size: int = 200, overlap: int = 20) -> List[str]:
86
+ text = (text or "").replace("\r\n", "\n")
87
+ tokens = list(text)
88
+ if size <= 0:
89
+ return [text] if text else []
90
+ if overlap < 0:
91
+ overlap = 0
92
+ chunks = []
93
  i = 0
94
+ while i < len(tokens):
95
+ j = min(i + size, len(tokens))
96
+ chunk = "".join(tokens[i:j]).strip()
97
+ if chunk:
98
+ chunks.append(chunk)
99
+ if j == len(tokens):
 
 
 
 
100
  break
101
+ i = j - overlap if (j - overlap) > i else j
102
+ return chunks
103
+
104
+ def _emb_dummy(texts: List[str], dim: int = EMB_DIM) -> np.ndarray:
105
+ # vecteurs déterministes à partir d’un hash
106
+ vecs = np.zeros((len(texts), dim), dtype="float32")
107
+ for i, t in enumerate(texts):
108
+ h = hashlib.sha1((t or "").encode("utf-8")).digest()
109
+ rng = np.random.default_rng(int.from_bytes(h[:8], "little", signed=False))
110
+ v = rng.standard_normal(dim).astype("float32")
111
+ # normalisation
112
+ n = np.linalg.norm(v) + 1e-9
113
+ vecs[i] = v / n
114
+ return vecs
115
+
116
+ def _save_dataset(ds_dir: str, rows: List[Dict[str, Any]]):
117
+ # Format simple : on écrit un JSONL + un manifest JSON
118
+ os.makedirs(ds_dir, exist_ok=True)
119
+ data_path = os.path.join(ds_dir, "data.jsonl")
120
+ with open(data_path, "w", encoding="utf-8") as f:
121
+ for r in rows:
122
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
123
+ meta = {"format": "jsonl", "columns": ["path", "text", "chunk_id"], "count": len(rows)}
124
+ with open(os.path.join(ds_dir, "meta.json"), "w", encoding="utf-8") as f:
125
+ json.dump(meta, f, ensure_ascii=False, indent=2)
126
+
127
+ def _load_dataset(ds_dir: str) -> List[Dict[str, Any]]:
128
+ data_path = os.path.join(ds_dir, "data.jsonl")
129
+ if not os.path.isfile(data_path):
130
+ return []
131
+ out = []
132
+ with open(data_path, "r", encoding="utf-8") as f:
133
+ for line in f:
134
+ try:
135
+ out.append(json.loads(line))
136
+ except Exception:
137
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  return out
139
 
140
+ def _save_faiss(fx_dir: str, xb: np.ndarray):
141
+ os.makedirs(fx_dir, exist_ok=True)
142
+ idx_path = os.path.join(fx_dir, "emb.faiss")
143
+ index = faiss.IndexFlatIP(xb.shape[1]) # cosine ~ inner product si normalisé
144
+ # les embeddings _emb_dummy sont déjà normalisés
145
+ index.add(xb)
146
+ faiss.write_index(index, idx_path)
147
+ with open(os.path.join(fx_dir, "meta.json"), "w", encoding="utf-8") as f:
148
+ json.dump({"dim": xb.shape[1], "count": int(index.ntotal), "provider": EMB_PROVIDER}, f)
149
+
150
+ def _load_faiss(fx_dir: str) -> faiss.Index:
151
+ idx_path = os.path.join(fx_dir, "emb.faiss")
152
+ if not os.path.isfile(idx_path):
153
+ raise FileNotFoundError(f"FAISS index introuvable: {idx_path}")
154
+ return faiss.read_index(idx_path)
155
+
156
+ def _tar_dir_to_bytes(dir_path: str) -> bytes:
157
+ bio = io.BytesIO()
158
+ with tarfile.open(fileobj=bio, mode="w:gz") as tar:
159
+ tar.add(dir_path, arcname=os.path.basename(dir_path))
160
+ bio.seek(0)
161
+ return bio.read()
162
+
163
+ # =============================================================================
164
+ # FASTAPI
165
+ # =============================================================================
166
+ fastapi_app = FastAPI(title="remote-indexer-min", version="1.0.0")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  fastapi_app.add_middleware(
168
+ CORSMiddleware,
169
+ allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"],
170
  )
171
 
172
+ class FileItem(BaseModel):
173
+ path: str
174
+ text: str
175
+
176
+ class IndexRequest(BaseModel):
177
+ project_id: str
178
+ files: List[FileItem]
179
+ chunk_size: int = 200
180
+ overlap: int = 20
181
+ batch_size: int = 32
182
+ store_text: bool = True
183
+
184
  @fastapi_app.get("/health")
185
+ def health():
186
+ return {"ok": True, "service": "remote-indexer-min", "emb_provider": EMB_PROVIDER}
 
 
 
 
 
 
 
 
 
 
187
 
188
  @fastapi_app.get("/")
189
+ def root_redirect():
190
+ # petit état + invite à utiliser /ui
191
+ return {"ok": True, "service": "remote-indexer-min", "ui": "/ui"}
 
 
 
 
 
 
 
 
192
 
193
  @fastapi_app.post("/index")
194
+ def index(req: IndexRequest):
195
+ job_id = hashlib.sha1(f"{req.project_id}{time.time()}".encode()).hexdigest()[:12]
196
+ st = JobState(job_id=job_id, project_id=req.project_id, stage="pending", messages=[])
197
+ JOBS[job_id] = st
198
+ _add_msg(st, f"Job {job_id} créé pour project {req.project_id}")
199
+ _add_msg(st, f"Index start project={req.project_id} files={len(req.files)} chunk_size={req.chunk_size} overlap={req.overlap} batch_size={req.batch_size} store_text={req.store_text}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  try:
201
+ base, ds_dir, fx_dir = _proj_dirs(req.project_id)
202
+
203
+ # 1) Chunking
204
+ _set_stage(st, "chunking")
205
+ rows: List[Dict[str, Any]] = []
206
+ st.total_files = len(req.files)
207
+ for it in req.files:
208
+ txt = it.text or ""
209
+ chunks = _chunk_text(txt, size=req.chunk_size, overlap=req.overlap)
210
+ _add_msg(st, f"{it.path}: len(text)={len(txt)} chunks={len(chunks)}")
211
+ for ci, ck in enumerate(chunks):
212
+ rows.append({"path": it.path, "text": ck, "chunk_id": ci})
213
+ st.total_chunks = len(rows)
214
+ _add_msg(st, f"Total chunks = {st.total_chunks}")
215
+
216
+ # 2) Embedding
217
+ _set_stage(st, "embedding")
218
+ if EMB_PROVIDER == "dummy":
219
+ xb = _emb_dummy([r["text"] for r in rows], dim=EMB_DIM)
220
+ else:
221
+ # fallback sur dummy tant que pas d'autre provider
222
+ xb = _emb_dummy([r["text"] for r in rows], dim=EMB_DIM)
223
+ st.embedded = xb.shape[0]
224
+ _add_msg(st, f"Embeddings {st.embedded}/{st.total_chunks}")
225
+ _add_msg(st, f"Embeddings dim={xb.shape[1]}")
226
+
227
+ # 3) Sauvegarde dataset (texte)
228
+ _save_dataset(ds_dir, rows)
229
+ _add_msg(st, f"Dataset (sans index) sauvegardé dans {ds_dir}")
230
+
231
+ # 4) FAISS
232
+ _set_stage(st, "indexing")
233
+ _save_faiss(fx_dir, xb)
234
+ st.indexed = int(xb.shape[0])
235
+ _add_msg(st, f"FAISS écrit sur {os.path.join(fx_dir, 'emb.faiss')}")
236
+ _add_msg(st, f"OK — dataset+index prêts (projet={req.project_id})")
237
+
238
+ _set_stage(st, "done")
239
+ st.finished_at = time.time()
240
+ return {"job_id": job_id}
241
  except Exception as e:
242
+ LOG.exception("index failed")
243
+ st.errors.append(str(e))
244
+ _add_msg(st, f"❌ Exception: {e}")
245
+ st.stage = "failed"
246
+ st.finished_at = time.time()
247
+ raise HTTPException(status_code=500, detail=str(e))
248
 
249
+ @fastapi_app.get("/status/{job_id}")
250
+ def status(job_id: str):
251
+ st = JOBS.get(job_id)
252
+ if not st:
253
+ raise HTTPException(status_code=404, detail="job inconnu")
254
+ return JSONResponse(st.model_dump())
 
 
 
 
 
 
 
255
 
256
+ class SearchRequest(BaseModel):
257
+ project_id: str
258
+ query: str
259
+ k: int = 5
260
+
261
+ @fastapi_app.post("/search")
262
+ def search(req: SearchRequest):
263
+ base, ds_dir, fx_dir = _proj_dirs(req.project_id)
264
+ rows = _load_dataset(ds_dir)
265
+ if not rows:
266
+ raise HTTPException(status_code=404, detail="dataset introuvable (index pas encore construit ?)")
267
+
268
+ # embeddings du query (dummy)
269
+ q = _emb_dummy([req.query], dim=EMB_DIM)[0:1, :] # shape (1, d)
270
+
271
+ # faiss
272
+ index = _load_faiss(fx_dir)
273
+ if index.d != q.shape[1]:
274
+ raise HTTPException(status_code=500, detail=f"dim incompatibles: index.d={index.d} vs query={q.shape[1]}")
275
+ scores, ids = index.search(q, int(max(1, req.k)))
276
+ ids = ids[0].tolist()
277
+ scores = scores[0].tolist()
278
+
279
+ # compose résultats
280
+ out = []
281
+ for idx, sc in zip(ids, scores):
282
+ if idx < 0 or idx >= len(rows):
283
+ continue
284
+ r = rows[idx]
285
+ out.append({"path": r.get("path"), "text": r.get("text"), "score": float(sc)})
286
+ return {"results": out}
287
+
288
+ # ----------- ARTIFACTS EXPORT (ce qui manquait pour ton 404) -----------
289
+ @fastapi_app.get("/artifacts/{project_id}/dataset")
290
+ def download_dataset(project_id: str):
291
+ base, ds_dir, _ = _proj_dirs(project_id)
292
+ if not os.path.isdir(ds_dir):
293
+ raise HTTPException(status_code=404, detail="Dataset introuvable")
294
+ buf = _tar_dir_to_bytes(ds_dir)
295
+ headers = {
296
+ "Content-Disposition": f'attachment; filename="{project_id}_dataset.tgz"'
297
+ }
298
+ return StreamingResponse(io.BytesIO(buf), media_type="application/gzip", headers=headers)
299
+
300
+ @fastapi_app.get("/artifacts/{project_id}/faiss")
301
+ def download_faiss(project_id: str):
302
+ base, _, fx_dir = _proj_dirs(project_id)
303
+ if not os.path.isdir(fx_dir):
304
+ raise HTTPException(status_code=404, detail="FAISS introuvable")
305
+ buf = _tar_dir_to_bytes(fx_dir)
306
+ headers = {
307
+ "Content-Disposition": f'attachment; filename="{project_id}_faiss.tgz"'
308
+ }
309
+ return StreamingResponse(io.BytesIO(buf), media_type="application/gzip", headers=headers)
310
+
311
+ # =============================================================================
312
+ # GRADIO (UI facultative pour déclencher / tester rapidement)
313
+ # =============================================================================
314
+ def _ui_index(project_id: str, sample_text: str):
315
+ files = [{"path": "sample.txt", "text": sample_text}]
316
+ from pydantic import ValidationError
317
  try:
318
+ req = IndexRequest(project_id=project_id, files=[FileItem(**f) for f in files])
319
+ except ValidationError as e:
320
+ return f"Erreur: {e}"
 
 
 
 
321
  try:
322
+ res = index(req)
323
+ return f"Job lancé: {res['job_id']}"
 
 
 
 
 
 
324
  except Exception as e:
325
+ return f"Erreur index: {e}"
 
326
 
327
+ def _ui_search(project_id: str, query: str, k: int):
328
  try:
329
+ res = search(SearchRequest(project_id=project_id, query=query, k=int(k)))
330
+ return json.dumps(res, ensure_ascii=False, indent=2)
331
  except Exception as e:
332
+ return f"Erreur search: {e}"
333
+
334
+ with gr.Blocks(title="Remote Indexer (FAISS mini)", analytics_enabled=False) as ui:
335
+ gr.Markdown("## Remote Indexer — demo UI (les vraies API sont sur `/index`, `/status/{job}`, `/search`, `/artifacts/...`).")
336
+ with gr.Tab("Index"):
337
+ pid = gr.Textbox(label="Project ID", value="DEEPWEB")
338
+ sample = gr.Textbox(label="Texte d’exemple", value="Alpha bravo charlie delta echo foxtrot.", lines=4)
339
+ btn = gr.Button("Lancer index (sample)")
340
+ out = gr.Textbox(label="Résultat")
341
+ btn.click(_ui_index, inputs=[pid, sample], outputs=[out])
342
+
343
+ with gr.Tab("Search"):
344
+ pid2 = gr.Textbox(label="Project ID", value="DEEPWEB")
345
+ q = gr.Textbox(label="Query", value="alpha")
346
+ k = gr.Slider(1, 20, value=5, step=1, label="k")
347
+ btn2 = gr.Button("Rechercher")
348
+ out2 = gr.Code(label="Résultats")
349
+ btn2.click(_ui_search, inputs=[pid2, q, k], outputs=[out2])
350
+
351
+ fastapi_app = gr.mount_gradio_app(fastapi_app, ui, path="/ui")
352
+
353
+ # =============================================================================
354
+ # MAIN (HF Space lancera ce module avec python -u main.py)
355
+ # =============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  if __name__ == "__main__":
357
+ import uvicorn
358
+ LOG.info("Démarrage Uvicorn sur 0.0.0.0:%s (UI_PATH=/ui)", PORT)
359
+ uvicorn.run(fastapi_app, host="0.0.0.0", port=PORT)