chouchouvs commited on
Commit
073e326
·
verified ·
1 Parent(s): 849d996

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +241 -915
main.py CHANGED
@@ -1,937 +1,263 @@
 
1
  # -*- coding: utf-8 -*-
2
  from __future__ import annotations
3
-
4
  import os
 
5
  import time
6
- import uuid
7
- import random
8
  import logging
9
- import hashlib
10
- import re
11
- import json
12
- from typing import List, Optional, Dict, Any, Tuple
13
-
14
- import numpy as np
15
- import requests
16
- from fastapi import FastAPI, BackgroundTasks, Header, HTTPException, Query
17
- from pydantic import BaseModel, Field
18
-
19
- # ======================================================================================
20
- # Logging
21
- # ======================================================================================
22
- logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
23
- LOG = logging.getLogger("remote_indexer")
24
-
25
- # ======================================================================================
26
- # ENV (config)
27
- # ======================================================================================
28
-
29
- # Ordre des backends d'embeddings. Ex: "deepinfra,hf"
30
- EMB_BACKEND_ORDER = [
31
- s.strip().lower()
32
- for s in os.getenv("EMB_BACKEND_ORDER", os.getenv("EMB_BACKEND", "deepinfra,hf")).split(",")
33
- if s.strip()
34
- ]
35
-
36
- # --- DeepInfra Embeddings (OpenAI-like) ---
37
- DI_TOKEN = os.getenv("DEEPINFRA_API_KEY", "").strip()
38
- DI_MODEL = os.getenv("DEEPINFRA_EMBED_MODEL", "BAAI/bge-m3").strip()
39
- DI_URL = os.getenv("DEEPINFRA_EMBED_URL", "https://api.deepinfra.com/v1/openai/embeddings").strip()
40
- DI_TIMEOUT = float(os.getenv("EMB_TIMEOUT_SEC", "120"))
41
-
42
- # --- Hugging Face Inference API ---
43
- HF_TOKEN = os.getenv("HF_API_TOKEN", "").strip()
44
- HF_MODEL = os.getenv("HF_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2").strip()
45
- HF_URL_PIPE = os.getenv("HF_API_URL_PIPELINE", "").strip() or (
46
- f"https://api-inference.huggingface.co/pipeline/feature-extraction/{HF_MODEL}"
47
  )
48
- HF_URL_MODL = os.getenv("HF_API_URL_MODELS", "").strip() or (
49
- f"https://api-inference.huggingface.co/models/{HF_MODEL}"
50
- )
51
- HF_TIMEOUT = float(os.getenv("EMB_TIMEOUT_SEC", "120"))
52
- HF_WAIT = os.getenv("HF_WAIT_FOR_MODEL", "true").lower() in ("1", "true", "yes", "on")
53
-
54
- # --- Retries / backoff ---
55
- RETRY_MAX = int(os.getenv("EMB_RETRY_MAX", "6"))
56
- RETRY_BASE_SEC = float(os.getenv("EMB_RETRY_BASE", "1.6"))
57
- RETRY_JITTER = float(os.getenv("EMB_RETRY_JITTER", "0.35"))
58
-
59
- # --- Vector store (Qdrant / Memory fallback) ---
60
- VECTOR_STORE = os.getenv("VECTOR_STORE", "qdrant").strip().lower()
61
- QDRANT_URL = os.getenv("QDRANT_URL", "").strip()
62
- QDRANT_API = os.getenv("QDRANT_API_KEY", "").strip()
63
-
64
- # IDs déterministes activés ?
65
- QDRANT_DETERMINISTIC_IDS = os.getenv("QDRANT_DETERMINISTIC_IDS", "true").lower() in ("1","true","yes","on")
66
- QDRANT_ID_MODE = os.getenv("QDRANT_ID_MODE", "uuid").strip().lower() # uuid|int
67
-
68
- # Wipe automatique avant chaque /index (optionnel)
69
- WIPE_BEFORE_INDEX = os.getenv("WIPE_BEFORE_INDEX", "false").lower() in ("1","true","yes","on")
70
-
71
- # --- Auth d’API de ce service (simple header) ---
72
- AUTH_TOKEN = os.getenv("REMOTE_INDEX_TOKEN", "").strip()
73
-
74
- LOG.info(f"Embeddings backend order = {EMB_BACKEND_ORDER}")
75
- LOG.info(f"HF pipeline URL = {HF_URL_PIPE}")
76
- LOG.info(f"HF models URL = {HF_URL_MODL}")
77
- LOG.info(f"VECTOR_STORE = {VECTOR_STORE}")
78
-
79
- if "deepinfra" in EMB_BACKEND_ORDER and not DI_TOKEN:
80
- LOG.warning("DEEPINFRA_API_KEY manquant — tentatives DeepInfra échoueront.")
81
- if "hf" in EMB_BACKEND_ORDER and not HF_TOKEN:
82
- LOG.warning("HF_API_TOKEN manquant — tentatives HF échoueront.")
83
-
84
- # ======================================================================================
85
- # Vector Stores (Memory + Qdrant)
86
- # ======================================================================================
87
  try:
88
- from qdrant_client import QdrantClient
89
- from qdrant_client.http.models import VectorParams, Distance, PointStruct
90
  except Exception:
91
- QdrantClient = None
92
- PointStruct = None
93
-
94
- class BaseStore:
95
- def ensure_collection(self, name: str, dim: int): ...
96
- def upsert(self, name: str, vectors: np.ndarray, payloads: List[Dict[str, Any]]) -> int: ...
97
- def search(self, name: str, query_vec: np.ndarray, top_k: int) -> List[Dict[str, Any]]: ...
98
- def wipe(self, name: str): ...
99
-
100
- class MemoryStore(BaseStore):
101
- """Store en mémoire (volatile) — fallback/tests."""
102
- def __init__(self):
103
- self.db: Dict[str, Dict[str, Any]] = {} # name -> {"vecs":[np.ndarray], "payloads":[dict], "dim":int}
104
-
105
- def ensure_collection(self, name: str, dim: int):
106
- self.db.setdefault(name, {"vecs": [], "payloads": [], "dim": dim})
107
-
108
- def upsert(self, name: str, vectors: np.ndarray, payloads: List[Dict[str, Any]]) -> int:
109
- if name not in self.db:
110
- raise RuntimeError(f"MemoryStore: collection {name} inconnue")
111
- if len(vectors) != len(payloads):
112
- raise ValueError("MemoryStore.upsert: tailles vectors/payloads incohérentes")
113
- self.db[name]["vecs"].extend([np.asarray(v, dtype=np.float32) for v in vectors])
114
- self.db[name]["payloads"].extend(payloads)
115
- return len(vectors)
116
-
117
- def search(self, name: str, query_vec: np.ndarray, top_k: int) -> List[Dict[str, Any]]:
118
- if name not in self.db or not self.db[name]["vecs"]:
119
- return []
120
- mat = np.vstack(self.db[name]["vecs"]).astype(np.float32) # [N, dim]
121
- q = query_vec.reshape(1, -1).astype(np.float32)
122
- sims = (mat @ q.T).ravel() # cosine (embeddings normalisés en amont)
123
- top_idx = np.argsort(-sims)[:top_k]
124
- out = []
125
- for i in top_idx:
126
- pl = dict(self.db[name]["payloads"][i]); pl["_score"] = float(sims[i])
127
- out.append(pl)
128
- return out
129
-
130
- def wipe(self, name: str):
131
- self.db.pop(name, None)
132
-
133
- def _stable_point_id_uuid(collection: str, payload: Dict[str, Any]) -> str:
134
- """
135
- UUID v5 déterministe: uuid5(NAMESPACE_URL, 'collection|path|chunk|start|end|BLAKE8(text)')
136
- """
137
- path = str(payload.get("path", ""))
138
- chunk = str(payload.get("chunk", ""))
139
- start = str(payload.get("start", ""))
140
- end = str(payload.get("end", ""))
141
- text = payload.get("text", "")
142
- # hash court du texte pour stabiliser l’empreinte sans tout concaténer
143
- h = hashlib.blake2b((text or "").encode("utf-8", "ignore"), digest_size=8).hexdigest()
144
- base = f"{collection}|{path}|{chunk}|{start}|{end}|{h}"
145
- return str(uuid.uuid5(uuid.NAMESPACE_URL, base))
146
-
147
- class QdrantStore(BaseStore):
148
- """Store Qdrant — IDs UUID déterministes (par défaut) ou entiers séquentiels."""
149
- def __init__(self, url: str, api_key: Optional[str] = None,
150
- deterministic_ids: bool = True, id_mode: str = "uuid"):
151
- if QdrantClient is None or PointStruct is None:
152
- raise RuntimeError("qdrant_client non disponible")
153
- self.client = QdrantClient(url=url, api_key=api_key if api_key else None)
154
- self._next_ids: Dict[str, int] = {}
155
- self._deterministic = deterministic_ids
156
- self._id_mode = id_mode if id_mode in ("uuid", "int") else "uuid"
157
-
158
- def _init_next_id(self, name: str):
159
- try:
160
- cnt = self.client.count(collection_name=name, exact=True).count
161
- except Exception:
162
- cnt = 0
163
- self._next_ids[name] = int(cnt)
164
-
165
- def ensure_collection(self, name: str, dim: int):
166
- try:
167
- self.client.get_collection(name)
168
- except Exception:
169
- self.client.create_collection(
170
- collection_name=name,
171
- vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
172
- )
173
- if name not in self._next_ids:
174
- self._init_next_id(name)
175
-
176
- def upsert(self, name: str, vectors: np.ndarray, payloads: List[Dict[str, Any]]) -> int:
177
- if vectors is None or len(vectors) == 0:
178
- return 0
179
- if len(vectors) != len(payloads):
180
- raise ValueError("QdrantStore.upsert: tailles vectors/payloads incohérentes")
181
-
182
- points: List[PointStruct] = []
183
- added = 0
184
-
185
- if self._deterministic and self._id_mode == "uuid":
186
- # UUID déterministes => Qdrant Cloud OK, remplace si existe
187
- seen = set()
188
- for v, pl in zip(vectors, payloads):
189
- pid = _stable_point_id_uuid(name, pl)
190
- if pid in seen:
191
- continue # dédup intra-batch
192
- seen.add(pid)
193
- points.append(PointStruct(id=pid,
194
- vector=np.asarray(v, dtype=np.float32).tolist(),
195
- payload=pl))
196
- if points:
197
- self.client.upsert(collection_name=name, points=points)
198
- added = len(points)
199
-
200
- elif self._deterministic and self._id_mode == "int":
201
- # int déterministes (peu utile si on veut remplacer)
202
- seen = set()
203
- for v, pl in zip(vectors, payloads):
204
- pid_str = _stable_point_id_uuid(name, pl)
205
- pid_int = uuid.UUID(pid_str).int >> 64
206
- if pid_int in seen:
207
- continue
208
- seen.add(pid_int)
209
- points.append(PointStruct(id=int(pid_int),
210
- vector=np.asarray(v, dtype=np.float32).tolist(),
211
- payload=pl))
212
- if points:
213
- self.client.upsert(collection_name=name, points=points)
214
- added = len(points)
215
-
216
- else:
217
- # IDs séquentiels -> append-only
218
- if name not in self._next_ids:
219
- self._init_next_id(name)
220
- start = self._next_ids[name]
221
- for i, (v, pl) in enumerate(zip(vectors, payloads)):
222
- points.append(PointStruct(id=start + i,
223
- vector=np.asarray(v, dtype=np.float32).tolist(),
224
- payload=pl))
225
- if points:
226
- self.client.upsert(collection_name=name, points=points)
227
- added = len(points)
228
- self._next_ids[name] += added
229
-
230
- LOG.debug(f"QdrantStore.upsert: +{added} points (deterministic={self._deterministic}, mode={self._id_mode})")
231
- return added
232
-
233
- def search(self, name: str, query_vec: np.ndarray, top_k: int) -> List[Dict[str, Any]]:
234
- qv = query_vec[0].astype(np.float32).tolist() if query_vec.ndim == 2 else query_vec.astype(np.float32).tolist()
235
- res = self.client.search(collection_name=name, query_vector=qv, limit=int(top_k))
236
- out = []
237
- for p in res:
238
- pl = p.payload or {}
239
- try:
240
- pl["_score"] = float(p.score)
241
- except Exception:
242
- pl["_score"] = None
243
- out.append(pl)
244
- return out
245
-
246
- def wipe(self, name: str):
247
- try:
248
- self.client.delete_collection(name)
249
- except Exception:
250
- pass
251
- self._next_ids.pop(name, None)
252
-
253
- # Initialisation du store actif
254
  try:
255
- if VECTOR_STORE == "qdrant" and QDRANT_URL:
256
- STORE: BaseStore = QdrantStore(
257
- QDRANT_URL,
258
- api_key=QDRANT_API if QDRANT_API else None,
259
- deterministic_ids=QDRANT_DETERMINISTIC_IDS,
260
- id_mode=QDRANT_ID_MODE,
261
- )
262
- _ = STORE.client.get_collections() # ping
263
- LOG.info("Connecté à Qdrant.")
264
- VECTOR_STORE_ACTIVE = "QdrantStore"
265
  else:
266
- raise RuntimeError("Qdrant non configuré, fallback mémoire.")
267
- except Exception as e:
268
- LOG.error(f"Qdrant indisponible (Connexion Qdrant impossible: {e}) — fallback en mémoire.")
269
- STORE = MemoryStore()
270
- VECTOR_STORE_ACTIVE = "MemoryStore"
271
- LOG.warning("Vector store: MEMORY (fallback). Les données sont volatiles (perdues au restart).")
272
-
273
- # ======================================================================================
274
- # Pydantic I/O
275
- # ======================================================================================
276
-
277
- class FileIn(BaseModel):
278
- path: Optional[str] = "" # tolérancemajeure: accepte None
279
- text: Optional[str] = "" # idem
280
-
281
- class IndexRequest(BaseModel):
282
- project_id: str = Field(..., min_length=1)
283
- files: List[FileIn]
284
- chunk_size: int = 1200
285
- overlap: int = 200
286
- batch_size: int = 8
287
- store_text: bool = True
288
-
289
- class QueryRequest(BaseModel):
290
- project_id: str
291
- query: str
292
- top_k: int = 6
293
-
294
- class StatusBody(BaseModel):
295
- job_id: str
296
-
297
- # ======================================================================================
298
- # Jobs store (mémoire)
299
- # ======================================================================================
300
- JOBS: Dict[str, Dict[str, Any]] = {} # {job_id: {"status": "...", "logs": [...], "created": ts}}
301
-
302
- def _append_log(job_id: str, line: str):
303
- job = JOBS.get(job_id)
304
- if job:
305
- job["logs"].append(line)
306
-
307
- def _set_status(job_id: str, status: str):
308
- job = JOBS.get(job_id)
309
- if job:
310
- job["status"] = status
311
-
312
- def _auth(x_auth: Optional[str]):
313
- if AUTH_TOKEN and (x_auth or "") != AUTH_TOKEN:
314
- raise HTTPException(401, "Unauthorized")
315
-
316
- # ======================================================================================
317
- # Embeddings backends + retry/fallback
318
- # ======================================================================================
319
-
320
- def _retry_sleep(attempt: int) -> float:
321
- back = (RETRY_BASE_SEC ** attempt)
322
- jitter = 1.0 + random.uniform(-RETRY_JITTER, RETRY_JITTER)
323
- return max(0.25, back * jitter)
324
-
325
- def _normalize_rows(arr: np.ndarray) -> np.ndarray:
326
- arr = np.asarray(arr, dtype=np.float32)
327
- norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12
328
- return (arr / norms).astype(np.float32)
329
-
330
- def _di_post_embeddings_once(batch: List[str]) -> Tuple[np.ndarray, int]:
331
- if not DI_TOKEN:
332
- raise RuntimeError("DEEPINFRA_API_KEY manquant (backend=deepinfra).")
333
- headers = {"Authorization": f"Bearer {DI_TOKEN}", "Content-Type": "application/json"}
334
- payload = {"model": DI_MODEL, "input": batch}
335
- r = requests.post(DI_URL, headers=headers, json=payload, timeout=DI_TIMEOUT)
336
- size = int(r.headers.get("Content-Length", "0") or 0)
337
- if r.status_code >= 400:
338
- LOG.error(f"DeepInfra error {r.status_code}: {r.text[:1000]}")
339
- r.raise_for_status()
340
- js = r.json()
341
- data = js.get("data")
342
- if not isinstance(data, list) or not data:
343
- raise RuntimeError(f"DeepInfra embeddings: réponse invalide {js}")
344
- embs = [d.get("embedding") for d in data]
345
- arr = np.asarray(embs, dtype=np.float32)
346
- if arr.ndim != 2:
347
- raise RuntimeError(f"DeepInfra: unexpected embeddings shape: {arr.shape}")
348
- return _normalize_rows(arr), size
349
-
350
- def _hf_post_embeddings_once(batch: List[str]) -> Tuple[np.ndarray, int]:
351
- if not HF_TOKEN:
352
- raise RuntimeError("HF_API_TOKEN manquant (backend=hf).")
353
- headers = {
354
- "Authorization": f"Bearer {HF_TOKEN}",
355
- "Content-Type": "application/json",
356
- }
357
- if HF_WAIT:
358
- headers["X-Wait-For-Model"] = "true"
359
- headers["X-Use-Cache"] = "true"
360
-
361
- def _call(url: str, payload: Dict[str, Any], extra_headers: Optional[Dict[str, str]] = None):
362
- h = dict(headers)
363
- if extra_headers:
364
- h.update(extra_headers)
365
- r = requests.post(url, headers=h, json=payload, timeout=HF_TIMEOUT)
366
- return r
367
-
368
- payload = {"inputs": batch if len(batch) > 1 else batch[0]}
369
- r = _call(HF_URL_PIPE, payload)
370
- size = int(r.headers.get("Content-Length", "0") or 0)
371
- if r.status_code == 404:
372
- LOG.error("HF error 404: Not Found")
373
- LOG.warning(f"HF endpoint {HF_URL_PIPE} non dispo (404), fallback vers alternative ...")
374
- elif r.status_code >= 400:
375
- LOG.error(f"HF error {r.status_code}: {r.text[:1000]}")
376
- r.raise_for_status()
377
- else:
378
- data = r.json()
379
- arr = np.array(data, dtype=np.float32)
380
- if arr.ndim == 3:
381
- arr = arr.mean(axis=1)
382
- if arr.ndim == 1:
383
- arr = arr.reshape(1, -1)
384
- if arr.ndim != 2:
385
- raise RuntimeError(f"HF: unexpected embeddings shape: {arr.shape}")
386
- return _normalize_rows(arr), size
387
-
388
- r2 = _call(HF_URL_MODL, payload)
389
- size2 = int(r2.headers.get("Content-Length", "0") or 0)
390
- if r2.status_code >= 400:
391
- LOG.error(f"HF error {r2.status_code}: {r2.text[:1000]}")
392
- if r2.status_code == 400 and "SentenceSimilarityPipeline" in (r2.text or ""):
393
- LOG.warning("HF MODELS a choisi Similarity -> retry avec ?task=feature-extraction + X-Task")
394
- r3 = _call(
395
- HF_URL_MODL + "?task=feature-extraction",
396
- payload,
397
- extra_headers={"X-Task": "feature-extraction"}
398
- )
399
- size3 = int(r3.headers.get("Content-Length", "0") or 0)
400
- if r3.status_code >= 400:
401
- LOG.error(f"HF error {r3.status_code}: {r3.text[:1000]}")
402
- r3.raise_for_status()
403
- data3 = r3.json()
404
- arr3 = np.array(data3, dtype=np.float32)
405
- if arr3.ndim == 3:
406
- arr3 = arr3.mean(axis=1)
407
- if arr3.ndim == 1:
408
- arr3 = arr3.reshape(1, -1)
409
- if arr3.ndim != 2:
410
- raise RuntimeError(f"HF: unexpected embeddings shape: {arr3.shape}")
411
- return _normalize_rows(arr3), size3
412
- else:
413
- r2.raise_for_status()
414
- data2 = r2.json()
415
- arr2 = np.array(data2, dtype=np.float32)
416
- if arr2.ndim == 3:
417
- arr2 = arr2.mean(axis=1)
418
- if arr2.ndim == 1:
419
- arr2 = arr2.reshape(1, -1)
420
- if arr2.ndim != 2:
421
- raise RuntimeError(f"HF: unexpected embeddings shape: {arr2.shape}")
422
- return _normalize_rows(arr2), size2
423
-
424
- def _call_with_retries(func, batch: List[str], label: str, job_id: Optional[str] = None) -> Tuple[np.ndarray, int]:
425
- last_exc = None
426
- for attempt in range(RETRY_MAX):
427
- try:
428
- if job_id:
429
- _append_log(job_id, f"{label}: try {attempt+1}/{RETRY_MAX} (batch={len(batch)})")
430
- return func(batch)
431
- except requests.HTTPError as he:
432
- code = he.response.status_code if he.response is not None else "HTTP"
433
- retriable = code in (429, 500, 502, 503, 504)
434
- if not retriable:
435
- raise
436
- sleep_s = _retry_sleep(attempt)
437
- msg = f"{label}: HTTP {code}, retry in {sleep_s:.1f}s"
438
- LOG.warning(msg); _append_log(job_id, msg)
439
- time.sleep(sleep_s)
440
- last_exc = he
441
- except Exception as e:
442
- sleep_s = _retry_sleep(attempt)
443
- msg = f"{label}: error {type(e).__name__}: {e}, retry in {sleep_s:.1f}s"
444
- LOG.warning(msg); _append_log(job_id, msg)
445
- time.sleep(sleep_s)
446
- last_exc = e
447
- raise RuntimeError(f"{label}: retries exhausted: {last_exc}")
448
-
449
- def _post_embeddings(batch: List[str], job_id: Optional[str] = None) -> Tuple[np.ndarray, int]:
450
- last_err = None
451
- for b in EMB_BACKEND_ORDER:
452
- if b == "deepinfra":
453
- try:
454
- return _call_with_retries(_di_post_embeddings_once, batch, "DeepInfra", job_id)
455
- except Exception as e:
456
- last_err = e; _append_log(job_id, f"DeepInfra failed: {e}."); LOG.error(f"DeepInfra failed: {e}")
457
- elif b == "hf":
458
- try:
459
- return _call_with_retries(_hf_post_embeddings_once, batch, "HF", job_id)
460
- except Exception as e:
461
- last_err = e; _append_log(job_id, f"HF failed: {e}."); LOG.error(f"HF failed: {e}")
462
- if "SentenceSimilarityPipeline" in str(e) and "deepinfra" not in EMB_BACKEND_ORDER:
463
- _append_log(job_id, "Auto-fallback DeepInfra (HF => SentenceSimilarity).")
464
- try:
465
- return _call_with_retries(_di_post_embeddings_once, batch, "DeepInfra", job_id)
466
- except Exception as e2:
467
- last_err = e2; _append_log(job_id, f"DeepInfra failed after HF: {e2}."); LOG.error(f"DeepInfra failed after HF: {e2}")
468
- else:
469
- _append_log(job_id, f"Backend inconnu ignoré: {b}")
470
- raise RuntimeError(f"Tous les backends ont échoué: {last_err}")
471
-
472
- # ======================================================================================
473
- # Helpers chunking
474
- # ======================================================================================
475
-
476
- def _chunk_with_spans(text: str, size: int, overlap: int):
477
- n = len(text or "")
478
- if size <= 0:
479
- yield (0, n, text); return
480
- i = 0
481
- while i < n:
482
- j = min(n, i + size)
483
- yield (i, j, text[i:j])
484
- i = max(0, j - overlap)
485
- if i >= n:
486
- break
487
-
488
- def _clean_chunk_text(text: str) -> str:
489
- """
490
- Nettoyage simple des fragments JSON / artefacts:
491
- - supprime un champ "indexed_at" tronqué à la fin,
492
- - supprime accolades/caractères isolés en début/fin,
493
- - compacte sauts de ligne multiples,
494
- - tente d'extraire des valeurs textuelles si le chunk ressemble fortement à du JSON.
495
- """
496
- if not text:
497
- return text
498
- t = (text or "").strip()
499
-
500
- # retirer un suffixe typique: , "indexed_at": "2025-..."}}
501
- t = re.sub(r',\s*"indexed_at"\s*:\s*"[^"]*"\s*}+\s*$', '', t, flags=re.IGNORECASE)
502
-
503
- # retirer d'autres clés timestamps communes à la fin si tronquées
504
- t = re.sub(r',\s*"(created_at|timestamp|time|date)"\s*:\s*"[^"]*"\s*}+\s*$', '', t, flags=re.IGNORECASE)
505
-
506
- # retirer accolades ou crochets seuls en début/fin
507
- t = re.sub(r'^[\s\]\}\,]+', '', t)
508
- t = re.sub(r'[\s\]\}\,]+$', '', t)
509
 
510
- # si le chunk ressemble majoritairement à du JSON (beaucoup de ":" ou "{"), essayer d'en extraire les valeurs textuelles
511
- if t.count(':') >= 3 and (t.count('{') + t.count('}')) >= 1:
512
- try:
513
- j = json.loads(t)
514
- if isinstance(j, dict):
515
- # concatène les valeurs textuelles pertinentes
516
- vals = []
517
- for v in j.values():
518
- if isinstance(v, (str, int, float)):
519
- vals.append(str(v))
520
- if vals:
521
- t = " ".join(vals)
522
- except Exception:
523
- # ignore, on garde t tel quel
524
- pass
525
 
526
- # compacter sauts de ligne
527
- t = re.sub(r'\n{3,}', '\n\n', t)
528
- return t.strip()
529
 
530
- # ======================================================================================
531
- # Background task : indexation — VERSION CORRIGÉE (ajouts anti-dup & robustesse)
532
- # ======================================================================================
533
 
534
- def run_index_job(job_id: str, req: IndexRequest):
 
535
  try:
536
- _set_status(job_id, "running")
537
- _append_log(job_id, f"Start project={req.project_id} files={len(req.files)} | backends={EMB_BACKEND_ORDER} | store={VECTOR_STORE} (deterministic_ids={QDRANT_DETERMINISTIC_IDS}, mode={QDRANT_ID_MODE})")
538
- LOG.info(f"[{job_id}] Index start project={req.project_id} files={len(req.files)}")
539
-
540
- # ensemble de hashes de chunks déjà vus dans ce job (dédup intra-job)
541
- seen_chunk_hashes = set()
542
-
543
- # --- DEBUG DIAGNOSTIC (INSÈRE ICI) ---
544
- try:
545
- N_SAMPLE = 6
546
- sample = req.files[:N_SAMPLE]
547
- seen_hashes = {}
548
- for fidx, fi in enumerate(sample, 1):
549
- p = (getattr(fi, "path", "") or "") or ""
550
- t = (getattr(fi, "text", "") or "") or ""
551
- h = hashlib.blake2b((t or "").encode("utf-8", "ignore"), digest_size=8).hexdigest()
552
- seen_hashes.setdefault(h, []).append(p)
553
- LOG.info(f"[{job_id}] recv file #{fidx}: path={p!r} len_text={len(t)} hash8={h} preview={repr(t[:120])}")
554
- if len(req.files) > N_SAMPLE:
555
- LOG.info(f"[{job_id}] ... and {len(req.files)-N_SAMPLE} more files")
556
- if len(seen_hashes) == 1 and len(req.files) > 1:
557
- _append_log(job_id, "⚠️ All received files appear IDENTICAL (same hash). Possible client-side bug.")
558
- LOG.warning("[%s] All files identical by hash8=%s", job_id, list(seen_hashes.keys())[0])
559
- except Exception as _e:
560
- LOG.exception("Debug sample failed: %s", _e)
561
- # --- end debug block ---
562
-
563
- col = f"proj_{req.project_id}"
564
-
565
- # Option: wipe avant index
566
- if WIPE_BEFORE_INDEX:
567
- try:
568
- STORE.wipe(col)
569
- _append_log(job_id, f"Wiped existing collection: {col}")
570
- except Exception as e:
571
- _append_log(job_id, f"Wipe failed (ignored): {e}")
572
-
573
- # --- WARMUP: calculer un embedding de test pour déterminer la dimension (dim) ---
574
- # On prend un chunk de départ (ou une string 'warmup' si pas de fichiers)
575
- if req.files:
576
- warm_text = next(_chunk_with_spans((req.files[0].text or "") , req.chunk_size, req.overlap))[2]
577
- else:
578
- warm_text = "warmup"
579
- try:
580
- embs, sz = _post_embeddings([warm_text], job_id=job_id)
581
- if embs is None or embs.ndim != 2:
582
- raise RuntimeError("Warmup embeddings invalid shape")
583
- dim = int(embs.shape[1])
584
- LOG.info(f"[{job_id}] warmup embeddings shape = {embs.shape} dtype={embs.dtype}")
585
- _append_log(job_id, f"warmup embeddings shape = {embs.shape} dim={dim}")
586
- except Exception as e:
587
- LOG.exception("[%s] Warmup embeddings failed: %s", job_id, e)
588
- _append_log(job_id, f"Warmup embeddings failed: {e}")
589
- _set_status(job_id, "error")
590
- return
591
-
592
- # If using QdrantStore: check existing collection vector size and warn if mismatch
593
- if isinstance(STORE, QdrantStore):
594
- try:
595
- # client.get_collection throws if not exists
596
- info = STORE.client.get_collection(collection_name=col)
597
- existing_size = None
598
- # depending on qdrant client version, structure might be different:
599
- if hasattr(info, "result") and isinstance(info.result, dict):
600
- cfg = info.result.get("params") or {}
601
- vectors = cfg.get("vectors") or {}
602
- existing_size = int(vectors.get("size")) if vectors.get("size") else None
603
- elif isinstance(info, dict):
604
- cfg = info.get("result", info)
605
- vectors = cfg.get("params", {}).get("vectors", {})
606
- existing_size = int(vectors.get("size")) if vectors else None
607
-
608
- if existing_size and existing_size != dim:
609
- msg = (f"Qdrant collection {col} already exists with dim={existing_size} but embeddings dim={dim}. "
610
- "This will likely cause vectors to be rejected. Consider wiping or recreating collection.")
611
- LOG.error("[%s] %s", job_id, msg)
612
- _append_log(job_id, msg)
613
- # Optional: if WIPE_BEFORE_INDEX True, recreate:
614
- if WIPE_BEFORE_INDEX:
615
- try:
616
- STORE.wipe(col)
617
- STORE.ensure_collection(col, dim)
618
- _append_log(job_id, f"Recreated collection {col} with dim={dim} (WIPE_BEFORE_INDEX).")
619
- except Exception as e:
620
- _append_log(job_id, f"Failed recreate collection: {e}")
621
- except Exception as e:
622
- # collection not present or unable to introspect -> ok, ensure_collection will create
623
- LOG.debug("[%s] Could not introspect collection: %s", job_id, e)
624
-
625
- STORE.ensure_collection(col, dim)
626
- _append_log(job_id, f"Collection ready: {col} (dim={dim})")
627
-
628
- total_chunks = 0
629
- buf_chunks: List[str] = []
630
- buf_metas: List[Dict[str, Any]] = []
631
-
632
- def _flush():
633
- nonlocal buf_chunks, buf_metas, total_chunks
634
- if not buf_chunks:
635
- return
636
-
637
- # ✅ DÉDUP INTRA-BATCH (même texte → même ID)
638
- if QDRANT_DETERMINISTIC_IDS:
639
- before = len(buf_metas)
640
- seen = set()
641
- dedup_chunks, dedup_metas = [], []
642
- for txt, meta in zip(buf_chunks, buf_metas):
643
- pid = _stable_point_id_uuid(col, meta) if QDRANT_ID_MODE == "uuid" else uuid.UUID(_stable_point_id_uuid(col, meta)).int >> 64
644
- if pid in seen:
645
- continue
646
- seen.add(pid)
647
- dedup_chunks.append(txt); dedup_metas.append(meta)
648
- buf_chunks, buf_metas = dedup_chunks, dedup_metas
649
- skipped = before - len(buf_metas)
650
- if skipped > 0:
651
- _append_log(job_id, f"Dedup intra-batch: skipped {skipped} duplicates")
652
-
653
- try:
654
- vecs, sz = _post_embeddings(buf_chunks, job_id=job_id)
655
- except Exception as e:
656
- # échec -> journaliser et faire échouer le job proprement (on ne vide pas le buffer pour debug mais on arrête)
657
- LOG.exception("[%s] Embeddings failed during flush: %s", job_id, e)
658
- _append_log(job_id, f"Embeddings failed during flush: {e}")
659
- _set_status(job_id, "error")
660
- raise
661
-
662
- added = STORE.upsert(col, vecs, buf_metas)
663
- total_chunks += added
664
- _append_log(job_id, f"+{added} chunks (total={total_chunks}) ~{(sz/1024.0):.1f}KiB")
665
- # vider buffers ONLY après succès
666
- buf_chunks, buf_metas = [], []
667
-
668
- # ✅ Filtre des fichiers pertinents
669
- TEXT_EXTS = {".py", ".md", ".txt", ".yaml", ".yml", ".json", ".sh", ".dockerfile", ".ini", ".cfg", ".toml", ".env"}
670
- IGNORE_PREFIXES = {".git", "__pycache__", ".vscode", ".idea", "node_modules", "build", "dist", "venv", ".env", ".log", ".tmp"}
671
-
672
- for fi, f in enumerate(req.files, 1):
673
- # defensive: path/text peuvent être None -> utiliser fallback
674
- path_raw = (getattr(f, "path", "") or "") # peut être None
675
- path = (path_raw or "").strip()
676
- text_raw = (getattr(f, "text", "") or "")
677
- text = text_raw or ""
678
-
679
- if not path:
680
- # fallback path stable basé sur hash du texte (pour éviter collisions None)
681
- h8 = hashlib.blake2b((text or "").encode("utf-8", "ignore"), digest_size=8).hexdigest()
682
- path = f"__no_path__{h8}"
683
-
684
- if any(path.startswith(p) for p in IGNORE_PREFIXES):
685
- _append_log(job_id, f"📁 Ignored: {path} (dossier ignoré)")
686
- continue
687
-
688
- ext = os.path.splitext(path)[1].lower()
689
- if ext not in TEXT_EXTS:
690
- _append_log(job_id, f"📁 Ignored: {path} (extension non supportée: {ext})")
691
- continue
692
-
693
- if len((text or "").strip()) < 50: # ✅ Ignorer les fichiers trop courts
694
- _append_log(job_id, f"📄 Ignored: {path} (texte trop court: {len((text or '').strip())} chars)")
695
- continue
696
-
697
- _append_log(job_id, f"📄 Processing: {path} ({len(text)} chars)")
698
-
699
- # --- traitement spécial JSON / NDJSON ---
700
- if ext in {".json"} or path.lower().endswith(".ndjson"):
701
- handled = False
702
- try:
703
- parsed = json.loads(text)
704
- # si c'est une liste -> indexer chaque entrée séparément
705
- if isinstance(parsed, list):
706
- for idx, obj in enumerate(parsed):
707
- if isinstance(obj, dict):
708
- s = " ".join(str(v) for v in obj.values() if isinstance(v, (str, int, float)))
709
- else:
710
- s = str(obj)
711
- s = _clean_chunk_text(s)
712
- if len(s) < 30:
713
- continue
714
- # dedup global intra-job
715
- chash = hashlib.blake2b(s.encode("utf-8", "ignore"), digest_size=8).hexdigest()
716
- if chash in seen_chunk_hashes:
717
- continue
718
- seen_chunk_hashes.add(chash)
719
-
720
- meta = {"path": path, "chunk": idx, "start": 0, "end": len(s)}
721
- if req.store_text:
722
- meta["text"] = s
723
- buf_chunks.append(s); buf_metas.append(meta)
724
- if len(buf_chunks) >= req.batch_size:
725
- _flush()
726
- handled = True
727
- elif isinstance(parsed, dict):
728
- s = " ".join(str(v) for v in parsed.values() if isinstance(v, (str, int, float)))
729
- s = _clean_chunk_text(s)
730
- if len(s) >= 30:
731
- chash = hashlib.blake2b(s.encode("utf-8", "ignore"), digest_size=8).hexdigest()
732
- if chash not in seen_chunk_hashes:
733
- seen_chunk_hashes.add(chash)
734
- meta = {"path": path, "chunk": 0, "start": 0, "end": len(s)}
735
- if req.store_text:
736
- meta["text"] = s
737
- buf_chunks.append(s); buf_metas.append(meta)
738
- if len(buf_chunks) >= req.batch_size:
739
- _flush()
740
- handled = True
741
- except Exception:
742
- # fallback NDJSON: une ligne == un JSON ou texte
743
- try:
744
- lines = [L for L in (text or "").splitlines() if L.strip()]
745
- for li, line in enumerate(lines):
746
- try:
747
- obj = json.loads(line)
748
- if isinstance(obj, dict):
749
- s = " ".join(str(v) for v in obj.values() if isinstance(v, (str, int, float)))
750
- else:
751
- s = str(obj)
752
- s = _clean_chunk_text(s)
753
- if len(s) < 30:
754
- continue
755
- chash = hashlib.blake2b(s.encode("utf-8", "ignore"), digest_size=8).hexdigest()
756
- if chash in seen_chunk_hashes:
757
- continue
758
- seen_chunk_hashes.add(chash)
759
- meta = {"path": path, "chunk": li, "start": 0, "end": len(s)}
760
- if req.store_text:
761
- meta["text"] = s
762
- buf_chunks.append(s); buf_metas.append(meta)
763
- if len(buf_chunks) >= req.batch_size:
764
- _flush()
765
- except Exception:
766
- # ligne non JSON -> indexer comme texte si longue
767
- sl = (line or "").strip()
768
- if len(sl) >= 30:
769
- sl = _clean_chunk_text(sl)
770
- chash = hashlib.blake2b(sl.encode("utf-8", "ignore"), digest_size=8).hexdigest()
771
- if chash in seen_chunk_hashes:
772
- continue
773
- seen_chunk_hashes.add(chash)
774
- meta = {"path": path, "chunk": li, "start": 0, "end": len(sl)}
775
- if req.store_text:
776
- meta["text"] = sl
777
- buf_chunks.append(sl); buf_metas.append(meta)
778
- if len(buf_chunks) >= req.batch_size:
779
- _flush()
780
- handled = True
781
- except Exception:
782
- handled = False
783
-
784
- if handled:
785
- _flush()
786
- _append_log(job_id, f"File done: {path}")
787
- continue # passe au fichier suivant
788
-
789
- # --- traitement normal pour fichiers texte ---
790
- for ci, (start, end, chunk_txt) in enumerate(_chunk_with_spans(text or "", req.chunk_size, req.overlap)):
791
- chunk_txt = (chunk_txt or "").strip()
792
- if len(chunk_txt) < 30: # ✅ Ignorer les chunks trop courts
793
- continue
794
- # nettoyage pour éviter artefacts JSON / timestamps
795
- chunk_txt = _clean_chunk_text(chunk_txt)
796
- if len(chunk_txt) < 30:
797
- continue
798
-
799
- # dedup global intra-job (empêche répétitions)
800
- chash = hashlib.blake2b(chunk_txt.encode("utf-8", "ignore"), digest_size=8).hexdigest()
801
- if chash in seen_chunk_hashes:
802
- continue
803
- seen_chunk_hashes.add(chash)
804
-
805
- buf_chunks.append(chunk_txt)
806
- meta = {
807
- "path": path,
808
- "chunk": ci,
809
- "start": start,
810
- "end": end,
811
- }
812
- if req.store_text:
813
- meta["text"] = chunk_txt
814
- buf_metas.append(meta)
815
-
816
- if len(buf_chunks) >= req.batch_size:
817
- _flush()
818
-
819
- # flush fin de fichier
820
- _flush()
821
- _append_log(job_id, f"File done: {path}")
822
-
823
- _append_log(job_id, f"Done. chunks={total_chunks}")
824
- _set_status(job_id, "done")
825
- LOG.info(f"[{job_id}] Index finished. chunks={total_chunks}")
826
-
827
  except Exception as e:
828
- LOG.exception("Index job failed")
829
- _append_log(job_id, f"ERROR: {e}")
830
- _set_status(job_id, "error")
831
-
832
- # ======================================================================================
833
- # API
834
- # ======================================================================================
835
-
836
- app = FastAPI()
837
-
838
- @app.get("/")
839
- def root():
840
- return {
841
- "ok": True,
842
- "service": "remote-indexer",
843
- "backends": EMB_BACKEND_ORDER,
844
- "hf_url_pipeline": HF_URL_PIPE if "hf" in EMB_BACKEND_ORDER else None,
845
- "hf_url_models": HF_URL_MODL if "hf" in EMB_BACKEND_ORDER else None,
846
- "di_url": DI_URL if "deepinfra" in EMB_BACKEND_ORDER else None,
847
- "di_model": DI_MODEL if "deepinfra" in EMB_BACKEND_ORDER else None,
848
- "vector_store": VECTOR_STORE,
849
- "vector_store_active": "QdrantStore" if isinstance(STORE, QdrantStore) else "MemoryStore",
850
- "qdrant_deterministic_ids": QDRANT_DETERMINISTIC_IDS,
851
- "qdrant_id_mode": QDRANT_ID_MODE,
852
- "wipe_before_index": WIPE_BEFORE_INDEX,
853
- "docs": "/health, /index, /status/{job_id} | /status?job_id= | POST /status, /query, /wipe",
854
- }
855
-
856
- @app.get("/health")
857
- def health():
858
- return {"ok": True, "store": "QdrantStore" if isinstance(STORE, QdrantStore) else "MemoryStore"}
859
-
860
- def _check_backend_ready():
861
- if "hf" in EMB_BACKEND_ORDER and not HF_TOKEN:
862
- raise HTTPException(400, "HF_API_TOKEN manquant côté serveur (backend=hf).")
863
- if "deepinfra" in EMB_BACKEND_ORDER and not DI_TOKEN and EMB_BACKEND_ORDER == ["deepinfra"]:
864
- raise HTTPException(400, "DEEPINFRA_API_KEY manquant côté serveur (backend=deepinfra).")
865
-
866
- @app.post("/index")
867
- def start_index(req: IndexRequest, background_tasks: BackgroundTasks, x_auth_token: Optional[str] = Header(default=None)):
868
- _auth(x_auth_token)
869
- _check_backend_ready()
870
- job_id = uuid.uuid4().hex[:12]
871
- JOBS[job_id] = {"status": "queued", "logs": [], "created": time.time()}
872
- LOG.info(f"Created job {job_id} for project {req.project_id}")
873
- _append_log(job_id, f"Job created: {job_id} project={req.project_id}")
874
- background_tasks.add_task(run_index_job, job_id, req)
875
- return {"job_id": job_id}
876
-
877
- # --- 3 variantes pour /status ---
878
- @app.get("/status/{job_id}")
879
- def status_path(job_id: str, x_auth_token: Optional[str] = Header(default=None)):
880
- _auth(x_auth_token)
881
- j = JOBS.get(job_id)
882
- if not j:
883
- # Response JSON plus explicite pour faciliter le debug côté client
884
- raise HTTPException(status_code=404, detail={"error": "job inconnu", "advice": "POST /index to create a new job"})
885
- return {"status": j["status"], "logs": j["logs"][-1500:]}
886
-
887
- @app.get("/status")
888
- def status_query(job_id: str = Query(...), x_auth_token: Optional[str] = Header(default=None)):
889
- return status_path(job_id, x_auth_token)
890
-
891
- @app.post("/status")
892
- def status_post(body: StatusBody, x_auth_token: Optional[str] = Header(default=None)):
893
- return status_path(body.job_id, x_auth_token)
894
-
895
- @app.post("/query")
896
- def query(req: QueryRequest, x_auth_token: Optional[str] = Header(default=None)):
897
- _auth(x_auth_token)
898
- _check_backend_ready()
899
- vecs, _ = _post_embeddings([req.query])
900
- col = f"proj_{req.project_id}"
901
  try:
902
- results = STORE.search(col, vecs[0], int(req.top_k))
 
903
  except Exception as e:
904
- raise HTTPException(400, f"Search failed: {e}")
905
- out = []
906
- for pl in results:
907
- txt = pl.get("text")
908
- if txt and len(txt) > 800:
909
- txt = txt[:800] + "..."
910
- out.append({
911
- "path": pl.get("path"),
912
- "chunk": pl.get("chunk"),
913
- "start": pl.get("start"),
914
- "end": pl.get("end"),
915
- "text": txt,
916
- "score": float(pl.get("_score")) if pl.get("_score") is not None else None
917
- })
918
- return {"results": out}
919
-
920
- @app.post("/wipe")
921
- def wipe_collection(project_id: str, x_auth_token: Optional[str] = Header(default=None)):
922
- _auth(x_auth_token)
923
- col = f"proj_{project_id}"
924
  try:
925
- STORE.wipe(col); return {"ok": True}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
926
  except Exception as e:
927
- raise HTTPException(400, f"wipe failed: {e}")
928
-
929
- # ======================================================================================
930
- # Entrypoint
931
- # ======================================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
932
 
933
  if __name__ == "__main__":
934
- import uvicorn
935
- port = int(os.getenv("PORT", "7860"))
936
- LOG.info(f"===== Application Startup on PORT {port} =====")
937
- uvicorn.run(app, host="0.0.0.0", port=port)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
  # -*- coding: utf-8 -*-
3
  from __future__ import annotations
 
4
  import os
5
+ import sys
6
  import time
 
 
7
  import logging
8
+ import tempfile
9
+ from typing import List, Dict, Any
10
+
11
+ import gradio as gr
12
+
13
+ # Import des utilitaires (fichiers fournis dans ton repo / upload)
14
+ from app.env_tools import load_config, config_markdown
15
+ from app.remote_index_client import start_remote_index, stream_remote_status, remote_query
16
+ from app.persistence import (
17
+ list_workspaces,
18
+ load_workspace,
19
+ save_workspace,
20
+ create_workspace,
21
+ delete_workspace,
22
+ workspace_repo_dir,
23
+ load_last_workspace_name,
24
+ set_last_workspace_name,
25
+ default_config,
26
+ list_repo_files as persistence_list_repo_files,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  )
28
+ # Optionnel : utilitaire pour décider quels fichiers indexer (si présent)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  try:
30
+ from app.index_state import get_files_to_index
 
31
  except Exception:
32
+ get_files_to_index = None # fallback si le module n'est pas disponible
33
+
34
+ # Charger config globale
35
+ CFG_GLOBAL = load_config()
36
+
37
+ # --- Logging principal (console stdout pour conteneur) ---
38
+ LOG = logging.getLogger("appli_v1")
39
+ if not LOG.handlers:
40
+ handler = logging.StreamHandler(sys.stdout)
41
+ handler.setFormatter(logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s", "%H:%M:%S"))
42
+ LOG.addHandler(handler)
43
+ LOG.setLevel(logging.DEBUG if os.getenv("DEBUG", "0") == "1" else logging.INFO)
44
+
45
+ DBG = logging.getLogger("appli_v1.debug")
46
+ if not DBG.handlers:
47
+ dbg_h = logging.StreamHandler(sys.stdout)
48
+ dbg_h.setFormatter(logging.Formatter("[DEBUG] %(asctime)s - %(message)s", "%H:%M:%S"))
49
+ DBG.addHandler(dbg_h)
50
+ DBG.setLevel(logging.DEBUG)
51
+
52
+ # Configure DeepInfra key if present (compatibilité avec ton code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  try:
54
+ from app.deepinfra_client import set_api_key_runtime
55
+ if CFG_GLOBAL.get("DEEPINFRA_API_KEY"):
56
+ set_api_key_runtime(CFG_GLOBAL["DEEPINFRA_API_KEY"])
57
+ LOG.debug("Clé API DeepInfra configurée")
 
 
 
 
 
 
58
  else:
59
+ LOG.debug("DEEPINFRA_API_KEY non fournie")
60
+ except Exception:
61
+ LOG.debug("deepinfra_client absent ou non nécessaire")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ # --- Helpers locaux pour l'UI indexation ---
64
+ TEXT_EXTS = {".py", ".md", ".txt", ".json", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".csv", ".tsv"}
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ def _repo_dir(ws: str) -> str:
67
+ return workspace_repo_dir(ws or "default")
 
68
 
69
+ def _is_text_like(path: str) -> bool:
70
+ ext = os.path.splitext(path.lower())[1]
71
+ return ext in TEXT_EXTS and not os.path.basename(path).startswith(".")
72
 
73
+ def _get_repo_text_files(ws: str) -> List[str]:
74
+ repo_dir = _repo_dir(ws)
75
  try:
76
+ files = persistence_list_repo_files(repo_dir)
77
+ return [f for f in files if _is_text_like(f)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  except Exception as e:
79
+ LOG.debug("persistence.list_repo_files failed: %s", e)
80
+ out = []
81
+ for root, _, names in os.walk(repo_dir):
82
+ for nm in names:
83
+ rel = os.path.relpath(os.path.join(root, nm), repo_dir)
84
+ if rel.startswith(".git") or any(p.startswith(".") for p in rel.split(os.sep)):
85
+ continue
86
+ if _is_text_like(rel):
87
+ out.append(rel)
88
+ return sorted(out)
89
+
90
+ def _load_ws_choices() -> gr.Update:
91
+ ws = list_workspaces()
92
+ cur = load_last_workspace_name() or next(iter(ws.keys()), "default")
93
+ return gr.update(choices=list(ws.keys()), value=cur)
94
+
95
+ def _on_ws_changed(ws_name: str):
96
+ # retourne (chunk_size, overlap, batch_size, store_text, remote_index_url)
97
+ _, cfg = load_workspace(ws_name)
98
+ cfg = {**default_config(), **(cfg or {})}
99
+ return (
100
+ cfg.get("index_chunk_size", 512),
101
+ cfg.get("index_overlap", 50),
102
+ cfg.get("index_batch_size", 8),
103
+ cfg.get("index_store_text", True),
104
+ cfg.get("remote_index_url", os.getenv("REMOTE_INDEX_URL", "https://chouchouvs-deepindex.hf.space")),
105
+ )
106
+
107
+ # Fonction appelée pour rafraîchir la liste de fichiers (UI)
108
+ def _list_files_for_ws(ws_name: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  try:
110
+ files = _get_repo_text_files(ws_name)
111
+ return gr.update(choices=files, value=[])
112
  except Exception as e:
113
+ LOG.exception("Erreur _list_files_for_ws: %s", e)
114
+ return gr.update(choices=[], value=[])
115
+
116
+ # Lancement d'une indexation distante (wrapper simple)
117
+ def _launch_index_remote(
118
+ ws_name: str,
119
+ use_selected: bool,
120
+ selected_files: List[str],
121
+ chunk_size: int,
122
+ overlap: int,
123
+ batch_size: int,
124
+ store_text: bool,
125
+ base_url: str,
126
+ ) -> str:
 
 
 
 
 
 
127
  try:
128
+ repo_dir = _repo_dir(ws_name)
129
+ # si l'utilisateur a choisi des fichiers explicitement, utiliser ceux-ci
130
+ if use_selected and selected_files:
131
+ files_to_send = selected_files
132
+ else:
133
+ # si get_files_to_index est dispo, l'utiliser (logique de hash/diff)
134
+ if get_files_to_index:
135
+ files_to_send = get_files_to_index(repo_dir)
136
+ else:
137
+ files_to_send = _get_repo_text_files(ws_name)
138
+
139
+ # Préparer payload minimal : path + text (read files)
140
+ payload_files = []
141
+ for rel in files_to_send:
142
+ full = os.path.join(repo_dir, rel)
143
+ try:
144
+ with open(full, "r", encoding="utf-8", errors="ignore") as f:
145
+ text = f.read()
146
+ if text and rel:
147
+ payload_files.append({"path": rel, "text": text})
148
+ except Exception as e:
149
+ LOG.debug("skip file %s : %s", full, e)
150
+
151
+ if not payload_files:
152
+ return "❌ Aucun fichier texte valide à indexer."
153
+
154
+ LOG.info("Lancement index distant: %d fichiers -> %s", len(payload_files), base_url or os.getenv("REMOTE_INDEX_URL"))
155
+ # start_remote_index est importé depuis app.remote_index_client
156
+ job_id = start_remote_index(
157
+ project_id=ws_name,
158
+ files=payload_files,
159
+ chunk_size=chunk_size,
160
+ overlap=overlap,
161
+ batch_size=batch_size,
162
+ store_text=bool(store_text),
163
+ timeout=600.0,
164
+ extra_headers={"X-Remote-Url": base_url} if base_url else None,
165
+ )
166
+ return f"✅ Index lancé (job_id={job_id})"
167
  except Exception as e:
168
+ LOG.exception("Erreur lancement index distant: %s", e)
169
+ return f"❌ Erreur: {e}"
170
+
171
+ # --- Construction de l'UI Gradio ---
172
+ def build_demo() -> gr.Interface:
173
+ demo = gr.Blocks(title="appli_v1", css=None)
174
+
175
+ with demo:
176
+ with gr.Row():
177
+ gr.Markdown("## appli_v1 — interface")
178
+ with gr.Tabs():
179
+ # --- Onglet Indexation ---
180
+ with gr.Tab("📊 Index"):
181
+ gr.Markdown("### 🚀 Indexation intelligente des fichiers textuels")
182
+ gr.Markdown(
183
+ "Indexe automatiquement **uniquement les fichiers modifiés**. "
184
+ "Extensions prises: " + ", ".join(sorted(TEXT_EXTS))
185
+ )
186
+
187
+ with gr.Row():
188
+ index_use_selected = gr.Checkbox(label="Indexer uniquement les fichiers sélectionnés ci-dessous", value=False)
189
+ index_base_url = gr.Textbox(label="URL du service d'indexation (optionnel)", placeholder="https://your-indexer.hf.space", value=os.getenv("REMOTE_INDEX_URL", "https://chouchouvs-deepindex.hf.space"))
190
+
191
+ with gr.Row():
192
+ index_files_dd = gr.Dropdown(label="Fichiers à indexer", choices=[], multiselect=True, scale=2)
193
+ index_refresh_btn = gr.Button("🔄 Rafraîchir la liste")
194
+
195
+ with gr.Row():
196
+ chunk_size = gr.Slider(128, 2048, value=512, step=64, label="Taille des chunks")
197
+ overlap = gr.Slider(0, 256, value=50, step=10, label="Chevauchement")
198
+ batch_size = gr.Slider(1, 16, value=8, step=1, label="Taille batch")
199
+ store_text = gr.Checkbox(label="Stocker le texte brut", value=True)
200
+
201
+ index_btn = gr.Button("🚀 Lancer l'indexation intelligente", variant="primary")
202
+ index_log = gr.Textbox(label="Journal d'indexation", lines=15, interactive=False)
203
+
204
+ # Callbacks
205
+ index_refresh_btn.click(fn=lambda ws: _list_files_for_ws(ws), inputs=[gr.State(load_last_workspace_name())], outputs=index_files_dd)
206
+ # pour l'initialisation on expose une action: ws dropdown load (ci-dessous dans la barre d'état)
207
+ index_btn.click(
208
+ _launch_index_remote,
209
+ inputs=[
210
+ gr.State(load_last_workspace_name()), # workspace actif
211
+ index_use_selected,
212
+ index_files_dd,
213
+ chunk_size,
214
+ overlap,
215
+ batch_size,
216
+ store_text,
217
+ index_base_url,
218
+ ],
219
+ outputs=index_log,
220
+ )
221
+
222
+ # --- Onglet Workspaces / Git (simplifié) ---
223
+ with gr.Tab("🧰 Workspaces"):
224
+ gr.Markdown("Gestion des workspaces locaux (création / suppression / navigation).")
225
+ ws_dd = gr.Dropdown(label="Workspace", choices=[], value=None)
226
+ refresh_ws = gr.Button("🔄 Rafraîchir")
227
+ ws_new = gr.Textbox(label="Nouveau nom de workspace", placeholder="nom")
228
+ ws_create_btn = gr.Button("Créer workspace")
229
+ ws_delete_btn = gr.Button("Supprimer workspace (attention)")
230
+ ws_status_md = gr.Markdown("")
231
+
232
+ refresh_ws.click(_load_ws_choices, outputs=ws_dd)
233
+ ws_create_btn.click(lambda name: create_workspace(name) and "✅ Workspace créé" or "❌", inputs=ws_new, outputs=ws_status_md)
234
+ ws_delete_btn.click(lambda name: (delete_workspace(name), "✅ Suppression demandée")[1], inputs=ws_dd, outputs=ws_status_md)
235
+
236
+ # --- Onglet About / Config ---
237
+ with gr.Tab("⚙️ Config"):
238
+ gr.Markdown("Configuration et variables d'environnement")
239
+ cfg_md = gr.Markdown(config_markdown(load_config()))
240
+ reload_env_btn = gr.Button("Recharger config")
241
+ reload_env_btn.click(lambda: config_markdown(load_config()), None, cfg_md)
242
+
243
+ return demo
244
 
245
  if __name__ == "__main__":
246
+ port = int(os.getenv("GRADIO_SERVER_PORT", "7860"))
247
+ demo = build_demo()
248
+ # activation de la queue si besoin (long running tasks)
249
+ try:
250
+ demo.queue()
251
+ except Exception:
252
+ LOG.debug("queue() non disponible / déjà activée")
253
+
254
+ workspaces_root = os.getenv("WORKSPACES_ROOT", "/workspace/workspaces")
255
+ tmp_dir = tempfile.gettempdir()
256
+ demo.launch(
257
+ server_name="0.0.0.0",
258
+ server_port=port,
259
+ show_api=False,
260
+ share=False,
261
+ show_error=True,
262
+ allowed_paths=[workspaces_root, tmp_dir, os.getcwd()],
263
+ )