minhvtt commited on
Commit
5aa7215
·
verified ·
1 Parent(s): 680a4ae

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +629 -230
main.py CHANGED
@@ -13,6 +13,9 @@ from huggingface_hub import InferenceClient
13
 
14
  from embedding_service import JinaClipEmbeddingService
15
  from qdrant_service import QdrantVectorService
 
 
 
16
 
17
  # Initialize FastAPI app
18
  app = FastAPI(
@@ -54,6 +57,29 @@ hf_token = os.getenv("HUGGINGFACE_TOKEN")
54
  if hf_token:
55
  print("✓ Hugging Face token configured")
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  print("✓ Services initialized successfully")
58
 
59
 
@@ -88,12 +114,19 @@ class ChatRequest(BaseModel):
88
  temperature: float = 0.7
89
  top_p: float = 0.95
90
  hf_token: Optional[str] = None
 
 
 
 
 
 
91
 
92
 
93
  class ChatResponse(BaseModel):
94
  response: str
95
  context_used: List[Dict]
96
  timestamp: str
 
97
 
98
 
99
  class AddDocumentRequest(BaseModel):
@@ -107,234 +140,292 @@ class AddDocumentResponse(BaseModel):
107
  message: str
108
 
109
 
 
 
 
 
 
 
 
 
110
  @app.get("/")
111
  async def root():
112
  """Health check endpoint with comprehensive API documentation"""
113
  return {
114
  "status": "running",
115
- "service": "ChatbotRAG API",
116
- "version": "2.0.0",
117
  "vector_db": "Qdrant",
118
- "document_db": "MongoDB",
 
 
 
 
 
 
 
 
119
  "endpoints": {
120
- "chatbot_rag": {
121
- "API endpoint": "https://minhvtt-ChatbotRAG.hf.space/",
122
- "POST /chat": {
123
- "description": "Chat với AI sử dụng RAG (Retrieval-Augmented Generation)",
124
- "request": {
125
- "method": "POST",
126
- "content_type": "application/json",
127
- "body": {
128
- "message": "string (required) - User message/question",
129
- "use_rag": "boolean (optional, default: true) - Enable RAG context retrieval",
130
- "top_k": "integer (optional, default: 3) - Number of context documents to retrieve",
131
- "system_message": "string (optional) - Custom system prompt",
132
- "max_tokens": "integer (optional, default: 512) - Max response length",
133
- "temperature": "float (optional, default: 0.7, range: 0-1) - Creativity level",
134
- "top_p": "float (optional, default: 0.95) - Nucleus sampling",
135
- "hf_token": "string (optional) - Hugging Face token (fallback to env)"
136
- }
137
  },
 
138
  "response": {
139
- "response": "string - AI generated response",
140
- "context_used": [
141
- {
142
- "id": "string - Document ID",
143
- "confidence": "float - Relevance score",
144
- "metadata": {
145
- "text": "string - Retrieved context"
146
- }
147
- }
148
- ],
149
- "timestamp": "string - ISO 8601 timestamp"
150
- },
151
- "example_request": {
152
- "message": "Dao có nguy hiểm không?",
153
- "use_rag": True,
154
- "top_k": 3,
155
- "temperature": 0.7
156
- },
157
- "example_response": {
158
- "response": "Dựa trên thông tin trong database, dao được phân loại là vũ khí nguy hiểm. Dao sắc có thể gây thương tích nghiêm trọng nếu không sử dụng đúng cách. Cần tuân thủ các quy định an toàn khi sử dụng.",
159
- "context_used": [
160
- {
161
- "id": "68a3fc14c853d7621e8977b5",
162
- "confidence": 0.92,
163
- "metadata": {
164
- "text": "Vũ khí"
165
- }
166
- },
167
- {
168
- "id": "68a3fc4cc853d7621e8977b6",
169
- "confidence": 0.85,
170
- "metadata": {
171
- "text": "Con dao sắc"
172
- }
173
- }
174
- ],
175
- "timestamp": "2025-10-13T10:30:45.123456"
176
- },
177
- "notes": [
178
- "RAG retrieves relevant context from vector DB before generating response",
179
- "LLM uses context to provide accurate, grounded answers",
180
- "Requires HUGGINGFACE_TOKEN environment variable or hf_token in request"
181
- ]
182
  },
183
  "POST /documents": {
184
- "description": "Add document to knowledge base for RAG",
185
- "request": {
186
- "method": "POST",
187
- "content_type": "application/json",
188
- "body": {
189
- "text": "string (required) - Document text content",
190
- "metadata": "object (optional) - Additional metadata (source, category, etc.)"
191
- }
192
  },
193
- "response": {
194
- "success": "boolean",
195
- "doc_id": "string - MongoDB ObjectId",
196
- "message": "string - Status message"
197
- },
198
- "example_request": {
199
- "text": "Để tạo event mới: Click nút 'Tạo Event' ở góc trên bên phải màn hình. Điền thông tin sự kiện bao gồm tên, ngày giờ, địa điểm. Click Lưu để hoàn tất.",
200
- "metadata": {
201
- "source": "user_guide.pdf",
202
- "section": "create_event",
203
- "page": 5,
204
- "category": "tutorial"
205
- }
206
- },
207
- "example_response": {
208
- "success": True,
209
- "doc_id": "67a9876543210fedcba98765",
210
- "message": "Document added successfully with ID: 67a9876543210fedcba98765"
211
  }
212
  },
213
- "POST /rag/search": {
214
- "description": "Search in knowledge base (similar to /search/text but for RAG documents)",
215
- "request": {
216
- "method": "POST",
217
- "content_type": "multipart/form-data",
218
- "body": {
219
- "query": "string (required) - Search query",
220
- "top_k": "integer (optional, default: 5) - Number of results",
221
- "score_threshold": "float (optional, default: 0.5) - Minimum relevance score"
222
- }
223
  },
224
- "response": [
225
- {
226
- "id": "string",
227
- "confidence": "float",
228
- "metadata": {
229
- "text": "string",
230
- "source": "string"
231
- }
232
- }
 
 
 
233
  ],
234
- "example_request": {
235
- "query": "cách tạo sự kiện mới",
236
- "top_k": 3,
237
- "score_threshold": 0.6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  }
239
  },
240
- "GET /history": {
241
- "description": "Get chat conversation history",
242
- "request": {
243
- "method": "GET",
244
- "query_params": {
245
- "limit": "integer (optional, default: 10) - Number of messages",
246
- "skip": "integer (optional, default: 0) - Pagination offset"
247
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  },
249
  "response": {
250
- "history": [
251
- {
252
- "user_message": "string",
253
- "assistant_response": "string",
254
- "context_used": "array",
255
- "timestamp": "string - ISO 8601"
256
- }
257
- ],
258
- "total": "integer - Total messages count"
 
 
259
  },
260
- "example_request": "GET /history?limit=5&skip=0",
261
- "example_response": {
262
- "history": [
263
  {
264
- "user_message": "Dao có nguy hiểm không?",
265
- "assistant_response": "Dao được phân loại là vũ khí...",
266
- "context_used": [],
267
- "timestamp": "2025-10-13T10:30:45.123456"
 
 
 
 
 
 
 
 
268
  }
269
  ],
270
- "total": 15
271
- }
272
- },
273
- "DELETE /documents/{doc_id}": {
274
- "description": "Delete document from knowledge base",
275
- "request": {
276
- "method": "DELETE",
277
- "path_params": {
278
- "doc_id": "string - MongoDB ObjectId"
279
  }
280
  },
281
- "response": {
282
- "success": "boolean",
283
- "message": "string"
284
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  }
286
  }
287
  },
288
- "usage_examples": {
289
- "curl_chat": "curl -X POST 'http://localhost:8000/chat' -H 'Content-Type: application/json' -d '{\"message\": \"Dao có nguy hiểm không?\", \"use_rag\": true}'",
290
- "python_chat": """
291
- import requests
292
-
293
- response = requests.post(
294
- 'http://localhost:8000/chat',
295
- json={
296
- 'message': 'Nút tạo event ở đâu?',
297
- 'use_rag': True,
298
- 'top_k': 3
299
- }
300
- )
301
- print(response.json()['response'])
302
- """
303
- },
304
- "authentication": {
305
- "embeddings_apis": "No authentication required",
306
- "chat_api": "Requires HUGGINGFACE_TOKEN (env variable or request body)"
307
  },
308
- "rate_limits": {
309
- "embeddings": "No limit",
310
- "chat_with_llm": "Limited by Hugging Face API (free tier: ~1000 requests/hour)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  },
312
- "error_codes": {
313
- "400": "Bad Request - Missing required fields or invalid input",
314
- "401": "Unauthorized - Invalid Hugging Face token",
315
- "404": "Not Found - Document ID not found",
316
- "500": "Internal Server Error - Server or database error"
 
 
 
 
 
 
 
 
 
 
 
 
317
  },
318
  "links": {
319
  "docs": "http://localhost:8000/docs",
320
  "redoc": "http://localhost:8000/redoc",
321
- "openapi": "http://localhost:8000/openapi.json"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  }
323
  }
324
 
325
  @app.post("/index", response_model=IndexResponse)
326
  async def index_data(
327
  id: str = Form(...),
328
- text: str = Form(...),
329
- image: Optional[UploadFile] = File(None)
330
  ):
331
  """
332
- Index data vào vector database
333
 
334
  Body:
335
  - id: Document ID (event ID, post ID, etc.)
336
- - text: Text content (tiếng Việt supported)
337
- - image: Image file (optional)
338
 
339
  Returns:
340
  - success: True/False
@@ -342,39 +433,64 @@ async def index_data(
342
  - message: Status message
343
  """
344
  try:
345
- # Prepare embeddings
346
- text_embedding = None
347
- image_embedding = None
348
 
349
- # Encode text (tiếng Việt)
350
- if text and text.strip():
351
- text_embedding = embedding_service.encode_text(text)
352
 
353
- # Encode image nếu
354
- if image:
355
- image_bytes = await image.read()
356
- pil_image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
357
- image_embedding = embedding_service.encode_image(pil_image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
 
359
  # Combine embeddings
360
- if text_embedding is not None and image_embedding is not None:
361
- # Average của text và image embeddings
362
- combined_embedding = np.mean([text_embedding, image_embedding], axis=0)
363
- elif text_embedding is not None:
364
- combined_embedding = text_embedding
365
- elif image_embedding is not None:
366
- combined_embedding = image_embedding
367
- else:
368
- raise HTTPException(status_code=400, detail="Phải cung cấp ít nhất text hoặc image")
 
 
 
 
 
 
 
 
369
 
370
  # Normalize
371
  combined_embedding = combined_embedding / np.linalg.norm(combined_embedding, axis=1, keepdims=True)
372
 
373
  # Index vào Qdrant
374
  metadata = {
375
- "text": text,
376
- "has_image": image is not None,
377
- "image_filename": image.filename if image else None
 
378
  }
379
 
380
  result = qdrant_service.index_data(
@@ -386,9 +502,11 @@ async def index_data(
386
  return IndexResponse(
387
  success=True,
388
  id=result["original_id"], # Trả về MongoDB ObjectId
389
- message=f"Đã index thành công document {result['original_id']} (Qdrant UUID: {result['qdrant_id']})"
390
  )
391
 
 
 
392
  except Exception as e:
393
  raise HTTPException(status_code=500, detail=f"Lỗi khi index: {str(e)}")
394
 
@@ -611,7 +729,7 @@ async def get_stats():
611
  @app.post("/chat", response_model=ChatResponse)
612
  async def chat(request: ChatRequest):
613
  """
614
- Chat endpoint với RAG
615
 
616
  Body:
617
  - message: User message
@@ -621,38 +739,79 @@ async def chat(request: ChatRequest):
621
  - max_tokens: Max tokens for response (default: 512)
622
  - temperature: Temperature for generation (default: 0.7)
623
  - hf_token: Hugging Face token (optional, sẽ dùng env nếu không truyền)
 
 
 
 
 
624
 
625
  Returns:
626
  - response: Generated response
627
  - context_used: Retrieved context documents
628
  - timestamp: Response timestamp
 
629
  """
630
  try:
631
  # Retrieve context if RAG enabled
632
  context_used = []
 
 
633
  if request.use_rag:
634
- # Generate query embedding
635
- query_embedding = embedding_service.encode_text(request.message)
636
-
637
- # Search in Qdrant
638
- results = qdrant_service.search(
639
- query_embedding=query_embedding,
640
- limit=request.top_k,
641
- score_threshold=0.5
642
- )
643
- context_used = results
644
-
645
- # Build context text
646
- context_text = ""
647
- if context_used:
648
- context_text = "\n\nRelevant Context:\n"
649
- for i, doc in enumerate(context_used, 1):
650
- doc_text = doc["metadata"].get("text", "")
651
- confidence = doc["confidence"]
652
- context_text += f"\n[{i}] (Confidence: {confidence:.2f})\n{doc_text}\n"
653
-
654
- # Add context to system message
655
- system_message = f"{request.system_message}\n{context_text}\n\nPlease use the above context to answer the user's question when relevant."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
656
  else:
657
  system_message = request.system_message
658
 
@@ -716,7 +875,8 @@ Example:
716
  return ChatResponse(
717
  response=response,
718
  context_used=context_used,
719
- timestamp=datetime.utcnow().isoformat()
 
720
  )
721
 
722
  except Exception as e:
@@ -876,6 +1036,245 @@ async def delete_document_from_kb(doc_id: str):
876
  raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
877
 
878
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
879
  if __name__ == "__main__":
880
  import uvicorn
881
  uvicorn.run(
 
13
 
14
  from embedding_service import JinaClipEmbeddingService
15
  from qdrant_service import QdrantVectorService
16
+ from advanced_rag import AdvancedRAG
17
+ from pdf_parser import PDFIndexer
18
+ from multimodal_pdf_parser import MultimodalPDFIndexer
19
 
20
  # Initialize FastAPI app
21
  app = FastAPI(
 
57
  if hf_token:
58
  print("✓ Hugging Face token configured")
59
 
60
+ # Initialize Advanced RAG
61
+ advanced_rag = AdvancedRAG(
62
+ embedding_service=embedding_service,
63
+ qdrant_service=qdrant_service
64
+ )
65
+ print("✓ Advanced RAG pipeline initialized")
66
+
67
+ # Initialize PDF Indexer
68
+ pdf_indexer = PDFIndexer(
69
+ embedding_service=embedding_service,
70
+ qdrant_service=qdrant_service,
71
+ documents_collection=documents_collection
72
+ )
73
+ print("✓ PDF Indexer initialized")
74
+
75
+ # Initialize Multimodal PDF Indexer (for PDFs with images)
76
+ multimodal_pdf_indexer = MultimodalPDFIndexer(
77
+ embedding_service=embedding_service,
78
+ qdrant_service=qdrant_service,
79
+ documents_collection=documents_collection
80
+ )
81
+ print("✓ Multimodal PDF Indexer initialized")
82
+
83
  print("✓ Services initialized successfully")
84
 
85
 
 
114
  temperature: float = 0.7
115
  top_p: float = 0.95
116
  hf_token: Optional[str] = None
117
+ # Advanced RAG options
118
+ use_advanced_rag: bool = True
119
+ use_query_expansion: bool = True
120
+ use_reranking: bool = True
121
+ use_compression: bool = True
122
+ score_threshold: float = 0.5
123
 
124
 
125
  class ChatResponse(BaseModel):
126
  response: str
127
  context_used: List[Dict]
128
  timestamp: str
129
+ rag_stats: Optional[Dict] = None # Stats from advanced RAG pipeline
130
 
131
 
132
  class AddDocumentRequest(BaseModel):
 
140
  message: str
141
 
142
 
143
+ class UploadPDFResponse(BaseModel):
144
+ success: bool
145
+ document_id: str
146
+ filename: str
147
+ chunks_indexed: int
148
+ message: str
149
+
150
+
151
  @app.get("/")
152
  async def root():
153
  """Health check endpoint with comprehensive API documentation"""
154
  return {
155
  "status": "running",
156
+ "service": "ChatbotRAG API - Advanced RAG with Multimodal Support",
157
+ "version": "3.0.0",
158
  "vector_db": "Qdrant",
159
+ "document_db": "MongoDB",
160
+ "features": {
161
+ "multiple_inputs": "Index up to 10 texts + 10 images per request",
162
+ "advanced_rag": "Query expansion, reranking, contextual compression",
163
+ "pdf_support": "Upload PDFs and chat about their content",
164
+ "multimodal_pdf": "PDFs with text and image URLs - perfect for user guides",
165
+ "chat_history": "Track conversation history",
166
+ "hybrid_search": "Text + image search with Jina CLIP v2"
167
+ },
168
  "endpoints": {
169
+ "indexing": {
170
+ "POST /index": {
171
+ "description": "Index multiple texts and images (NEW: up to 10 each)",
172
+ "content_type": "multipart/form-data",
173
+ "body": {
174
+ "id": "string (required) - Document ID",
175
+ "texts": "List[string] (optional) - Up to 10 texts",
176
+ "images": "List[UploadFile] (optional) - Up to 10 images"
 
 
 
 
 
 
 
 
 
177
  },
178
+ "example": "curl -X POST '/index' -F 'id=doc1' -F 'texts=Text 1' -F 'texts=Text 2' -F 'images=@img1.jpg'",
179
  "response": {
180
+ "success": True,
181
+ "id": "doc1",
182
+ "message": "Indexed successfully with 2 texts and 1 images"
183
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  },
185
  "POST /documents": {
186
+ "description": "Add text document to knowledge base",
187
+ "content_type": "application/json",
188
+ "body": {
189
+ "text": "string (required) - Document content",
190
+ "metadata": "object (optional) - Additional metadata"
 
 
 
191
  },
192
+ "example": {
193
+ "text": "How to create event: Click 'Create Event' button...",
194
+ "metadata": {"category": "tutorial", "source": "user_guide"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  }
196
  },
197
+ "POST /upload-pdf": {
198
+ "description": "Upload PDF file (text only)",
199
+ "content_type": "multipart/form-data",
200
+ "body": {
201
+ "file": "UploadFile (required) - PDF file",
202
+ "title": "string (optional) - Document title",
203
+ "category": "string (optional) - Category",
204
+ "description": "string (optional) - Description"
 
 
205
  },
206
+ "example": "curl -X POST '/upload-pdf' -F 'file=@guide.pdf' -F 'title=User Guide'"
207
+ },
208
+ "POST /upload-pdf-multimodal": {
209
+ "description": "Upload PDF with text and image URLs (RECOMMENDED for user guides)",
210
+ "content_type": "multipart/form-data",
211
+ "features": [
212
+ "Extracts text from PDF",
213
+ "Detects image URLs (http://, https://)",
214
+ "Supports markdown: ![alt](url)",
215
+ "Supports HTML: <img src='url'>",
216
+ "Links images to text chunks",
217
+ "Returns images with context in chat"
218
  ],
219
+ "body": {
220
+ "file": "UploadFile (required) - PDF file with image URLs",
221
+ "title": "string (optional) - Document title",
222
+ "category": "string (optional) - e.g. 'user_guide', 'tutorial'",
223
+ "description": "string (optional)"
224
+ },
225
+ "example": "curl -X POST '/upload-pdf-multimodal' -F 'file=@guide_with_images.pdf' -F 'category=user_guide'",
226
+ "response": {
227
+ "success": True,
228
+ "document_id": "pdf_multimodal_20251029_150000",
229
+ "chunks_indexed": 25,
230
+ "message": "PDF indexed with 25 chunks and 15 images"
231
+ },
232
+ "use_case": "Perfect for user guides with screenshots, tutorials with diagrams"
233
+ }
234
+ },
235
+ "search": {
236
+ "POST /search": {
237
+ "description": "Hybrid search with text and/or image",
238
+ "body": {
239
+ "text": "string (optional) - Query text",
240
+ "image": "UploadFile (optional) - Query image",
241
+ "limit": "int (default: 10)",
242
+ "score_threshold": "float (optional, 0-1)",
243
+ "text_weight": "float (default: 0.5)",
244
+ "image_weight": "float (default: 0.5)"
245
  }
246
  },
247
+ "POST /search/text": {
248
+ "description": "Text-only search",
249
+ "body": {"text": "string", "limit": "int", "score_threshold": "float"}
250
+ },
251
+ "POST /search/image": {
252
+ "description": "Image-only search",
253
+ "body": {"image": "UploadFile", "limit": "int", "score_threshold": "float"}
254
+ },
255
+ "POST /rag/search": {
256
+ "description": "Search in RAG knowledge base",
257
+ "body": {"query": "string", "top_k": "int (default: 5)", "score_threshold": "float (default: 0.5)"}
258
+ }
259
+ },
260
+ "chat": {
261
+ "POST /chat": {
262
+ "description": "Chat với Advanced RAG (Query expansion + Reranking + Compression)",
263
+ "content_type": "application/json",
264
+ "body": {
265
+ "message": "string (required) - User question",
266
+ "use_rag": "bool (default: true) - Enable RAG retrieval",
267
+ "use_advanced_rag": "bool (default: true) - Use advanced RAG pipeline (RECOMMENDED)",
268
+ "use_query_expansion": "bool (default: true) - Expand query with variations",
269
+ "use_reranking": "bool (default: true) - Rerank results for accuracy",
270
+ "use_compression": "bool (default: true) - Compress context to relevant parts",
271
+ "top_k": "int (default: 3) - Number of documents to retrieve",
272
+ "score_threshold": "float (default: 0.5) - Min relevance score (0-1)",
273
+ "max_tokens": "int (default: 512) - Max response tokens",
274
+ "temperature": "float (default: 0.7) - Creativity (0-1)",
275
+ "hf_token": "string (optional) - Hugging Face token"
276
  },
277
  "response": {
278
+ "response": "string - AI answer",
279
+ "context_used": "array - Retrieved documents with metadata",
280
+ "timestamp": "string",
281
+ "rag_stats": "object - RAG pipeline statistics (query variants, retrieval counts)"
282
+ },
283
+ "example_advanced": {
284
+ "message": "Làm sao để upload PDF có hình ảnh?",
285
+ "use_advanced_rag": True,
286
+ "use_reranking": True,
287
+ "top_k": 5,
288
+ "score_threshold": 0.5
289
  },
290
+ "example_response_with_images": {
291
+ "response": "Để upload PDF có hình ảnh, sử dụng endpoint /upload-pdf-multimodal...",
292
+ "context_used": [
293
  {
294
+ "id": "pdf_multimodal_...._p2_c1",
295
+ "confidence": 0.89,
296
+ "metadata": {
297
+ "text": "Bước 1: Chuẩn bị PDF với image URLs...",
298
+ "has_images": True,
299
+ "image_urls": [
300
+ "https://example.com/screenshot1.png",
301
+ "https://example.com/diagram.jpg"
302
+ ],
303
+ "num_images": 2,
304
+ "page": 2
305
+ }
306
  }
307
  ],
308
+ "rag_stats": {
309
+ "original_query": "Làm sao để upload PDF có hình ảnh?",
310
+ "expanded_queries": ["upload PDF hình ảnh", "PDF có ảnh"],
311
+ "initial_results": 10,
312
+ "after_rerank": 5,
313
+ "after_compression": 5
 
 
 
314
  }
315
  },
316
+ "notes": [
317
+ "Advanced RAG significantly improves answer quality",
318
+ "When multimodal PDF is used, images are returned in metadata",
319
+ "Requires HUGGINGFACE_TOKEN for actual LLM generation"
320
+ ]
321
+ },
322
+ "GET /history": {
323
+ "description": "Get chat history",
324
+ "query_params": {"limit": "int (default: 10)", "skip": "int (default: 0)"},
325
+ "response": {"history": "array", "total": "int"}
326
+ }
327
+ },
328
+ "management": {
329
+ "GET /documents/pdf": {
330
+ "description": "List all PDF documents",
331
+ "response": {"documents": "array", "total": "int"}
332
+ },
333
+ "DELETE /documents/pdf/{document_id}": {
334
+ "description": "Delete PDF and all its chunks",
335
+ "response": {"success": "bool", "message": "string"}
336
+ },
337
+ "GET /document/{doc_id}": {
338
+ "description": "Get document by ID",
339
+ "response": {"success": "bool", "data": "object"}
340
+ },
341
+ "DELETE /delete/{doc_id}": {
342
+ "description": "Delete document by ID",
343
+ "response": {"success": "bool", "message": "string"}
344
+ },
345
+ "GET /stats": {
346
+ "description": "Get Qdrant collection statistics",
347
+ "response": {"vectors_count": "int", "segments": "int", "indexed_vectors_count": "int"}
348
  }
349
  }
350
  },
351
+ "quick_start": {
352
+ "1_upload_multimodal_pdf": "curl -X POST '/upload-pdf-multimodal' -F 'file=@user_guide.pdf' -F 'title=Guide'",
353
+ "2_verify_upload": "curl '/documents/pdf'",
354
+ "3_chat_with_rag": "curl -X POST '/chat' -H 'Content-Type: application/json' -d '{\"message\": \"How to...?\", \"use_advanced_rag\": true}'",
355
+ "4_see_images_in_context": "response['context_used'][0]['metadata']['image_urls']"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  },
357
+ "use_cases": {
358
+ "user_guide_with_screenshots": {
359
+ "endpoint": "/upload-pdf-multimodal",
360
+ "description": "PDFs with text instructions + image URLs for visual guidance",
361
+ "benefits": ["Images linked to text chunks", "Chatbot returns relevant screenshots", "Perfect for step-by-step guides"]
362
+ },
363
+ "simple_text_docs": {
364
+ "endpoint": "/upload-pdf",
365
+ "description": "Simple PDFs with text only (FAQ, policies, etc.)"
366
+ },
367
+ "social_media_posts": {
368
+ "endpoint": "/index",
369
+ "description": "Index multiple posts with texts (up to 10) and images (up to 10)"
370
+ },
371
+ "complex_queries": {
372
+ "endpoint": "/chat",
373
+ "description": "Use advanced RAG for better accuracy on complex questions",
374
+ "settings": {"use_advanced_rag": True, "use_reranking": True, "use_compression": True}
375
+ }
376
  },
377
+ "best_practices": {
378
+ "pdf_format": [
379
+ "Include image URLs in text (http://, https://)",
380
+ "Use markdown format: ![alt](url) or HTML: <img src='url'>",
381
+ "Clear structure with headings and sections",
382
+ "Link images close to their related text"
383
+ ],
384
+ "chat_settings": {
385
+ "for_accuracy": {"temperature": 0.3, "use_advanced_rag": True, "use_reranking": True},
386
+ "for_creativity": {"temperature": 0.8, "use_advanced_rag": False},
387
+ "for_factual_answers": {"temperature": 0.3, "use_compression": True, "score_threshold": 0.6}
388
+ },
389
+ "retrieval_tuning": {
390
+ "not_finding_info": "Lower score_threshold to 0.3-0.4, increase top_k to 7-10",
391
+ "too_much_context": "Increase score_threshold to 0.6-0.7, decrease top_k to 3-5",
392
+ "slow_responses": "Disable compression, use basic RAG, decrease top_k"
393
+ }
394
  },
395
  "links": {
396
  "docs": "http://localhost:8000/docs",
397
  "redoc": "http://localhost:8000/redoc",
398
+ "openapi": "http://localhost:8000/openapi.json",
399
+ "guides": {
400
+ "multimodal_pdf": "See MULTIMODAL_PDF_GUIDE.md",
401
+ "advanced_rag": "See ADVANCED_RAG_GUIDE.md",
402
+ "pdf_general": "See PDF_RAG_GUIDE.md",
403
+ "quick_start": "See QUICK_START_PDF.md"
404
+ }
405
+ },
406
+ "system_info": {
407
+ "embedding_model": "Jina CLIP v2 (multimodal)",
408
+ "vector_db": "Qdrant with HNSW index",
409
+ "document_db": "MongoDB",
410
+ "rag_pipeline": "Advanced RAG with query expansion, reranking, compression",
411
+ "pdf_parser": "pypdfium2 with URL extraction",
412
+ "max_inputs": "10 texts + 10 images per /index request"
413
  }
414
  }
415
 
416
  @app.post("/index", response_model=IndexResponse)
417
  async def index_data(
418
  id: str = Form(...),
419
+ texts: Optional[List[str]] = Form(None),
420
+ images: Optional[List[UploadFile]] = File(None)
421
  ):
422
  """
423
+ Index data vào vector database (hỗ trợ nhiều texts và images)
424
 
425
  Body:
426
  - id: Document ID (event ID, post ID, etc.)
427
+ - texts: List of text contents (tiếng Việt supported) - Tối đa 10 texts
428
+ - images: List of image files (optional) - Tối đa 10 images
429
 
430
  Returns:
431
  - success: True/False
 
433
  - message: Status message
434
  """
435
  try:
436
+ # Validation
437
+ if texts is None and images is None:
438
+ raise HTTPException(status_code=400, detail="Phải cung cấp ít nhất texts hoặc images")
439
 
440
+ if texts and len(texts) > 10:
441
+ raise HTTPException(status_code=400, detail="Tối đa 10 texts")
 
442
 
443
+ if images and len(images) > 10:
444
+ raise HTTPException(status_code=400, detail="Tối đa 10 images")
445
+
446
+ # Prepare embeddings
447
+ text_embeddings = []
448
+ image_embeddings = []
449
+
450
+ # Encode multiple texts (tiếng Việt)
451
+ if texts:
452
+ for text in texts:
453
+ if text and text.strip():
454
+ text_emb = embedding_service.encode_text(text)
455
+ text_embeddings.append(text_emb)
456
+
457
+ # Encode multiple images
458
+ if images:
459
+ for image in images:
460
+ if image.filename: # Check if image is provided
461
+ image_bytes = await image.read()
462
+ pil_image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
463
+ image_emb = embedding_service.encode_image(pil_image)
464
+ image_embeddings.append(image_emb)
465
 
466
  # Combine embeddings
467
+ all_embeddings = []
468
+
469
+ if text_embeddings:
470
+ # Average all text embeddings
471
+ avg_text_embedding = np.mean(text_embeddings, axis=0)
472
+ all_embeddings.append(avg_text_embedding)
473
+
474
+ if image_embeddings:
475
+ # Average all image embeddings
476
+ avg_image_embedding = np.mean(image_embeddings, axis=0)
477
+ all_embeddings.append(avg_image_embedding)
478
+
479
+ if not all_embeddings:
480
+ raise HTTPException(status_code=400, detail="Không có embedding nào được tạo từ texts hoặc images")
481
+
482
+ # Final combined embedding
483
+ combined_embedding = np.mean(all_embeddings, axis=0)
484
 
485
  # Normalize
486
  combined_embedding = combined_embedding / np.linalg.norm(combined_embedding, axis=1, keepdims=True)
487
 
488
  # Index vào Qdrant
489
  metadata = {
490
+ "texts": texts if texts else [],
491
+ "text_count": len(texts) if texts else 0,
492
+ "image_count": len(images) if images else 0,
493
+ "image_filenames": [img.filename for img in images] if images else []
494
  }
495
 
496
  result = qdrant_service.index_data(
 
502
  return IndexResponse(
503
  success=True,
504
  id=result["original_id"], # Trả về MongoDB ObjectId
505
+ message=f"Đã index thành công document {result['original_id']} với {len(texts) if texts else 0} texts và {len(images) if images else 0} images (Qdrant UUID: {result['qdrant_id']})"
506
  )
507
 
508
+ except HTTPException:
509
+ raise
510
  except Exception as e:
511
  raise HTTPException(status_code=500, detail=f"Lỗi khi index: {str(e)}")
512
 
 
729
  @app.post("/chat", response_model=ChatResponse)
730
  async def chat(request: ChatRequest):
731
  """
732
+ Chat endpoint với Advanced RAG
733
 
734
  Body:
735
  - message: User message
 
739
  - max_tokens: Max tokens for response (default: 512)
740
  - temperature: Temperature for generation (default: 0.7)
741
  - hf_token: Hugging Face token (optional, sẽ dùng env nếu không truyền)
742
+ - use_advanced_rag: Use advanced RAG pipeline (default: true)
743
+ - use_query_expansion: Enable query expansion (default: true)
744
+ - use_reranking: Enable reranking (default: true)
745
+ - use_compression: Enable context compression (default: true)
746
+ - score_threshold: Minimum relevance score (default: 0.5)
747
 
748
  Returns:
749
  - response: Generated response
750
  - context_used: Retrieved context documents
751
  - timestamp: Response timestamp
752
+ - rag_stats: Statistics from RAG pipeline
753
  """
754
  try:
755
  # Retrieve context if RAG enabled
756
  context_used = []
757
+ rag_stats = None
758
+
759
  if request.use_rag:
760
+ if request.use_advanced_rag:
761
+ # Use Advanced RAG Pipeline
762
+ documents, stats = advanced_rag.hybrid_rag_pipeline(
763
+ query=request.message,
764
+ top_k=request.top_k,
765
+ score_threshold=request.score_threshold,
766
+ use_reranking=request.use_reranking,
767
+ use_compression=request.use_compression,
768
+ max_context_tokens=500
769
+ )
770
+
771
+ # Convert to dict format for compatibility
772
+ context_used = [
773
+ {
774
+ "id": doc.id,
775
+ "confidence": doc.confidence,
776
+ "metadata": doc.metadata
777
+ }
778
+ for doc in documents
779
+ ]
780
+ rag_stats = stats
781
+
782
+ # Format context using advanced RAG formatter
783
+ context_text = advanced_rag.format_context_for_llm(documents)
784
+
785
+ else:
786
+ # Use basic RAG (original implementation)
787
+ query_embedding = embedding_service.encode_text(request.message)
788
+
789
+ results = qdrant_service.search(
790
+ query_embedding=query_embedding,
791
+ limit=request.top_k,
792
+ score_threshold=request.score_threshold
793
+ )
794
+ context_used = results
795
+
796
+ # Build context text (basic format)
797
+ context_text = "\n\nRelevant Context:\n"
798
+ for i, doc in enumerate(context_used, 1):
799
+ doc_text = doc["metadata"].get("text", "")
800
+ confidence = doc["confidence"]
801
+ context_text += f"\n[{i}] (Confidence: {confidence:.2f})\n{doc_text}\n"
802
+
803
+ # Build system message with context
804
+ if request.use_rag and context_used:
805
+ if request.use_advanced_rag:
806
+ # Use advanced prompt builder
807
+ system_message = advanced_rag.build_rag_prompt(
808
+ query=request.message,
809
+ context=context_text,
810
+ system_message=request.system_message
811
+ )
812
+ else:
813
+ # Basic prompt
814
+ system_message = f"{request.system_message}\n{context_text}\n\nPlease use the above context to answer the user's question when relevant."
815
  else:
816
  system_message = request.system_message
817
 
 
875
  return ChatResponse(
876
  response=response,
877
  context_used=context_used,
878
+ timestamp=datetime.utcnow().isoformat(),
879
+ rag_stats=rag_stats
880
  )
881
 
882
  except Exception as e:
 
1036
  raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
1037
 
1038
 
1039
+ @app.post("/upload-pdf", response_model=UploadPDFResponse)
1040
+ async def upload_pdf(
1041
+ file: UploadFile = File(...),
1042
+ document_id: Optional[str] = Form(None),
1043
+ title: Optional[str] = Form(None),
1044
+ description: Optional[str] = Form(None),
1045
+ category: Optional[str] = Form(None)
1046
+ ):
1047
+ """
1048
+ Upload and index PDF file into knowledge base
1049
+
1050
+ Body (multipart/form-data):
1051
+ - file: PDF file (required)
1052
+ - document_id: Custom document ID (optional, auto-generated if not provided)
1053
+ - title: Document title (optional)
1054
+ - description: Document description (optional)
1055
+ - category: Document category (optional, e.g., "user_guide", "faq")
1056
+
1057
+ Returns:
1058
+ - success: True/False
1059
+ - document_id: Document ID
1060
+ - filename: Original filename
1061
+ - chunks_indexed: Number of chunks created
1062
+ - message: Status message
1063
+
1064
+ Example:
1065
+ ```bash
1066
+ curl -X POST "http://localhost:8000/upload-pdf" \
1067
+ -F "file=@user_guide.pdf" \
1068
+ -F "title=Hướng dẫn sử dụng ChatbotRAG" \
1069
+ -F "category=user_guide"
1070
+ ```
1071
+ """
1072
+ try:
1073
+ # Validate file type
1074
+ if not file.filename.endswith('.pdf'):
1075
+ raise HTTPException(status_code=400, detail="Only PDF files are allowed")
1076
+
1077
+ # Generate document ID if not provided
1078
+ if not document_id:
1079
+ from datetime import datetime
1080
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1081
+ document_id = f"pdf_{timestamp}"
1082
+
1083
+ # Read PDF bytes
1084
+ pdf_bytes = await file.read()
1085
+
1086
+ # Prepare metadata
1087
+ metadata = {}
1088
+ if title:
1089
+ metadata['title'] = title
1090
+ if description:
1091
+ metadata['description'] = description
1092
+ if category:
1093
+ metadata['category'] = category
1094
+
1095
+ # Index PDF
1096
+ result = pdf_indexer.index_pdf_bytes(
1097
+ pdf_bytes=pdf_bytes,
1098
+ document_id=document_id,
1099
+ filename=file.filename,
1100
+ document_metadata=metadata
1101
+ )
1102
+
1103
+ return UploadPDFResponse(
1104
+ success=True,
1105
+ document_id=result['document_id'],
1106
+ filename=result['filename'],
1107
+ chunks_indexed=result['chunks_indexed'],
1108
+ message=f"PDF '{file.filename}' đã được index thành công với {result['chunks_indexed']} chunks"
1109
+ )
1110
+
1111
+ except HTTPException:
1112
+ raise
1113
+ except Exception as e:
1114
+ raise HTTPException(status_code=500, detail=f"Error uploading PDF: {str(e)}")
1115
+
1116
+
1117
+ @app.get("/documents/pdf")
1118
+ async def list_pdf_documents():
1119
+ """
1120
+ List all PDF documents in knowledge base
1121
+
1122
+ Returns:
1123
+ - documents: List of PDF documents with metadata
1124
+ """
1125
+ try:
1126
+ docs = list(documents_collection.find(
1127
+ {"type": "pdf"},
1128
+ {"_id": 0}
1129
+ ))
1130
+ return {"documents": docs, "total": len(docs)}
1131
+ except Exception as e:
1132
+ raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
1133
+
1134
+
1135
+ @app.delete("/documents/pdf/{document_id}")
1136
+ async def delete_pdf_document(document_id: str):
1137
+ """
1138
+ Delete PDF document and all its chunks from knowledge base
1139
+
1140
+ Args:
1141
+ - document_id: Document ID
1142
+
1143
+ Returns:
1144
+ - success: True/False
1145
+ - message: Status message
1146
+ """
1147
+ try:
1148
+ # Get document info
1149
+ doc = documents_collection.find_one({"document_id": document_id, "type": "pdf"})
1150
+
1151
+ if not doc:
1152
+ raise HTTPException(status_code=404, detail=f"PDF document {document_id} not found")
1153
+
1154
+ # Delete all chunks from Qdrant
1155
+ chunk_ids = doc.get('chunk_ids', [])
1156
+ for chunk_id in chunk_ids:
1157
+ try:
1158
+ qdrant_service.delete_by_id(chunk_id)
1159
+ except:
1160
+ pass # Chunk might already be deleted
1161
+
1162
+ # Delete from MongoDB
1163
+ documents_collection.delete_one({"document_id": document_id})
1164
+
1165
+ return {
1166
+ "success": True,
1167
+ "message": f"PDF document {document_id} and {len(chunk_ids)} chunks deleted"
1168
+ }
1169
+
1170
+ except HTTPException:
1171
+ raise
1172
+ except Exception as e:
1173
+ raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
1174
+
1175
+
1176
+ @app.post("/upload-pdf-multimodal", response_model=UploadPDFResponse)
1177
+ async def upload_pdf_multimodal(
1178
+ file: UploadFile = File(...),
1179
+ document_id: Optional[str] = Form(None),
1180
+ title: Optional[str] = Form(None),
1181
+ description: Optional[str] = Form(None),
1182
+ category: Optional[str] = Form(None)
1183
+ ):
1184
+ """
1185
+ Upload PDF with text and image URLs (for user guides with screenshots)
1186
+
1187
+ This endpoint is optimized for PDFs containing:
1188
+ - Text instructions
1189
+ - Image URLs (http://... or https://...)
1190
+ - Markdown images: ![alt](url)
1191
+ - HTML images: <img src="url">
1192
+
1193
+ The system will:
1194
+ 1. Extract text from PDF
1195
+ 2. Detect all image URLs in the text
1196
+ 3. Link images to their corresponding text chunks
1197
+ 4. Store image URLs in metadata
1198
+ 5. Return images along with text during chat
1199
+
1200
+ Body (multipart/form-data):
1201
+ - file: PDF file (required)
1202
+ - document_id: Custom document ID (optional, auto-generated if not provided)
1203
+ - title: Document title (optional)
1204
+ - description: Document description (optional)
1205
+ - category: Document category (optional, e.g., "user_guide", "tutorial")
1206
+
1207
+ Returns:
1208
+ - success: True/False
1209
+ - document_id: Document ID
1210
+ - filename: Original filename
1211
+ - chunks_indexed: Number of chunks created
1212
+ - message: Status message (includes image count)
1213
+
1214
+ Example:
1215
+ ```bash
1216
+ curl -X POST "http://localhost:8000/upload-pdf-multimodal" \
1217
+ -F "file=@user_guide_with_images.pdf" \
1218
+ -F "title=Hướng dẫn có ảnh minh họa" \
1219
+ -F "category=user_guide"
1220
+ ```
1221
+
1222
+ Example Response:
1223
+ ```json
1224
+ {
1225
+ "success": true,
1226
+ "document_id": "pdf_20251029_150000",
1227
+ "filename": "user_guide_with_images.pdf",
1228
+ "chunks_indexed": 25,
1229
+ "message": "PDF 'user_guide_with_images.pdf' indexed with 25 chunks and 15 images"
1230
+ }
1231
+ ```
1232
+ """
1233
+ try:
1234
+ # Validate file type
1235
+ if not file.filename.endswith('.pdf'):
1236
+ raise HTTPException(status_code=400, detail="Only PDF files are allowed")
1237
+
1238
+ # Generate document ID if not provided
1239
+ if not document_id:
1240
+ from datetime import datetime
1241
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1242
+ document_id = f"pdf_multimodal_{timestamp}"
1243
+
1244
+ # Read PDF bytes
1245
+ pdf_bytes = await file.read()
1246
+
1247
+ # Prepare metadata
1248
+ metadata = {'type': 'multimodal'}
1249
+ if title:
1250
+ metadata['title'] = title
1251
+ if description:
1252
+ metadata['description'] = description
1253
+ if category:
1254
+ metadata['category'] = category
1255
+
1256
+ # Index PDF with multimodal parser
1257
+ result = multimodal_pdf_indexer.index_pdf_bytes(
1258
+ pdf_bytes=pdf_bytes,
1259
+ document_id=document_id,
1260
+ filename=file.filename,
1261
+ document_metadata=metadata
1262
+ )
1263
+
1264
+ return UploadPDFResponse(
1265
+ success=True,
1266
+ document_id=result['document_id'],
1267
+ filename=result['filename'],
1268
+ chunks_indexed=result['chunks_indexed'],
1269
+ message=f"PDF '{file.filename}' indexed successfully with {result['chunks_indexed']} chunks and {result.get('images_found', 0)} images"
1270
+ )
1271
+
1272
+ except HTTPException:
1273
+ raise
1274
+ except Exception as e:
1275
+ raise HTTPException(status_code=500, detail=f"Error uploading multimodal PDF: {str(e)}")
1276
+
1277
+
1278
  if __name__ == "__main__":
1279
  import uvicorn
1280
  uvicorn.run(