Spaces:
Running
Running
| """ | |
| Batch script to index PDF files into RAG knowledge base | |
| Usage: python batch_index_pdfs.py <pdf_directory> [options] | |
| """ | |
| import os | |
| import sys | |
| from pathlib import Path | |
| from pymongo import MongoClient | |
| from embedding_service import JinaClipEmbeddingService | |
| from qdrant_service import QdrantVectorService | |
| from pdf_parser import PDFIndexer | |
| def index_pdf_directory( | |
| pdf_dir: str, | |
| category: str = "user_guide", | |
| force: bool = False | |
| ): | |
| """ | |
| Index all PDF files in a directory | |
| Args: | |
| pdf_dir: Directory containing PDF files | |
| category: Category for the PDFs (default: "user_guide") | |
| force: Force reindex even if already indexed (default: False) | |
| """ | |
| print("="*60) | |
| print("PDF Batch Indexer") | |
| print("="*60) | |
| # Initialize services (same as main.py) | |
| print("\n[1/5] Initializing services...") | |
| embedding_service = JinaClipEmbeddingService(model_path="jinaai/jina-clip-v2") | |
| collection_name = os.getenv("COLLECTION_NAME", "event_social_media") | |
| qdrant_service = QdrantVectorService( | |
| collection_name=collection_name, | |
| vector_size=embedding_service.get_embedding_dimension() | |
| ) | |
| # MongoDB | |
| mongodb_uri = os.getenv("MONGODB_URI", "mongodb+srv://truongtn7122003:7KaI9OT5KTUxWjVI@truongtn7122003.xogin4q.mongodb.net/") | |
| mongo_client = MongoClient(mongodb_uri) | |
| db = mongo_client[os.getenv("MONGODB_DB_NAME", "chatbot_rag")] | |
| documents_collection = db["documents"] | |
| # Initialize PDF indexer | |
| pdf_indexer = PDFIndexer( | |
| embedding_service=embedding_service, | |
| qdrant_service=qdrant_service, | |
| documents_collection=documents_collection | |
| ) | |
| print("β Services initialized") | |
| # Find all PDF files | |
| print(f"\n[2/5] Scanning directory: {pdf_dir}") | |
| pdf_files = list(Path(pdf_dir).glob("*.pdf")) | |
| if not pdf_files: | |
| print("β No PDF files found in directory") | |
| return | |
| print(f"β Found {len(pdf_files)} PDF file(s)") | |
| # Index each PDF | |
| print(f"\n[3/5] Indexing PDFs...") | |
| indexed_count = 0 | |
| skipped_count = 0 | |
| error_count = 0 | |
| for i, pdf_path in enumerate(pdf_files, 1): | |
| print(f"\n--- [{i}/{len(pdf_files)}] Processing: {pdf_path.name} ---") | |
| # Generate document ID | |
| doc_id = f"pdf_{pdf_path.stem}" | |
| # Check if already indexed | |
| if not force: | |
| existing = documents_collection.find_one({"document_id": doc_id}) | |
| if existing: | |
| print(f"β Already indexed (use --force to reindex)") | |
| skipped_count += 1 | |
| continue | |
| try: | |
| # Index PDF | |
| metadata = { | |
| 'title': pdf_path.stem.replace('_', ' ').title(), | |
| 'category': category, | |
| 'source_file': str(pdf_path) | |
| } | |
| result = pdf_indexer.index_pdf( | |
| pdf_path=str(pdf_path), | |
| document_id=doc_id, | |
| document_metadata=metadata | |
| ) | |
| print(f"β Indexed: {result['chunks_indexed']} chunks") | |
| indexed_count += 1 | |
| except Exception as e: | |
| print(f"β Error: {str(e)}") | |
| error_count += 1 | |
| # Summary | |
| print("\n" + "="*60) | |
| print("SUMMARY") | |
| print("="*60) | |
| print(f"Total PDFs found: {len(pdf_files)}") | |
| print(f"β Successfully indexed: {indexed_count}") | |
| print(f"β Skipped (already indexed): {skipped_count}") | |
| print(f"β Errors: {error_count}") | |
| if indexed_count > 0: | |
| print(f"\nβ Knowledge base updated successfully!") | |
| print(f"You can now chat with your chatbot about the content in these PDFs.") | |
| def main(): | |
| """Main entry point""" | |
| if len(sys.argv) < 2: | |
| print("Usage: python batch_index_pdfs.py <pdf_directory> [--category=<category>] [--force]") | |
| print("\nExample:") | |
| print(" python batch_index_pdfs.py ./docs/guides") | |
| print(" python batch_index_pdfs.py ./docs/guides --category=user_guide --force") | |
| sys.exit(1) | |
| pdf_dir = sys.argv[1] | |
| if not os.path.isdir(pdf_dir): | |
| print(f"Error: Directory not found: {pdf_dir}") | |
| sys.exit(1) | |
| # Parse options | |
| category = "user_guide" | |
| force = False | |
| for arg in sys.argv[2:]: | |
| if arg.startswith("--category="): | |
| category = arg.split("=")[1] | |
| elif arg == "--force": | |
| force = True | |
| # Index PDFs | |
| index_pdf_directory(pdf_dir, category=category, force=force) | |
| if __name__ == "__main__": | |
| main() | |