Spaces:
Running
Running
File size: 4,519 Bytes
500cf95 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
"""
Batch script to index PDF files into RAG knowledge base
Usage: python batch_index_pdfs.py <pdf_directory> [options]
"""
import os
import sys
from pathlib import Path
from pymongo import MongoClient
from embedding_service import JinaClipEmbeddingService
from qdrant_service import QdrantVectorService
from pdf_parser import PDFIndexer
def index_pdf_directory(
pdf_dir: str,
category: str = "user_guide",
force: bool = False
):
"""
Index all PDF files in a directory
Args:
pdf_dir: Directory containing PDF files
category: Category for the PDFs (default: "user_guide")
force: Force reindex even if already indexed (default: False)
"""
print("="*60)
print("PDF Batch Indexer")
print("="*60)
# Initialize services (same as main.py)
print("\n[1/5] Initializing services...")
embedding_service = JinaClipEmbeddingService(model_path="jinaai/jina-clip-v2")
collection_name = os.getenv("COLLECTION_NAME", "event_social_media")
qdrant_service = QdrantVectorService(
collection_name=collection_name,
vector_size=embedding_service.get_embedding_dimension()
)
# MongoDB
mongodb_uri = os.getenv("MONGODB_URI", "mongodb+srv://truongtn7122003:7KaI9OT5KTUxWjVI@truongtn7122003.xogin4q.mongodb.net/")
mongo_client = MongoClient(mongodb_uri)
db = mongo_client[os.getenv("MONGODB_DB_NAME", "chatbot_rag")]
documents_collection = db["documents"]
# Initialize PDF indexer
pdf_indexer = PDFIndexer(
embedding_service=embedding_service,
qdrant_service=qdrant_service,
documents_collection=documents_collection
)
print("β Services initialized")
# Find all PDF files
print(f"\n[2/5] Scanning directory: {pdf_dir}")
pdf_files = list(Path(pdf_dir).glob("*.pdf"))
if not pdf_files:
print("β No PDF files found in directory")
return
print(f"β Found {len(pdf_files)} PDF file(s)")
# Index each PDF
print(f"\n[3/5] Indexing PDFs...")
indexed_count = 0
skipped_count = 0
error_count = 0
for i, pdf_path in enumerate(pdf_files, 1):
print(f"\n--- [{i}/{len(pdf_files)}] Processing: {pdf_path.name} ---")
# Generate document ID
doc_id = f"pdf_{pdf_path.stem}"
# Check if already indexed
if not force:
existing = documents_collection.find_one({"document_id": doc_id})
if existing:
print(f"β Already indexed (use --force to reindex)")
skipped_count += 1
continue
try:
# Index PDF
metadata = {
'title': pdf_path.stem.replace('_', ' ').title(),
'category': category,
'source_file': str(pdf_path)
}
result = pdf_indexer.index_pdf(
pdf_path=str(pdf_path),
document_id=doc_id,
document_metadata=metadata
)
print(f"β Indexed: {result['chunks_indexed']} chunks")
indexed_count += 1
except Exception as e:
print(f"β Error: {str(e)}")
error_count += 1
# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Total PDFs found: {len(pdf_files)}")
print(f"β Successfully indexed: {indexed_count}")
print(f"β Skipped (already indexed): {skipped_count}")
print(f"β Errors: {error_count}")
if indexed_count > 0:
print(f"\nβ Knowledge base updated successfully!")
print(f"You can now chat with your chatbot about the content in these PDFs.")
def main():
"""Main entry point"""
if len(sys.argv) < 2:
print("Usage: python batch_index_pdfs.py <pdf_directory> [--category=<category>] [--force]")
print("\nExample:")
print(" python batch_index_pdfs.py ./docs/guides")
print(" python batch_index_pdfs.py ./docs/guides --category=user_guide --force")
sys.exit(1)
pdf_dir = sys.argv[1]
if not os.path.isdir(pdf_dir):
print(f"Error: Directory not found: {pdf_dir}")
sys.exit(1)
# Parse options
category = "user_guide"
force = False
for arg in sys.argv[2:]:
if arg.startswith("--category="):
category = arg.split("=")[1]
elif arg == "--force":
force = True
# Index PDFs
index_pdf_directory(pdf_dir, category=category, force=force)
if __name__ == "__main__":
main()
|