Spaces:

minhvtt
/

ChatbotRAG

Running

File size: 4,519 Bytes

500cf95

"""
Batch script to index PDF files into RAG knowledge base
Usage: python batch_index_pdfs.py <pdf_directory> [options]
"""

import os
import sys
from pathlib import Path
from pymongo import MongoClient
from embedding_service import JinaClipEmbeddingService
from qdrant_service import QdrantVectorService
from pdf_parser import PDFIndexer


def index_pdf_directory(
    pdf_dir: str,
    category: str = "user_guide",
    force: bool = False
):
    """
    Index all PDF files in a directory

    Args:
        pdf_dir: Directory containing PDF files
        category: Category for the PDFs (default: "user_guide")
        force: Force reindex even if already indexed (default: False)
    """
    print("="*60)
    print("PDF Batch Indexer")
    print("="*60)

    # Initialize services (same as main.py)
    print("\n[1/5] Initializing services...")
    embedding_service = JinaClipEmbeddingService(model_path="jinaai/jina-clip-v2")

    collection_name = os.getenv("COLLECTION_NAME", "event_social_media")
    qdrant_service = QdrantVectorService(
        collection_name=collection_name,
        vector_size=embedding_service.get_embedding_dimension()
    )

    # MongoDB
    mongodb_uri = os.getenv("MONGODB_URI", "mongodb+srv://truongtn7122003:7KaI9OT5KTUxWjVI@truongtn7122003.xogin4q.mongodb.net/")
    mongo_client = MongoClient(mongodb_uri)
    db = mongo_client[os.getenv("MONGODB_DB_NAME", "chatbot_rag")]
    documents_collection = db["documents"]

    # Initialize PDF indexer
    pdf_indexer = PDFIndexer(
        embedding_service=embedding_service,
        qdrant_service=qdrant_service,
        documents_collection=documents_collection
    )
    print("✓ Services initialized")

    # Find all PDF files
    print(f"\n[2/5] Scanning directory: {pdf_dir}")
    pdf_files = list(Path(pdf_dir).glob("*.pdf"))

    if not pdf_files:
        print("✗ No PDF files found in directory")
        return

    print(f"✓ Found {len(pdf_files)} PDF file(s)")

    # Index each PDF
    print(f"\n[3/5] Indexing PDFs...")
    indexed_count = 0
    skipped_count = 0
    error_count = 0

    for i, pdf_path in enumerate(pdf_files, 1):
        print(f"\n--- [{i}/{len(pdf_files)}] Processing: {pdf_path.name} ---")

        # Generate document ID
        doc_id = f"pdf_{pdf_path.stem}"

        # Check if already indexed
        if not force:
            existing = documents_collection.find_one({"document_id": doc_id})
            if existing:
                print(f"⊘ Already indexed (use --force to reindex)")
                skipped_count += 1
                continue

        try:
            # Index PDF
            metadata = {
                'title': pdf_path.stem.replace('_', ' ').title(),
                'category': category,
                'source_file': str(pdf_path)
            }

            result = pdf_indexer.index_pdf(
                pdf_path=str(pdf_path),
                document_id=doc_id,
                document_metadata=metadata
            )

            print(f"✓ Indexed: {result['chunks_indexed']} chunks")
            indexed_count += 1

        except Exception as e:
            print(f"✗ Error: {str(e)}")
            error_count += 1

    # Summary
    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)
    print(f"Total PDFs found: {len(pdf_files)}")
    print(f"✓ Successfully indexed: {indexed_count}")
    print(f"⊘ Skipped (already indexed): {skipped_count}")
    print(f"✗ Errors: {error_count}")

    if indexed_count > 0:
        print(f"\n✓ Knowledge base updated successfully!")
        print(f"You can now chat with your chatbot about the content in these PDFs.")


def main():
    """Main entry point"""
    if len(sys.argv) < 2:
        print("Usage: python batch_index_pdfs.py <pdf_directory> [--category=<category>] [--force]")
        print("\nExample:")
        print("  python batch_index_pdfs.py ./docs/guides")
        print("  python batch_index_pdfs.py ./docs/guides --category=user_guide --force")
        sys.exit(1)

    pdf_dir = sys.argv[1]

    if not os.path.isdir(pdf_dir):
        print(f"Error: Directory not found: {pdf_dir}")
        sys.exit(1)

    # Parse options
    category = "user_guide"
    force = False

    for arg in sys.argv[2:]:
        if arg.startswith("--category="):
            category = arg.split("=")[1]
        elif arg == "--force":
            force = True

    # Index PDFs
    index_pdf_directory(pdf_dir, category=category, force=force)


if __name__ == "__main__":
    main()