File size: 4,519 Bytes
500cf95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""
Batch script to index PDF files into RAG knowledge base
Usage: python batch_index_pdfs.py <pdf_directory> [options]
"""

import os
import sys
from pathlib import Path
from pymongo import MongoClient
from embedding_service import JinaClipEmbeddingService
from qdrant_service import QdrantVectorService
from pdf_parser import PDFIndexer


def index_pdf_directory(
    pdf_dir: str,
    category: str = "user_guide",
    force: bool = False
):
    """
    Index all PDF files in a directory

    Args:
        pdf_dir: Directory containing PDF files
        category: Category for the PDFs (default: "user_guide")
        force: Force reindex even if already indexed (default: False)
    """
    print("="*60)
    print("PDF Batch Indexer")
    print("="*60)

    # Initialize services (same as main.py)
    print("\n[1/5] Initializing services...")
    embedding_service = JinaClipEmbeddingService(model_path="jinaai/jina-clip-v2")

    collection_name = os.getenv("COLLECTION_NAME", "event_social_media")
    qdrant_service = QdrantVectorService(
        collection_name=collection_name,
        vector_size=embedding_service.get_embedding_dimension()
    )

    # MongoDB
    mongodb_uri = os.getenv("MONGODB_URI", "mongodb+srv://truongtn7122003:7KaI9OT5KTUxWjVI@truongtn7122003.xogin4q.mongodb.net/")
    mongo_client = MongoClient(mongodb_uri)
    db = mongo_client[os.getenv("MONGODB_DB_NAME", "chatbot_rag")]
    documents_collection = db["documents"]

    # Initialize PDF indexer
    pdf_indexer = PDFIndexer(
        embedding_service=embedding_service,
        qdrant_service=qdrant_service,
        documents_collection=documents_collection
    )
    print("βœ“ Services initialized")

    # Find all PDF files
    print(f"\n[2/5] Scanning directory: {pdf_dir}")
    pdf_files = list(Path(pdf_dir).glob("*.pdf"))

    if not pdf_files:
        print("βœ— No PDF files found in directory")
        return

    print(f"βœ“ Found {len(pdf_files)} PDF file(s)")

    # Index each PDF
    print(f"\n[3/5] Indexing PDFs...")
    indexed_count = 0
    skipped_count = 0
    error_count = 0

    for i, pdf_path in enumerate(pdf_files, 1):
        print(f"\n--- [{i}/{len(pdf_files)}] Processing: {pdf_path.name} ---")

        # Generate document ID
        doc_id = f"pdf_{pdf_path.stem}"

        # Check if already indexed
        if not force:
            existing = documents_collection.find_one({"document_id": doc_id})
            if existing:
                print(f"⊘ Already indexed (use --force to reindex)")
                skipped_count += 1
                continue

        try:
            # Index PDF
            metadata = {
                'title': pdf_path.stem.replace('_', ' ').title(),
                'category': category,
                'source_file': str(pdf_path)
            }

            result = pdf_indexer.index_pdf(
                pdf_path=str(pdf_path),
                document_id=doc_id,
                document_metadata=metadata
            )

            print(f"βœ“ Indexed: {result['chunks_indexed']} chunks")
            indexed_count += 1

        except Exception as e:
            print(f"βœ— Error: {str(e)}")
            error_count += 1

    # Summary
    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)
    print(f"Total PDFs found: {len(pdf_files)}")
    print(f"βœ“ Successfully indexed: {indexed_count}")
    print(f"⊘ Skipped (already indexed): {skipped_count}")
    print(f"βœ— Errors: {error_count}")

    if indexed_count > 0:
        print(f"\nβœ“ Knowledge base updated successfully!")
        print(f"You can now chat with your chatbot about the content in these PDFs.")


def main():
    """Main entry point"""
    if len(sys.argv) < 2:
        print("Usage: python batch_index_pdfs.py <pdf_directory> [--category=<category>] [--force]")
        print("\nExample:")
        print("  python batch_index_pdfs.py ./docs/guides")
        print("  python batch_index_pdfs.py ./docs/guides --category=user_guide --force")
        sys.exit(1)

    pdf_dir = sys.argv[1]

    if not os.path.isdir(pdf_dir):
        print(f"Error: Directory not found: {pdf_dir}")
        sys.exit(1)

    # Parse options
    category = "user_guide"
    force = False

    for arg in sys.argv[2:]:
        if arg.startswith("--category="):
            category = arg.split("=")[1]
        elif arg == "--force":
            force = True

    # Index PDFs
    index_pdf_directory(pdf_dir, category=category, force=force)


if __name__ == "__main__":
    main()