Spaces:
Running
Running
| """ | |
| PDF Parser Service for RAG Chatbot | |
| Extracts text from PDF and splits into chunks for indexing | |
| """ | |
| import pypdfium2 as pdfium | |
| from typing import List, Dict, Optional | |
| import re | |
| from dataclasses import dataclass | |
| class PDFChunk: | |
| """Represents a chunk of text from PDF""" | |
| text: str | |
| page_number: int | |
| chunk_index: int | |
| metadata: Dict | |
| class PDFParser: | |
| """Parse PDF files and prepare for RAG indexing""" | |
| def __init__( | |
| self, | |
| chunk_size: int = 500, # words per chunk | |
| chunk_overlap: int = 50, # words overlap between chunks | |
| min_chunk_size: int = 50 # minimum words in a chunk | |
| ): | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| self.min_chunk_size = min_chunk_size | |
| def extract_text_from_pdf(self, pdf_path: str) -> Dict[int, str]: | |
| """ | |
| Extract text from PDF file | |
| Args: | |
| pdf_path: Path to PDF file | |
| Returns: | |
| Dictionary mapping page number to text content | |
| """ | |
| pdf_text = {} | |
| try: | |
| pdf = pdfium.PdfDocument(pdf_path) | |
| for page_num in range(len(pdf)): | |
| page = pdf[page_num] | |
| textpage = page.get_textpage() | |
| text = textpage.get_text_range() | |
| # Clean text | |
| text = self._clean_text(text) | |
| pdf_text[page_num + 1] = text # 1-indexed pages | |
| return pdf_text | |
| except Exception as e: | |
| raise Exception(f"Error reading PDF: {str(e)}") | |
| def _clean_text(self, text: str) -> str: | |
| """Clean extracted text""" | |
| # Remove excessive whitespace | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove special characters that might cause issues | |
| text = text.replace('\x00', '') | |
| return text.strip() | |
| def chunk_text(self, text: str, page_number: int) -> List[PDFChunk]: | |
| """ | |
| Split text into overlapping chunks | |
| Args: | |
| text: Text to chunk | |
| page_number: Page number this text came from | |
| Returns: | |
| List of PDFChunk objects | |
| """ | |
| # Split into words | |
| words = text.split() | |
| if len(words) < self.min_chunk_size: | |
| # Text too short, return as single chunk | |
| if len(words) > 0: | |
| return [PDFChunk( | |
| text=text, | |
| page_number=page_number, | |
| chunk_index=0, | |
| metadata={'page': page_number, 'chunk': 0} | |
| )] | |
| return [] | |
| chunks = [] | |
| chunk_index = 0 | |
| start = 0 | |
| while start < len(words): | |
| # Get chunk | |
| end = min(start + self.chunk_size, len(words)) | |
| chunk_words = words[start:end] | |
| chunk_text = ' '.join(chunk_words) | |
| chunks.append(PDFChunk( | |
| text=chunk_text, | |
| page_number=page_number, | |
| chunk_index=chunk_index, | |
| metadata={ | |
| 'page': page_number, | |
| 'chunk': chunk_index, | |
| 'start_word': start, | |
| 'end_word': end | |
| } | |
| )) | |
| chunk_index += 1 | |
| # Move start position with overlap | |
| start = end - self.chunk_overlap | |
| # Avoid infinite loop | |
| if start >= len(words) - self.min_chunk_size: | |
| break | |
| return chunks | |
| def parse_pdf( | |
| self, | |
| pdf_path: str, | |
| document_metadata: Optional[Dict] = None | |
| ) -> List[PDFChunk]: | |
| """ | |
| Parse entire PDF into chunks | |
| Args: | |
| pdf_path: Path to PDF file | |
| document_metadata: Additional metadata for the document | |
| Returns: | |
| List of all chunks from the PDF | |
| """ | |
| # Extract text from all pages | |
| pages_text = self.extract_text_from_pdf(pdf_path) | |
| # Chunk each page | |
| all_chunks = [] | |
| for page_num, text in pages_text.items(): | |
| chunks = self.chunk_text(text, page_num) | |
| # Add document metadata | |
| if document_metadata: | |
| for chunk in chunks: | |
| chunk.metadata.update(document_metadata) | |
| all_chunks.extend(chunks) | |
| return all_chunks | |
| def parse_pdf_bytes( | |
| self, | |
| pdf_bytes: bytes, | |
| document_metadata: Optional[Dict] = None | |
| ) -> List[PDFChunk]: | |
| """ | |
| Parse PDF from bytes (for uploaded files) | |
| Args: | |
| pdf_bytes: PDF file as bytes | |
| document_metadata: Additional metadata | |
| Returns: | |
| List of chunks | |
| """ | |
| import tempfile | |
| import os | |
| # Save to temp file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: | |
| tmp.write(pdf_bytes) | |
| tmp_path = tmp.name | |
| try: | |
| chunks = self.parse_pdf(tmp_path, document_metadata) | |
| return chunks | |
| finally: | |
| # Clean up temp file | |
| if os.path.exists(tmp_path): | |
| os.unlink(tmp_path) | |
| def get_pdf_info(self, pdf_path: str) -> Dict: | |
| """ | |
| Get basic info about PDF | |
| Args: | |
| pdf_path: Path to PDF file | |
| Returns: | |
| Dictionary with PDF information | |
| """ | |
| try: | |
| pdf = pdfium.PdfDocument(pdf_path) | |
| info = { | |
| 'num_pages': len(pdf), | |
| 'file_path': pdf_path, | |
| } | |
| return info | |
| except Exception as e: | |
| raise Exception(f"Error reading PDF info: {str(e)}") | |
| class PDFIndexer: | |
| """Index PDF chunks into RAG system""" | |
| def __init__(self, embedding_service, qdrant_service, documents_collection): | |
| self.embedding_service = embedding_service | |
| self.qdrant_service = qdrant_service | |
| self.documents_collection = documents_collection | |
| self.parser = PDFParser() | |
| def index_pdf( | |
| self, | |
| pdf_path: str, | |
| document_id: str, | |
| document_metadata: Optional[Dict] = None | |
| ) -> Dict: | |
| """ | |
| Index entire PDF into RAG system | |
| Args: | |
| pdf_path: Path to PDF file | |
| document_id: Unique ID for this document | |
| document_metadata: Additional metadata (title, author, etc.) | |
| Returns: | |
| Indexing results | |
| """ | |
| # Parse PDF | |
| chunks = self.parser.parse_pdf(pdf_path, document_metadata) | |
| # Index each chunk | |
| indexed_count = 0 | |
| chunk_ids = [] | |
| for chunk in chunks: | |
| # Generate unique ID for chunk | |
| chunk_id = f"{document_id}_p{chunk.page_number}_c{chunk.chunk_index}" | |
| # Generate embedding | |
| embedding = self.embedding_service.encode_text(chunk.text) | |
| # Prepare metadata | |
| metadata = { | |
| 'text': chunk.text, | |
| 'document_id': document_id, | |
| 'page': chunk.page_number, | |
| 'chunk_index': chunk.chunk_index, | |
| 'source': 'pdf', | |
| **chunk.metadata | |
| } | |
| # Index to Qdrant | |
| self.qdrant_service.index_data( | |
| doc_id=chunk_id, | |
| embedding=embedding, | |
| metadata=metadata | |
| ) | |
| chunk_ids.append(chunk_id) | |
| indexed_count += 1 | |
| # Save document info to MongoDB | |
| doc_info = { | |
| 'document_id': document_id, | |
| 'type': 'pdf', | |
| 'file_path': pdf_path, | |
| 'num_chunks': indexed_count, | |
| 'chunk_ids': chunk_ids, | |
| 'metadata': document_metadata or {}, | |
| 'pdf_info': self.parser.get_pdf_info(pdf_path) | |
| } | |
| self.documents_collection.insert_one(doc_info) | |
| return { | |
| 'success': True, | |
| 'document_id': document_id, | |
| 'chunks_indexed': indexed_count, | |
| 'chunk_ids': chunk_ids[:5] # Return first 5 as sample | |
| } | |
| def index_pdf_bytes( | |
| self, | |
| pdf_bytes: bytes, | |
| document_id: str, | |
| filename: str, | |
| document_metadata: Optional[Dict] = None | |
| ) -> Dict: | |
| """ | |
| Index PDF from bytes (for uploaded files) | |
| Args: | |
| pdf_bytes: PDF file as bytes | |
| document_id: Unique ID for this document | |
| filename: Original filename | |
| document_metadata: Additional metadata | |
| Returns: | |
| Indexing results | |
| """ | |
| # Parse PDF | |
| metadata = document_metadata or {} | |
| metadata['filename'] = filename | |
| chunks = self.parser.parse_pdf_bytes(pdf_bytes, metadata) | |
| # Index each chunk | |
| indexed_count = 0 | |
| chunk_ids = [] | |
| for chunk in chunks: | |
| # Generate unique ID for chunk | |
| chunk_id = f"{document_id}_p{chunk.page_number}_c{chunk.chunk_index}" | |
| # Generate embedding | |
| embedding = self.embedding_service.encode_text(chunk.text) | |
| # Prepare metadata | |
| metadata = { | |
| 'text': chunk.text, | |
| 'document_id': document_id, | |
| 'page': chunk.page_number, | |
| 'chunk_index': chunk.chunk_index, | |
| 'source': 'pdf', | |
| 'filename': filename, | |
| **chunk.metadata | |
| } | |
| # Index to Qdrant | |
| self.qdrant_service.index_data( | |
| doc_id=chunk_id, | |
| embedding=embedding, | |
| metadata=metadata | |
| ) | |
| chunk_ids.append(chunk_id) | |
| indexed_count += 1 | |
| # Save document info to MongoDB | |
| doc_info = { | |
| 'document_id': document_id, | |
| 'type': 'pdf', | |
| 'filename': filename, | |
| 'num_chunks': indexed_count, | |
| 'chunk_ids': chunk_ids, | |
| 'metadata': metadata | |
| } | |
| self.documents_collection.insert_one(doc_info) | |
| return { | |
| 'success': True, | |
| 'document_id': document_id, | |
| 'filename': filename, | |
| 'chunks_indexed': indexed_count, | |
| 'chunk_ids': chunk_ids[:5] | |
| } | |