# process_pdf.py import fitz # PyMuPDF from sentence_transformers import SentenceTransformer import faiss import numpy as np import pickle # === Step 1: Extract text from PDF === def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) full_text = "" for page in doc: full_text += page.get_text() return full_text # === Step 2: Chunk text === def chunk_text(text, chunk_size=500): paragraphs = text.split("\n") chunks = [] current_chunk = "" for para in paragraphs: if len(current_chunk) + len(para) < chunk_size: current_chunk += para + " " else: chunks.append(current_chunk.strip()) current_chunk = para + " " if current_chunk: chunks.append(current_chunk.strip()) return chunks # === Step 3: Embed and save === def build_and_save_index(chunks, embedder, index_path="index.faiss", chunks_path="chunks.pkl"): embeddings = embedder.encode(chunks, convert_to_numpy=True) dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings) faiss.write_index(index, index_path) with open(chunks_path, "wb") as f: pickle.dump(chunks, f) print(f"✅ Saved FAISS index to {index_path}") print(f"✅ Saved chunks to {chunks_path}") # === Run Once === if __name__ == "__main__": pdf_path = "input.pdf" # Replace with your actual PDF raw_text = extract_text_from_pdf(pdf_path) chunks = chunk_text(raw_text) embedder = SentenceTransformer("all-MiniLM-L6-v2") build_and_save_index(chunks, embedder)