rag-chatbot / process_pdf.py
giveaccesstoall's picture
Upload 6 files
be8bda0 verified
# process_pdf.py
import fitz # PyMuPDF
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle
# === Step 1: Extract text from PDF ===
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
full_text = ""
for page in doc:
full_text += page.get_text()
return full_text
# === Step 2: Chunk text ===
def chunk_text(text, chunk_size=500):
paragraphs = text.split("\n")
chunks = []
current_chunk = ""
for para in paragraphs:
if len(current_chunk) + len(para) < chunk_size:
current_chunk += para + " "
else:
chunks.append(current_chunk.strip())
current_chunk = para + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# === Step 3: Embed and save ===
def build_and_save_index(chunks, embedder, index_path="index.faiss", chunks_path="chunks.pkl"):
embeddings = embedder.encode(chunks, convert_to_numpy=True)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
faiss.write_index(index, index_path)
with open(chunks_path, "wb") as f:
pickle.dump(chunks, f)
print(f"βœ… Saved FAISS index to {index_path}")
print(f"βœ… Saved chunks to {chunks_path}")
# === Run Once ===
if __name__ == "__main__":
pdf_path = "input.pdf" # Replace with your actual PDF
raw_text = extract_text_from_pdf(pdf_path)
chunks = chunk_text(raw_text)
embedder = SentenceTransformer("all-MiniLM-L6-v2")
build_and_save_index(chunks, embedder)