import os import PyPDF2 from sentence_transformers import SentenceTransformer import faiss import numpy as np # Load embedding model embedder = SentenceTransformer("all-MiniLM-L6-v2") def load_pdf(file_path): """Extract text from a PDF file""" text = "" with open(file_path, "rb") as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: if page.extract_text(): text += page.extract_text() + " " return text def load_all_pdfs(folder="notes"): """Load and merge text from all PDFs in a folder""" all_chunks = [] sources = [] for file in os.listdir(folder): if file.endswith(".pdf"): subject = file.replace(".pdf", "") print(f"📖 Loading {subject} ...") text = load_pdf(os.path.join(folder, file)) # Split into chunks chunks = [text[i:i+500] for i in range(0, len(text), 500)] all_chunks.extend(chunks) sources.extend([subject] * len(chunks)) # Keep track of subject return all_chunks, sources def create_vector_store(chunks): """Create embeddings and FAISS index""" embeddings = embedder.encode(chunks) dim = embeddings.shape[1] index = faiss.IndexFlatL2(dim) index.add(np.array(embeddings)) return index # Load all PDFs chunks, sources = load_all_pdfs("notes") index = create_vector_store(chunks)