Spaces:
Running
Running
| import os | |
| import PyPDF2 | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import numpy as np | |
| # Load embedding model | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| def load_pdf(file_path): | |
| """Extract text from a PDF file""" | |
| text = "" | |
| with open(file_path, "rb") as f: | |
| reader = PyPDF2.PdfReader(f) | |
| for page in reader.pages: | |
| if page.extract_text(): | |
| text += page.extract_text() + " " | |
| return text | |
| def load_all_pdfs(folder="notes"): | |
| """Load and merge text from all PDFs in a folder""" | |
| all_chunks = [] | |
| sources = [] | |
| for file in os.listdir(folder): | |
| if file.endswith(".pdf"): | |
| subject = file.replace(".pdf", "") | |
| print(f"π Loading {subject} ...") | |
| text = load_pdf(os.path.join(folder, file)) | |
| # Split into chunks | |
| chunks = [text[i:i+500] for i in range(0, len(text), 500)] | |
| all_chunks.extend(chunks) | |
| sources.extend([subject] * len(chunks)) # Keep track of subject | |
| return all_chunks, sources | |
| def create_vector_store(chunks): | |
| """Create embeddings and FAISS index""" | |
| embeddings = embedder.encode(chunks) | |
| dim = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dim) | |
| index.add(np.array(embeddings)) | |
| return index | |
| # Load all PDFs | |
| chunks, sources = load_all_pdfs("notes") | |
| index = create_vector_store(chunks) |