Medstudyeasyai / data_loader.py
Bman21's picture
Create data_loader.py
72f5b02 verified
import os
import PyPDF2
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
# Load embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")
def load_pdf(file_path):
"""Extract text from a PDF file"""
text = ""
with open(file_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
if page.extract_text():
text += page.extract_text() + " "
return text
def load_all_pdfs(folder="notes"):
"""Load and merge text from all PDFs in a folder"""
all_chunks = []
sources = []
for file in os.listdir(folder):
if file.endswith(".pdf"):
subject = file.replace(".pdf", "")
print(f"πŸ“– Loading {subject} ...")
text = load_pdf(os.path.join(folder, file))
# Split into chunks
chunks = [text[i:i+500] for i in range(0, len(text), 500)]
all_chunks.extend(chunks)
sources.extend([subject] * len(chunks)) # Keep track of subject
return all_chunks, sources
def create_vector_store(chunks):
"""Create embeddings and FAISS index"""
embeddings = embedder.encode(chunks)
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(np.array(embeddings))
return index
# Load all PDFs
chunks, sources = load_all_pdfs("notes")
index = create_vector_store(chunks)