Spaces:

Bman21
/

Medstudyeasyai

Running

Medstudyeasyai / data_loader.py

Create data_loader.py

72f5b02 verified about 1 month ago

1.42 kB

	import os
	import PyPDF2
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np

	# Load embedding model
	embedder = SentenceTransformer("all-MiniLM-L6-v2")

	def load_pdf(file_path):
	"""Extract text from a PDF file"""
	text = ""
	with open(file_path, "rb") as f:
	reader = PyPDF2.PdfReader(f)
	for page in reader.pages:
	if page.extract_text():
	text += page.extract_text() + " "
	return text

	def load_all_pdfs(folder="notes"):
	"""Load and merge text from all PDFs in a folder"""
	all_chunks = []
	sources = []

	for file in os.listdir(folder):
	if file.endswith(".pdf"):
	subject = file.replace(".pdf", "")
	print(f"📖 Loading {subject} ...")
	text = load_pdf(os.path.join(folder, file))

	# Split into chunks
	chunks = [text[i:i+500] for i in range(0, len(text), 500)]
	all_chunks.extend(chunks)
	sources.extend([subject] * len(chunks)) # Keep track of subject

	return all_chunks, sources

	def create_vector_store(chunks):
	"""Create embeddings and FAISS index"""
	embeddings = embedder.encode(chunks)
	dim = embeddings.shape[1]
	index = faiss.IndexFlatL2(dim)
	index.add(np.array(embeddings))
	return index

	# Load all PDFs
	chunks, sources = load_all_pdfs("notes")
	index = create_vector_store(chunks)