Spaces:

giveaccesstoall
/

rag-chatbot

Sleeping

rag-chatbot / process_pdf.py

Upload 6 files

be8bda0 verified 4 months ago

1.65 kB

	# process_pdf.py

	import fitz # PyMuPDF
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np
	import pickle

	# === Step 1: Extract text from PDF ===
	def extract_text_from_pdf(pdf_path):
	doc = fitz.open(pdf_path)
	full_text = ""
	for page in doc:
	full_text += page.get_text()
	return full_text

	# === Step 2: Chunk text ===
	def chunk_text(text, chunk_size=500):
	paragraphs = text.split("\n")
	chunks = []
	current_chunk = ""
	for para in paragraphs:
	if len(current_chunk) + len(para) < chunk_size:
	current_chunk += para + " "
	else:
	chunks.append(current_chunk.strip())
	current_chunk = para + " "
	if current_chunk:
	chunks.append(current_chunk.strip())
	return chunks

	# === Step 3: Embed and save ===
	def build_and_save_index(chunks, embedder, index_path="index.faiss", chunks_path="chunks.pkl"):
	embeddings = embedder.encode(chunks, convert_to_numpy=True)
	dimension = embeddings.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(embeddings)

	faiss.write_index(index, index_path)
	with open(chunks_path, "wb") as f:
	pickle.dump(chunks, f)

	print(f"✅ Saved FAISS index to {index_path}")
	print(f"✅ Saved chunks to {chunks_path}")

	# === Run Once ===
	if __name__ == "__main__":
	pdf_path = "input.pdf" # Replace with your actual PDF
	raw_text = extract_text_from_pdf(pdf_path)
	chunks = chunk_text(raw_text)

	embedder = SentenceTransformer("all-MiniLM-L6-v2")
	build_and_save_index(chunks, embedder)