Spaces:
Sleeping
Sleeping
| # process_pdf.py | |
| import fitz # PyMuPDF | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import numpy as np | |
| import pickle | |
| # === Step 1: Extract text from PDF === | |
| def extract_text_from_pdf(pdf_path): | |
| doc = fitz.open(pdf_path) | |
| full_text = "" | |
| for page in doc: | |
| full_text += page.get_text() | |
| return full_text | |
| # === Step 2: Chunk text === | |
| def chunk_text(text, chunk_size=500): | |
| paragraphs = text.split("\n") | |
| chunks = [] | |
| current_chunk = "" | |
| for para in paragraphs: | |
| if len(current_chunk) + len(para) < chunk_size: | |
| current_chunk += para + " " | |
| else: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = para + " " | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| # === Step 3: Embed and save === | |
| def build_and_save_index(chunks, embedder, index_path="index.faiss", chunks_path="chunks.pkl"): | |
| embeddings = embedder.encode(chunks, convert_to_numpy=True) | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings) | |
| faiss.write_index(index, index_path) | |
| with open(chunks_path, "wb") as f: | |
| pickle.dump(chunks, f) | |
| print(f"β Saved FAISS index to {index_path}") | |
| print(f"β Saved chunks to {chunks_path}") | |
| # === Run Once === | |
| if __name__ == "__main__": | |
| pdf_path = "input.pdf" # Replace with your actual PDF | |
| raw_text = extract_text_from_pdf(pdf_path) | |
| chunks = chunk_text(raw_text) | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| build_and_save_index(chunks, embedder) | |