Spaces:

giveaccesstoall
/

rag-chatbot

Sleeping

File size: 1,648 Bytes

be8bda0

# process_pdf.py

import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle

# === Step 1: Extract text from PDF ===
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return full_text

# === Step 2: Chunk text ===
def chunk_text(text, chunk_size=500):
    paragraphs = text.split("\n")
    chunks = []
    current_chunk = ""
    for para in paragraphs:
        if len(current_chunk) + len(para) < chunk_size:
            current_chunk += para + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = para + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# === Step 3: Embed and save ===
def build_and_save_index(chunks, embedder, index_path="index.faiss", chunks_path="chunks.pkl"):
    embeddings = embedder.encode(chunks, convert_to_numpy=True)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    faiss.write_index(index, index_path)
    with open(chunks_path, "wb") as f:
        pickle.dump(chunks, f)

    print(f"✅ Saved FAISS index to {index_path}")
    print(f"✅ Saved chunks to {chunks_path}")

# === Run Once ===
if __name__ == "__main__":
    pdf_path = "input.pdf"  # Replace with your actual PDF
    raw_text = extract_text_from_pdf(pdf_path)
    chunks = chunk_text(raw_text)

    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    build_and_save_index(chunks, embedder)