File size: 1,648 Bytes
be8bda0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# process_pdf.py

import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle

# === Step 1: Extract text from PDF ===
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return full_text

# === Step 2: Chunk text ===
def chunk_text(text, chunk_size=500):
    paragraphs = text.split("\n")
    chunks = []
    current_chunk = ""
    for para in paragraphs:
        if len(current_chunk) + len(para) < chunk_size:
            current_chunk += para + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = para + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# === Step 3: Embed and save ===
def build_and_save_index(chunks, embedder, index_path="index.faiss", chunks_path="chunks.pkl"):
    embeddings = embedder.encode(chunks, convert_to_numpy=True)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    faiss.write_index(index, index_path)
    with open(chunks_path, "wb") as f:
        pickle.dump(chunks, f)

    print(f"βœ… Saved FAISS index to {index_path}")
    print(f"βœ… Saved chunks to {chunks_path}")

# === Run Once ===
if __name__ == "__main__":
    pdf_path = "input.pdf"  # Replace with your actual PDF
    raw_text = extract_text_from_pdf(pdf_path)
    chunks = chunk_text(raw_text)

    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    build_and_save_index(chunks, embedder)