PDF-Data_Extractor / extract_pdf_data.py
Shami96's picture
Update extract_pdf_data.py
102bd04 verified
raw
history blame
2.44 kB
# extract_pdf_data.py
import pdfplumber
from pdf2image import convert_from_path
import pytesseract
import os
def extract_pdf_full_text(pdf_path, txt_path, dpi=300):
raw_texts = []
need_ocr = []
# Step 1: Try to extract RAW text, record pages needing OCR
with pdfplumber.open(pdf_path) as pdf:
for i, page in enumerate(pdf.pages):
print(f"Extracting text from page {i+1}...")
try:
text = page.extract_text() or ""
except Exception:
text = ""
if text.strip():
raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
else:
raw_texts.append(None)
need_ocr.append(i)
# Step 2: OCR only pages that need it
if need_ocr:
print("Running OCR where RAW text is missing...")
try:
images = convert_from_path(pdf_path, dpi=dpi)
for idx in need_ocr:
try:
ocr_text = pytesseract.image_to_string(images[idx])
except Exception:
page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
ocr_text = pytesseract.image_to_string(page_img)
raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
except Exception as e:
print("⚠️ OCR step failed:", e)
for idx in need_ocr:
try:
page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
ocr_text = pytesseract.image_to_string(page_img)
raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
except Exception as ee:
print(f" ❌ OCR failed for page {idx+1}: {ee}")
raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n"
# Step 3: Save deduped text
result = [txt for txt in raw_texts if txt]
os.makedirs(os.path.dirname(txt_path) or ".", exist_ok=True)
with open(txt_path, "w", encoding="utf-8") as f:
f.write("\n".join(result))
print(f"✅ Saved deduped full text to {txt_path}")
if __name__ == "__main__":
import sys
if len(sys.argv) != 3:
print("Usage: python extract_pdf_data.py input.pdf output.txt")
sys.exit(1)
extract_pdf_full_text(sys.argv[1], sys.argv[2])