# extract_pdf_data.py import pdfplumber from pdf2image import convert_from_path import pytesseract import os def extract_pdf_full_text(pdf_path, txt_path, dpi=300): raw_texts = [] need_ocr = [] # Step 1: Try to extract RAW text, record pages needing OCR with pdfplumber.open(pdf_path) as pdf: for i, page in enumerate(pdf.pages): print(f"Extracting text from page {i+1}...") try: text = page.extract_text() or "" except Exception: text = "" if text.strip(): raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}") else: raw_texts.append(None) need_ocr.append(i) # Step 2: OCR only pages that need it if need_ocr: print("Running OCR where RAW text is missing...") try: images = convert_from_path(pdf_path, dpi=dpi) for idx in need_ocr: try: ocr_text = pytesseract.image_to_string(images[idx]) except Exception: page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0] ocr_text = pytesseract.image_to_string(page_img) raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}" except Exception as e: print("⚠️ OCR step failed:", e) for idx in need_ocr: try: page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0] ocr_text = pytesseract.image_to_string(page_img) raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}" except Exception as ee: print(f" ❌ OCR failed for page {idx+1}: {ee}") raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n" # Step 3: Save deduped text result = [txt for txt in raw_texts if txt] os.makedirs(os.path.dirname(txt_path) or ".", exist_ok=True) with open(txt_path, "w", encoding="utf-8") as f: f.write("\n".join(result)) print(f"✅ Saved deduped full text to {txt_path}") if __name__ == "__main__": import sys if len(sys.argv) != 3: print("Usage: python extract_pdf_data.py input.pdf output.txt") sys.exit(1) extract_pdf_full_text(sys.argv[1], sys.argv[2])