Spaces:
Running
Running
| # extract_pdf_data.py | |
| import pdfplumber | |
| from pdf2image import convert_from_path | |
| import pytesseract | |
| import os | |
| def extract_pdf_full_text(pdf_path, txt_path, dpi=300): | |
| raw_texts = [] | |
| need_ocr = [] | |
| # Step 1: Try to extract RAW text, record pages needing OCR | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for i, page in enumerate(pdf.pages): | |
| print(f"Extracting text from page {i+1}...") | |
| try: | |
| text = page.extract_text() or "" | |
| except Exception: | |
| text = "" | |
| if text.strip(): | |
| raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}") | |
| else: | |
| raw_texts.append(None) | |
| need_ocr.append(i) | |
| # Step 2: OCR only pages that need it | |
| if need_ocr: | |
| print("Running OCR where RAW text is missing...") | |
| try: | |
| images = convert_from_path(pdf_path, dpi=dpi) | |
| for idx in need_ocr: | |
| try: | |
| ocr_text = pytesseract.image_to_string(images[idx]) | |
| except Exception: | |
| page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0] | |
| ocr_text = pytesseract.image_to_string(page_img) | |
| raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}" | |
| except Exception as e: | |
| print("⚠️ OCR step failed:", e) | |
| for idx in need_ocr: | |
| try: | |
| page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0] | |
| ocr_text = pytesseract.image_to_string(page_img) | |
| raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}" | |
| except Exception as ee: | |
| print(f" ❌ OCR failed for page {idx+1}: {ee}") | |
| raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n" | |
| # Step 3: Save deduped text | |
| result = [txt for txt in raw_texts if txt] | |
| os.makedirs(os.path.dirname(txt_path) or ".", exist_ok=True) | |
| with open(txt_path, "w", encoding="utf-8") as f: | |
| f.write("\n".join(result)) | |
| print(f"✅ Saved deduped full text to {txt_path}") | |
| if __name__ == "__main__": | |
| import sys | |
| if len(sys.argv) != 3: | |
| print("Usage: python extract_pdf_data.py input.pdf output.txt") | |
| sys.exit(1) | |
| extract_pdf_full_text(sys.argv[1], sys.argv[2]) |