Spaces:
Running
Running
Update extract_pdf_data.py
Browse files- extract_pdf_data.py +18 -38
extract_pdf_data.py
CHANGED
|
@@ -1,59 +1,39 @@
|
|
| 1 |
-
# extract_pdf_data.py
|
| 2 |
import pdfplumber
|
| 3 |
from pdf2image import convert_from_path
|
| 4 |
import pytesseract
|
| 5 |
-
import os
|
| 6 |
|
| 7 |
-
def extract_pdf_full_text(pdf_path, txt_path
|
| 8 |
raw_texts = []
|
| 9 |
need_ocr = []
|
| 10 |
-
|
|
|
|
| 11 |
with pdfplumber.open(pdf_path) as pdf:
|
| 12 |
for i, page in enumerate(pdf.pages):
|
| 13 |
print(f"Extracting text from page {i+1}...")
|
| 14 |
-
|
| 15 |
-
text = page.extract_text() or ""
|
| 16 |
-
except Exception:
|
| 17 |
-
text = ""
|
| 18 |
if text.strip():
|
| 19 |
raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
|
| 20 |
else:
|
| 21 |
raw_texts.append(None)
|
|
|
|
| 22 |
need_ocr.append(i)
|
| 23 |
-
|
| 24 |
-
# Step 2: OCR only pages
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
except Exception:
|
| 33 |
-
page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
|
| 34 |
-
ocr_text = pytesseract.image_to_string(page_img)
|
| 35 |
-
raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
|
| 36 |
-
except Exception as e:
|
| 37 |
-
print("⚠️ OCR step failed:", e)
|
| 38 |
-
for idx in need_ocr:
|
| 39 |
-
try:
|
| 40 |
-
page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
|
| 41 |
-
ocr_text = pytesseract.image_to_string(page_img)
|
| 42 |
-
raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
|
| 43 |
-
except Exception as ee:
|
| 44 |
-
print(f" ❌ OCR failed for page {idx+1}: {ee}")
|
| 45 |
-
raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n"
|
| 46 |
-
|
| 47 |
-
# Step 3: Save deduped text
|
| 48 |
result = [txt for txt in raw_texts if txt]
|
| 49 |
-
os.makedirs(os.path.dirname(txt_path) or ".", exist_ok=True)
|
| 50 |
with open(txt_path, "w", encoding="utf-8") as f:
|
| 51 |
f.write("\n".join(result))
|
| 52 |
print(f"✅ Saved deduped full text to {txt_path}")
|
| 53 |
|
| 54 |
if __name__ == "__main__":
|
| 55 |
import sys
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
extract_pdf_full_text(
|
|
|
|
|
|
|
| 1 |
import pdfplumber
|
| 2 |
from pdf2image import convert_from_path
|
| 3 |
import pytesseract
|
|
|
|
| 4 |
|
| 5 |
+
def extract_pdf_full_text(pdf_path, txt_path):
|
| 6 |
raw_texts = []
|
| 7 |
need_ocr = []
|
| 8 |
+
|
| 9 |
+
# Step 1: Try to extract RAW text, record which pages need OCR
|
| 10 |
with pdfplumber.open(pdf_path) as pdf:
|
| 11 |
for i, page in enumerate(pdf.pages):
|
| 12 |
print(f"Extracting text from page {i+1}...")
|
| 13 |
+
text = page.extract_text() or ""
|
|
|
|
|
|
|
|
|
|
| 14 |
if text.strip():
|
| 15 |
raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
|
| 16 |
else:
|
| 17 |
raw_texts.append(None)
|
| 18 |
+
# Mark that we need OCR for this page
|
| 19 |
need_ocr.append(i)
|
| 20 |
+
|
| 21 |
+
# Step 2: OCR only those pages with no RAW text
|
| 22 |
+
print("Running OCR where RAW text is missing...")
|
| 23 |
+
images = convert_from_path(pdf_path, dpi=300)
|
| 24 |
+
for idx in need_ocr:
|
| 25 |
+
ocr_text = pytesseract.image_to_string(images[idx])
|
| 26 |
+
raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
|
| 27 |
+
|
| 28 |
+
# Step 3: Save to file (skip any leftover Nones, but there shouldn't be any)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
result = [txt for txt in raw_texts if txt]
|
|
|
|
| 30 |
with open(txt_path, "w", encoding="utf-8") as f:
|
| 31 |
f.write("\n".join(result))
|
| 32 |
print(f"✅ Saved deduped full text to {txt_path}")
|
| 33 |
|
| 34 |
if __name__ == "__main__":
|
| 35 |
import sys
|
| 36 |
+
# Usage: python extract_pdf_data.py input.pdf output.txt
|
| 37 |
+
input_pdf = sys.argv[1]
|
| 38 |
+
output_txt = sys.argv[2]
|
| 39 |
+
extract_pdf_full_text(input_pdf, output_txt)
|