Spaces:
Running
Running
Update extract_pdf_data.py
Browse files- extract_pdf_data.py +39 -17
extract_pdf_data.py
CHANGED
|
@@ -1,37 +1,59 @@
|
|
|
|
|
| 1 |
import pdfplumber
|
| 2 |
from pdf2image import convert_from_path
|
| 3 |
import pytesseract
|
|
|
|
| 4 |
|
| 5 |
-
def extract_pdf_full_text(pdf_path, txt_path):
|
| 6 |
raw_texts = []
|
| 7 |
need_ocr = []
|
| 8 |
-
# Step 1: Try to extract RAW text, record
|
| 9 |
with pdfplumber.open(pdf_path) as pdf:
|
| 10 |
for i, page in enumerate(pdf.pages):
|
| 11 |
print(f"Extracting text from page {i+1}...")
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
| 13 |
if text.strip():
|
| 14 |
raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
|
| 15 |
else:
|
| 16 |
-
raw_texts.append(None)
|
| 17 |
need_ocr.append(i)
|
| 18 |
-
|
| 19 |
-
# Step 2: OCR only
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
result = [txt for txt in raw_texts if txt]
|
|
|
|
| 28 |
with open(txt_path, "w", encoding="utf-8") as f:
|
| 29 |
f.write("\n".join(result))
|
| 30 |
print(f"✅ Saved deduped full text to {txt_path}")
|
| 31 |
|
| 32 |
if __name__ == "__main__":
|
| 33 |
import sys
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
extract_pdf_full_text(
|
|
|
|
| 1 |
+
# extract_pdf_data.py
|
| 2 |
import pdfplumber
|
| 3 |
from pdf2image import convert_from_path
|
| 4 |
import pytesseract
|
| 5 |
+
import os
|
| 6 |
|
| 7 |
+
def extract_pdf_full_text(pdf_path, txt_path, dpi=300):
|
| 8 |
raw_texts = []
|
| 9 |
need_ocr = []
|
| 10 |
+
# Step 1: Try to extract RAW text, record pages needing OCR
|
| 11 |
with pdfplumber.open(pdf_path) as pdf:
|
| 12 |
for i, page in enumerate(pdf.pages):
|
| 13 |
print(f"Extracting text from page {i+1}...")
|
| 14 |
+
try:
|
| 15 |
+
text = page.extract_text() or ""
|
| 16 |
+
except Exception:
|
| 17 |
+
text = ""
|
| 18 |
if text.strip():
|
| 19 |
raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
|
| 20 |
else:
|
| 21 |
+
raw_texts.append(None)
|
| 22 |
need_ocr.append(i)
|
| 23 |
+
|
| 24 |
+
# Step 2: OCR only pages that need it
|
| 25 |
+
if need_ocr:
|
| 26 |
+
print("Running OCR where RAW text is missing...")
|
| 27 |
+
try:
|
| 28 |
+
images = convert_from_path(pdf_path, dpi=dpi)
|
| 29 |
+
for idx in need_ocr:
|
| 30 |
+
try:
|
| 31 |
+
ocr_text = pytesseract.image_to_string(images[idx])
|
| 32 |
+
except Exception:
|
| 33 |
+
page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
|
| 34 |
+
ocr_text = pytesseract.image_to_string(page_img)
|
| 35 |
+
raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print("⚠️ OCR step failed:", e)
|
| 38 |
+
for idx in need_ocr:
|
| 39 |
+
try:
|
| 40 |
+
page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
|
| 41 |
+
ocr_text = pytesseract.image_to_string(page_img)
|
| 42 |
+
raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
|
| 43 |
+
except Exception as ee:
|
| 44 |
+
print(f" ❌ OCR failed for page {idx+1}: {ee}")
|
| 45 |
+
raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n"
|
| 46 |
+
|
| 47 |
+
# Step 3: Save deduped text
|
| 48 |
result = [txt for txt in raw_texts if txt]
|
| 49 |
+
os.makedirs(os.path.dirname(txt_path) or ".", exist_ok=True)
|
| 50 |
with open(txt_path, "w", encoding="utf-8") as f:
|
| 51 |
f.write("\n".join(result))
|
| 52 |
print(f"✅ Saved deduped full text to {txt_path}")
|
| 53 |
|
| 54 |
if __name__ == "__main__":
|
| 55 |
import sys
|
| 56 |
+
if len(sys.argv) != 3:
|
| 57 |
+
print("Usage: python extract_pdf_data.py input.pdf output.txt")
|
| 58 |
+
sys.exit(1)
|
| 59 |
+
extract_pdf_full_text(sys.argv[1], sys.argv[2])
|