Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

102bd04

verified ·

1 Parent(s): 97cac57

Update extract_pdf_data.py

Browse files

Files changed (1) hide show

extract_pdf_data.py +39 -17

extract_pdf_data.py CHANGED Viewed

@@ -1,37 +1,59 @@
 import pdfplumber
 from pdf2image import convert_from_path
 import pytesseract
-def extract_pdf_full_text(pdf_path, txt_path):
     raw_texts = []
     need_ocr = []
-    # Step 1: Try to extract RAW text, record which pages need OCR
     with pdfplumber.open(pdf_path) as pdf:
         for i, page in enumerate(pdf.pages):
             print(f"Extracting text from page {i+1}...")
-            text = page.extract_text() or ""
             if text.strip():
                 raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
             else:
-                raw_texts.append(None)  # Mark that we need OCR for this page
                 need_ocr.append(i)
-    # Step 2: OCR only those pages with no RAW text
-    print("Running OCR where RAW text is missing...")
-    images = convert_from_path(pdf_path, dpi=300)
-    for idx in need_ocr:
-        ocr_text = pytesseract.image_to_string(images[idx])
-        raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
-    # Step 3: Save to file (skip any leftover Nones, but there shouldn't be any)
     result = [txt for txt in raw_texts if txt]
     with open(txt_path, "w", encoding="utf-8") as f:
         f.write("\n".join(result))
     print(f"✅ Saved deduped full text to {txt_path}")
 if __name__ == "__main__":
     import sys
-    # Usage: python extract_pdf_data.py input.pdf output.txt
-    input_pdf = sys.argv[1]
-    output_txt = sys.argv[2]
-    extract_pdf_full_text(input_pdf, output_txt)

+# extract_pdf_data.py
 import pdfplumber
 from pdf2image import convert_from_path
 import pytesseract
+import os
+def extract_pdf_full_text(pdf_path, txt_path, dpi=300):
     raw_texts = []
     need_ocr = []
+    # Step 1: Try to extract RAW text, record pages needing OCR
     with pdfplumber.open(pdf_path) as pdf:
         for i, page in enumerate(pdf.pages):
             print(f"Extracting text from page {i+1}...")
+            try:
+                text = page.extract_text() or ""
+            except Exception:
+                text = ""
             if text.strip():
                 raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
             else:
+                raw_texts.append(None)
                 need_ocr.append(i)
+    # Step 2: OCR only pages that need it
+    if need_ocr:
+        print("Running OCR where RAW text is missing...")
+        try:
+            images = convert_from_path(pdf_path, dpi=dpi)
+            for idx in need_ocr:
+                try:
+                    ocr_text = pytesseract.image_to_string(images[idx])
+                except Exception:
+                    page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
+                    ocr_text = pytesseract.image_to_string(page_img)
+                raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
+        except Exception as e:
+            print("⚠️ OCR step failed:", e)
+            for idx in need_ocr:
+                try:
+                    page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
+                    ocr_text = pytesseract.image_to_string(page_img)
+                    raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
+                except Exception as ee:
+                    print(f"  ❌ OCR failed for page {idx+1}: {ee}")
+                    raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n"
+    # Step 3: Save deduped text
     result = [txt for txt in raw_texts if txt]
+    os.makedirs(os.path.dirname(txt_path) or ".", exist_ok=True)
     with open(txt_path, "w", encoding="utf-8") as f:
         f.write("\n".join(result))
     print(f"✅ Saved deduped full text to {txt_path}")
 if __name__ == "__main__":
     import sys
+    if len(sys.argv) != 3:
+        print("Usage: python extract_pdf_data.py input.pdf output.txt")
+        sys.exit(1)
+    extract_pdf_full_text(sys.argv[1], sys.argv[2])