Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

0322667

verified ·

1 Parent(s): 7ca7d73

Update extract_pdf_data.py

Browse files

Files changed (1) hide show

extract_pdf_data.py +18 -38

extract_pdf_data.py CHANGED Viewed

@@ -1,59 +1,39 @@
-# extract_pdf_data.py
 import pdfplumber
 from pdf2image import convert_from_path
 import pytesseract
-import os
-def extract_pdf_full_text(pdf_path, txt_path, dpi=300):
     raw_texts = []
     need_ocr = []
-    # Step 1: Try to extract RAW text, record pages needing OCR
     with pdfplumber.open(pdf_path) as pdf:
         for i, page in enumerate(pdf.pages):
             print(f"Extracting text from page {i+1}...")
-            try:
-                text = page.extract_text() or ""
-            except Exception:
-                text = ""
             if text.strip():
                 raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
             else:
                 raw_texts.append(None)
                 need_ocr.append(i)
-    # Step 2: OCR only pages that need it
-    if need_ocr:
-        print("Running OCR where RAW text is missing...")
-        try:
-            images = convert_from_path(pdf_path, dpi=dpi)
-            for idx in need_ocr:
-                try:
-                    ocr_text = pytesseract.image_to_string(images[idx])
-                except Exception:
-                    page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
-                    ocr_text = pytesseract.image_to_string(page_img)
-                raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
-        except Exception as e:
-            print("⚠️ OCR step failed:", e)
-            for idx in need_ocr:
-                try:
-                    page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
-                    ocr_text = pytesseract.image_to_string(page_img)
-                    raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
-                except Exception as ee:
-                    print(f"  ❌ OCR failed for page {idx+1}: {ee}")
-                    raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n"
-    # Step 3: Save deduped text
     result = [txt for txt in raw_texts if txt]
-    os.makedirs(os.path.dirname(txt_path) or ".", exist_ok=True)
     with open(txt_path, "w", encoding="utf-8") as f:
         f.write("\n".join(result))
     print(f"✅ Saved deduped full text to {txt_path}")
 if __name__ == "__main__":
     import sys
-    if len(sys.argv) != 3:
-        print("Usage: python extract_pdf_data.py input.pdf output.txt")
-        sys.exit(1)
-    extract_pdf_full_text(sys.argv[1], sys.argv[2])

 import pdfplumber
 from pdf2image import convert_from_path
 import pytesseract
+def extract_pdf_full_text(pdf_path, txt_path):
     raw_texts = []
     need_ocr = []
+    # Step 1: Try to extract RAW text, record which pages need OCR
     with pdfplumber.open(pdf_path) as pdf:
         for i, page in enumerate(pdf.pages):
             print(f"Extracting text from page {i+1}...")
+            text = page.extract_text() or ""
             if text.strip():
                 raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
             else:
                 raw_texts.append(None)
+                # Mark that we need OCR for this page
                 need_ocr.append(i)
+    # Step 2: OCR only those pages with no RAW text
+    print("Running OCR where RAW text is missing...")
+    images = convert_from_path(pdf_path, dpi=300)
+    for idx in need_ocr:
+        ocr_text = pytesseract.image_to_string(images[idx])
+        raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
+    # Step 3: Save to file (skip any leftover Nones, but there shouldn't be any)
     result = [txt for txt in raw_texts if txt]
     with open(txt_path, "w", encoding="utf-8") as f:
         f.write("\n".join(result))
     print(f"✅ Saved deduped full text to {txt_path}")
 if __name__ == "__main__":
     import sys
+    # Usage: python extract_pdf_data.py input.pdf output.txt
+    input_pdf = sys.argv[1]
+    output_txt = sys.argv[2]
+    extract_pdf_full_text(input_pdf, output_txt)