Shami96 commited on
Commit
102bd04
·
verified ·
1 Parent(s): 97cac57

Update extract_pdf_data.py

Browse files
Files changed (1) hide show
  1. extract_pdf_data.py +39 -17
extract_pdf_data.py CHANGED
@@ -1,37 +1,59 @@
 
1
  import pdfplumber
2
  from pdf2image import convert_from_path
3
  import pytesseract
 
4
 
5
- def extract_pdf_full_text(pdf_path, txt_path):
6
  raw_texts = []
7
  need_ocr = []
8
- # Step 1: Try to extract RAW text, record which pages need OCR
9
  with pdfplumber.open(pdf_path) as pdf:
10
  for i, page in enumerate(pdf.pages):
11
  print(f"Extracting text from page {i+1}...")
12
- text = page.extract_text() or ""
 
 
 
13
  if text.strip():
14
  raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
15
  else:
16
- raw_texts.append(None) # Mark that we need OCR for this page
17
  need_ocr.append(i)
18
-
19
- # Step 2: OCR only those pages with no RAW text
20
- print("Running OCR where RAW text is missing...")
21
- images = convert_from_path(pdf_path, dpi=300)
22
- for idx in need_ocr:
23
- ocr_text = pytesseract.image_to_string(images[idx])
24
- raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
25
-
26
- # Step 3: Save to file (skip any leftover Nones, but there shouldn't be any)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  result = [txt for txt in raw_texts if txt]
 
28
  with open(txt_path, "w", encoding="utf-8") as f:
29
  f.write("\n".join(result))
30
  print(f"✅ Saved deduped full text to {txt_path}")
31
 
32
  if __name__ == "__main__":
33
  import sys
34
- # Usage: python extract_pdf_data.py input.pdf output.txt
35
- input_pdf = sys.argv[1]
36
- output_txt = sys.argv[2]
37
- extract_pdf_full_text(input_pdf, output_txt)
 
1
+ # extract_pdf_data.py
2
  import pdfplumber
3
  from pdf2image import convert_from_path
4
  import pytesseract
5
+ import os
6
 
7
+ def extract_pdf_full_text(pdf_path, txt_path, dpi=300):
8
  raw_texts = []
9
  need_ocr = []
10
+ # Step 1: Try to extract RAW text, record pages needing OCR
11
  with pdfplumber.open(pdf_path) as pdf:
12
  for i, page in enumerate(pdf.pages):
13
  print(f"Extracting text from page {i+1}...")
14
+ try:
15
+ text = page.extract_text() or ""
16
+ except Exception:
17
+ text = ""
18
  if text.strip():
19
  raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
20
  else:
21
+ raw_texts.append(None)
22
  need_ocr.append(i)
23
+
24
+ # Step 2: OCR only pages that need it
25
+ if need_ocr:
26
+ print("Running OCR where RAW text is missing...")
27
+ try:
28
+ images = convert_from_path(pdf_path, dpi=dpi)
29
+ for idx in need_ocr:
30
+ try:
31
+ ocr_text = pytesseract.image_to_string(images[idx])
32
+ except Exception:
33
+ page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
34
+ ocr_text = pytesseract.image_to_string(page_img)
35
+ raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
36
+ except Exception as e:
37
+ print("⚠️ OCR step failed:", e)
38
+ for idx in need_ocr:
39
+ try:
40
+ page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
41
+ ocr_text = pytesseract.image_to_string(page_img)
42
+ raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
43
+ except Exception as ee:
44
+ print(f" ❌ OCR failed for page {idx+1}: {ee}")
45
+ raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n"
46
+
47
+ # Step 3: Save deduped text
48
  result = [txt for txt in raw_texts if txt]
49
+ os.makedirs(os.path.dirname(txt_path) or ".", exist_ok=True)
50
  with open(txt_path, "w", encoding="utf-8") as f:
51
  f.write("\n".join(result))
52
  print(f"✅ Saved deduped full text to {txt_path}")
53
 
54
  if __name__ == "__main__":
55
  import sys
56
+ if len(sys.argv) != 3:
57
+ print("Usage: python extract_pdf_data.py input.pdf output.txt")
58
+ sys.exit(1)
59
+ extract_pdf_full_text(sys.argv[1], sys.argv[2])