Shami96 commited on
Commit
0322667
·
verified ·
1 Parent(s): 7ca7d73

Update extract_pdf_data.py

Browse files
Files changed (1) hide show
  1. extract_pdf_data.py +18 -38
extract_pdf_data.py CHANGED
@@ -1,59 +1,39 @@
1
- # extract_pdf_data.py
2
  import pdfplumber
3
  from pdf2image import convert_from_path
4
  import pytesseract
5
- import os
6
 
7
- def extract_pdf_full_text(pdf_path, txt_path, dpi=300):
8
  raw_texts = []
9
  need_ocr = []
10
- # Step 1: Try to extract RAW text, record pages needing OCR
 
11
  with pdfplumber.open(pdf_path) as pdf:
12
  for i, page in enumerate(pdf.pages):
13
  print(f"Extracting text from page {i+1}...")
14
- try:
15
- text = page.extract_text() or ""
16
- except Exception:
17
- text = ""
18
  if text.strip():
19
  raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
20
  else:
21
  raw_texts.append(None)
 
22
  need_ocr.append(i)
23
-
24
- # Step 2: OCR only pages that need it
25
- if need_ocr:
26
- print("Running OCR where RAW text is missing...")
27
- try:
28
- images = convert_from_path(pdf_path, dpi=dpi)
29
- for idx in need_ocr:
30
- try:
31
- ocr_text = pytesseract.image_to_string(images[idx])
32
- except Exception:
33
- page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
34
- ocr_text = pytesseract.image_to_string(page_img)
35
- raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
36
- except Exception as e:
37
- print("⚠️ OCR step failed:", e)
38
- for idx in need_ocr:
39
- try:
40
- page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
41
- ocr_text = pytesseract.image_to_string(page_img)
42
- raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
43
- except Exception as ee:
44
- print(f" ❌ OCR failed for page {idx+1}: {ee}")
45
- raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n"
46
-
47
- # Step 3: Save deduped text
48
  result = [txt for txt in raw_texts if txt]
49
- os.makedirs(os.path.dirname(txt_path) or ".", exist_ok=True)
50
  with open(txt_path, "w", encoding="utf-8") as f:
51
  f.write("\n".join(result))
52
  print(f"✅ Saved deduped full text to {txt_path}")
53
 
54
  if __name__ == "__main__":
55
  import sys
56
- if len(sys.argv) != 3:
57
- print("Usage: python extract_pdf_data.py input.pdf output.txt")
58
- sys.exit(1)
59
- extract_pdf_full_text(sys.argv[1], sys.argv[2])
 
 
1
  import pdfplumber
2
  from pdf2image import convert_from_path
3
  import pytesseract
 
4
 
5
+ def extract_pdf_full_text(pdf_path, txt_path):
6
  raw_texts = []
7
  need_ocr = []
8
+
9
+ # Step 1: Try to extract RAW text, record which pages need OCR
10
  with pdfplumber.open(pdf_path) as pdf:
11
  for i, page in enumerate(pdf.pages):
12
  print(f"Extracting text from page {i+1}...")
13
+ text = page.extract_text() or ""
 
 
 
14
  if text.strip():
15
  raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
16
  else:
17
  raw_texts.append(None)
18
+ # Mark that we need OCR for this page
19
  need_ocr.append(i)
20
+
21
+ # Step 2: OCR only those pages with no RAW text
22
+ print("Running OCR where RAW text is missing...")
23
+ images = convert_from_path(pdf_path, dpi=300)
24
+ for idx in need_ocr:
25
+ ocr_text = pytesseract.image_to_string(images[idx])
26
+ raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
27
+
28
+ # Step 3: Save to file (skip any leftover Nones, but there shouldn't be any)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  result = [txt for txt in raw_texts if txt]
 
30
  with open(txt_path, "w", encoding="utf-8") as f:
31
  f.write("\n".join(result))
32
  print(f"✅ Saved deduped full text to {txt_path}")
33
 
34
  if __name__ == "__main__":
35
  import sys
36
+ # Usage: python extract_pdf_data.py input.pdf output.txt
37
+ input_pdf = sys.argv[1]
38
+ output_txt = sys.argv[2]
39
+ extract_pdf_full_text(input_pdf, output_txt)