Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

PDF-Data_Extractor / extract_pdf_data.py

Shami96

Update extract_pdf_data.py

102bd04 verified 3 months ago

raw

history blame

2.44 kB

	# extract_pdf_data.py
	import pdfplumber
	from pdf2image import convert_from_path
	import pytesseract
	import os

	def extract_pdf_full_text(pdf_path, txt_path, dpi=300):
	raw_texts = []
	need_ocr = []
	# Step 1: Try to extract RAW text, record pages needing OCR
	with pdfplumber.open(pdf_path) as pdf:
	for i, page in enumerate(pdf.pages):
	print(f"Extracting text from page {i+1}...")
	try:
	text = page.extract_text() or ""
	except Exception:
	text = ""
	if text.strip():
	raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
	else:
	raw_texts.append(None)
	need_ocr.append(i)

	# Step 2: OCR only pages that need it
	if need_ocr:
	print("Running OCR where RAW text is missing...")
	try:
	images = convert_from_path(pdf_path, dpi=dpi)
	for idx in need_ocr:
	try:
	ocr_text = pytesseract.image_to_string(images[idx])
	except Exception:
	page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
	ocr_text = pytesseract.image_to_string(page_img)
	raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
	except Exception as e:
	print("⚠️ OCR step failed:", e)
	for idx in need_ocr:
	try:
	page_img = convert_from_path(pdf_path, dpi=dpi, first_page=idx+1, last_page=idx+1)[0]
	ocr_text = pytesseract.image_to_string(page_img)
	raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
	except Exception as ee:
	print(f" ❌ OCR failed for page {idx+1}: {ee}")
	raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n"

	# Step 3: Save deduped text
	result = [txt for txt in raw_texts if txt]
	os.makedirs(os.path.dirname(txt_path) or ".", exist_ok=True)
	with open(txt_path, "w", encoding="utf-8") as f:
	f.write("\n".join(result))
	print(f"✅ Saved deduped full text to {txt_path}")

	if __name__ == "__main__":
	import sys
	if len(sys.argv) != 3:
	print("Usage: python extract_pdf_data.py input.pdf output.txt")
	sys.exit(1)
	extract_pdf_full_text(sys.argv[1], sys.argv[2])