PDF-Data_Extractor / pdf_extractor.py
Shami96's picture
Rename pdf_parser.py to pdf_extractor.py
f1bab1c verified
raw
history blame
755 Bytes
# pdf_extractor.py
import fitz # PyMuPDF
def extract_text_pdf_raw(pdf_path):
doc = fitz.open(pdf_path)
return "\n".join(page.get_text() for page in doc)
def extract_label_value_pairs(pdf_path):
raw_text = extract_text_pdf_raw(pdf_path)
label_value_map = {}
for line in raw_text.split('\n'):
line = line.strip()
if ':' in line:
label, value = line.split(':', 1)
if len(value.strip()) > 0:
label_value_map[label.strip().lower()] = value.strip()
elif '-' in line:
parts = line.split('-', 1)
if len(parts) == 2:
label, value = parts
label_value_map[label.strip().lower()] = value.strip()
return label_value_map