Spaces:
Running
Running
| # pdf_extractor.py | |
| import fitz # PyMuPDF | |
| def extract_text_pdf_raw(pdf_path): | |
| doc = fitz.open(pdf_path) | |
| return "\n".join(page.get_text() for page in doc) | |
| def extract_label_value_pairs(pdf_path): | |
| raw_text = extract_text_pdf_raw(pdf_path) | |
| label_value_map = {} | |
| for line in raw_text.split('\n'): | |
| line = line.strip() | |
| if ':' in line: | |
| label, value = line.split(':', 1) | |
| if len(value.strip()) > 0: | |
| label_value_map[label.strip().lower()] = value.strip() | |
| elif '-' in line: | |
| parts = line.split('-', 1) | |
| if len(parts) == 2: | |
| label, value = parts | |
| label_value_map[label.strip().lower()] = value.strip() | |
| return label_value_map |