Spaces:
Running
Running
| # utils.py | |
| import pdfplumber | |
| import re | |
| def extract_text_from_pdf(pdf_path): | |
| text = "" | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page in pdf.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| def parse_pdf_to_dict(text): | |
| data = {} | |
| # Example logic — customize based on actual data format | |
| patterns = { | |
| "Operator Name": r"Operator Name[:\s]+(.+)", | |
| "Accreditation Modules": r"Accreditation Modules[:\s]+(.+)", | |
| "Audit Date": r"Audit Date[:\s]+([\d\-\/]+)", | |
| "Auditor Name": r"Auditor Name[:\s]+(.+)", | |
| } | |
| for key, pattern in patterns.items(): | |
| match = re.search(pattern, text, re.IGNORECASE) | |
| if match: | |
| data[key] = match.group(1).strip() | |
| return data |