# utils.py import pdfplumber import re def extract_text_from_pdf(pdf_path): text = "" with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text += page.extract_text() + "\n" return text def parse_pdf_to_dict(text): data = {} # Example logic — customize based on actual data format patterns = { "Operator Name": r"Operator Name[:\s]+(.+)", "Accreditation Modules": r"Accreditation Modules[:\s]+(.+)", "Audit Date": r"Audit Date[:\s]+([\d\-\/]+)", "Auditor Name": r"Auditor Name[:\s]+(.+)", } for key, pattern in patterns.items(): match = re.search(pattern, text, re.IGNORECASE) if match: data[key] = match.group(1).strip() return data