Spaces:
Running
Running
File size: 766 Bytes
423a437 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
# utils.py
import pdfplumber
import re
def extract_text_from_pdf(pdf_path):
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text += page.extract_text() + "\n"
return text
def parse_pdf_to_dict(text):
data = {}
# Example logic — customize based on actual data format
patterns = {
"Operator Name": r"Operator Name[:\s]+(.+)",
"Accreditation Modules": r"Accreditation Modules[:\s]+(.+)",
"Audit Date": r"Audit Date[:\s]+([\d\-\/]+)",
"Auditor Name": r"Auditor Name[:\s]+(.+)",
}
for key, pattern in patterns.items():
match = re.search(pattern, text, re.IGNORECASE)
if match:
data[key] = match.group(1).strip()
return data |