File size: 766 Bytes
423a437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# utils.py

import pdfplumber
import re

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def parse_pdf_to_dict(text):
    data = {}

    # Example logic — customize based on actual data format
    patterns = {
        "Operator Name": r"Operator Name[:\s]+(.+)",
        "Accreditation Modules": r"Accreditation Modules[:\s]+(.+)",
        "Audit Date": r"Audit Date[:\s]+([\d\-\/]+)",
        "Auditor Name": r"Auditor Name[:\s]+(.+)",
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            data[key] = match.group(1).strip()

    return data