Spaces:

Shami96
/

PDF-Data_Extractor

Running

File size: 766 Bytes

423a437

# utils.py

import pdfplumber
import re

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def parse_pdf_to_dict(text):
    data = {}

    # Example logic — customize based on actual data format
    patterns = {
        "Operator Name": r"Operator Name[:\s]+(.+)",
        "Accreditation Modules": r"Accreditation Modules[:\s]+(.+)",
        "Audit Date": r"Audit Date[:\s]+([\d\-\/]+)",
        "Auditor Name": r"Auditor Name[:\s]+(.+)",
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            data[key] = match.group(1).strip()

    return data