Spaces:

Shami96
/

PDF-Data_Extractor

Running

Shami96 commited on Jul 28

Commit

423a437

verified ·

1 Parent(s): eb097b8

Create utils.py

Files changed (1) hide show

utils.py ADDED Viewed

+# utils.py
+import pdfplumber
+import re
+def extract_text_from_pdf(pdf_path):
+    text = ""
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            text += page.extract_text() + "\n"
+    return text
+def parse_pdf_to_dict(text):
+    data = {}
+    # Example logic — customize based on actual data format
+    patterns = {
+        "Operator Name": r"Operator Name[:\s]+(.+)",
+        "Accreditation Modules": r"Accreditation Modules[:\s]+(.+)",
+        "Audit Date": r"Audit Date[:\s]+([\d\-\/]+)",
+        "Auditor Name": r"Auditor Name[:\s]+(.+)",
+    }
+    for key, pattern in patterns.items():
+        match = re.search(pattern, text, re.IGNORECASE)
+        if match:
+            data[key] = match.group(1).strip()
+    return data