Shami96 commited on
Commit
423a437
·
verified ·
1 Parent(s): eb097b8

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +29 -0
utils.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils.py
2
+
3
+ import pdfplumber
4
+ import re
5
+
6
+ def extract_text_from_pdf(pdf_path):
7
+ text = ""
8
+ with pdfplumber.open(pdf_path) as pdf:
9
+ for page in pdf.pages:
10
+ text += page.extract_text() + "\n"
11
+ return text
12
+
13
+ def parse_pdf_to_dict(text):
14
+ data = {}
15
+
16
+ # Example logic — customize based on actual data format
17
+ patterns = {
18
+ "Operator Name": r"Operator Name[:\s]+(.+)",
19
+ "Accreditation Modules": r"Accreditation Modules[:\s]+(.+)",
20
+ "Audit Date": r"Audit Date[:\s]+([\d\-\/]+)",
21
+ "Auditor Name": r"Auditor Name[:\s]+(.+)",
22
+ }
23
+
24
+ for key, pattern in patterns.items():
25
+ match = re.search(pattern, text, re.IGNORECASE)
26
+ if match:
27
+ data[key] = match.group(1).strip()
28
+
29
+ return data