PDF-Data_Extractor / utils.py
Shami96's picture
Create utils.py
423a437 verified
raw
history blame
766 Bytes
# utils.py
import pdfplumber
import re
def extract_text_from_pdf(pdf_path):
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text += page.extract_text() + "\n"
return text
def parse_pdf_to_dict(text):
data = {}
# Example logic — customize based on actual data format
patterns = {
"Operator Name": r"Operator Name[:\s]+(.+)",
"Accreditation Modules": r"Accreditation Modules[:\s]+(.+)",
"Audit Date": r"Audit Date[:\s]+([\d\-\/]+)",
"Auditor Name": r"Auditor Name[:\s]+(.+)",
}
for key, pattern in patterns.items():
match = re.search(pattern, text, re.IGNORECASE)
if match:
data[key] = match.group(1).strip()
return data