Spaces:

Shami96
/

PDF-Data_Extractor

Running

PDF-Data_Extractor / utils.py

Create utils.py

423a437 verified 4 months ago

766 Bytes

	# utils.py

	import pdfplumber
	import re

	def extract_text_from_pdf(pdf_path):
	text = ""
	with pdfplumber.open(pdf_path) as pdf:
	for page in pdf.pages:
	text += page.extract_text() + "\n"
	return text

	def parse_pdf_to_dict(text):
	data = {}

	# Example logic — customize based on actual data format
	patterns = {
	"Operator Name": r"Operator Name[:\s]+(.+)",
	"Accreditation Modules": r"Accreditation Modules[:\s]+(.+)",
	"Audit Date": r"Audit Date[:\s]+([\d\-\/]+)",
	"Auditor Name": r"Auditor Name[:\s]+(.+)",
	}

	for key, pattern in patterns.items():
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	data[key] = match.group(1).strip()

	return data