PDF-Data_Extractor / pdf_parser.py
Shami96's picture
Update pdf_parser.py
86d5840 verified
raw
history blame
365 Bytes
# pdf_parser.py
import fitz # PyMuPDF
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
return "\n".join(page.get_text() for page in doc)
def parse_data_blocks(text):
data = {}
for line in text.splitlines():
if ':' in line:
key, val = line.split(':', 1)
data[key.strip()] = val.strip()
return data