PDF-Data_Extractor / pdf_parser.py
Shami96's picture
Create pdf_parser.py
65691ad verified
raw
history blame
459 Bytes
# pdf_parser.py
import fitz # PyMuPDF
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
def parse_data_blocks(text):
# You can customize this logic as needed
data = {}
lines = text.split("\n")
for line in lines:
if ":" in line:
key, val = line.split(":", 1)
data[key.strip()] = val.strip()
return data