Shami96 commited on
Commit
65691ad
·
verified ·
1 Parent(s): b48339c

Create pdf_parser.py

Browse files
Files changed (1) hide show
  1. pdf_parser.py +22 -0
pdf_parser.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pdf_parser.py
2
+
3
+ import fitz # PyMuPDF
4
+
5
+ def extract_text_from_pdf(pdf_path):
6
+ doc = fitz.open(pdf_path)
7
+ text = ""
8
+ for page in doc:
9
+ text += page.get_text()
10
+ return text
11
+
12
+ def parse_data_blocks(text):
13
+ # You can customize this logic as needed
14
+ data = {}
15
+ lines = text.split("\n")
16
+
17
+ for line in lines:
18
+ if ":" in line:
19
+ key, val = line.split(":", 1)
20
+ data[key.strip()] = val.strip()
21
+
22
+ return data