Shami96 commited on
Commit
f1bab1c
·
verified ·
1 Parent(s): 4b3f51e

Rename pdf_parser.py to pdf_extractor.py

Browse files
Files changed (2) hide show
  1. pdf_extractor.py +24 -0
  2. pdf_parser.py +0 -14
pdf_extractor.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pdf_extractor.py
2
+ import fitz # PyMuPDF
3
+
4
+ def extract_text_pdf_raw(pdf_path):
5
+ doc = fitz.open(pdf_path)
6
+ return "\n".join(page.get_text() for page in doc)
7
+
8
+ def extract_label_value_pairs(pdf_path):
9
+ raw_text = extract_text_pdf_raw(pdf_path)
10
+ label_value_map = {}
11
+
12
+ for line in raw_text.split('\n'):
13
+ line = line.strip()
14
+ if ':' in line:
15
+ label, value = line.split(':', 1)
16
+ if len(value.strip()) > 0:
17
+ label_value_map[label.strip().lower()] = value.strip()
18
+ elif '-' in line:
19
+ parts = line.split('-', 1)
20
+ if len(parts) == 2:
21
+ label, value = parts
22
+ label_value_map[label.strip().lower()] = value.strip()
23
+
24
+ return label_value_map
pdf_parser.py DELETED
@@ -1,14 +0,0 @@
1
- # pdf_parser.py
2
- import fitz # PyMuPDF
3
-
4
- def extract_text_from_pdf(pdf_path):
5
- doc = fitz.open(pdf_path)
6
- return "\n".join(page.get_text() for page in doc)
7
-
8
- def parse_data_blocks(text):
9
- data = {}
10
- for line in text.splitlines():
11
- if ':' in line:
12
- key, val = line.split(':', 1)
13
- data[key.strip()] = val.strip()
14
- return data