Shami96 commited on
Commit
de67fe9
·
verified ·
1 Parent(s): cce0884

Delete pdf_extractor.py

Browse files
Files changed (1) hide show
  1. pdf_extractor.py +0 -24
pdf_extractor.py DELETED
@@ -1,24 +0,0 @@
1
- # pdf_extractor.py
2
- import fitz # PyMuPDF
3
-
4
- def extract_text_pdf_raw(pdf_path):
5
- doc = fitz.open(pdf_path)
6
- return "\n".join(page.get_text() for page in doc)
7
-
8
- def extract_label_value_pairs(pdf_path):
9
- raw_text = extract_text_pdf_raw(pdf_path)
10
- label_value_map = {}
11
-
12
- for line in raw_text.split('\n'):
13
- line = line.strip()
14
- if ':' in line:
15
- label, value = line.split(':', 1)
16
- if len(value.strip()) > 0:
17
- label_value_map[label.strip().lower()] = value.strip()
18
- elif '-' in line:
19
- parts = line.split('-', 1)
20
- if len(parts) == 2:
21
- label, value = parts
22
- label_value_map[label.strip().lower()] = value.strip()
23
-
24
- return label_value_map