Shami96 commited on
Commit
86d5840
·
verified ·
1 Parent(s): 1ccec94

Update pdf_parser.py

Browse files
Files changed (1) hide show
  1. pdf_parser.py +4 -12
pdf_parser.py CHANGED
@@ -1,22 +1,14 @@
1
  # pdf_parser.py
2
-
3
  import fitz # PyMuPDF
4
 
5
  def extract_text_from_pdf(pdf_path):
6
  doc = fitz.open(pdf_path)
7
- text = ""
8
- for page in doc:
9
- text += page.get_text()
10
- return text
11
 
12
  def parse_data_blocks(text):
13
- # You can customize this logic as needed
14
  data = {}
15
- lines = text.split("\n")
16
-
17
- for line in lines:
18
- if ":" in line:
19
- key, val = line.split(":", 1)
20
  data[key.strip()] = val.strip()
21
-
22
  return data
 
1
  # pdf_parser.py
 
2
  import fitz # PyMuPDF
3
 
4
  def extract_text_from_pdf(pdf_path):
5
  doc = fitz.open(pdf_path)
6
+ return "\n".join(page.get_text() for page in doc)
 
 
 
7
 
8
  def parse_data_blocks(text):
 
9
  data = {}
10
+ for line in text.splitlines():
11
+ if ':' in line:
12
+ key, val = line.split(':', 1)
 
 
13
  data[key.strip()] = val.strip()
 
14
  return data