Shami96 commited on
Commit
1ccec94
·
verified ·
1 Parent(s): 1804090

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -17
app.py CHANGED
@@ -1,40 +1,47 @@
1
  import gradio as gr
2
- import os
3
- from docx import Document
4
- from utils import extract_text_from_pdf, parse_pdf_to_dict
5
-
6
- from docx.shared import RGBColor
7
-
8
  from docx import Document
9
  from docx.shared import RGBColor
10
  from difflib import get_close_matches
 
11
 
 
12
  def is_red_color(run):
13
  color = run.font.color
14
  if color is None:
15
  return False
16
  if color.rgb:
17
- r, g, b = color.rgb[0], color.rgb[1], color.rgb[2]
18
- return r > 180 and g < 100 and b < 100 # any strong red tint
 
 
 
 
 
 
 
19
  return False
20
-
21
 
22
  def replace_red_text_with_data(doc_path, data_dict):
23
  doc = Document(doc_path)
 
24
 
25
  for para in doc.paragraphs:
26
  for run in para.runs:
27
  if is_red_color(run):
28
  red_text = run.text.strip()
29
- # Try fuzzy match
30
- key_match = get_close_matches(red_text.lower(), [k.lower() for k in data_dict.keys()], n=1, cutoff=0.6)
 
 
 
31
  if key_match:
32
- # Find original key with matching text
33
  for key in data_dict:
34
  if key.lower() == key_match[0]:
 
35
  run.text = data_dict[key]
 
36
  break
37
-
38
  return doc
39
 
40
  def process_files(pdf_file, template_docx):
@@ -42,14 +49,11 @@ def process_files(pdf_file, template_docx):
42
  doc_path = template_docx
43
  output_path = "filled_output.docx"
44
 
45
- # Extract and parse PDF
46
  raw_text = extract_text_from_pdf(pdf_path)
47
  data_dict = parse_pdf_to_dict(raw_text)
48
 
49
- # Replace red text with data
50
  final_doc = replace_red_text_with_data(doc_path, data_dict)
51
 
52
- # Save final document
53
  final_doc.save(output_path)
54
  return output_path
55
 
@@ -61,7 +65,7 @@ demo = gr.Interface(
61
  ],
62
  outputs=gr.File(label="Download Filled Report (.docx)"),
63
  title="Audit Report Generator",
64
- description="Upload a PDF and a Word template. This tool will auto-fill red-highlighted fields with data from the PDF."
65
  )
66
 
67
  if __name__ == "__main__":
 
1
  import gradio as gr
 
 
 
 
 
 
2
  from docx import Document
3
  from docx.shared import RGBColor
4
  from difflib import get_close_matches
5
+ from utils import extract_text_from_pdf, parse_pdf_to_dict
6
 
7
+ # Improved red color detection
8
  def is_red_color(run):
9
  color = run.font.color
10
  if color is None:
11
  return False
12
  if color.rgb:
13
+ # Try detecting red tones even if slightly off
14
+ red_shades = [
15
+ RGBColor(255, 0, 0),
16
+ RGBColor(200, 0, 0),
17
+ RGBColor(255, 20, 20),
18
+ RGBColor(192, 0, 0),
19
+ RGBColor(220, 20, 60),
20
+ ]
21
+ return any(color.rgb == shade for shade in red_shades)
22
  return False
 
23
 
24
  def replace_red_text_with_data(doc_path, data_dict):
25
  doc = Document(doc_path)
26
+ matched = 0
27
 
28
  for para in doc.paragraphs:
29
  for run in para.runs:
30
  if is_red_color(run):
31
  red_text = run.text.strip()
32
+ print(f"Found red text: {red_text}")
33
+ if not red_text:
34
+ continue
35
+ # Use fuzzy matching
36
+ key_match = get_close_matches(red_text.lower(), [k.lower() for k in data_dict], n=1, cutoff=0.6)
37
  if key_match:
 
38
  for key in data_dict:
39
  if key.lower() == key_match[0]:
40
+ print(f"Replacing '{red_text}' with '{data_dict[key]}'")
41
  run.text = data_dict[key]
42
+ matched += 1
43
  break
44
+ print(f"Total replacements: {matched}")
45
  return doc
46
 
47
  def process_files(pdf_file, template_docx):
 
49
  doc_path = template_docx
50
  output_path = "filled_output.docx"
51
 
 
52
  raw_text = extract_text_from_pdf(pdf_path)
53
  data_dict = parse_pdf_to_dict(raw_text)
54
 
 
55
  final_doc = replace_red_text_with_data(doc_path, data_dict)
56
 
 
57
  final_doc.save(output_path)
58
  return output_path
59
 
 
65
  ],
66
  outputs=gr.File(label="Download Filled Report (.docx)"),
67
  title="Audit Report Generator",
68
+ description="Upload a PDF and a Word template. This tool will auto-fill red-colored outdated text with data from the PDF."
69
  )
70
 
71
  if __name__ == "__main__":