Shami96 commited on
Commit
03b2a60
·
verified ·
1 Parent(s): 86d5840

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -37
app.py CHANGED
@@ -2,58 +2,41 @@ import gradio as gr
2
  from docx import Document
3
  from docx.shared import RGBColor
4
  from difflib import get_close_matches
5
- from utils import extract_text_from_pdf, parse_pdf_to_dict
6
 
7
- # Improved red color detection
8
  def is_red_color(run):
9
  color = run.font.color
10
- if color is None:
11
  return False
12
- if color.rgb:
13
- # Try detecting red tones even if slightly off
14
- red_shades = [
15
- RGBColor(255, 0, 0),
16
- RGBColor(200, 0, 0),
17
- RGBColor(255, 20, 20),
18
- RGBColor(192, 0, 0),
19
- RGBColor(220, 20, 60),
20
- ]
21
- return any(color.rgb == shade for shade in red_shades)
22
- return False
23
 
24
  def replace_red_text_with_data(doc_path, data_dict):
25
  doc = Document(doc_path)
26
- matched = 0
27
 
28
  for para in doc.paragraphs:
29
  for run in para.runs:
30
  if is_red_color(run):
31
- red_text = run.text.strip()
32
- print(f"Found red text: {red_text}")
33
- if not red_text:
34
- continue
35
- # Use fuzzy matching
36
- key_match = get_close_matches(red_text.lower(), [k.lower() for k in data_dict], n=1, cutoff=0.6)
37
- if key_match:
38
  for key in data_dict:
39
- if key.lower() == key_match[0]:
40
- print(f"Replacing '{red_text}' with '{data_dict[key]}'")
41
  run.text = data_dict[key]
42
- matched += 1
43
  break
44
- print(f"Total replacements: {matched}")
45
  return doc
46
 
47
- def process_files(pdf_file, template_docx):
48
- pdf_path = pdf_file
49
- doc_path = template_docx
50
- output_path = "filled_output.docx"
51
-
52
- raw_text = extract_text_from_pdf(pdf_path)
53
- data_dict = parse_pdf_to_dict(raw_text)
54
 
55
- final_doc = replace_red_text_with_data(doc_path, data_dict)
 
56
 
 
 
57
  final_doc.save(output_path)
58
  return output_path
59
 
@@ -63,9 +46,9 @@ demo = gr.Interface(
63
  gr.File(label="Upload PDF Report", file_types=[".pdf"]),
64
  gr.File(label="Upload Word Template (.docx)", file_types=[".docx"])
65
  ],
66
- outputs=gr.File(label="Download Filled Report (.docx)"),
67
- title="Audit Report Generator",
68
- description="Upload a PDF and a Word template. This tool will auto-fill red-colored outdated text with data from the PDF."
69
  )
70
 
71
  if __name__ == "__main__":
 
2
  from docx import Document
3
  from docx.shared import RGBColor
4
  from difflib import get_close_matches
5
+ from pdf_parser import extract_text_from_pdf, parse_data_blocks
6
 
 
7
  def is_red_color(run):
8
  color = run.font.color
9
+ if not color or not color.rgb:
10
  return False
11
+ r, g, b = color.rgb[0], color.rgb[1], color.rgb[2]
12
+ return r >= 200 and g <= 100 and b <= 100 # red-dominant
 
 
 
 
 
 
 
 
 
13
 
14
  def replace_red_text_with_data(doc_path, data_dict):
15
  doc = Document(doc_path)
 
16
 
17
  for para in doc.paragraphs:
18
  for run in para.runs:
19
  if is_red_color(run):
20
+ original_text = run.text.strip()
21
+ # Try exact or close match
22
+ match = get_close_matches(original_text.lower(), [k.lower() for k in data_dict.keys()], n=1, cutoff=0.6)
23
+ if match:
 
 
 
24
  for key in data_dict:
25
+ if key.lower() == match[0]:
 
26
  run.text = data_dict[key]
 
27
  break
 
28
  return doc
29
 
30
+ def process_files(pdf_file, word_template):
31
+ # Extract data from PDF
32
+ raw_text = extract_text_from_pdf(pdf_file)
33
+ data_dict = parse_data_blocks(raw_text)
 
 
 
34
 
35
+ # Replace red text in Word
36
+ final_doc = replace_red_text_with_data(word_template, data_dict)
37
 
38
+ # Save and return output
39
+ output_path = "filled_output.docx"
40
  final_doc.save(output_path)
41
  return output_path
42
 
 
46
  gr.File(label="Upload PDF Report", file_types=[".pdf"]),
47
  gr.File(label="Upload Word Template (.docx)", file_types=[".docx"])
48
  ],
49
+ outputs=gr.File(label="Download Updated Word (.docx)"),
50
+ title="Audit Report Auto-Filler",
51
+ description="Replaces outdated red text in Word using updated values from a PDF report."
52
  )
53
 
54
  if __name__ == "__main__":