Shami96 commited on
Commit
4b3f51e
·
verified ·
1 Parent(s): b433369

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -43
app.py CHANGED
@@ -1,55 +1,81 @@
1
  import gradio as gr
 
 
 
 
2
  from docx import Document
3
  from docx.shared import RGBColor
4
- from difflib import get_close_matches
5
- from pdf_parser import extract_text_from_pdf, parse_data_blocks
6
 
7
- def is_red_color(run):
8
- color = run.font.color
9
- if not color or not color.rgb:
10
- return False
11
- r, g, b = color.rgb[0], color.rgb[1], color.rgb[2]
12
- return r >= 200 and g <= 100 and b <= 100 # red-dominant
13
 
14
- def replace_red_text_with_data(doc_path, data_dict):
 
 
 
 
 
15
  doc = Document(doc_path)
16
 
17
  for para in doc.paragraphs:
18
  for run in para.runs:
19
- if is_red_color(run):
20
- original_text = run.text.strip()
21
- # Try exact or close match
22
- match = get_close_matches(original_text.lower(), [k.lower() for k in data_dict.keys()], n=1, cutoff=0.6)
23
- if match:
24
- for key in data_dict:
25
- if key.lower() == match[0]:
26
- run.text = data_dict[key]
27
- break
28
- return doc
29
-
30
- def process_files(pdf_file, word_template):
31
- # Extract data from PDF
32
- raw_text = extract_text_from_pdf(pdf_file)
33
- data_dict = parse_data_blocks(raw_text)
34
-
35
- # Replace red text in Word
36
- final_doc = replace_red_text_with_data(word_template, data_dict)
37
-
38
- # Save and return output
39
- output_path = "filled_output.docx"
40
- final_doc.save(output_path)
41
- return output_path
42
-
43
- demo = gr.Interface(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  fn=process_files,
45
  inputs=[
46
- gr.File(label="Upload PDF Report", file_types=[".pdf"]),
47
- gr.File(label="Upload Word Template (.docx)", file_types=[".docx"])
48
  ],
49
- outputs=gr.File(label="Download Updated Word (.docx)"),
50
- title="Audit Report Auto-Filler",
51
- description="Replaces outdated red text in Word using updated values from a PDF report."
52
- )
53
-
54
- if __name__ == "__main__":
55
- demo.launch()
 
1
  import gradio as gr
2
+ import tempfile
3
+ import shutil
4
+ from pdf_extractor import extract_text_pdf_raw
5
+ from word_extractor import extract_red_text_with_labels, is_red_font
6
  from docx import Document
7
  from docx.shared import RGBColor
8
+ import difflib
 
9
 
 
 
 
 
 
 
10
 
11
+ def find_best_match(target, candidates):
12
+ match = difflib.get_close_matches(target, candidates, n=1, cutoff=0.5)
13
+ return match[0] if match else None
14
+
15
+
16
+ def replace_red_text_in_doc(doc_path, replacements):
17
  doc = Document(doc_path)
18
 
19
  for para in doc.paragraphs:
20
  for run in para.runs:
21
+ if is_red_font(run):
22
+ old_text = run.text.strip()
23
+ new_text = find_best_match(old_text, replacements)
24
+ if new_text:
25
+ run.text = new_text
26
+ run.font.color.rgb = RGBColor(0, 0, 0) # Set to black
27
+
28
+ for table in doc.tables:
29
+ for row in table.rows:
30
+ for cell in row.cells:
31
+ for para in cell.paragraphs:
32
+ for run in para.runs:
33
+ if is_red_font(run):
34
+ old_text = run.text.strip()
35
+ new_text = find_best_match(old_text, replacements)
36
+ if new_text:
37
+ run.text = new_text
38
+ run.font.color.rgb = RGBColor(0, 0, 0) # Set to black
39
+
40
+ temp_dir = tempfile.mkdtemp()
41
+ updated_path = f"{temp_dir}/updated.docx"
42
+ doc.save(updated_path)
43
+ return updated_path
44
+
45
+
46
+ def process_files(pdf_file, word_file):
47
+ pdf_path = pdf_file.name
48
+ word_path = word_file.name
49
+
50
+ pdf_text = extract_text_pdf_raw(pdf_path)
51
+ word_data = extract_red_text_with_labels(word_path)
52
+
53
+ # Flatten red text entries
54
+ red_values = []
55
+ for values in word_data.values():
56
+ red_values.extend(values)
57
+ red_values = list(set(red_values)) # dedupe
58
+
59
+ # Match red values to PDF
60
+ replacements = []
61
+ for val in red_values:
62
+ match = find_best_match(val, pdf_text)
63
+ if match:
64
+ replacements.append(match)
65
+
66
+ # Replace in Word
67
+ updated_doc_path = replace_red_text_in_doc(word_path, replacements)
68
+
69
+ return updated_doc_path
70
+
71
+
72
+ gr.Interface(
73
  fn=process_files,
74
  inputs=[
75
+ gr.File(label="Upload PDF File", type="file"),
76
+ gr.File(label="Upload Word File", type="file")
77
  ],
78
+ outputs=gr.File(label="Download Updated Word File"),
79
+ title="Red Text Replacer",
80
+ description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching content from the PDF."
81
+ ).launch()