Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,58 +2,41 @@ import gradio as gr
|
|
| 2 |
from docx import Document
|
| 3 |
from docx.shared import RGBColor
|
| 4 |
from difflib import get_close_matches
|
| 5 |
-
from
|
| 6 |
|
| 7 |
-
# Improved red color detection
|
| 8 |
def is_red_color(run):
|
| 9 |
color = run.font.color
|
| 10 |
-
if color
|
| 11 |
return False
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
red_shades = [
|
| 15 |
-
RGBColor(255, 0, 0),
|
| 16 |
-
RGBColor(200, 0, 0),
|
| 17 |
-
RGBColor(255, 20, 20),
|
| 18 |
-
RGBColor(192, 0, 0),
|
| 19 |
-
RGBColor(220, 20, 60),
|
| 20 |
-
]
|
| 21 |
-
return any(color.rgb == shade for shade in red_shades)
|
| 22 |
-
return False
|
| 23 |
|
| 24 |
def replace_red_text_with_data(doc_path, data_dict):
|
| 25 |
doc = Document(doc_path)
|
| 26 |
-
matched = 0
|
| 27 |
|
| 28 |
for para in doc.paragraphs:
|
| 29 |
for run in para.runs:
|
| 30 |
if is_red_color(run):
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
# Use fuzzy matching
|
| 36 |
-
key_match = get_close_matches(red_text.lower(), [k.lower() for k in data_dict], n=1, cutoff=0.6)
|
| 37 |
-
if key_match:
|
| 38 |
for key in data_dict:
|
| 39 |
-
if key.lower() ==
|
| 40 |
-
print(f"Replacing '{red_text}' with '{data_dict[key]}'")
|
| 41 |
run.text = data_dict[key]
|
| 42 |
-
matched += 1
|
| 43 |
break
|
| 44 |
-
print(f"Total replacements: {matched}")
|
| 45 |
return doc
|
| 46 |
|
| 47 |
-
def process_files(pdf_file,
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
raw_text = extract_text_from_pdf(pdf_path)
|
| 53 |
-
data_dict = parse_pdf_to_dict(raw_text)
|
| 54 |
|
| 55 |
-
|
|
|
|
| 56 |
|
|
|
|
|
|
|
| 57 |
final_doc.save(output_path)
|
| 58 |
return output_path
|
| 59 |
|
|
@@ -63,9 +46,9 @@ demo = gr.Interface(
|
|
| 63 |
gr.File(label="Upload PDF Report", file_types=[".pdf"]),
|
| 64 |
gr.File(label="Upload Word Template (.docx)", file_types=[".docx"])
|
| 65 |
],
|
| 66 |
-
outputs=gr.File(label="Download
|
| 67 |
-
title="Audit Report
|
| 68 |
-
description="
|
| 69 |
)
|
| 70 |
|
| 71 |
if __name__ == "__main__":
|
|
|
|
| 2 |
from docx import Document
|
| 3 |
from docx.shared import RGBColor
|
| 4 |
from difflib import get_close_matches
|
| 5 |
+
from pdf_parser import extract_text_from_pdf, parse_data_blocks
|
| 6 |
|
|
|
|
| 7 |
def is_red_color(run):
|
| 8 |
color = run.font.color
|
| 9 |
+
if not color or not color.rgb:
|
| 10 |
return False
|
| 11 |
+
r, g, b = color.rgb[0], color.rgb[1], color.rgb[2]
|
| 12 |
+
return r >= 200 and g <= 100 and b <= 100 # red-dominant
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
def replace_red_text_with_data(doc_path, data_dict):
|
| 15 |
doc = Document(doc_path)
|
|
|
|
| 16 |
|
| 17 |
for para in doc.paragraphs:
|
| 18 |
for run in para.runs:
|
| 19 |
if is_red_color(run):
|
| 20 |
+
original_text = run.text.strip()
|
| 21 |
+
# Try exact or close match
|
| 22 |
+
match = get_close_matches(original_text.lower(), [k.lower() for k in data_dict.keys()], n=1, cutoff=0.6)
|
| 23 |
+
if match:
|
|
|
|
|
|
|
|
|
|
| 24 |
for key in data_dict:
|
| 25 |
+
if key.lower() == match[0]:
|
|
|
|
| 26 |
run.text = data_dict[key]
|
|
|
|
| 27 |
break
|
|
|
|
| 28 |
return doc
|
| 29 |
|
| 30 |
+
def process_files(pdf_file, word_template):
|
| 31 |
+
# Extract data from PDF
|
| 32 |
+
raw_text = extract_text_from_pdf(pdf_file)
|
| 33 |
+
data_dict = parse_data_blocks(raw_text)
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
+
# Replace red text in Word
|
| 36 |
+
final_doc = replace_red_text_with_data(word_template, data_dict)
|
| 37 |
|
| 38 |
+
# Save and return output
|
| 39 |
+
output_path = "filled_output.docx"
|
| 40 |
final_doc.save(output_path)
|
| 41 |
return output_path
|
| 42 |
|
|
|
|
| 46 |
gr.File(label="Upload PDF Report", file_types=[".pdf"]),
|
| 47 |
gr.File(label="Upload Word Template (.docx)", file_types=[".docx"])
|
| 48 |
],
|
| 49 |
+
outputs=gr.File(label="Download Updated Word (.docx)"),
|
| 50 |
+
title="Audit Report Auto-Filler",
|
| 51 |
+
description="Replaces outdated red text in Word using updated values from a PDF report."
|
| 52 |
)
|
| 53 |
|
| 54 |
if __name__ == "__main__":
|