Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,40 +1,47 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import os
|
| 3 |
-
from docx import Document
|
| 4 |
-
from utils import extract_text_from_pdf, parse_pdf_to_dict
|
| 5 |
-
|
| 6 |
-
from docx.shared import RGBColor
|
| 7 |
-
|
| 8 |
from docx import Document
|
| 9 |
from docx.shared import RGBColor
|
| 10 |
from difflib import get_close_matches
|
|
|
|
| 11 |
|
|
|
|
| 12 |
def is_red_color(run):
|
| 13 |
color = run.font.color
|
| 14 |
if color is None:
|
| 15 |
return False
|
| 16 |
if color.rgb:
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
return False
|
| 20 |
-
|
| 21 |
|
| 22 |
def replace_red_text_with_data(doc_path, data_dict):
|
| 23 |
doc = Document(doc_path)
|
|
|
|
| 24 |
|
| 25 |
for para in doc.paragraphs:
|
| 26 |
for run in para.runs:
|
| 27 |
if is_red_color(run):
|
| 28 |
red_text = run.text.strip()
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
| 31 |
if key_match:
|
| 32 |
-
# Find original key with matching text
|
| 33 |
for key in data_dict:
|
| 34 |
if key.lower() == key_match[0]:
|
|
|
|
| 35 |
run.text = data_dict[key]
|
|
|
|
| 36 |
break
|
| 37 |
-
|
| 38 |
return doc
|
| 39 |
|
| 40 |
def process_files(pdf_file, template_docx):
|
|
@@ -42,14 +49,11 @@ def process_files(pdf_file, template_docx):
|
|
| 42 |
doc_path = template_docx
|
| 43 |
output_path = "filled_output.docx"
|
| 44 |
|
| 45 |
-
# Extract and parse PDF
|
| 46 |
raw_text = extract_text_from_pdf(pdf_path)
|
| 47 |
data_dict = parse_pdf_to_dict(raw_text)
|
| 48 |
|
| 49 |
-
# Replace red text with data
|
| 50 |
final_doc = replace_red_text_with_data(doc_path, data_dict)
|
| 51 |
|
| 52 |
-
# Save final document
|
| 53 |
final_doc.save(output_path)
|
| 54 |
return output_path
|
| 55 |
|
|
@@ -61,7 +65,7 @@ demo = gr.Interface(
|
|
| 61 |
],
|
| 62 |
outputs=gr.File(label="Download Filled Report (.docx)"),
|
| 63 |
title="Audit Report Generator",
|
| 64 |
-
description="Upload a PDF and a Word template. This tool will auto-fill red-
|
| 65 |
)
|
| 66 |
|
| 67 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from docx import Document
|
| 3 |
from docx.shared import RGBColor
|
| 4 |
from difflib import get_close_matches
|
| 5 |
+
from utils import extract_text_from_pdf, parse_pdf_to_dict
|
| 6 |
|
| 7 |
+
# Improved red color detection
|
| 8 |
def is_red_color(run):
|
| 9 |
color = run.font.color
|
| 10 |
if color is None:
|
| 11 |
return False
|
| 12 |
if color.rgb:
|
| 13 |
+
# Try detecting red tones even if slightly off
|
| 14 |
+
red_shades = [
|
| 15 |
+
RGBColor(255, 0, 0),
|
| 16 |
+
RGBColor(200, 0, 0),
|
| 17 |
+
RGBColor(255, 20, 20),
|
| 18 |
+
RGBColor(192, 0, 0),
|
| 19 |
+
RGBColor(220, 20, 60),
|
| 20 |
+
]
|
| 21 |
+
return any(color.rgb == shade for shade in red_shades)
|
| 22 |
return False
|
|
|
|
| 23 |
|
| 24 |
def replace_red_text_with_data(doc_path, data_dict):
|
| 25 |
doc = Document(doc_path)
|
| 26 |
+
matched = 0
|
| 27 |
|
| 28 |
for para in doc.paragraphs:
|
| 29 |
for run in para.runs:
|
| 30 |
if is_red_color(run):
|
| 31 |
red_text = run.text.strip()
|
| 32 |
+
print(f"Found red text: {red_text}")
|
| 33 |
+
if not red_text:
|
| 34 |
+
continue
|
| 35 |
+
# Use fuzzy matching
|
| 36 |
+
key_match = get_close_matches(red_text.lower(), [k.lower() for k in data_dict], n=1, cutoff=0.6)
|
| 37 |
if key_match:
|
|
|
|
| 38 |
for key in data_dict:
|
| 39 |
if key.lower() == key_match[0]:
|
| 40 |
+
print(f"Replacing '{red_text}' with '{data_dict[key]}'")
|
| 41 |
run.text = data_dict[key]
|
| 42 |
+
matched += 1
|
| 43 |
break
|
| 44 |
+
print(f"Total replacements: {matched}")
|
| 45 |
return doc
|
| 46 |
|
| 47 |
def process_files(pdf_file, template_docx):
|
|
|
|
| 49 |
doc_path = template_docx
|
| 50 |
output_path = "filled_output.docx"
|
| 51 |
|
|
|
|
| 52 |
raw_text = extract_text_from_pdf(pdf_path)
|
| 53 |
data_dict = parse_pdf_to_dict(raw_text)
|
| 54 |
|
|
|
|
| 55 |
final_doc = replace_red_text_with_data(doc_path, data_dict)
|
| 56 |
|
|
|
|
| 57 |
final_doc.save(output_path)
|
| 58 |
return output_path
|
| 59 |
|
|
|
|
| 65 |
],
|
| 66 |
outputs=gr.File(label="Download Filled Report (.docx)"),
|
| 67 |
title="Audit Report Generator",
|
| 68 |
+
description="Upload a PDF and a Word template. This tool will auto-fill red-colored outdated text with data from the PDF."
|
| 69 |
)
|
| 70 |
|
| 71 |
if __name__ == "__main__":
|