Spaces:
Running
Running
File size: 2,388 Bytes
459372e 31d231c 1804090 1ccec94 31d231c 1ccec94 9e5331a 1ccec94 9e5331a 1804090 1ccec94 459372e 1804090 9e5331a 1804090 1ccec94 1804090 1ccec94 1804090 1ccec94 1804090 1ccec94 459372e c19a2c1 459372e c19a2c1 459372e 1ccec94 459372e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import gradio as gr
from docx import Document
from docx.shared import RGBColor
from difflib import get_close_matches
from utils import extract_text_from_pdf, parse_pdf_to_dict
# Improved red color detection
def is_red_color(run):
color = run.font.color
if color is None:
return False
if color.rgb:
# Try detecting red tones even if slightly off
red_shades = [
RGBColor(255, 0, 0),
RGBColor(200, 0, 0),
RGBColor(255, 20, 20),
RGBColor(192, 0, 0),
RGBColor(220, 20, 60),
]
return any(color.rgb == shade for shade in red_shades)
return False
def replace_red_text_with_data(doc_path, data_dict):
doc = Document(doc_path)
matched = 0
for para in doc.paragraphs:
for run in para.runs:
if is_red_color(run):
red_text = run.text.strip()
print(f"Found red text: {red_text}")
if not red_text:
continue
# Use fuzzy matching
key_match = get_close_matches(red_text.lower(), [k.lower() for k in data_dict], n=1, cutoff=0.6)
if key_match:
for key in data_dict:
if key.lower() == key_match[0]:
print(f"Replacing '{red_text}' with '{data_dict[key]}'")
run.text = data_dict[key]
matched += 1
break
print(f"Total replacements: {matched}")
return doc
def process_files(pdf_file, template_docx):
pdf_path = pdf_file
doc_path = template_docx
output_path = "filled_output.docx"
raw_text = extract_text_from_pdf(pdf_path)
data_dict = parse_pdf_to_dict(raw_text)
final_doc = replace_red_text_with_data(doc_path, data_dict)
final_doc.save(output_path)
return output_path
demo = gr.Interface(
fn=process_files,
inputs=[
gr.File(label="Upload PDF Report", file_types=[".pdf"]),
gr.File(label="Upload Word Template (.docx)", file_types=[".docx"])
],
outputs=gr.File(label="Download Filled Report (.docx)"),
title="Audit Report Generator",
description="Upload a PDF and a Word template. This tool will auto-fill red-colored outdated text with data from the PDF."
)
if __name__ == "__main__":
demo.launch() |