File size: 2,388 Bytes
459372e
31d231c
 
1804090
1ccec94
31d231c
1ccec94
9e5331a
 
 
 
 
1ccec94
 
 
 
 
 
 
 
 
9e5331a
 
1804090
 
1ccec94
459372e
 
1804090
9e5331a
1804090
1ccec94
 
 
 
 
1804090
 
 
1ccec94
1804090
1ccec94
1804090
1ccec94
459372e
 
 
c19a2c1
 
459372e
 
 
 
 
 
 
 
c19a2c1
459372e
 
 
 
 
 
 
 
 
1ccec94
459372e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import gradio as gr
from docx import Document
from docx.shared import RGBColor
from difflib import get_close_matches
from utils import extract_text_from_pdf, parse_pdf_to_dict

# Improved red color detection
def is_red_color(run):
    color = run.font.color
    if color is None:
        return False
    if color.rgb:
        # Try detecting red tones even if slightly off
        red_shades = [
            RGBColor(255, 0, 0),
            RGBColor(200, 0, 0),
            RGBColor(255, 20, 20),
            RGBColor(192, 0, 0),
            RGBColor(220, 20, 60),
        ]
        return any(color.rgb == shade for shade in red_shades)
    return False

def replace_red_text_with_data(doc_path, data_dict):
    doc = Document(doc_path)
    matched = 0

    for para in doc.paragraphs:
        for run in para.runs:
            if is_red_color(run):
                red_text = run.text.strip()
                print(f"Found red text: {red_text}")
                if not red_text:
                    continue
                # Use fuzzy matching
                key_match = get_close_matches(red_text.lower(), [k.lower() for k in data_dict], n=1, cutoff=0.6)
                if key_match:
                    for key in data_dict:
                        if key.lower() == key_match[0]:
                            print(f"Replacing '{red_text}' with '{data_dict[key]}'")
                            run.text = data_dict[key]
                            matched += 1
                            break
    print(f"Total replacements: {matched}")
    return doc

def process_files(pdf_file, template_docx):
    pdf_path = pdf_file
    doc_path = template_docx
    output_path = "filled_output.docx"

    raw_text = extract_text_from_pdf(pdf_path)
    data_dict = parse_pdf_to_dict(raw_text)

    final_doc = replace_red_text_with_data(doc_path, data_dict)

    final_doc.save(output_path)
    return output_path

demo = gr.Interface(
    fn=process_files,
    inputs=[
        gr.File(label="Upload PDF Report", file_types=[".pdf"]),
        gr.File(label="Upload Word Template (.docx)", file_types=[".docx"])
    ],
    outputs=gr.File(label="Download Filled Report (.docx)"),
    title="Audit Report Generator",
    description="Upload a PDF and a Word template. This tool will auto-fill red-colored outdated text with data from the PDF."
)

if __name__ == "__main__":
    demo.launch()