File size: 2,152 Bytes
5fb575a
459372e
4b3f51e
5fb575a
4b3f51e
31d231c
 
4b3f51e
31d231c
9e5331a
5fb575a
 
 
4b3f51e
 
 
5fb575a
 
4b3f51e
 
 
5fb575a
 
 
 
 
 
 
 
 
 
4b3f51e
 
5fb575a
 
4b3f51e
 
 
 
 
 
 
 
 
 
 
5fb575a
 
4b3f51e
5fb575a
4b3f51e
 
 
 
459372e
 
4b3f51e
 
459372e
4b3f51e
5fb575a
 
4b3f51e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# app.py
import gradio as gr
import tempfile
from pdf_extractor import extract_label_value_pairs
from word_extractor import extract_red_text_with_labels, is_red_font
from docx import Document
from docx.shared import RGBColor
import difflib


def find_best_match_label(target_label, pdf_data):
    keys = list(pdf_data.keys())
    match = difflib.get_close_matches(target_label.lower(), keys, n=1, cutoff=0.4)
    return match[0] if match else None


def replace_red_text_by_label(word_path, label_value_map):
    doc = Document(word_path)

    for table in doc.tables:
        for row in table.rows:
            cells = row.cells
            if len(cells) >= 2:
                label = cells[0].text.strip().replace(":", "").replace("\n", " ")
                matched_label = find_best_match_label(label, label_value_map)
                if not matched_label:
                    continue

                new_value = label_value_map[matched_label]

                for para in cells[1].paragraphs:
                    for run in para.runs:
                        if is_red_font(run):
                            run.text = new_value
                            run.font.color.rgb = RGBColor(0, 0, 0)  # make black

    temp_dir = tempfile.mkdtemp()
    updated_path = f"{temp_dir}/updated.docx"
    doc.save(updated_path)
    return updated_path


def process_files(pdf_file, word_file):
    pdf_path = pdf_file.name
    word_path = word_file.name

    pdf_data = extract_label_value_pairs(pdf_path)           # {label: value}
    word_data = extract_red_text_with_labels(word_path)      # {label: [red_texts]}

    updated_doc_path = replace_red_text_by_label(word_path, pdf_data)
    return updated_doc_path


gr.Interface(
    fn=process_files,
    inputs=[
        gr.File(label="Upload PDF File", type="file"),
        gr.File(label="Upload Word File", type="file")
    ],
    outputs=gr.File(label="Download Updated Word File"),
    title="Red Text Replacer (Label-Aware)",
    description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching values from the PDF using label-based matching."
).launch()