Shami96's picture
Update app.py
5fb575a verified
raw
history blame
2.15 kB
# app.py
import gradio as gr
import tempfile
from pdf_extractor import extract_label_value_pairs
from word_extractor import extract_red_text_with_labels, is_red_font
from docx import Document
from docx.shared import RGBColor
import difflib
def find_best_match_label(target_label, pdf_data):
keys = list(pdf_data.keys())
match = difflib.get_close_matches(target_label.lower(), keys, n=1, cutoff=0.4)
return match[0] if match else None
def replace_red_text_by_label(word_path, label_value_map):
doc = Document(word_path)
for table in doc.tables:
for row in table.rows:
cells = row.cells
if len(cells) >= 2:
label = cells[0].text.strip().replace(":", "").replace("\n", " ")
matched_label = find_best_match_label(label, label_value_map)
if not matched_label:
continue
new_value = label_value_map[matched_label]
for para in cells[1].paragraphs:
for run in para.runs:
if is_red_font(run):
run.text = new_value
run.font.color.rgb = RGBColor(0, 0, 0) # make black
temp_dir = tempfile.mkdtemp()
updated_path = f"{temp_dir}/updated.docx"
doc.save(updated_path)
return updated_path
def process_files(pdf_file, word_file):
pdf_path = pdf_file.name
word_path = word_file.name
pdf_data = extract_label_value_pairs(pdf_path) # {label: value}
word_data = extract_red_text_with_labels(word_path) # {label: [red_texts]}
updated_doc_path = replace_red_text_by_label(word_path, pdf_data)
return updated_doc_path
gr.Interface(
fn=process_files,
inputs=[
gr.File(label="Upload PDF File", type="file"),
gr.File(label="Upload Word File", type="file")
],
outputs=gr.File(label="Download Updated Word File"),
title="Red Text Replacer (Label-Aware)",
description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching values from the PDF using label-based matching."
).launch()