Spaces:
Running
Running
File size: 2,593 Bytes
459372e 4b3f51e 31d231c 4b3f51e 31d231c 9e5331a 4b3f51e 1804090 459372e 1804090 4b3f51e 459372e 4b3f51e 459372e 4b3f51e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import gradio as gr
import tempfile
import shutil
from pdf_extractor import extract_text_pdf_raw
from word_extractor import extract_red_text_with_labels, is_red_font
from docx import Document
from docx.shared import RGBColor
import difflib
def find_best_match(target, candidates):
match = difflib.get_close_matches(target, candidates, n=1, cutoff=0.5)
return match[0] if match else None
def replace_red_text_in_doc(doc_path, replacements):
doc = Document(doc_path)
for para in doc.paragraphs:
for run in para.runs:
if is_red_font(run):
old_text = run.text.strip()
new_text = find_best_match(old_text, replacements)
if new_text:
run.text = new_text
run.font.color.rgb = RGBColor(0, 0, 0) # Set to black
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for para in cell.paragraphs:
for run in para.runs:
if is_red_font(run):
old_text = run.text.strip()
new_text = find_best_match(old_text, replacements)
if new_text:
run.text = new_text
run.font.color.rgb = RGBColor(0, 0, 0) # Set to black
temp_dir = tempfile.mkdtemp()
updated_path = f"{temp_dir}/updated.docx"
doc.save(updated_path)
return updated_path
def process_files(pdf_file, word_file):
pdf_path = pdf_file.name
word_path = word_file.name
pdf_text = extract_text_pdf_raw(pdf_path)
word_data = extract_red_text_with_labels(word_path)
# Flatten red text entries
red_values = []
for values in word_data.values():
red_values.extend(values)
red_values = list(set(red_values)) # dedupe
# Match red values to PDF
replacements = []
for val in red_values:
match = find_best_match(val, pdf_text)
if match:
replacements.append(match)
# Replace in Word
updated_doc_path = replace_red_text_in_doc(word_path, replacements)
return updated_doc_path
gr.Interface(
fn=process_files,
inputs=[
gr.File(label="Upload PDF File", type="file"),
gr.File(label="Upload Word File", type="file")
],
outputs=gr.File(label="Download Updated Word File"),
title="Red Text Replacer",
description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching content from the PDF."
).launch() |