Spaces:

Shami96
/

PDF-Data_Extractor

Running

File size: 2,593 Bytes

import gradio as gr
import tempfile
import shutil
from pdf_extractor import extract_text_pdf_raw
from word_extractor import extract_red_text_with_labels, is_red_font
from docx import Document
from docx.shared import RGBColor
import difflib


def find_best_match(target, candidates):
    match = difflib.get_close_matches(target, candidates, n=1, cutoff=0.5)
    return match[0] if match else None


def replace_red_text_in_doc(doc_path, replacements):
    doc = Document(doc_path)

    for para in doc.paragraphs:
        for run in para.runs:
            if is_red_font(run):
                old_text = run.text.strip()
                new_text = find_best_match(old_text, replacements)
                if new_text:
                    run.text = new_text
                    run.font.color.rgb = RGBColor(0, 0, 0)  # Set to black

    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for para in cell.paragraphs:
                    for run in para.runs:
                        if is_red_font(run):
                            old_text = run.text.strip()
                            new_text = find_best_match(old_text, replacements)
                            if new_text:
                                run.text = new_text
                                run.font.color.rgb = RGBColor(0, 0, 0)  # Set to black

    temp_dir = tempfile.mkdtemp()
    updated_path = f"{temp_dir}/updated.docx"
    doc.save(updated_path)
    return updated_path


def process_files(pdf_file, word_file):
    pdf_path = pdf_file.name
    word_path = word_file.name

    pdf_text = extract_text_pdf_raw(pdf_path)
    word_data = extract_red_text_with_labels(word_path)

    # Flatten red text entries
    red_values = []
    for values in word_data.values():
        red_values.extend(values)
    red_values = list(set(red_values))  # dedupe

    # Match red values to PDF
    replacements = []
    for val in red_values:
        match = find_best_match(val, pdf_text)
        if match:
            replacements.append(match)

    # Replace in Word
    updated_doc_path = replace_red_text_in_doc(word_path, replacements)

    return updated_doc_path


gr.Interface(
    fn=process_files,
    inputs=[
        gr.File(label="Upload PDF File", type="file"),
        gr.File(label="Upload Word File", type="file")
    ],
    outputs=gr.File(label="Download Updated Word File"),
    title="Red Text Replacer",
    description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching content from the PDF."
).launch()