File size: 2,051 Bytes
459372e
4b3f51e
cce0884
 
55e3c9a
4b3f51e
 
55e3c9a
 
 
 
 
cce0884
 
55e3c9a
 
cce0884
55e3c9a
cce0884
55e3c9a
 
 
 
cce0884
55e3c9a
cce0884
55e3c9a
 
 
 
cce0884
55e3c9a
cce0884
55e3c9a
 
 
 
cce0884
55e3c9a
cce0884
55e3c9a
 
 
 
cce0884
55e3c9a
cce0884
4b3f51e
55e3c9a
459372e
 
55e3c9a
 
459372e
4b3f51e
6afedff
 
55e3c9a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import gradio as gr
import tempfile
import os
import subprocess
import uuid

def process_files(pdf_file, word_file):
    # Each upload returns a path (str) with type="filepath"
    # Create a unique temp directory for each run (prevents parallel collision)
    temp_dir = tempfile.mkdtemp(prefix="hf_redtext_")

    # Copy user-uploaded files into temp directory with standard names
    pdf_path = os.path.join(temp_dir, "input.pdf")
    word_path = os.path.join(temp_dir, "input.docx")
    os.rename(pdf_file, pdf_path)
    os.rename(word_file, word_path)

    # Step 1: Extract PDF data to txt
    pdf_txt_path = os.path.join(temp_dir, "pdf_data.txt")
    subprocess.run(
        ["python", "extract_pdf_data.py", pdf_path, pdf_txt_path],
        check=True
    )

    # Step 2: Extract red text from Word to JSON
    word_json_path = os.path.join(temp_dir, "word_data.json")
    subprocess.run(
        ["python", "extract_red_text.py", word_path, word_json_path],
        check=True
    )

    # Step 3: Update docx JSON with PDF txt, output updated JSON
    updated_json_path = os.path.join(temp_dir, "updated_word_data.json")
    subprocess.run(
        ["python", "update_docx_with_pdf.py", word_json_path, pdf_txt_path, updated_json_path],
        check=True
    )

    # Step 4: Compare word file with updated JSON and update docx
    final_docx_path = os.path.join(temp_dir, "updated.docx")
    subprocess.run(
        ["python", "updated_word.py", word_path, updated_json_path, final_docx_path],
        check=True
    )

    # Return final updated docx file
    return final_docx_path

iface = gr.Interface(
    fn=process_files,
    inputs=[
        gr.File(label="Upload PDF File", type="filepath"),
        gr.File(label="Upload Word File", type="filepath"),
    ],
    outputs=gr.File(label="Download Updated Word File"),
    title="Red Text Replacer",
    description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching content from the PDF."
)

if __name__ == "__main__":
    iface.launch()