Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 1

Commit

21ae957

verified ·

1 Parent(s): 688144c

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -32

app.py CHANGED Viewed

@@ -1,57 +1,45 @@
 import gradio as gr
 import tempfile
 import os
-import subprocess
-import uuid
 import shutil
 def process_files(pdf_file, word_file):
-    # Each upload returns a path (str) with type="filepath"
-    # Create a unique temp directory for each run (prevents parallel collision)
     temp_dir = tempfile.mkdtemp(prefix="hf_redtext_")
-    # Copy user-uploaded files into temp directory with standard names
     pdf_path = os.path.join(temp_dir, "input.pdf")
     word_path = os.path.join(temp_dir, "input.docx")
     shutil.copy(pdf_file, pdf_path)
-shutil.copy(word_file, word_path)
-    # Step 1: Extract PDF data to txt
-    pdf_txt_path = os.path.join(temp_dir, "pdf_data.txt")
-    subprocess.run(
-        ["python", "extract_pdf_data.py", pdf_path, pdf_txt_path],
-        check=True
-    )
-    # Step 2: Extract red text from Word to JSON
-    word_json_path = os.path.join(temp_dir, "word_data.json")
-    subprocess.run(
-        ["python", "extract_red_text.py", word_path, word_json_path],
-        check=True
-    )
-    # Step 3: Update docx JSON with PDF txt, output updated JSON
-    updated_json_path = os.path.join(temp_dir, "updated_word_data.json")
-    subprocess.run(
-        ["python", "update_docx_with_pdf.py", word_json_path, pdf_txt_path, updated_json_path],
-        check=True
-    )
-    # Step 4: Compare word file with updated JSON and update docx
-    final_docx_path = os.path.join(temp_dir, "updated.docx")
-    subprocess.run(
-        ["python", "updated_word.py", word_path, updated_json_path, final_docx_path],
-        check=True
-    )
-    # Return final updated docx file
     return final_docx_path
 iface = gr.Interface(
     fn=process_files,
     inputs=[
         gr.File(label="Upload PDF File", type="filepath"),
-        gr.File(label="Upload Word File", type="filepath"),
     ],
     outputs=gr.File(label="Download Updated Word File"),
     title="Red Text Replacer",

 import gradio as gr
 import tempfile
 import os
 import shutil
+import subprocess
 def process_files(pdf_file, word_file):
+    # Create a unique temporary directory for this run
     temp_dir = tempfile.mkdtemp(prefix="hf_redtext_")
+    # Define standard filenames for use in the pipeline
     pdf_path = os.path.join(temp_dir, "input.pdf")
     word_path = os.path.join(temp_dir, "input.docx")
+    pdf_txt_path = os.path.join(temp_dir, "pdf_data.txt")
+    word_json_path = os.path.join(temp_dir, "word_data.json")
+    updated_json_path = os.path.join(temp_dir, "updated_word_data.json")
+    final_docx_path = os.path.join(temp_dir, "updated.docx")
+    # Copy the uploaded files to the temp directory
     shutil.copy(pdf_file, pdf_path)
+    shutil.copy(word_file, word_path)
+    # Step 1: Extract text from the PDF
+    subprocess.run(["python", "extract_pdf_data.py", pdf_path, pdf_txt_path], check=True)
+    # Step 2: Extract red text from the Word document
+    subprocess.run(["python", "extract_red_text.py", word_path, word_json_path], check=True)
+    # Step 3: Update the Word JSON using the PDF text (calls OpenAI)
+    subprocess.run(["python", "update_docx_with_pdf.py", word_json_path, pdf_txt_path, updated_json_path], check=True)
+    # Step 4: Apply the updated JSON to the Word doc to create the final output
+    subprocess.run(["python", "updated_word.py", word_path, updated_json_path, final_docx_path], check=True)
+    # Return the final .docx file
     return final_docx_path
 iface = gr.Interface(
     fn=process_files,
     inputs=[
         gr.File(label="Upload PDF File", type="filepath"),
+        gr.File(label="Upload Word File", type="filepath")
     ],
     outputs=gr.File(label="Download Updated Word File"),
     title="Red Text Replacer",