Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Jul 29

Commit

5fb575a

verified ·

1 Parent(s): 88a026a

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -42

app.py CHANGED Viewed

@@ -1,41 +1,38 @@
 import gradio as gr
 import tempfile
-import shutil
-from pdf_extractor import extract_text_pdf_raw
 from word_extractor import extract_red_text_with_labels, is_red_font
 from docx import Document
 from docx.shared import RGBColor
 import difflib
-def find_best_match(target, candidates):
-    match = difflib.get_close_matches(target, candidates, n=1, cutoff=0.5)
     return match[0] if match else None
-def replace_red_text_in_doc(doc_path, replacements):
-    doc = Document(doc_path)
-    for para in doc.paragraphs:
-        for run in para.runs:
-            if is_red_font(run):
-                old_text = run.text.strip()
-                new_text = find_best_match(old_text, replacements)
-                if new_text:
-                    run.text = new_text
-                    run.font.color.rgb = RGBColor(0, 0, 0)  # Set to black
     for table in doc.tables:
         for row in table.rows:
-            for cell in row.cells:
-                for para in cell.paragraphs:
                     for run in para.runs:
                         if is_red_font(run):
-                            old_text = run.text.strip()
-                            new_text = find_best_match(old_text, replacements)
-                            if new_text:
-                                run.text = new_text
-                                run.font.color.rgb = RGBColor(0, 0, 0)  # Set to black
     temp_dir = tempfile.mkdtemp()
     updated_path = f"{temp_dir}/updated.docx"
@@ -47,25 +44,10 @@ def process_files(pdf_file, word_file):
     pdf_path = pdf_file.name
     word_path = word_file.name
-    pdf_text = extract_text_pdf_raw(pdf_path)
-    word_data = extract_red_text_with_labels(word_path)
-    # Flatten red text entries
-    red_values = []
-    for values in word_data.values():
-        red_values.extend(values)
-    red_values = list(set(red_values))  # dedupe
-    # Match red values to PDF
-    replacements = []
-    for val in red_values:
-        match = find_best_match(val, pdf_text)
-        if match:
-            replacements.append(match)
-    # Replace in Word
-    updated_doc_path = replace_red_text_in_doc(word_path, replacements)
     return updated_doc_path
@@ -76,6 +58,6 @@ gr.Interface(
         gr.File(label="Upload Word File", type="file")
     ],
     outputs=gr.File(label="Download Updated Word File"),
-    title="Red Text Replacer",
-    description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching content from the PDF."
 ).launch()

+# app.py
 import gradio as gr
 import tempfile
+from pdf_extractor import extract_label_value_pairs
 from word_extractor import extract_red_text_with_labels, is_red_font
 from docx import Document
 from docx.shared import RGBColor
 import difflib
+def find_best_match_label(target_label, pdf_data):
+    keys = list(pdf_data.keys())
+    match = difflib.get_close_matches(target_label.lower(), keys, n=1, cutoff=0.4)
     return match[0] if match else None
+def replace_red_text_by_label(word_path, label_value_map):
+    doc = Document(word_path)
     for table in doc.tables:
         for row in table.rows:
+            cells = row.cells
+            if len(cells) >= 2:
+                label = cells[0].text.strip().replace(":", "").replace("\n", " ")
+                matched_label = find_best_match_label(label, label_value_map)
+                if not matched_label:
+                    continue
+                new_value = label_value_map[matched_label]
+                for para in cells[1].paragraphs:
                     for run in para.runs:
                         if is_red_font(run):
+                            run.text = new_value
+                            run.font.color.rgb = RGBColor(0, 0, 0)  # make black
     temp_dir = tempfile.mkdtemp()
     updated_path = f"{temp_dir}/updated.docx"
     pdf_path = pdf_file.name
     word_path = word_file.name
+    pdf_data = extract_label_value_pairs(pdf_path)           # {label: value}
+    word_data = extract_red_text_with_labels(word_path)      # {label: [red_texts]}
+    updated_doc_path = replace_red_text_by_label(word_path, pdf_data)
     return updated_doc_path
         gr.File(label="Upload Word File", type="file")
     ],
     outputs=gr.File(label="Download Updated Word File"),
+    title="Red Text Replacer (Label-Aware)",
+    description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching values from the PDF using label-based matching."
 ).launch()