Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 20

Commit

6c1e37b

verified ·

1 Parent(s): 8df4ecc

Update updated_word.py

Browse files

Files changed (1) hide show

updated_word.py +155 -2

updated_word.py CHANGED Viewed

@@ -1124,8 +1124,157 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
     return replacements_made
 def process_hf(json_file, docx_file, output_file):
-    """Your original main function (unchanged)"""
     try:
         # Load JSON
         if hasattr(json_file, "read"):
@@ -1154,7 +1303,10 @@ def process_hf(json_file, docx_file, output_file):
         paragraph_replacements = process_paragraphs(doc, flat_json)
         heading_replacements = process_headings(doc, flat_json)
-        total_replacements = table_replacements + paragraph_replacements + heading_replacements
         # Save output
         if hasattr(output_file, "write"):
@@ -1167,6 +1319,7 @@ def process_hf(json_file, docx_file, output_file):
         print(f"   📊 Tables: {table_replacements}")
         print(f"   📝 Paragraphs: {paragraph_replacements}")
         print(f"   📋 Headings: {heading_replacements}")
         print(f"🎉 Processing complete!")
     except FileNotFoundError as e:

     return replacements_made
+def force_red_text_replacement(document, flat_json):
+    """Force replacement of any remaining red text by trying ALL JSON values"""
+    replacements_made = 0
+    print(f"\n🎯 FORCE FIX: Scanning for any remaining red text...")
+    # Collect ALL possible replacement values from JSON
+    all_values = {}
+    for key, value in flat_json.items():
+        if value and str(value).strip():
+            # Store both the key and variations of the value
+            value_str = get_value_as_string(value, key)
+            if value_str and value_str.strip():
+                all_values[key] = value_str
+                # Also store individual words/parts for partial matching
+                if isinstance(value, list):
+                    for item in value:
+                        if str(item).strip():
+                            all_values[f"{key}_item"] = str(item).strip()
+    print(f"    Found {len(all_values)} potential replacement values")
+    # Process all tables
+    for table_idx, table in enumerate(document.tables):
+        for row_idx, row in enumerate(table.rows):
+            for cell_idx, cell in enumerate(row.cells):
+                if has_red_text(cell):
+                    print(f"    🔍 Found red text in Table {table_idx + 1}, Row {row_idx + 1}, Cell {cell_idx + 1}")
+                    # Extract all red text from this cell
+                    red_text_parts = []
+                    for paragraph in cell.paragraphs:
+                        for run in paragraph.runs:
+                            if is_red(run) and run.text.strip():
+                                red_text_parts.append(run.text.strip())
+                    combined_red_text = " ".join(red_text_parts).strip()
+                    print(f"        Red text: '{combined_red_text}'")
+                    # Try to find a match
+                    best_match = None
+                    best_key = None
+                    # First try exact matching
+                    for key, value in all_values.items():
+                        if combined_red_text.lower() == value.lower():
+                            best_match = value
+                            best_key = key
+                            break
+                    # If no exact match, try partial matching
+                    if not best_match:
+                        for key, value in all_values.items():
+                            # Try if red text contains this value or vice versa
+                            if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
+                               (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
+                                best_match = value
+                                best_key = key
+                                break
+                    # If still no match, try word-by-word matching for names/dates
+                    if not best_match:
+                        red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
+                        best_score = 0
+                        for key, value in all_values.items():
+                            value_words = set(word.lower() for word in str(value).split() if len(word) > 2)
+                            if red_words and value_words:
+                                common_words = red_words.intersection(value_words)
+                                if common_words:
+                                    score = len(common_words) / len(red_words)
+                                    if score > best_score and score >= 0.5:  # At least 50% match
+                                        best_score = score
+                                        best_match = value
+                                        best_key = key
+                    # Replace if we found a match
+                    if best_match:
+                        print(f"        ✅ Replacing with: '{best_match}' (from key: '{best_key}')")
+                        cell_replacements = replace_red_text_in_cell(cell, best_match)
+                        replacements_made += cell_replacements
+                        print(f"        Made {cell_replacements} replacements")
+                    else:
+                        print(f"        ❌ No suitable replacement found")
+    # Process all paragraphs
+    for para_idx, paragraph in enumerate(document.paragraphs):
+        if has_red_text_in_paragraph(paragraph):
+            red_text_parts = []
+            for run in paragraph.runs:
+                if is_red(run) and run.text.strip():
+                    red_text_parts.append(run.text.strip())
+            combined_red_text = " ".join(red_text_parts).strip()
+            if combined_red_text:
+                print(f"    🔍 Found red text in Paragraph {para_idx + 1}: '{combined_red_text}'")
+                # Same matching logic as above
+                best_match = None
+                best_key = None
+                # Exact match
+                for key, value in all_values.items():
+                    if combined_red_text.lower() == value.lower():
+                        best_match = value
+                        best_key = key
+                        break
+                # Partial match
+                if not best_match:
+                    for key, value in all_values.items():
+                        if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
+                           (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
+                            best_match = value
+                            best_key = key
+                            break
+                # Word match
+                if not best_match:
+                    red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
+                    best_score = 0
+                    for key, value in all_values.items():
+                        value_words = set(word.lower() for word in str(value).split() if len(word) > 2)
+                        if red_words and value_words:
+                            common_words = red_words.intersection(value_words)
+                            if common_words:
+                                score = len(common_words) / len(red_words)
+                                if score > best_score and score >= 0.5:
+                                    best_score = score
+                                    best_match = value
+                                    best_key = key
+                # Replace if found
+                if best_match:
+                    print(f"        ✅ Replacing with: '{best_match}' (from key: '{best_key}')")
+                    red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
+                    if red_runs:
+                        red_runs[0].text = best_match
+                        red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
+                        for run in red_runs[1:]:
+                            run.text = ''
+                        replacements_made += 1
+                        print(f"        Made 1 paragraph replacement")
+                else:
+                    print(f"        ❌ No suitable replacement found")
+    return replacements_made
 def process_hf(json_file, docx_file, output_file):
+    """Your original main function with force fix added at the end"""
     try:
         # Load JSON
         if hasattr(json_file, "read"):
         paragraph_replacements = process_paragraphs(doc, flat_json)
         heading_replacements = process_headings(doc, flat_json)
+        # 🎯 ADD THIS: Force fix for any remaining red text
+        force_replacements = force_red_text_replacement(doc, flat_json)
+        total_replacements = table_replacements + paragraph_replacements + heading_replacements + force_replacements
         # Save output
         if hasattr(output_file, "write"):
         print(f"   📊 Tables: {table_replacements}")
         print(f"   📝 Paragraphs: {paragraph_replacements}")
         print(f"   📋 Headings: {heading_replacements}")
+        print(f"   🎯 Force fixes: {force_replacements}")
         print(f"🎉 Processing complete!")
     except FileNotFoundError as e: