Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

c60ddb7

verified ·

1 Parent(s): f5393f7

Update updated_word.py

Browse files

Files changed (1) hide show

updated_word.py +136 -32

updated_word.py CHANGED Viewed

@@ -570,13 +570,16 @@ def handle_attendance_list_table_enhanced(table, flat_json):
     return replacements_made
 def fix_management_summary_details_column(table, flat_json):
-    """Preserve behavior but prefer scoped mgmt dicts."""
     replacements_made = 0
     print(f"    🎯 FIX: Management Summary DETAILS column processing")
     table_text = ""
     for row in table.rows[:3]:
         for cell in row.cells:
             table_text += get_clean_text(cell).lower() + " "
     mgmt_types = []
     if "mass management" in table_text or "mass" in table_text:
         mgmt_types.append("Mass Management Summary")
@@ -584,65 +587,125 @@ def fix_management_summary_details_column(table, flat_json):
         mgmt_types.append("Maintenance Management Summary")
     if "fatigue management" in table_text or "fatigue" in table_text:
         mgmt_types.append("Fatigue Management Summary")
     if not mgmt_types:
         if any("std 5" in get_clean_text(c).lower() for r in table.rows for c in r.cells):
             mgmt_types.append("Mass Management Summary")
     if not mgmt_types:
         return 0
     for mgmt_type in mgmt_types:
         print(f"    ✅ Confirmed {mgmt_type} table processing")
-        mgmt_data = flat_json.get(mgmt_type)
-        if not isinstance(mgmt_data, dict):
             for key in flat_json.keys():
-                if mgmt_type.split()[0].lower() in key.lower() and "summary" in key.lower():
-                    mgmt_data = flat_json.get(key)
                     break
-        if not isinstance(mgmt_data, dict):
             print(f"    ⚠️ No JSON management dict found for {mgmt_type}, skipping this type")
             continue
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) >= 2:
                 standard_cell = row.cells[0]
                 details_cell = row.cells[1]
                 standard_text = get_clean_text(standard_cell).strip().lower()
                 if "std 5" in standard_text or "verification" in standard_text:
                     if has_red_text(details_cell):
-                        std_val = None
-                        for candidate in ("Std 5. Verification", "Std 5 Verification", "Std 5", "Verification"):
-                            std_val = mgmt_data.get(candidate)
-                            if std_val is not None:
-                                break
-                        if std_val is None:
-                            for k, v in mgmt_data.items():
-                                if 'std 5' in k.lower() or 'verification' in k.lower():
-                                    std_val = v
-                                    break
-                        if std_val is not None:
                             replacement_text = get_value_as_string(std_val, "Std 5. Verification")
                             cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
                             replacements_made += cell_replacements
                             if cell_replacements:
                                 print(f"      ✅ Replaced Std 5. Verification details for {mgmt_type}")
-                if "std 6" in standard_text or "internal review" in standard_text:
                     if has_red_text(details_cell):
-                        std_val = None
-                        for candidate in ("Std 6. Internal Review", "Std 6 Internal Review", "Std 6", "Internal Review"):
-                            std_val = mgmt_data.get(candidate)
-                            if std_val is not None:
-                                break
-                        if std_val is None:
-                            for k, v in mgmt_data.items():
-                                if 'std 6' in k.lower() or 'internal review' in k.lower():
-                                    std_val = v
-                                    break
-                        if std_val is not None:
                             replacement_text = get_value_as_string(std_val, "Std 6. Internal Review")
                             cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
                             replacements_made += cell_replacements
                             if cell_replacements:
                                 print(f"      ✅ Replaced Std 6. Internal Review details for {mgmt_type}")
     return replacements_made
 # ============================================================================
 # Canonical operator declaration fixer — SAFER
 # ============================================================================
@@ -1282,13 +1345,40 @@ def process_paragraphs(document, flat_json):
     return replacements_made
 def process_headings(document, flat_json):
     replacements_made = 0
     print(f"\n🔍 Processing headings:")
     paragraphs = document.paragraphs
     for para_idx, paragraph in enumerate(paragraphs):
         paragraph_text = paragraph.text.strip()
         if not paragraph_text:
             continue
         matched_heading = None
         for category, patterns in HEADING_PATTERNS.items():
             for pattern in patterns:
@@ -1297,20 +1387,29 @@ def process_headings(document, flat_json):
                     break
             if matched_heading:
                 break
         if matched_heading:
             print(f"  📌 Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
             if has_red_text_in_paragraph(paragraph):
                 print(f"    🔴 Found red text in heading itself")
-                heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
                 replacements_made += heading_replacements
             for next_para_offset in range(1, 6):
                 next_para_idx = para_idx + next_para_offset
                 if next_para_idx >= len(paragraphs):
                     break
                 next_paragraph = paragraphs[next_para_idx]
                 next_text = next_paragraph.text.strip()
                 if not next_text:
                     continue
                 is_another_heading = False
                 for category, patterns in HEADING_PATTERNS.items():
                     for pattern in patterns:
@@ -1319,18 +1418,23 @@ def process_headings(document, flat_json):
                             break
                     if is_another_heading:
                         break
                 if is_another_heading:
                     break
                 if has_red_text_in_paragraph(next_paragraph):
                     print(f"    🔴 Found red text in paragraph {next_para_idx + 1} after heading")
-                    context_replacements = process_red_text_in_paragraph(
                         next_paragraph,
                         paragraph_text,
-                        flat_json
                     )
                     replacements_made += context_replacements
     return replacements_made
 def process_red_text_in_paragraph(paragraph, context_text, flat_json):
     replacements_made = 0
     red_text_segments = []

     return replacements_made
 def fix_management_summary_details_column(table, flat_json):
+    """Enhanced management summary processing with better data matching"""
     replacements_made = 0
     print(f"    🎯 FIX: Management Summary DETAILS column processing")
+    # Determine which type of management summary this is
     table_text = ""
     for row in table.rows[:3]:
         for cell in row.cells:
             table_text += get_clean_text(cell).lower() + " "
     mgmt_types = []
     if "mass management" in table_text or "mass" in table_text:
         mgmt_types.append("Mass Management Summary")
         mgmt_types.append("Maintenance Management Summary")
     if "fatigue management" in table_text or "fatigue" in table_text:
         mgmt_types.append("Fatigue Management Summary")
+    # Fallback detection
     if not mgmt_types:
         if any("std 5" in get_clean_text(c).lower() for r in table.rows for c in r.cells):
             mgmt_types.append("Mass Management Summary")
     if not mgmt_types:
+        print(f"    ⚠️ Could not determine management summary type")
         return 0
     for mgmt_type in mgmt_types:
         print(f"    ✅ Confirmed {mgmt_type} table processing")
+        # Look for management data in the JSON
+        mgmt_data = None
+        # Try direct key match first
+        if mgmt_type in flat_json:
+            mgmt_data = flat_json[mgmt_type]
+        # Try variations of the key
+        if not mgmt_data:
             for key in flat_json.keys():
+                key_lower = key.lower()
+                mgmt_lower = mgmt_type.lower()
+                if mgmt_lower in key_lower or key_lower in mgmt_lower:
+                    mgmt_data = flat_json[key]
+                    print(f"    ✅ Found data using key variation: '{key}'")
                     break
+        # If still no data, look for individual standard data
+        if not mgmt_data:
+            # Collect individual standard entries
+            mgmt_data = {}
+            for key, value in flat_json.items():
+                key_lower = key.lower()
+                # Look for standard entries related to this management type
+                if ("std " in key_lower and
+                    (("mass" in mgmt_type.lower() and any(term in key_lower for term in ["verification", "internal review"])) or
+                     ("maintenance" in mgmt_type.lower() and any(term in key_lower for term in ["daily check", "internal review"])) or
+                     ("fatigue" in mgmt_type.lower() and any(term in key_lower for term in ["internal review"])))):
+                    mgmt_data[key] = value
+            if mgmt_data:
+                print(f"    ✅ Collected individual standard data: {list(mgmt_data.keys())}")
+        if not mgmt_data or not isinstance(mgmt_data, dict):
             print(f"    ⚠️ No JSON management dict found for {mgmt_type}, skipping this type")
             continue
+        # Process the table rows
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) >= 2:
                 standard_cell = row.cells[0]
                 details_cell = row.cells[1]
                 standard_text = get_clean_text(standard_cell).strip().lower()
+                # Skip header rows
+                if "standard" in standard_text or "requirement" in standard_text or "details" in standard_text:
+                    continue
+                # Look for specific standards
                 if "std 5" in standard_text or "verification" in standard_text:
                     if has_red_text(details_cell):
+                        std_val = find_best_standard_value(mgmt_data, ["Std 5. Verification", "Std 5 Verification", "Std 5", "Verification"])
+                        if std_val:
                             replacement_text = get_value_as_string(std_val, "Std 5. Verification")
                             cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
                             replacements_made += cell_replacements
                             if cell_replacements:
                                 print(f"      ✅ Replaced Std 5. Verification details for {mgmt_type}")
+                elif "std 6" in standard_text or "internal review" in standard_text:
                     if has_red_text(details_cell):
+                        std_val = find_best_standard_value(mgmt_data, ["Std 6. Internal Review", "Std 6 Internal Review", "Std 6", "Internal Review"])
+                        if std_val:
                             replacement_text = get_value_as_string(std_val, "Std 6. Internal Review")
                             cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
                             replacements_made += cell_replacements
                             if cell_replacements:
                                 print(f"      ✅ Replaced Std 6. Internal Review details for {mgmt_type}")
+                elif "std 1" in standard_text or "daily check" in standard_text:
+                    if has_red_text(details_cell):
+                        std_val = find_best_standard_value(mgmt_data, ["Std 1. Daily Check", "Std 1 Daily Check", "Std 1", "Daily Check"])
+                        if std_val:
+                            replacement_text = get_value_as_string(std_val, "Std 1. Daily Check")
+                            cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
+                            replacements_made += cell_replacements
+                            if cell_replacements:
+                                print(f"      ✅ Replaced Std 1. Daily Check details for {mgmt_type}")
+                elif "std 7" in standard_text:
+                    if has_red_text(details_cell):
+                        std_val = find_best_standard_value(mgmt_data, ["Std 7. Internal Review", "Std 7 Internal Review", "Std 7"])
+                        if std_val:
+                            replacement_text = get_value_as_string(std_val, "Std 7. Internal Review")
+                            cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
+                            replacements_made += cell_replacements
+                            if cell_replacements:
+                                print(f"      ✅ Replaced Std 7. Internal Review details for {mgmt_type}")
     return replacements_made
+def find_best_standard_value(mgmt_data, candidate_keys):
+    """Find the best matching value for a standard from management data"""
+    for candidate in candidate_keys:
+        if candidate in mgmt_data:
+            return mgmt_data[candidate]
+    # Try fuzzy matching
+    for key, value in mgmt_data.items():
+        for candidate in candidate_keys:
+            if candidate.lower() in key.lower() or key.lower() in candidate.lower():
+                return value
+    return None
 # ============================================================================
 # Canonical operator declaration fixer — SAFER
 # ============================================================================
     return replacements_made
 def process_headings(document, flat_json):
+    """
+    IMPROVED: Better heading processing that avoids mixing company data
+    """
     replacements_made = 0
     print(f"\n🔍 Processing headings:")
     paragraphs = document.paragraphs
+    # Extract the correct operator name from the JSON data
+    operator_name = None
+    for key, value in flat_json.items():
+        if "operator name" in key.lower() and "legal entity" in key.lower():
+            if isinstance(value, list) and value:
+                operator_name = str(value[0]).strip()
+            else:
+                operator_name = str(value).strip()
+            break
+    if not operator_name:
+        # Fallback - try other operator name keys
+        for key, value in flat_json.items():
+            if ("operator" in key.lower() and "name" in key.lower()) or key.lower() == "operator name":
+                if isinstance(value, list) and value:
+                    operator_name = str(value[0]).strip()
+                elif value:
+                    operator_name = str(value).strip()
+                break
+    print(f"    📋 Using operator name: '{operator_name}'")
     for para_idx, paragraph in enumerate(paragraphs):
         paragraph_text = paragraph.text.strip()
         if not paragraph_text:
             continue
         matched_heading = None
         for category, patterns in HEADING_PATTERNS.items():
             for pattern in patterns:
                     break
             if matched_heading:
                 break
         if matched_heading:
             print(f"  📌 Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
+            # Check if the heading itself has red text
             if has_red_text_in_paragraph(paragraph):
                 print(f"    🔴 Found red text in heading itself")
+                heading_replacements = process_red_text_in_heading_paragraph(paragraph, paragraph_text, flat_json, operator_name)
                 replacements_made += heading_replacements
+            # Look for red text in paragraphs immediately following this heading
             for next_para_offset in range(1, 6):
                 next_para_idx = para_idx + next_para_offset
                 if next_para_idx >= len(paragraphs):
                     break
                 next_paragraph = paragraphs[next_para_idx]
                 next_text = next_paragraph.text.strip()
                 if not next_text:
                     continue
+                # Stop if we hit another heading
                 is_another_heading = False
                 for category, patterns in HEADING_PATTERNS.items():
                     for pattern in patterns:
                             break
                     if is_another_heading:
                         break
                 if is_another_heading:
                     break
                 if has_red_text_in_paragraph(next_paragraph):
                     print(f"    🔴 Found red text in paragraph {next_para_idx + 1} after heading")
+                    context_replacements = process_red_text_in_context_paragraph(
                         next_paragraph,
                         paragraph_text,
+                        flat_json,
+                        operator_name
                     )
                     replacements_made += context_replacements
     return replacements_made
 def process_red_text_in_paragraph(paragraph, context_text, flat_json):
     replacements_made = 0
     red_text_segments = []