Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 21

Commit

ef4ff89

verified ·

1 Parent(s): cf7f555

Update extract_red_text.py

Browse files

Files changed (1) hide show

extract_red_text.py +90 -0

extract_red_text.py CHANGED Viewed

@@ -279,6 +279,96 @@ def extract_red_text(input_doc):
         out["paragraphs"] = paras
     return out
 def extract_red_text_filelike(input_file, output_file):
     """
     Accepts:

         out["paragraphs"] = paras
     return out
+def handle_management_summary_table(table, flat_json):
+    """Enhanced function to handle Management Summary tables specifically"""
+    replacements_made = 0
+    # Check if this is a Management Summary table
+    table_text = ""
+    for row in table.rows[:3]:
+        for cell in row.cells:
+            table_text += get_clean_text(cell).lower() + " "
+    # Detect which type of management summary
+    management_type = None
+    if "mass management" in table_text and "details" in table_text:
+        management_type = "Mass Management"
+    elif "maintenance management" in table_text and "details" in table_text:
+        management_type = "Maintenance Management"
+    elif "fatigue management" in table_text and "details" in table_text:
+        management_type = "Fatigue Management"
+    if not management_type:
+        return 0
+    print(f"    📋 Detected {management_type} Summary table with DETAILS column")
+    # Process each row to find standards and update DETAILS column
+    for row_idx, row in enumerate(table.rows):
+        if len(row.cells) < 2:
+            continue
+        # Skip header row
+        if row_idx == 0:
+            continue
+        standard_cell = row.cells[0]
+        details_cell = row.cells[1]
+        standard_text = get_clean_text(standard_cell).strip()
+        # Check if this row contains a standard (Std 1., Std 2., etc.)
+        if not re.match(r'Std \d+\.', standard_text):
+            continue
+        print(f"    📌 Processing {standard_text}")
+        # Only process if DETAILS cell has red text
+        if not has_red_text(details_cell):
+            continue
+        # Try multiple approaches to find matching data
+        json_value = None
+        # Approach 1: Try direct standard match in the base management section
+        base_management_data = flat_json.get(management_type, {})
+        if isinstance(base_management_data, dict):
+            for key, value in base_management_data.items():
+                if standard_text in key and isinstance(value, list) and len(value) > 0:
+                    json_value = value
+                    print(f"        ✅ Found match in {management_type}: '{key}'")
+                    break
+        # Approach 2: Try the summary section
+        if json_value is None:
+            summary_section = flat_json.get(f"{management_type} Summary", {})
+            if isinstance(summary_section, dict):
+                for key, value in summary_section.items():
+                    if standard_text in key and isinstance(value, list) and len(value) > 0:
+                        json_value = value
+                        print(f"        ✅ Found match in {management_type} Summary: '{key}'")
+                        break
+        # Approach 3: Try fuzzy matching with all keys
+        if json_value is None:
+            json_value = find_matching_json_value(standard_text, flat_json)
+        # Replace red text if we found data
+        if json_value is not None:
+            replacement_text = get_value_as_string(json_value, standard_text)
+            if isinstance(json_value, list):
+                replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
+            cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
+            replacements_made += cell_replacements
+            if cell_replacements > 0:
+                print(f"        ✅ Updated DETAILS for {standard_text}")
+        else:
+            print(f"        ❌ No data found for {standard_text}")
+    return replacements_made
 def extract_red_text_filelike(input_file, output_file):
     """
     Accepts: