Shami96 commited on
Commit
ef4ff89
·
verified ·
1 Parent(s): cf7f555

Update extract_red_text.py

Browse files
Files changed (1) hide show
  1. extract_red_text.py +90 -0
extract_red_text.py CHANGED
@@ -279,6 +279,96 @@ def extract_red_text(input_doc):
279
  out["paragraphs"] = paras
280
  return out
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  def extract_red_text_filelike(input_file, output_file):
283
  """
284
  Accepts:
 
279
  out["paragraphs"] = paras
280
  return out
281
 
282
+ def handle_management_summary_table(table, flat_json):
283
+ """Enhanced function to handle Management Summary tables specifically"""
284
+ replacements_made = 0
285
+
286
+ # Check if this is a Management Summary table
287
+ table_text = ""
288
+ for row in table.rows[:3]:
289
+ for cell in row.cells:
290
+ table_text += get_clean_text(cell).lower() + " "
291
+
292
+ # Detect which type of management summary
293
+ management_type = None
294
+ if "mass management" in table_text and "details" in table_text:
295
+ management_type = "Mass Management"
296
+ elif "maintenance management" in table_text and "details" in table_text:
297
+ management_type = "Maintenance Management"
298
+ elif "fatigue management" in table_text and "details" in table_text:
299
+ management_type = "Fatigue Management"
300
+
301
+ if not management_type:
302
+ return 0
303
+
304
+ print(f" 📋 Detected {management_type} Summary table with DETAILS column")
305
+
306
+ # Process each row to find standards and update DETAILS column
307
+ for row_idx, row in enumerate(table.rows):
308
+ if len(row.cells) < 2:
309
+ continue
310
+
311
+ # Skip header row
312
+ if row_idx == 0:
313
+ continue
314
+
315
+ standard_cell = row.cells[0]
316
+ details_cell = row.cells[1]
317
+
318
+ standard_text = get_clean_text(standard_cell).strip()
319
+
320
+ # Check if this row contains a standard (Std 1., Std 2., etc.)
321
+ if not re.match(r'Std \d+\.', standard_text):
322
+ continue
323
+
324
+ print(f" 📌 Processing {standard_text}")
325
+
326
+ # Only process if DETAILS cell has red text
327
+ if not has_red_text(details_cell):
328
+ continue
329
+
330
+ # Try multiple approaches to find matching data
331
+ json_value = None
332
+
333
+ # Approach 1: Try direct standard match in the base management section
334
+ base_management_data = flat_json.get(management_type, {})
335
+ if isinstance(base_management_data, dict):
336
+ for key, value in base_management_data.items():
337
+ if standard_text in key and isinstance(value, list) and len(value) > 0:
338
+ json_value = value
339
+ print(f" ✅ Found match in {management_type}: '{key}'")
340
+ break
341
+
342
+ # Approach 2: Try the summary section
343
+ if json_value is None:
344
+ summary_section = flat_json.get(f"{management_type} Summary", {})
345
+ if isinstance(summary_section, dict):
346
+ for key, value in summary_section.items():
347
+ if standard_text in key and isinstance(value, list) and len(value) > 0:
348
+ json_value = value
349
+ print(f" ✅ Found match in {management_type} Summary: '{key}'")
350
+ break
351
+
352
+ # Approach 3: Try fuzzy matching with all keys
353
+ if json_value is None:
354
+ json_value = find_matching_json_value(standard_text, flat_json)
355
+
356
+ # Replace red text if we found data
357
+ if json_value is not None:
358
+ replacement_text = get_value_as_string(json_value, standard_text)
359
+ if isinstance(json_value, list):
360
+ replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
361
+
362
+ cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
363
+ replacements_made += cell_replacements
364
+
365
+ if cell_replacements > 0:
366
+ print(f" ✅ Updated DETAILS for {standard_text}")
367
+ else:
368
+ print(f" ❌ No data found for {standard_text}")
369
+
370
+ return replacements_made
371
+
372
  def extract_red_text_filelike(input_file, output_file):
373
  """
374
  Accepts: