Shami96 commited on
Commit
47f7e99
·
verified ·
1 Parent(s): ded60cc

Update extract_red_text.py

Browse files
Files changed (1) hide show
  1. extract_red_text.py +264 -156
extract_red_text.py CHANGED
@@ -1,6 +1,6 @@
1
  #!/usr/bin/env python3
2
  """
3
- extract_red_text.py
4
  """
5
 
6
  from __future__ import annotations
@@ -9,7 +9,7 @@ import re
9
  import sys
10
  import logging
11
  from collections import defaultdict
12
- from typing import List, Dict, Optional, Any
13
 
14
  # attempt to import python-docx (document processing)
15
  try:
@@ -20,62 +20,19 @@ except Exception as e:
20
  raise RuntimeError("python-docx is required. Install with: pip install python-docx") from e
21
 
22
  # ------------------------------
23
- # Import master_key GLOBAL_SETTINGS and optional EXTRA_HEADER_SYNONYMS
24
  # ------------------------------
25
  try:
26
  import master_key as mk
27
- GLOBAL_SETTINGS = getattr(mk, "GLOBAL_SETTINGS", {})
28
- EXTRA_HEADER_SYNONYMS = getattr(mk, "EXTRA_HEADER_SYNONYMS", None)
29
- except Exception:
30
- GLOBAL_SETTINGS = {
31
- "normalize": {
32
- "lower": True,
33
- "strip_punctuation": True,
34
- "collapse_whitespace": True,
35
- "replace_smart_dashes": True
36
- },
37
- "ocr_repair_rules": [
38
- (r"\s*\(\s*Yes\s*/\s*No\s*\)", " (Yes/No)"),
39
- (r"R[e3]gistrat[i1]on", "Registration"),
40
- (r"Prin?t", "Print"),
41
- (r"Accredi[ta]tion", "Accreditation"),
42
- (r"[^\w\s\-\&\(\)\/:]", " "),
43
- ],
44
- "split_on": [" – ", " - ", ";", "\n", " / "],
45
- "date_like_pattern": r"^\s*(\d{1,2}(st|nd|rd|th)?\s+[A-Za-z]+|\d{1,2}\/\d{1,2}\/\d{2,4}|\d{1,2}\.\d{1,2}\.\d{2,4}|\d{1,2}\s+[A-Za-z]{3,})",
46
- "fuzzy_thresholds": {"high_priority": 70, "medium_priority": 60, "low_priority": 45},
47
- "fuzzy_algorithm": "token_set_ratio",
48
- }
49
- EXTRA_HEADER_SYNONYMS = None
50
-
51
- # Provide an internal default synonyms map (compact keys -> canonical label)
52
- # This is used only if master_key.EXTRA_HEADER_SYNONYMS is not defined.
53
- _DEFAULT_EXTRA_HEADER_SYNONYMS = {
54
- # Compact key: canonical label
55
- # Examples from your logs (long/noisy headers)
56
- "roadworthinesscertificatesapplicableforentryaudit": "Roadworthiness Certificates",
57
- "roadworthinesscertificates": "Roadworthiness Certificates",
58
- "rfsuspensioncertificationn/aifnotapplicable": "RFS Suspension Certification #",
59
- "rfsuspensioncertification": "RFS Suspension Certification #",
60
- "maintenanceRecordsrecorddaterangeofrecordsreviewed".lower(): "Maintenance Records",
61
- "maintenancerecords": "Maintenance Records",
62
- "faultrecordingreportingonsuspensionsystemdaterange".lower(): "Fault Recording/ Reporting",
63
- "faultrecordingreporting": "Fault Recording/ Reporting",
64
- "faultrepairdaterange": "Fault Repair",
65
- "triprecordsdaterange": "Trip Records",
66
- # Add common variations
67
- "registrationnumber": "Registration Number",
68
- "registrationnumbernumber": "Registration Number",
69
- "subcontractor(yesno)": "Sub-contractor (Yes/No)",
70
- "sub-contractor(yes/no)": "Sub-contractor (Yes/No)",
71
- "nhvrorexemplarglobalauditorregistrationnumber": "NHVR or Exemplar Global Auditor Registration Number",
72
- "printname": "Print Name",
73
- "print": "Print Name",
74
- }
75
-
76
- # If mk provided EXTRA_HEADER_SYNONYMS, use it (but ensure keys are compacted similarly)
77
- if EXTRA_HEADER_SYNONYMS is None:
78
- EXTRA_HEADER_SYNONYMS = _DEFAULT_EXTRA_HEADER_SYNONYMS
79
 
80
  # ------------------------------
81
  # Logging
@@ -87,6 +44,7 @@ log = logging.getLogger("extract_red_text")
87
  # Normalization & OCR-repair utilities (aligned to GLOBAL_SETTINGS)
88
  # ------------------------------
89
  def _apply_ocr_repair_rules(text: str) -> str:
 
90
  s = text or ""
91
  for pat, repl in GLOBAL_SETTINGS.get("ocr_repair_rules", []):
92
  try:
@@ -100,6 +58,7 @@ def _normalize_text(text: str) -> str:
100
  """Normalize text according to GLOBAL_SETTINGS (readable normalized form)."""
101
  s = _apply_ocr_repair_rules(text or "")
102
  norm_cfg = GLOBAL_SETTINGS.get("normalize", {})
 
103
  if norm_cfg.get("replace_smart_dashes", False):
104
  s = s.replace("–", "-").replace("—", "-")
105
  if norm_cfg.get("lower", False):
@@ -109,6 +68,7 @@ def _normalize_text(text: str) -> str:
109
  s = re.sub(r"[^\w\s\-\&\(\)\/:]", " ", s)
110
  if norm_cfg.get("collapse_whitespace", False):
111
  s = re.sub(r"\s+", " ", s)
 
112
  return s.strip()
113
 
114
  def _compact_key(text: str) -> str:
@@ -125,182 +85,320 @@ def map_header_using_extra_synonyms(header_text: str) -> Optional[str]:
125
  """
126
  if not header_text:
127
  return None
 
128
  normalized = _normalize_text(header_text)
129
  compact = _compact_key(header_text)
 
130
  # try compact key
131
  if compact in EXTRA_HEADER_SYNONYMS:
132
  return EXTRA_HEADER_SYNONYMS[compact]
 
133
  # try normalized key directly
134
  if normalized in EXTRA_HEADER_SYNONYMS:
135
  return EXTRA_HEADER_SYNONYMS[normalized]
 
136
  # also try case-insensitive match on keys
137
  for k, v in EXTRA_HEADER_SYNONYMS.items():
138
  if k.lower() == normalized.lower() or k.lower() == compact.lower():
139
  return v
 
140
  return None
141
 
142
  # ------------------------------
143
- # Helpers to detect red font runs robustly
144
  # ------------------------------
145
  def _run_is_red(run) -> bool:
146
  """
147
- Detect if a run is red. python-docx represents color by run.font.color.
148
- We check RGB if available, or theme color 'red' as fallback.
149
  """
150
  try:
151
- color = run.font.color
152
- if color is None:
153
- return False
154
- # If RGB is specified
155
- rgb = getattr(color, "rgb", None)
156
- if rgb is not None:
157
- # rgb is a docx.shared.RGBColor or similar. Representable as 'FF0000' or integer tuple
158
- hexval = ''.join("{:02X}".format(c) for c in rgb) if isinstance(rgb, (tuple, list)) else str(rgb)
159
- # accept strings containing 'FF0000' or '0000FF'? (we want red)
160
- # Accept any color where red component is high and others low-ish
161
  try:
162
- # If hex-like 'FF0000' -> interpret
163
- hex_clean = re.sub(r"[^0-9A-Fa-f]", "", hexval)
164
- if len(hex_clean) >= 6:
165
- r = int(hex_clean[-6:-4], 16)
166
- g = int(hex_clean[-4:-2], 16)
167
- b = int(hex_clean[-2:], 16)
168
- if r >= 150 and g < 120 and b < 120:
 
 
 
 
169
  return True
170
  except Exception:
171
  pass
172
- # fallback: theme color or color.theme_color value
173
- theme_color = getattr(color, "theme_color", None)
174
- if theme_color:
175
- try:
176
- if str(theme_color).lower().find("red") != -1:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  return True
178
- except Exception:
179
- pass
180
  except Exception:
181
  pass
182
- # final heuristic: if run.font.color.rgb as string contains 'FF' prefix and '00' for others
 
183
  try:
184
  if hasattr(run.font.color, "rgb") and run.font.color.rgb is not None:
185
  s = str(run.font.color.rgb)
186
- if "FF" in s and "0000" in s:
 
187
  return True
188
  except Exception:
189
  pass
 
190
  return False
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  # ------------------------------
193
  # Extraction: paragraphs, headings, tables
194
  # ------------------------------
195
  def extract_from_docx(path: str) -> Dict[str, Any]:
 
 
196
  doc = Document(path)
 
197
  headings: List[str] = []
198
  paragraphs_red: List[Dict[str, Any]] = []
199
  red_runs: List[Dict[str, Any]] = []
200
  tables_out: List[Dict[str, Any]] = []
201
 
202
- # extract headings and paragraphs with red runs
 
203
  for p_index, para in enumerate(doc.paragraphs):
204
  text = para.text or ""
205
- # identify heading level from style name if available
 
206
  style_name = getattr(para.style, "name", "") if para.style is not None else ""
207
- is_heading = bool(re.search(r"Heading\s*\d+|HEADING|TITLE|SUBTITLE", style_name, flags=re.I)) or bool(re.search(r"^(MAINTENANCE|MASS|FATIGUE|NHVAS|Vehicle Registration|CORRECTIVE)", text, flags=re.I))
 
 
208
  if is_heading:
209
  headings.append(text.strip())
 
210
 
211
- # gather red runs in this paragraph
212
  paragraph_red_texts = []
213
  char_cursor = 0
 
214
  for run in para.runs:
215
  run_text = run.text or ""
216
  run_len = len(run_text)
 
217
  if _run_is_red(run) and run_text.strip():
218
- # store a red run entry
219
  rr = {
220
  "text": run_text,
221
  "paragraph_index": p_index,
222
  "char_index": char_cursor,
223
- "style_name": style_name
 
224
  }
225
  red_runs.append(rr)
226
  paragraph_red_texts.append(run_text)
 
 
227
  char_cursor += run_len
 
228
  if paragraph_red_texts:
229
  paragraphs_red.append({
230
  "paragraph_index": p_index,
231
  "text": text,
232
  "red_texts": paragraph_red_texts,
233
- "style_name": style_name
 
234
  })
235
 
236
- # extract tables
 
237
  for t_index, table in enumerate(doc.tables):
238
- # convert table to simple cell-text matrix
239
- nrows = len(table.rows)
240
- ncols = max(len(row.cells) for row in table.rows) if nrows > 0 else 0
241
- headers = []
242
- rows_text = []
243
- rows_red_cells = []
244
-
245
- # Attempt to treat first row as header if cells look like headers (bold or all-caps)
246
- header_row = table.rows[0] if nrows > 0 else None
247
-
248
- # build header texts & apply header mapping
249
- if header_row:
250
- for c_idx, cell in enumerate(header_row.cells):
251
- cell_text = cell.text.strip()
252
- # normalize & map using EXTRA_HEADER_SYNONYMS
253
- mapped = map_header_using_extra_synonyms(cell_text)
254
- if mapped:
255
- header_label = mapped
256
- else:
257
- header_label = cell_text
258
- headers.append(header_label)
259
 
260
- # process all rows -> list of lists
261
- for r_i, row in enumerate(table.rows):
262
- row_texts = []
263
- row_reds = []
264
- for c_i, cell in enumerate(row.cells):
265
- ct = cell.text.strip()
266
- # gather red text from runs in this cell
267
- red_in_cell = []
268
- # docx cell may have paragraphs
269
- for cpara in cell.paragraphs:
270
- for run in cpara.runs:
271
- if _run_is_red(run) and (run.text or "").strip():
272
- red_in_cell.append((run.text or "").strip())
273
- # compact red text into a single string if multiple runs present
274
- red_text_joined = " ".join(red_in_cell) if red_in_cell else None
275
- row_texts.append(ct)
276
- row_reds.append(red_text_joined)
277
- rows_text.append(row_texts)
278
- rows_red_cells.append(row_reds)
279
-
280
- tables_out.append({
281
- "table_index": t_index,
282
- "nrows": nrows,
283
- "ncols": ncols,
284
- "headers": headers,
285
- "rows": rows_text,
286
- "red_cells": rows_red_cells
287
- })
288
-
289
- # assemble output structure
290
  out = {
291
  "headings": headings,
292
  "paragraphs": paragraphs_red,
293
  "tables": tables_out,
294
  "red_runs": red_runs,
295
- # helpful metadata for downstream processing
296
  "meta": {
297
  "source_file": path,
298
  "total_headings": len(headings),
299
  "total_red_paragraphs": len(paragraphs_red),
300
  "total_tables": len(tables_out),
301
- "total_red_runs": len(red_runs)
 
 
 
 
 
 
 
 
 
302
  }
303
  }
 
304
  return out
305
 
306
  # ------------------------------
@@ -310,10 +408,14 @@ def main(argv):
310
  if len(argv) < 3:
311
  print("Usage: python extract_red_text.py input.docx output.json")
312
  sys.exit(2)
 
313
  input_docx = argv[1]
314
  output_json = argv[2]
315
 
316
- log.info("Extracting red text from: %s", input_docx)
 
 
 
317
  try:
318
  result = extract_from_docx(input_docx)
319
  except Exception as exc:
@@ -324,21 +426,27 @@ def main(argv):
324
  try:
325
  with open(output_json, "w", encoding="utf-8") as fh:
326
  json.dump(result, fh, ensure_ascii=False, indent=2)
327
- log.info("Saved extracted word JSON to: %s", output_json)
328
  except Exception:
329
  log.exception("Failed to write output JSON to %s", output_json)
330
  raise
331
 
332
- # Print a short summary for logs / quick verification
333
- log.info("Headings found: %d, Red paragraphs: %d, Tables: %d, Red runs: %d",
334
- len(result.get("headings", [])),
335
- len(result.get("paragraphs", [])),
336
- len(result.get("tables", [])),
337
- len(result.get("red_runs", []))
338
- )
 
 
339
 
340
  if __name__ == "__main__":
341
  main(sys.argv)
342
- # ADD THIS LINE:
343
  if len(sys.argv) >= 3:
344
- with open(sys.argv[2], 'r') as f: print(f"\n📄 EXTRACT_RED_TEXT OUTPUT:\n{f.read()}")
 
 
 
 
 
1
  #!/usr/bin/env python3
2
  """
3
+ extract_red_text.py - Enhanced version with improved red text detection and master key alignment
4
  """
5
 
6
  from __future__ import annotations
 
9
  import sys
10
  import logging
11
  from collections import defaultdict
12
+ from typing import List, Dict, Optional, Any, Tuple
13
 
14
  # attempt to import python-docx (document processing)
15
  try:
 
20
  raise RuntimeError("python-docx is required. Install with: pip install python-docx") from e
21
 
22
  # ------------------------------
23
+ # Import master_key configurations
24
  # ------------------------------
25
  try:
26
  import master_key as mk
27
+ GLOBAL_SETTINGS = mk.GLOBAL_SETTINGS
28
+ EXTRA_HEADER_SYNONYMS = mk.EXTRA_HEADER_SYNONYMS
29
+ TABLE_SCHEMAS = getattr(mk, "TABLE_SCHEMAS", {})
30
+ except ImportError as e:
31
+ logging.error("Failed to import master_key.py: %s", e)
32
+ raise RuntimeError("master_key.py is required for configuration") from e
33
+ except AttributeError as e:
34
+ logging.error("Missing required configuration in master_key.py: %s", e)
35
+ raise RuntimeError("master_key.py missing required GLOBAL_SETTINGS or EXTRA_HEADER_SYNONYMS") from e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  # ------------------------------
38
  # Logging
 
44
  # Normalization & OCR-repair utilities (aligned to GLOBAL_SETTINGS)
45
  # ------------------------------
46
  def _apply_ocr_repair_rules(text: str) -> str:
47
+ """Apply OCR repair rules from GLOBAL_SETTINGS."""
48
  s = text or ""
49
  for pat, repl in GLOBAL_SETTINGS.get("ocr_repair_rules", []):
50
  try:
 
58
  """Normalize text according to GLOBAL_SETTINGS (readable normalized form)."""
59
  s = _apply_ocr_repair_rules(text or "")
60
  norm_cfg = GLOBAL_SETTINGS.get("normalize", {})
61
+
62
  if norm_cfg.get("replace_smart_dashes", False):
63
  s = s.replace("–", "-").replace("—", "-")
64
  if norm_cfg.get("lower", False):
 
68
  s = re.sub(r"[^\w\s\-\&\(\)\/:]", " ", s)
69
  if norm_cfg.get("collapse_whitespace", False):
70
  s = re.sub(r"\s+", " ", s)
71
+
72
  return s.strip()
73
 
74
  def _compact_key(text: str) -> str:
 
85
  """
86
  if not header_text:
87
  return None
88
+
89
  normalized = _normalize_text(header_text)
90
  compact = _compact_key(header_text)
91
+
92
  # try compact key
93
  if compact in EXTRA_HEADER_SYNONYMS:
94
  return EXTRA_HEADER_SYNONYMS[compact]
95
+
96
  # try normalized key directly
97
  if normalized in EXTRA_HEADER_SYNONYMS:
98
  return EXTRA_HEADER_SYNONYMS[normalized]
99
+
100
  # also try case-insensitive match on keys
101
  for k, v in EXTRA_HEADER_SYNONYMS.items():
102
  if k.lower() == normalized.lower() or k.lower() == compact.lower():
103
  return v
104
+
105
  return None
106
 
107
  # ------------------------------
108
+ # Enhanced red font detection using hf_utils pattern
109
  # ------------------------------
110
  def _run_is_red(run) -> bool:
111
  """
112
+ Enhanced red color detection for docx.run objects.
113
+ Uses multiple methods to detect red text robustly.
114
  """
115
  try:
116
+ # Method 1: Check run.font.color.rgb
117
+ col = getattr(run.font, "color", None)
118
+ if col is not None and getattr(col, "rgb", None):
119
+ rgb = col.rgb
 
 
 
 
 
 
120
  try:
121
+ # rgb may be sequence-like or have attributes
122
+ if hasattr(rgb, '__getitem__'): # sequence-like
123
+ r, g, b = rgb[0], rgb[1], rgb[2]
124
+ else: # attribute access
125
+ r = getattr(rgb, "r", None) or getattr(rgb, "red", None)
126
+ g = getattr(rgb, "g", None) or getattr(rgb, "green", None)
127
+ b = getattr(rgb, "b", None) or getattr(rgb, "blue", None)
128
+
129
+ if r is not None and g is not None and b is not None:
130
+ # Tolerant heuristic: red must be noticeably higher than green/blue
131
+ if r >= 160 and g <= 120 and b <= 120 and (r - g) >= 30 and (r - b) >= 30:
132
  return True
133
  except Exception:
134
  pass
135
+ except Exception:
136
+ pass
137
+
138
+ # Method 2: Check raw XML color code
139
+ try:
140
+ rPr = run._element.rPr
141
+ if rPr is not None:
142
+ clr = rPr.find('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}color')
143
+ if clr is not None:
144
+ val = clr.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
145
+ if val and re.fullmatch(r"[0-9A-Fa-f]{6}", val):
146
+ rr = int(val[:2], 16)
147
+ gg = int(val[2:4], 16)
148
+ bb = int(val[4:], 16)
149
+ if rr >= 160 and gg <= 120 and bb <= 120 and (rr - gg) >= 30 and (rr - bb) >= 30:
150
+ return True
151
+ except Exception:
152
+ pass
153
+
154
+ # Method 3: Check theme color
155
+ try:
156
+ color = run.font.color
157
+ if color is not None:
158
+ theme_color = getattr(color, "theme_color", None)
159
+ if theme_color:
160
+ theme_str = str(theme_color).lower()
161
+ if "red" in theme_str or "accent_2" in theme_str: # Common red theme
162
  return True
 
 
163
  except Exception:
164
  pass
165
+
166
+ # Method 4: String representation fallback
167
  try:
168
  if hasattr(run.font.color, "rgb") and run.font.color.rgb is not None:
169
  s = str(run.font.color.rgb)
170
+ # Look for patterns like "FF0000" or similar high-red values
171
+ if re.search(r"[Ff]{2}0{4}|[Ee]{2}0{4}|[Dd]{2}0{4}", s):
172
  return True
173
  except Exception:
174
  pass
175
+
176
  return False
177
 
178
+ def _extract_red_text_segments(cell):
179
+ """Extract red text segments from a table cell."""
180
+ segments = []
181
+ for p_idx, paragraph in enumerate(cell.paragraphs):
182
+ current_text = ""
183
+ current_runs = []
184
+
185
+ for r_idx, run in enumerate(paragraph.runs):
186
+ if _run_is_red(run) and run.text.strip():
187
+ current_text += run.text
188
+ current_runs.append((p_idx, r_idx, run))
189
+ else:
190
+ # End of red segment
191
+ if current_runs:
192
+ segments.append({
193
+ 'text': current_text.strip(),
194
+ 'runs': current_runs.copy(),
195
+ 'paragraph_idx': p_idx
196
+ })
197
+ current_text = ""
198
+ current_runs = []
199
+
200
+ # Handle segment at end of paragraph
201
+ if current_runs:
202
+ segments.append({
203
+ 'text': current_text.strip(),
204
+ 'runs': current_runs.copy(),
205
+ 'paragraph_idx': p_idx
206
+ })
207
+
208
+ return segments
209
+
210
+ def _has_red_text(cell) -> bool:
211
+ """Check if a cell contains any red text."""
212
+ for paragraph in cell.paragraphs:
213
+ for run in paragraph.runs:
214
+ if _run_is_red(run) and run.text.strip():
215
+ return True
216
+ return False
217
+
218
+ # ------------------------------
219
+ # Enhanced table processing with schema-aware header mapping
220
+ # ------------------------------
221
+ def _process_table_with_schema_mapping(table, t_index: int) -> Dict[str, Any]:
222
+ """Process table with enhanced header mapping using master key schemas."""
223
+ nrows = len(table.rows)
224
+ ncols = max(len(row.cells) for row in table.rows) if nrows > 0 else 0
225
+
226
+ if nrows == 0:
227
+ return {
228
+ "table_index": t_index,
229
+ "nrows": 0,
230
+ "ncols": 0,
231
+ "headers": [],
232
+ "rows": [],
233
+ "red_cells": [],
234
+ "mapped_headers": []
235
+ }
236
+
237
+ # Process headers from first row
238
+ header_row = table.rows[0]
239
+ headers = []
240
+ mapped_headers = []
241
+
242
+ for c_idx, cell in enumerate(header_row.cells[:ncols]):
243
+ cell_text = cell.text.strip()
244
+
245
+ # Try mapping using EXTRA_HEADER_SYNONYMS first
246
+ mapped = map_header_using_extra_synonyms(cell_text)
247
+ if mapped:
248
+ header_label = mapped
249
+ log.debug(f"Mapped header '{cell_text}' -> '{mapped}'")
250
+ else:
251
+ header_label = cell_text
252
+
253
+ headers.append(cell_text) # Original header
254
+ mapped_headers.append(header_label) # Mapped header
255
+
256
+ # Process all rows
257
+ rows_text = []
258
+ rows_red_cells = []
259
+ rows_red_metadata = []
260
+
261
+ for r_i, row in enumerate(table.rows):
262
+ row_texts = []
263
+ row_reds = []
264
+ row_red_meta = []
265
+
266
+ for c_i, cell in enumerate(row.cells[:ncols]):
267
+ cell_text = cell.text.strip()
268
+
269
+ # Extract red text segments with metadata
270
+ red_segments = _extract_red_text_segments(cell)
271
+
272
+ if red_segments:
273
+ # Join all red text segments
274
+ red_text_parts = [seg['text'] for seg in red_segments if seg['text']]
275
+ red_text_joined = " ".join(red_text_parts).strip()
276
+
277
+ # Store metadata about red text location
278
+ red_metadata = {
279
+ "has_red": True,
280
+ "red_text": red_text_joined,
281
+ "segments": len(red_segments),
282
+ "total_red_runs": sum(len(seg['runs']) for seg in red_segments)
283
+ }
284
+ else:
285
+ red_text_joined = None
286
+ red_metadata = {"has_red": False}
287
+
288
+ row_texts.append(cell_text)
289
+ row_reds.append(red_text_joined)
290
+ row_red_meta.append(red_metadata)
291
+
292
+ rows_text.append(row_texts)
293
+ rows_red_cells.append(row_reds)
294
+ rows_red_metadata.append(row_red_meta)
295
+
296
+ return {
297
+ "table_index": t_index,
298
+ "nrows": nrows,
299
+ "ncols": ncols,
300
+ "headers": headers, # Original headers
301
+ "mapped_headers": mapped_headers, # Mapped headers
302
+ "rows": rows_text,
303
+ "red_cells": rows_red_cells,
304
+ "red_metadata": rows_red_metadata # Additional red text metadata
305
+ }
306
+
307
  # ------------------------------
308
  # Extraction: paragraphs, headings, tables
309
  # ------------------------------
310
  def extract_from_docx(path: str) -> Dict[str, Any]:
311
+ """Extract content from DOCX with enhanced red text detection and schema mapping."""
312
+ log.info(f"Opening document: {path}")
313
  doc = Document(path)
314
+
315
  headings: List[str] = []
316
  paragraphs_red: List[Dict[str, Any]] = []
317
  red_runs: List[Dict[str, Any]] = []
318
  tables_out: List[Dict[str, Any]] = []
319
 
320
+ # Extract headings and paragraphs with red runs
321
+ log.info("Processing paragraphs and headings...")
322
  for p_index, para in enumerate(doc.paragraphs):
323
  text = para.text or ""
324
+
325
+ # Identify heading level from style name if available
326
  style_name = getattr(para.style, "name", "") if para.style is not None else ""
327
+ is_heading = bool(re.search(r"Heading\s*\d+|HEADING|TITLE|SUBTITLE", style_name, flags=re.I)) or \
328
+ bool(re.search(r"^(MAINTENANCE|MASS|FATIGUE|NHVAS|Vehicle Registration|CORRECTIVE)", text, flags=re.I))
329
+
330
  if is_heading:
331
  headings.append(text.strip())
332
+ log.debug(f"Found heading: {text.strip()}")
333
 
334
+ # Gather red runs in this paragraph
335
  paragraph_red_texts = []
336
  char_cursor = 0
337
+
338
  for run in para.runs:
339
  run_text = run.text or ""
340
  run_len = len(run_text)
341
+
342
  if _run_is_red(run) and run_text.strip():
343
+ # Store a red run entry
344
  rr = {
345
  "text": run_text,
346
  "paragraph_index": p_index,
347
  "char_index": char_cursor,
348
+ "style_name": style_name,
349
+ "normalized_text": _normalize_text(run_text)
350
  }
351
  red_runs.append(rr)
352
  paragraph_red_texts.append(run_text)
353
+ log.debug(f"Found red text in paragraph {p_index}: '{run_text.strip()}'")
354
+
355
  char_cursor += run_len
356
+
357
  if paragraph_red_texts:
358
  paragraphs_red.append({
359
  "paragraph_index": p_index,
360
  "text": text,
361
  "red_texts": paragraph_red_texts,
362
+ "style_name": style_name,
363
+ "red_text_joined": " ".join(paragraph_red_texts).strip()
364
  })
365
 
366
+ # Extract tables with enhanced processing
367
+ log.info(f"Processing {len(doc.tables)} tables...")
368
  for t_index, table in enumerate(doc.tables):
369
+ table_data = _process_table_with_schema_mapping(table, t_index)
370
+ tables_out.append(table_data)
371
+
372
+ # Log red text findings
373
+ red_cell_count = sum(1 for row in table_data["red_cells"] for cell in row if cell)
374
+ if red_cell_count > 0:
375
+ log.info(f"Table {t_index}: Found {red_cell_count} cells with red text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
+ # Assemble output structure
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  out = {
379
  "headings": headings,
380
  "paragraphs": paragraphs_red,
381
  "tables": tables_out,
382
  "red_runs": red_runs,
383
+ # Enhanced metadata
384
  "meta": {
385
  "source_file": path,
386
  "total_headings": len(headings),
387
  "total_red_paragraphs": len(paragraphs_red),
388
  "total_tables": len(tables_out),
389
+ "total_red_runs": len(red_runs),
390
+ "total_red_cells": sum(
391
+ sum(1 for cell in row for cell in table["red_cells"] if cell)
392
+ for table in tables_out
393
+ ),
394
+ "global_settings_used": {
395
+ "normalization": GLOBAL_SETTINGS.get("normalize", {}),
396
+ "ocr_repair_rules_count": len(GLOBAL_SETTINGS.get("ocr_repair_rules", [])),
397
+ "synonyms_count": len(EXTRA_HEADER_SYNONYMS) if EXTRA_HEADER_SYNONYMS else 0
398
+ }
399
  }
400
  }
401
+
402
  return out
403
 
404
  # ------------------------------
 
408
  if len(argv) < 3:
409
  print("Usage: python extract_red_text.py input.docx output.json")
410
  sys.exit(2)
411
+
412
  input_docx = argv[1]
413
  output_json = argv[2]
414
 
415
+ log.info("Starting red text extraction from: %s", input_docx)
416
+ log.info("Using master_key configuration with %d header synonyms",
417
+ len(EXTRA_HEADER_SYNONYMS) if EXTRA_HEADER_SYNONYMS else 0)
418
+
419
  try:
420
  result = extract_from_docx(input_docx)
421
  except Exception as exc:
 
426
  try:
427
  with open(output_json, "w", encoding="utf-8") as fh:
428
  json.dump(result, fh, ensure_ascii=False, indent=2)
429
+ log.info("Saved extracted data to: %s", output_json)
430
  except Exception:
431
  log.exception("Failed to write output JSON to %s", output_json)
432
  raise
433
 
434
+ # Print comprehensive summary
435
+ meta = result.get("meta", {})
436
+ log.info("=== EXTRACTION SUMMARY ===")
437
+ log.info("Headings found: %d", meta.get("total_headings", 0))
438
+ log.info("Red paragraphs: %d", meta.get("total_red_paragraphs", 0))
439
+ log.info("Red runs total: %d", meta.get("total_red_runs", 0))
440
+ log.info("Tables processed: %d", meta.get("total_tables", 0))
441
+ log.info("Red cells found: %d", meta.get("total_red_cells", 0))
442
+ log.info("Header synonyms used: %d", meta.get("global_settings_used", {}).get("synonyms_count", 0))
443
 
444
  if __name__ == "__main__":
445
  main(sys.argv)
446
+ # Print output for verification
447
  if len(sys.argv) >= 3:
448
+ try:
449
+ with open(sys.argv[2], 'r') as f:
450
+ print(f"\n📄 EXTRACT_RED_TEXT OUTPUT:\n{f.read()}")
451
+ except Exception as e:
452
+ print(f"\n❌ Could not read output file: {e}")