Shami96 commited on
Commit
60df8d0
·
verified ·
1 Parent(s): b0a4dc4

Update extract_red_text.py

Browse files
Files changed (1) hide show
  1. extract_red_text.py +401 -99
extract_red_text.py CHANGED
@@ -7,23 +7,25 @@ from docx.oxml.ns import qn
7
  from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
8
 
9
  def is_red_font(run):
 
10
  col = run.font.color
11
  if col and col.rgb:
12
  r, g, b = col.rgb
13
- if r>150 and g<100 and b<100 and (r-g)>30 and (r-b)>30:
14
  return True
15
  rPr = getattr(run._element, "rPr", None)
16
  if rPr is not None:
17
  clr = rPr.find(qn('w:color'))
18
  if clr is not None:
19
  val = clr.get(qn('w:val'))
20
- if re.fullmatch(r"[0-9A-Fa-f]{6}", val):
21
- rr, gg, bb = int(val[:2],16), int(val[2:4],16), int(val[4:],16)
22
- if rr>150 and gg<100 and bb<100 and (rr-gg)>30 and (rr-bb)>30:
23
  return True
24
  return False
25
 
26
  def _prev_para_text(tbl):
 
27
  prev = tbl._tbl.getprevious()
28
  while prev is not None and not prev.tag.endswith("}p"):
29
  prev = prev.getprevious()
@@ -31,127 +33,427 @@ def _prev_para_text(tbl):
31
  return ""
32
  return "".join(node.text for node in prev.iter() if node.tag.endswith("}t") and node.text).strip()
33
 
34
- def match_table_schema(tbl):
35
- # look for explicit heading constraint
36
- heading = _prev_para_text(tbl)
37
- headers = [c.text.strip() for c in tbl.rows[0].cells]
38
- col0 = [r.cells[0].text.strip() for r in tbl.rows]
39
-
40
- # 1) exact first-cell name
41
- first = tbl.rows[0].cells[0].text.strip()
42
- if first in TABLE_SCHEMAS:
43
- spec = TABLE_SCHEMAS[first]
44
- if not spec.get("headings") or any(h["text"]==heading for h in spec.get("headings",[])):
45
- return first
46
 
47
- # 2) any other schema with explicit headings
48
- for name, spec in TABLE_SCHEMAS.items():
49
- if any(h["text"]==heading for h in spec.get("headings",[])):
50
- return name
 
 
 
51
 
52
- # 3) by two-column 'columns'
53
- for name, spec in TABLE_SCHEMAS.items():
54
- cols = spec.get("columns")
55
- if cols and all(col in headers for col in cols):
56
- return name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- # 4) row1 tables
59
- for name, spec in TABLE_SCHEMAS.items():
60
- if spec["orientation"]=="row1" and all(lbl in headers for lbl in spec["labels"]):
61
- return name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- # 5) left tables
 
 
 
 
64
  for name, spec in TABLE_SCHEMAS.items():
65
- if spec["orientation"]=="left" and all(lbl in col0 for lbl in spec["labels"]):
66
- return name
67
-
 
 
 
68
  return None
69
 
70
- def extract_red_text(path):
71
- doc = Document(path)
72
- out = {}
73
-
74
- # --- TABLES ---
75
- for tbl in doc.tables:
76
- schema = match_table_schema(tbl)
77
- if not schema:
78
- continue
79
- spec = TABLE_SCHEMAS[schema]
80
-
81
- # handle the special split_labels (row1 only)
82
- if spec.get("split_labels") and spec["orientation"]=="row1":
83
- cell_txt = tbl.rows[1].cells[0].text.strip()
84
- first_lbl = spec["split_labels"][0]
85
- narrative, _, tail = cell_txt.partition(first_lbl)
86
- narrative = narrative.strip()
87
- if narrative:
88
- out.setdefault(schema, {}).setdefault(spec["labels"][0], []).append(narrative)
89
 
90
- for i, lbl in enumerate(spec["split_labels"]):
91
- nxt = spec["split_labels"][i+1] if i+1<len(spec["split_labels"]) else None
92
- pattern = rf"{re.escape(lbl)}\s*(.+?)(?={re.escape(nxt)})" if nxt else rf"{re.escape(lbl)}\s*(.+)$"
93
- m = re.search(pattern, cell_txt, flags=re.DOTALL)
94
- if m:
95
- val = m.group(1).strip()
96
- out.setdefault(schema, {}).setdefault(lbl, []).append(val)
97
  continue
98
-
99
- # normal tables
100
- labels = spec["labels"] + [schema]
101
- collected = {lbl: [] for lbl in labels}
102
- seen = {lbl: set() for lbl in labels}
103
- by_col = (spec["orientation"]=="row1")
104
-
105
- rows = tbl.rows[1:]
106
- for ri, row in enumerate(rows):
 
 
 
 
 
 
 
 
107
  for ci, cell in enumerate(row.cells):
108
  red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
109
- if not red_txt:
110
- continue
 
 
 
 
 
 
111
 
112
- if by_col:
113
- # column header your defined label
114
- lbl = spec["labels"][ci] if ci < len(spec["labels"]) else schema
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  else:
116
- # first cell in this row → must be one of your labels
117
- raw = row.cells[0].text.strip()
118
- lbl = raw if raw in spec["labels"] else schema
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
- if red_txt not in seen[lbl]:
121
- seen[lbl].add(red_txt)
122
- collected[lbl].append(red_txt)
123
-
124
- # keep only non-empty
125
- data = {k:v for k,v in collected.items() if v}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  if data:
127
- out[schema] = data
128
-
129
- # --- PARAGRAPHS ---
 
 
 
 
 
130
  paras = {}
131
  for idx, para in enumerate(doc.paragraphs):
132
  red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
133
  if not red_txt:
134
  continue
135
-
136
- # find nearest heading above
137
  context = None
138
  for j in range(idx-1, -1, -1):
139
- txt = doc.paragraphs[j].text.strip()
140
- if txt and any(re.search(p, txt) for p in HEADING_PATTERNS["main"]+HEADING_PATTERNS["sub"]):
141
- context = txt
142
- break
143
-
144
- # fallback for date line
145
  if not context and re.fullmatch(PARAGRAPH_PATTERNS["date_line"], red_txt):
146
  context = "Date"
147
-
148
- paras.setdefault(context or "(para)", []).append(red_txt)
149
-
150
  if paras:
151
  out["paragraphs"] = paras
152
-
153
  return out
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  if __name__ == "__main__":
156
- fn = sys.argv[1] if len(sys.argv)>1 else "test.docx"
157
- print(json.dumps(extract_red_text(fn), indent=2, ensure_ascii=False))
 
 
 
 
 
 
 
 
 
 
7
  from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
8
 
9
  def is_red_font(run):
10
+ """Enhanced red font detection with better color checking"""
11
  col = run.font.color
12
  if col and col.rgb:
13
  r, g, b = col.rgb
14
+ if r > 150 and g < 100 and b < 100 and (r-g) > 30 and (r-b) > 30:
15
  return True
16
  rPr = getattr(run._element, "rPr", None)
17
  if rPr is not None:
18
  clr = rPr.find(qn('w:color'))
19
  if clr is not None:
20
  val = clr.get(qn('w:val'))
21
+ if val and re.fullmatch(r"[0-9A-Fa-f]{6}", val):
22
+ rr, gg, bb = int(val[:2], 16), int(val[2:4], 16), int(val[4:], 16)
23
+ if rr > 150 and gg < 100 and bb < 100 and (rr-gg) > 30 and (rr-bb) > 30:
24
  return True
25
  return False
26
 
27
  def _prev_para_text(tbl):
28
+ """Get text from previous paragraph before table"""
29
  prev = tbl._tbl.getprevious()
30
  while prev is not None and not prev.tag.endswith("}p"):
31
  prev = prev.getprevious()
 
33
  return ""
34
  return "".join(node.text for node in prev.iter() if node.tag.endswith("}t") and node.text).strip()
35
 
36
+ def normalize_text(text):
37
+ """Normalize text for better matching"""
38
+ return re.sub(r'\s+', ' ', text.strip())
 
 
 
 
 
 
 
 
 
39
 
40
+ def fuzzy_match_heading(heading, patterns):
41
+ """Check if heading matches any pattern with fuzzy matching"""
42
+ heading_norm = normalize_text(heading.upper())
43
+ for pattern in patterns:
44
+ if re.search(pattern, heading_norm, re.IGNORECASE):
45
+ return True
46
+ return False
47
 
48
+ def get_table_context(tbl):
49
+ """Get comprehensive context information for table"""
50
+ heading = normalize_text(_prev_para_text(tbl))
51
+ headers = [normalize_text(c.text) for c in tbl.rows[0].cells if c.text.strip()]
52
+ col0 = [normalize_text(r.cells[0].text) for r in tbl.rows if r.cells[0].text.strip()]
53
+ first_cell = normalize_text(tbl.rows[0].cells[0].text) if tbl.rows else ""
54
+ all_cells = []
55
+ for row in tbl.rows:
56
+ for cell in row.cells:
57
+ text = normalize_text(cell.text)
58
+ if text:
59
+ all_cells.append(text)
60
+ return {
61
+ 'heading': heading,
62
+ 'headers': headers,
63
+ 'col0': col0,
64
+ 'first_cell': first_cell,
65
+ 'all_cells': all_cells,
66
+ 'num_rows': len(tbl.rows),
67
+ 'num_cols': len(tbl.rows[0].cells) if tbl.rows else 0
68
+ }
69
 
70
+ def calculate_schema_match_score(schema_name, spec, context):
71
+ """Enhanced calculate match score - IMPROVED for Vehicle Registration tables"""
72
+ score = 0
73
+ reasons = []
74
+
75
+ # 🎯 VEHICLE REGISTRATION BOOST
76
+ if "Vehicle Registration" in schema_name:
77
+ vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
78
+ table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
79
+
80
+ keyword_matches = sum(1 for keyword in vehicle_keywords if keyword in table_text)
81
+ if keyword_matches >= 2:
82
+ score += 150 # Very high boost for vehicle tables
83
+ reasons.append(f"Vehicle Registration keywords: {keyword_matches}/5")
84
+ elif keyword_matches >= 1:
85
+ score += 75 # Medium boost
86
+ reasons.append(f"Some Vehicle Registration keywords: {keyword_matches}/5")
87
+
88
+ # 🎯 SUMMARY TABLE BOOST (existing logic)
89
+ if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
90
+ score += 100
91
+ reasons.append(f"Summary schema with DETAILS column - perfect match")
92
+
93
+ if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
94
+ score -= 75
95
+ reasons.append(f"Non-summary schema penalized for DETAILS column presence")
96
+
97
+ # Context exclusions
98
+ if spec.get("context_exclusions"):
99
+ table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
100
+ for exclusion in spec["context_exclusions"]:
101
+ if exclusion.lower() in table_text:
102
+ score -= 50
103
+ reasons.append(f"Context exclusion penalty: '{exclusion}' found")
104
+
105
+ # Context keywords
106
+ if spec.get("context_keywords"):
107
+ table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
108
+ keyword_matches = 0
109
+ for keyword in spec["context_keywords"]:
110
+ if keyword.lower() in table_text:
111
+ keyword_matches += 1
112
+
113
+ if keyword_matches > 0:
114
+ score += keyword_matches * 15
115
+ reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")
116
+
117
+ # Direct first cell match
118
+ if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
119
+ score += 100
120
+ reasons.append(f"Direct first cell match: '{context['first_cell']}'")
121
+
122
+ # Heading pattern matching
123
+ if spec.get("headings"):
124
+ for h in spec["headings"]:
125
+ if fuzzy_match_heading(context['heading'], [h["text"]]):
126
+ score += 50
127
+ reasons.append(f"Heading match: '{context['heading']}'")
128
+ break
129
+
130
+ # Column header matching
131
+ if spec.get("columns"):
132
+ cols = [normalize_text(col) for col in spec["columns"]]
133
+ matches = 0
134
+ for col in cols:
135
+ if any(col.upper() in h.upper() for h in context['headers']):
136
+ matches += 1
137
+ if matches == len(cols):
138
+ score += 60
139
+ reasons.append(f"All column headers match: {cols}")
140
+ elif matches > 0:
141
+ score += matches * 20
142
+ reasons.append(f"Partial column matches: {matches}/{len(cols)}")
143
+
144
+ # Label matching for left-oriented tables
145
+ if spec.get("orientation") == "left":
146
+ labels = [normalize_text(lbl) for lbl in spec["labels"]]
147
+ matches = 0
148
+ for lbl in labels:
149
+ if any(lbl.upper() in c.upper() or c.upper() in lbl.upper() for c in context['col0']):
150
+ matches += 1
151
+ if matches > 0:
152
+ score += (matches / len(labels)) * 30
153
+ reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
154
+
155
+ # 🎯 ENHANCED Label matching for row1-oriented tables (Vehicle Registration)
156
+ elif spec.get("orientation") == "row1":
157
+ labels = [normalize_text(lbl) for lbl in spec["labels"]]
158
+ matches = 0
159
+ for lbl in labels:
160
+ # More flexible matching for vehicle tables
161
+ if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
162
+ matches += 1
163
+ # Also check for partial keyword matches
164
+ elif any(word.upper() in " ".join(context['headers']).upper() for word in lbl.split() if len(word) > 3):
165
+ matches += 0.5 # Partial credit
166
+
167
+ if matches > 0:
168
+ score += (matches / len(labels)) * 40 # Higher weight for row1 tables
169
+ reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
170
+
171
+ # Special handling for Declaration tables (existing logic)
172
+ if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
173
+ if "OPERATOR DECLARATION" in context['heading'].upper():
174
+ score += 80
175
+ reasons.append("Operator Declaration context match")
176
+ elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
177
+ score += 60
178
+ reasons.append("Manager found in cells (likely Operator Declaration)")
179
+
180
+ if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
181
+ if any("MANAGER" in cell.upper() for cell in context['all_cells']):
182
+ score -= 50
183
+ reasons.append("Penalty: Manager found (not auditor)")
184
+
185
+ return score, reasons
186
 
187
+ def match_table_schema(tbl):
188
+ """Improved table schema matching with scoring system"""
189
+ context = get_table_context(tbl)
190
+ best_match = None
191
+ best_score = 0
192
  for name, spec in TABLE_SCHEMAS.items():
193
+ score, reasons = calculate_schema_match_score(name, spec, context)
194
+ if score > best_score:
195
+ best_score = score
196
+ best_match = name
197
+ if best_score >= 20:
198
+ return best_match
199
  return None
200
 
201
+ def check_multi_schema_table(tbl):
202
+ """Check if table contains multiple schemas and split appropriately"""
203
+ context = get_table_context(tbl)
204
+ operator_labels = ["Operator name (Legal entity)", "NHVAS Accreditation No.", "Registered trading name/s",
205
+ "Australian Company Number", "NHVAS Manual"]
206
+ contact_labels = ["Operator business address", "Operator Postal address", "Email address", "Operator Telephone Number"]
207
+ has_operator = any(any(op_lbl.upper() in cell.upper() for op_lbl in operator_labels) for cell in context['col0'])
208
+ has_contact = any(any(cont_lbl.upper() in cell.upper() for cont_lbl in contact_labels) for cell in context['col0'])
209
+ if has_operator and has_contact:
210
+ return ["Operator Information", "Operator contact details"]
211
+ return None
 
 
 
 
 
 
 
 
212
 
213
+ def extract_multi_schema_table(tbl, schemas):
214
+ """Extract data from table with multiple schemas"""
215
+ result = {}
216
+ for schema_name in schemas:
217
+ if schema_name not in TABLE_SCHEMAS:
 
 
218
  continue
219
+ spec = TABLE_SCHEMAS[schema_name]
220
+ schema_data = {}
221
+ for ri, row in enumerate(tbl.rows):
222
+ if ri == 0:
223
+ continue
224
+ row_label = normalize_text(row.cells[0].text)
225
+ belongs_to_schema = False
226
+ matched_label = None
227
+ for spec_label in spec["labels"]:
228
+ spec_norm = normalize_text(spec_label).upper()
229
+ row_norm = row_label.upper()
230
+ if spec_norm == row_norm or spec_norm in row_norm or row_norm in spec_norm:
231
+ belongs_to_schema = True
232
+ matched_label = spec_label
233
+ break
234
+ if not belongs_to_schema:
235
+ continue
236
  for ci, cell in enumerate(row.cells):
237
  red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
238
+ if red_txt:
239
+ if matched_label not in schema_data:
240
+ schema_data[matched_label] = []
241
+ if red_txt not in schema_data[matched_label]:
242
+ schema_data[matched_label].append(red_txt)
243
+ if schema_data:
244
+ result[schema_name] = schema_data
245
+ return result
246
 
247
+ def extract_table_data(tbl, schema_name, spec):
248
+ """Extract red text data from table based on schema - ENHANCED for Vehicle Registration"""
249
+
250
+ # 🎯 SPECIAL HANDLING for Vehicle Registration tables
251
+ if "Vehicle Registration" in schema_name:
252
+ print(f" 🚗 EXTRACTION FIX: Processing Vehicle Registration table")
253
+
254
+ labels = spec["labels"]
255
+ collected = {lbl: [] for lbl in labels}
256
+ seen = {lbl: set() for lbl in labels}
257
+
258
+ # For Vehicle Registration, orientation is "row1" - headers in first row
259
+ if len(tbl.rows) < 2:
260
+ print(f" ❌ Vehicle table has less than 2 rows")
261
+ return {}
262
+
263
+ # Map header cells to labels
264
+ header_row = tbl.rows[0]
265
+ column_mapping = {}
266
+
267
+ print(f" 📋 Mapping {len(header_row.cells)} header cells to labels")
268
+
269
+ for col_idx, cell in enumerate(header_row.cells):
270
+ header_text = normalize_text(cell.text).strip()
271
+ if not header_text:
272
+ continue
273
+
274
+ print(f" Column {col_idx}: '{header_text}'")
275
+
276
+ # Find best matching label
277
+ best_match = None
278
+ best_score = 0
279
+
280
+ for label in labels:
281
+ # Direct match
282
+ if header_text.upper() == label.upper():
283
+ best_match = label
284
+ best_score = 1.0
285
+ break
286
+
287
+ # Partial keyword matching
288
+ header_words = set(word.upper() for word in header_text.split() if len(word) > 2)
289
+ label_words = set(word.upper() for word in label.split() if len(word) > 2)
290
+
291
+ if header_words and label_words:
292
+ common_words = header_words.intersection(label_words)
293
+ if common_words:
294
+ score = len(common_words) / max(len(header_words), len(label_words))
295
+ if score > best_score and score >= 0.4: # Lower threshold for vehicle tables
296
+ best_score = score
297
+ best_match = label
298
+
299
+ if best_match:
300
+ column_mapping[col_idx] = best_match
301
+ print(f" ✅ Mapped to: '{best_match}' (score: {best_score:.2f})")
302
+ else:
303
+ print(f" ⚠️ No mapping found for '{header_text}'")
304
+
305
+ print(f" 📊 Total column mappings: {len(column_mapping)}")
306
+
307
+ # Extract red text from data rows (skip header)
308
+ for row_idx in range(1, len(tbl.rows)):
309
+ row = tbl.rows[row_idx]
310
+ print(f" 📌 Processing data row {row_idx}")
311
+
312
+ for col_idx, cell in enumerate(row.cells):
313
+ if col_idx in column_mapping:
314
+ label = column_mapping[col_idx]
315
+
316
+ # Extract red text
317
+ red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
318
+
319
+ if red_txt:
320
+ print(f" 🔴 Found red text in '{label}': '{red_txt}'")
321
+
322
+ if red_txt not in seen[label]:
323
+ seen[label].add(red_txt)
324
+ collected[label].append(red_txt)
325
+
326
+ # Return only non-empty collections
327
+ result = {k: v for k, v in collected.items() if v}
328
+ print(f" ✅ Vehicle Registration extracted: {len(result)} columns with data")
329
+ return result
330
+
331
+ # 🎯 ORIGINAL CODE for all other tables (unchanged)
332
+ labels = spec["labels"] + [schema_name]
333
+ collected = {lbl: [] for lbl in labels}
334
+ seen = {lbl: set() for lbl in labels}
335
+ by_col = (spec["orientation"] == "row1")
336
+ start_row = 1 if by_col else 0
337
+ rows = tbl.rows[start_row:]
338
+
339
+ for ri, row in enumerate(rows):
340
+ for ci, cell in enumerate(row.cells):
341
+ red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
342
+ if not red_txt:
343
+ continue
344
+ if by_col:
345
+ if ci < len(spec["labels"]):
346
+ lbl = spec["labels"][ci]
347
  else:
348
+ lbl = schema_name
349
+ else:
350
+ raw_label = normalize_text(row.cells[0].text)
351
+ lbl = None
352
+ for spec_label in spec["labels"]:
353
+ if normalize_text(spec_label).upper() == raw_label.upper():
354
+ lbl = spec_label
355
+ break
356
+ if not lbl:
357
+ for spec_label in spec["labels"]:
358
+ spec_norm = normalize_text(spec_label).upper()
359
+ raw_norm = raw_label.upper()
360
+ if spec_norm in raw_norm or raw_norm in spec_norm:
361
+ lbl = spec_label
362
+ break
363
+ if not lbl:
364
+ lbl = schema_name
365
+ if red_txt not in seen[lbl]:
366
+ seen[lbl].add(red_txt)
367
+ collected[lbl].append(red_txt)
368
+ return {k: v for k, v in collected.items() if v}
369
 
370
+ def extract_red_text(input_doc):
371
+ # input_doc: docx.Document object or file path
372
+ if isinstance(input_doc, str):
373
+ doc = Document(input_doc)
374
+ else:
375
+ doc = input_doc
376
+ out = {}
377
+ table_count = 0
378
+ for tbl in doc.tables:
379
+ table_count += 1
380
+ multi_schemas = check_multi_schema_table(tbl)
381
+ if multi_schemas:
382
+ multi_data = extract_multi_schema_table(tbl, multi_schemas)
383
+ for schema_name, schema_data in multi_data.items():
384
+ if schema_data:
385
+ if schema_name in out:
386
+ for k, v in schema_data.items():
387
+ if k in out[schema_name]:
388
+ out[schema_name][k].extend(v)
389
+ else:
390
+ out[schema_name][k] = v
391
+ else:
392
+ out[schema_name] = schema_data
393
+ continue
394
+ schema = match_table_schema(tbl)
395
+ if not schema:
396
+ continue
397
+ spec = TABLE_SCHEMAS[schema]
398
+ data = extract_table_data(tbl, schema, spec)
399
  if data:
400
+ if schema in out:
401
+ for k, v in data.items():
402
+ if k in out[schema]:
403
+ out[schema][k].extend(v)
404
+ else:
405
+ out[schema][k] = v
406
+ else:
407
+ out[schema] = data
408
  paras = {}
409
  for idx, para in enumerate(doc.paragraphs):
410
  red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
411
  if not red_txt:
412
  continue
 
 
413
  context = None
414
  for j in range(idx-1, -1, -1):
415
+ txt = normalize_text(doc.paragraphs[j].text)
416
+ if txt:
417
+ all_patterns = HEADING_PATTERNS["main"] + HEADING_PATTERNS["sub"]
418
+ if any(re.search(p, txt, re.IGNORECASE) for p in all_patterns):
419
+ context = txt
420
+ break
421
  if not context and re.fullmatch(PARAGRAPH_PATTERNS["date_line"], red_txt):
422
  context = "Date"
423
+ if not context:
424
+ context = "(para)"
425
+ paras.setdefault(context, []).append(red_txt)
426
  if paras:
427
  out["paragraphs"] = paras
 
428
  return out
429
 
430
+ def extract_red_text_filelike(input_file, output_file):
431
+ """
432
+ Accepts:
433
+ input_file: file-like object (BytesIO/File) or path
434
+ output_file: file-like object (opened for writing text) or path
435
+ """
436
+ if hasattr(input_file, "seek"):
437
+ input_file.seek(0)
438
+ doc = Document(input_file)
439
+ result = extract_red_text(doc)
440
+ if hasattr(output_file, "write"):
441
+ json.dump(result, output_file, indent=2, ensure_ascii=False)
442
+ output_file.flush()
443
+ else:
444
+ with open(output_file, "w", encoding="utf-8") as f:
445
+ json.dump(result, f, indent=2, ensure_ascii=False)
446
+ return result
447
+
448
  if __name__ == "__main__":
449
+ # Support both script and app/file-like usage
450
+ if len(sys.argv) == 3:
451
+ input_docx = sys.argv[1]
452
+ output_json = sys.argv[2]
453
+ doc = Document(input_docx)
454
+ word_data = extract_red_text(doc)
455
+ with open(output_json, 'w', encoding='utf-8') as f:
456
+ json.dump(word_data, f, indent=2, ensure_ascii=False)
457
+ print(json.dumps(word_data, indent=2, ensure_ascii=False))
458
+ else:
459
+ print("To use as a module: extract_red_text_filelike(input_file, output_file)")