Shami96 commited on
Commit
1055fe1
Β·
verified Β·
1 Parent(s): 8b6ed83

Update extract_red_text.py

Browse files
Files changed (1) hide show
  1. extract_red_text.py +288 -312
extract_red_text.py CHANGED
@@ -1,335 +1,311 @@
1
  #!/usr/bin/env python3
2
- import re, json, sys
 
 
3
  from docx import Document
4
  from docx.oxml.ns import qn
 
5
 
6
- def is_red_font(run) -> bool:
7
- """Return True if this run is coloured red-ish."""
8
  col = run.font.color
9
  if col and col.rgb:
10
- r,g,b = col.rgb[0], col.rgb[1], col.rgb[2]
11
- if r>150 and g<100 and b<100 and (r-g)>30 and (r-b)>30:
12
  return True
13
- # fallback: raw <w:color w:val="XXXXXX"/>
14
  rPr = getattr(run._element, "rPr", None)
15
  if rPr is not None:
16
  clr = rPr.find(qn('w:color'))
17
  if clr is not None:
18
  val = clr.get(qn('w:val'))
19
- if re.fullmatch(r"[0-9A-Fa-f]{6}", val):
20
- rr,gg,bb = int(val[:2],16), int(val[2:4],16), int(val[4:],16)
21
- if rr>150 and gg<100 and bb<100 and (rr-gg)>30 and (rr-bb)>30:
22
  return True
23
  return False
24
 
25
- # ─────────────────────────────────────────────────────────────────────────────
26
- # Your template, mapped 1:1 to doc.tables[0..18]
27
- MASTER_TABLES = [
28
- # Table 0: Tick as appropriate (Mass, Maintenance, etc.)
29
- {
30
- "name": "Tick as appropriate",
31
- "labels_on_row1": True,
32
- "labels": [
33
- "Mass", "Maintenance", "Basic Fatigue", "Advanced Fatigue",
34
- "Entry Audit", "Initial Compliance Audit", "Compliance Audit",
35
- "Spot Check", "Triggered Audit"
36
- ]
37
- },
38
-
39
- # Table 1: Audit Information
40
- {
41
- "name": "Audit Information",
42
- "labels_on_left": True,
43
- "labels": [
44
- "Date of Audit",
45
- "Location of audit",
46
- "Auditor name",
47
- "Audit Matrix Identifier (Name or Number)", # Corrected full label
48
- "Auditor Exemplar Global Reg No.",
49
- "expiry Date:",
50
- "NHVR Auditor Registration Number",
51
- "expiry Date:" # Note: Duplicate label, might need special handling
52
- ]
53
- },
54
-
55
- # Table 2: Operator Information (including contact details)
56
- {
57
- "name": "Operator Information",
58
- "labels_on_left": True,
59
- "skip_rows": ["Operator contact details", ""], # Skip subheading and blank rows
60
- "labels": [
61
- "Operator name (Legal entity)",
62
- "NHVAS Accreditation No. (If applicable)",
63
- "Registered trading name/s",
64
- "Australian Company Number",
65
- "NHVAS Manual (Policies and Procedures) developed by",
66
- "Operator business address",
67
- "Operator Postal address",
68
- "Email address",
69
- "Operator Telephone Number"
70
- ]
71
- },
72
-
73
- # Table 3: Attendance List
74
- {
75
- "name": "Attendance List (Names and Position Titles)",
76
- "labels": ["Attendance List (Names and Position Titles)"]
77
- },
78
-
79
- # Table 4: Nature of the Operators Business
80
- {
81
- "name": "Nature of the Operators Business (Summary)",
82
- "labels": [
83
- "Nature of the Operators Business (Summary)",
84
- "Accreditation Number:",
85
- "Expiry Date:"
86
- ]
87
- },
88
-
89
- # Table 5: Accreditation Vehicle Summary
90
- {
91
- "name": "Accreditation Vehicle Summary",
92
- "labels_on_left": True,
93
- "labels": [
94
- "Number of powered vehicles",
95
- "Number of trailing vehicles"
96
- ]
97
- },
98
-
99
- # Table 6: Accreditation Driver Summary
100
- {
101
- "name": "Accreditation Driver Summary",
102
- "labels_on_left": True,
103
- "labels": [
104
- "Number of drivers in BFM",
105
- "Number of drivers in AFM"
106
- ]
107
- },
108
-
109
- # Table 7: Compliance Codes
110
- {
111
- "name": "Compliance Codes",
112
- "labels_on_row1": True,
113
- "labels": ["V", "SFI", "NA", "NC", "NAP"]
114
- },
115
-
116
- # Table 8: Corrective Action Request Identification
117
- {
118
- "name": "Corrective Action Request Identification",
119
- "labels_on_row1": True,
120
- "labels": ["Title", "Abbreviation", "Description"]
121
- },
122
-
123
- # Table 9: MASS MANAGEMENT (Standards 1-8)
124
- {
125
- "name": "MASS MANAGEMENT",
126
- "labels_on_left": True,
127
- "labels": [
128
- "Std 1. Responsibilities",
129
- "Std 2. Vehicle Control",
130
- "Std 3. Vehicle Use",
131
- "Std 4. Records and Documentation",
132
- "Std 5. Verification",
133
- "Std 6. Internal Review",
134
- "Std 7. Training and Education",
135
- "Std 8. Maintenance of Suspension"
136
- ]
137
- },
138
-
139
- # Table 10: Mass Management Summary of Audit findings (Standards 1-8)
140
- {
141
- "name": "Mass Management Summary of Audit findings",
142
- "labels_on_left": True,
143
- "labels": [
144
- "Std 1. Responsibilities",
145
- "Std 2. Vehicle Control",
146
- "Std 3. Vehicle Use",
147
- "Std 4. Records and Documentation",
148
- "Std 5. Verification",
149
- "Std 6. Internal Review",
150
- "Std 7. Training and Education",
151
- "Std 8. Maintenance of Suspension"
152
- ]
153
- },
154
-
155
- # Table 11: Vehicle Registration Numbers of Records Examined
156
- {
157
- "name": "Vehicle Registration Numbers of Records Examined",
158
- "labels_on_row1": True,
159
- "labels": [
160
- "No.", "Registration Number",
161
- "Sub-contractor (Yes/No)",
162
- "Sub-contracted Vehicles Statement of Compliance (Yes/No)",
163
- "Weight Verification Records (Date Range)",
164
- "RFS Suspension Certification # (N/A if not applicable)",
165
- "Suspension System Maintenance (Date Range)",
166
- "Trip Records (Date Range)",
167
- "Fault Recording/ Reporting on Suspension System (Date Range)"
168
- ]
169
- },
170
-
171
- # Table 12: Operator's Name (legal entity) - Signature block
172
- {
173
- "name": "Operator’s Name (legal entity)",
174
- "labels": ["Operator’s Name (legal entity)"]
175
- },
176
-
177
- # Table 13: Non-conformance type
178
- {
179
- "name": "Non-conformance type (please tick)",
180
- "labels": ["Un-conditional", "Conditional"]
181
- },
182
-
183
- # Table 14: Non-conformance Information
184
- {
185
- "name": "Non-conformance Information",
186
- "labels_on_row1": True,
187
- "labels": [
188
- "Non-conformance agreed close out date",
189
- "Module and Standard",
190
- "Corrective Action Request (CAR) Number"
191
- ]
192
- },
193
-
194
- # Table 15: Non-conformance and action taken
195
- {
196
- "name": "Non-conformance and action taken",
197
- "labels_on_row1": True,
198
- "labels": [
199
- "Observed Non-conformance:",
200
- "Corrective Action taken or to be taken by operator:",
201
- "Operator or Representative Signature", "Position", "Date"
202
- ]
203
- },
204
-
205
- # Table 16: Print Name / Auditor Reg Number
206
- {
207
- "name": "Print Name / Auditor Reg Number",
208
- "labels_on_row1": True,
209
- "labels": [
210
- "Print Name",
211
- "NHVR or Exemplar Global Auditor Registration Number"
212
- ]
213
- },
214
-
215
- # Table 17: Audit Declaration
216
- {
217
- "name": "Audit Declaration",
218
- "labels_on_left": True,
219
- "labels": [
220
- "Audit was conducted on",
221
- "Unconditional CARs closed out on:",
222
- "Conditional CARs to be closed out by:"
223
- ]
224
- },
225
-
226
- # Table 18: print accreditation name
227
- {
228
- "name": "print accreditation name",
229
- "labels": ["print accreditation name"]
230
- },
231
-
232
- # Table 19: Operator Declaration
233
- {
234
- "name": "Operator Declaration",
235
- "labels_on_row1": True,
236
- "labels": ["Print Name", "Position Title"]
237
- }
238
- ]
239
-
240
- def extract_red_text(path):
241
- doc = Document(path)
242
-
243
- # debug print
244
- print(f"Found {len(doc.tables)} tables:")
245
- for i,t in enumerate(doc.tables):
246
- print(f" Table#{i}: β€œ{t.rows[0].cells[0].text.strip()[:30]}…”")
247
- print()
248
-
249
- out = {}
250
- for ti, spec in enumerate(MASTER_TABLES):
251
- if ti >= len(doc.tables):
252
- break
253
- tbl = doc.tables[ti]
254
- name = spec["name"]
255
-
256
- # prepare container & dedupe sets
257
- collected = {lbl:[] for lbl in spec["labels"]}
258
- seen = {lbl:set() for lbl in spec["labels"]}
259
-
260
- # choose orientation
261
- if spec.get("labels_on_row1"):
262
- headers = spec["labels"]
263
- rows = tbl.rows[1:]
264
- col_mode = True
265
- elif spec.get("labels_on_left"):
266
- headers = spec["labels"]
267
- # skip any unwanted header/subheading rows
268
- rows = [
269
- row for row in tbl.rows[1:]
270
- if row.cells[0].text.strip() not in spec.get("skip_rows",[])
271
- ]
272
- col_mode = False
273
- else:
274
- headers = [name]
275
- rows = tbl.rows
276
- col_mode = None
277
-
278
- # scan each cell
279
- for ri,row in enumerate(rows):
280
- for ci,cell in enumerate(row.cells):
281
- red = "".join(
282
- run.text for p in cell.paragraphs for run in p.runs
283
- if is_red_font(run)
284
- ).strip()
285
- if not red: continue
286
 
287
- # assign label
288
- if col_mode is True:
289
- lbl = headers[ci] if ci<len(headers) else name
290
- elif col_mode is False:
291
- lbl = headers[ri] if ri<len(headers) else name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  else:
293
- lbl = name
294
-
295
- # dedupe & collect
296
- if red not in seen[lbl]:
297
- seen[lbl].add(red)
298
- collected[lbl].append(red)
299
-
300
- # only keep non-empty labels
301
- filtered = {l:collected[l] for l in collected if collected[l]}
302
- if filtered:
303
- out[name] = filtered
304
-
305
- # paragraphs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  paras = {}
307
- for i,para in enumerate(doc.paragraphs):
308
- red = "".join(r.text for r in para.runs if is_red_font(r)).strip()
309
- if not red: continue
310
- # find nearest non-red above
311
- lab = None
312
- for j in range(i-1,-1,-1):
313
- if any(is_red_font(r) for r in doc.paragraphs[j].runs):
314
- continue
315
- txt = doc.paragraphs[j].text.strip()
316
  if txt:
317
- lab = txt; break
318
- key = lab or "(para)"
319
- paras.setdefault(key,[]).append(red)
320
-
 
 
 
 
 
321
  if paras:
322
  out["paragraphs"] = paras
323
  return out
324
 
325
- if __name__=="__main__":
326
- import sys, json
327
- # Usage: python extract_red_text.py input.docx output.json
328
- input_docx = sys.argv[1]
329
- output_json = sys.argv[2]
330
- word_data = extract_red_text(input_docx)
331
- with open(output_json, 'w', encoding='utf-8') as f:
332
- json.dump(word_data, f, indent=2, ensure_ascii=False)
333
-
334
- # still print to console for immediate feedback
335
- print(json.dumps(word_data, indent=2, ensure_ascii=False))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  #!/usr/bin/env python3
2
+ import re
3
+ import json
4
+ import sys
5
  from docx import Document
6
  from docx.oxml.ns import qn
7
+ from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
8
 
9
+ def is_red_font(run):
10
+ """Enhanced red font detection with better color checking"""
11
  col = run.font.color
12
  if col and col.rgb:
13
+ r, g, b = col.rgb
14
+ if r > 150 and g < 100 and b < 100 and (r-g) > 30 and (r-b) > 30:
15
  return True
 
16
  rPr = getattr(run._element, "rPr", None)
17
  if rPr is not None:
18
  clr = rPr.find(qn('w:color'))
19
  if clr is not None:
20
  val = clr.get(qn('w:val'))
21
+ if val and re.fullmatch(r"[0-9A-Fa-f]{6}", val):
22
+ rr, gg, bb = int(val[:2], 16), int(val[2:4], 16), int(val[4:], 16)
23
+ if rr > 150 and gg < 100 and bb < 100 and (rr-gg) > 30 and (rr-bb) > 30:
24
  return True
25
  return False
26
 
27
+ def _prev_para_text(tbl):
28
+ """Get text from previous paragraph before table"""
29
+ prev = tbl._tbl.getprevious()
30
+ while prev is not None and not prev.tag.endswith("}p"):
31
+ prev = prev.getprevious()
32
+ if prev is None:
33
+ return ""
34
+ return "".join(node.text for node in prev.iter() if node.tag.endswith("}t") and node.text).strip()
35
+
36
+ def normalize_text(text):
37
+ """Normalize text for better matching"""
38
+ return re.sub(r'\s+', ' ', text.strip())
39
+
40
+ def fuzzy_match_heading(heading, patterns):
41
+ """Check if heading matches any pattern with fuzzy matching"""
42
+ heading_norm = normalize_text(heading.upper())
43
+ for pattern in patterns:
44
+ if re.search(pattern, heading_norm, re.IGNORECASE):
45
+ return True
46
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ def get_table_context(tbl):
49
+ """Get comprehensive context information for table"""
50
+ heading = normalize_text(_prev_para_text(tbl))
51
+ headers = [normalize_text(c.text) for c in tbl.rows[0].cells if c.text.strip()]
52
+ col0 = [normalize_text(r.cells[0].text) for r in tbl.rows if r.cells[0].text.strip()]
53
+ first_cell = normalize_text(tbl.rows[0].cells[0].text) if tbl.rows else ""
54
+ all_cells = []
55
+ for row in tbl.rows:
56
+ for cell in row.cells:
57
+ text = normalize_text(cell.text)
58
+ if text:
59
+ all_cells.append(text)
60
+ return {
61
+ 'heading': heading,
62
+ 'headers': headers,
63
+ 'col0': col0,
64
+ 'first_cell': first_cell,
65
+ 'all_cells': all_cells,
66
+ 'num_rows': len(tbl.rows),
67
+ 'num_cols': len(tbl.rows[0].cells) if tbl.rows else 0
68
+ }
69
+
70
+ def calculate_schema_match_score(schema_name, spec, context):
71
+ """Calculate match score for a schema against table context"""
72
+ score = 0
73
+ reasons = []
74
+ if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
75
+ score += 100
76
+ reasons.append(f"Direct first cell match: '{context['first_cell']}'")
77
+ if spec.get("headings"):
78
+ for h in spec["headings"]:
79
+ if fuzzy_match_heading(context['heading'], [h["text"]]):
80
+ score += 50
81
+ reasons.append(f"Heading match: '{context['heading']}'")
82
+ break
83
+ if spec.get("orientation") == "left":
84
+ labels = [normalize_text(lbl) for lbl in spec["labels"]]
85
+ matches = 0
86
+ for lbl in labels:
87
+ if any(lbl.upper() in c.upper() or c.upper() in lbl.upper() for c in context['col0']):
88
+ matches += 1
89
+ if matches > 0:
90
+ score += (matches / len(labels)) * 30
91
+ reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
92
+ elif spec.get("orientation") == "row1":
93
+ labels = [normalize_text(lbl) for lbl in spec["labels"]]
94
+ matches = 0
95
+ for lbl in labels:
96
+ if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
97
+ matches += 1
98
+ if matches > 0:
99
+ score += (matches / len(labels)) * 30
100
+ reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
101
+ if spec.get("columns"):
102
+ cols = [normalize_text(col) for col in spec["columns"]]
103
+ matches = 0
104
+ for col in cols:
105
+ if any(col.upper() in h.upper() for h in context['headers']):
106
+ matches += 1
107
+ if matches == len(cols):
108
+ score += 40
109
+ reasons.append(f"All column headers match: {cols}")
110
+ if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
111
+ if "OPERATOR DECLARATION" in context['heading'].upper():
112
+ score += 80
113
+ reasons.append("Operator Declaration context match")
114
+ elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
115
+ score += 60
116
+ reasons.append("Manager found in cells (likely Operator Declaration)")
117
+ if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
118
+ if any("MANAGER" in cell.upper() for cell in context['all_cells']):
119
+ score -= 50 # Penalty because auditors shouldn't be managers
120
+ reasons.append("Penalty: Manager found (not auditor)")
121
+ return score, reasons
122
+
123
+ def match_table_schema(tbl):
124
+ """Improved table schema matching with scoring system"""
125
+ context = get_table_context(tbl)
126
+ best_match = None
127
+ best_score = 0
128
+ for name, spec in TABLE_SCHEMAS.items():
129
+ score, reasons = calculate_schema_match_score(name, spec, context)
130
+ if score > best_score:
131
+ best_score = score
132
+ best_match = name
133
+ if best_score >= 20:
134
+ return best_match
135
+ return None
136
+
137
+ def check_multi_schema_table(tbl):
138
+ """Check if table contains multiple schemas and split appropriately"""
139
+ context = get_table_context(tbl)
140
+ operator_labels = ["Operator name (Legal entity)", "NHVAS Accreditation No.", "Registered trading name/s",
141
+ "Australian Company Number", "NHVAS Manual"]
142
+ contact_labels = ["Operator business address", "Operator Postal address", "Email address", "Operator Telephone Number"]
143
+ has_operator = any(any(op_lbl.upper() in cell.upper() for op_lbl in operator_labels) for cell in context['col0'])
144
+ has_contact = any(any(cont_lbl.upper() in cell.upper() for cont_lbl in contact_labels) for cell in context['col0'])
145
+ if has_operator and has_contact:
146
+ return ["Operator Information", "Operator contact details"]
147
+ return None
148
+
149
+ def extract_multi_schema_table(tbl, schemas):
150
+ """Extract data from table with multiple schemas"""
151
+ result = {}
152
+ for schema_name in schemas:
153
+ if schema_name not in TABLE_SCHEMAS:
154
+ continue
155
+ spec = TABLE_SCHEMAS[schema_name]
156
+ schema_data = {}
157
+ for ri, row in enumerate(tbl.rows):
158
+ if ri == 0:
159
+ continue
160
+ row_label = normalize_text(row.cells[0].text)
161
+ belongs_to_schema = False
162
+ matched_label = None
163
+ for spec_label in spec["labels"]:
164
+ spec_norm = normalize_text(spec_label).upper()
165
+ row_norm = row_label.upper()
166
+ if spec_norm == row_norm or spec_norm in row_norm or row_norm in spec_norm:
167
+ belongs_to_schema = True
168
+ matched_label = spec_label
169
+ break
170
+ if not belongs_to_schema:
171
+ continue
172
+ for ci, cell in enumerate(row.cells):
173
+ red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
174
+ if red_txt:
175
+ if matched_label not in schema_data:
176
+ schema_data[matched_label] = []
177
+ if red_txt not in schema_data[matched_label]:
178
+ schema_data[matched_label].append(red_txt)
179
+ if schema_data:
180
+ result[schema_name] = schema_data
181
+ return result
182
+
183
+ def extract_table_data(tbl, schema_name, spec):
184
+ """Extract red text data from table based on schema"""
185
+ labels = spec["labels"] + [schema_name]
186
+ collected = {lbl: [] for lbl in labels}
187
+ seen = {lbl: set() for lbl in labels}
188
+ by_col = (spec["orientation"] == "row1")
189
+ start_row = 1 if by_col else 0
190
+ rows = tbl.rows[start_row:]
191
+ for ri, row in enumerate(rows):
192
+ for ci, cell in enumerate(row.cells):
193
+ red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
194
+ if not red_txt:
195
+ continue
196
+ if by_col:
197
+ if ci < len(spec["labels"]):
198
+ lbl = spec["labels"][ci]
199
  else:
200
+ lbl = schema_name
201
+ else:
202
+ raw_label = normalize_text(row.cells[0].text)
203
+ lbl = None
204
+ for spec_label in spec["labels"]:
205
+ if normalize_text(spec_label).upper() == raw_label.upper():
206
+ lbl = spec_label
207
+ break
208
+ if not lbl:
209
+ for spec_label in spec["labels"]:
210
+ spec_norm = normalize_text(spec_label).upper()
211
+ raw_norm = raw_label.upper()
212
+ if spec_norm in raw_norm or raw_norm in spec_norm:
213
+ lbl = spec_label
214
+ break
215
+ if not lbl:
216
+ lbl = schema_name
217
+ if red_txt not in seen[lbl]:
218
+ seen[lbl].add(red_txt)
219
+ collected[lbl].append(red_txt)
220
+ return {k: v for k, v in collected.items() if v}
221
+
222
+ def extract_red_text(input_doc):
223
+ # input_doc: docx.Document object or file path
224
+ if isinstance(input_doc, str):
225
+ doc = Document(input_doc)
226
+ else:
227
+ doc = input_doc
228
+ out = {}
229
+ table_count = 0
230
+ for tbl in doc.tables:
231
+ table_count += 1
232
+ multi_schemas = check_multi_schema_table(tbl)
233
+ if multi_schemas:
234
+ multi_data = extract_multi_schema_table(tbl, multi_schemas)
235
+ for schema_name, schema_data in multi_data.items():
236
+ if schema_data:
237
+ if schema_name in out:
238
+ for k, v in schema_data.items():
239
+ if k in out[schema_name]:
240
+ out[schema_name][k].extend(v)
241
+ else:
242
+ out[schema_name][k] = v
243
+ else:
244
+ out[schema_name] = schema_data
245
+ continue
246
+ schema = match_table_schema(tbl)
247
+ if not schema:
248
+ continue
249
+ spec = TABLE_SCHEMAS[schema]
250
+ data = extract_table_data(tbl, schema, spec)
251
+ if data:
252
+ if schema in out:
253
+ for k, v in data.items():
254
+ if k in out[schema]:
255
+ out[schema][k].extend(v)
256
+ else:
257
+ out[schema][k] = v
258
+ else:
259
+ out[schema] = data
260
  paras = {}
261
+ for idx, para in enumerate(doc.paragraphs):
262
+ red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
263
+ if not red_txt:
264
+ continue
265
+ context = None
266
+ for j in range(idx-1, -1, -1):
267
+ txt = normalize_text(doc.paragraphs[j].text)
 
 
268
  if txt:
269
+ all_patterns = HEADING_PATTERNS["main"] + HEADING_PATTERNS["sub"]
270
+ if any(re.search(p, txt, re.IGNORECASE) for p in all_patterns):
271
+ context = txt
272
+ break
273
+ if not context and re.fullmatch(PARAGRAPH_PATTERNS["date_line"], red_txt):
274
+ context = "Date"
275
+ if not context:
276
+ context = "(para)"
277
+ paras.setdefault(context, []).append(red_txt)
278
  if paras:
279
  out["paragraphs"] = paras
280
  return out
281
 
282
+ def extract_red_text_filelike(input_file, output_file):
283
+ """
284
+ Accepts:
285
+ input_file: file-like object (BytesIO/File) or path
286
+ output_file: file-like object (opened for writing text) or path
287
+ """
288
+ if hasattr(input_file, "seek"):
289
+ input_file.seek(0)
290
+ doc = Document(input_file)
291
+ result = extract_red_text(doc)
292
+ if hasattr(output_file, "write"):
293
+ json.dump(result, output_file, indent=2, ensure_ascii=False)
294
+ output_file.flush()
295
+ else:
296
+ with open(output_file, "w", encoding="utf-8") as f:
297
+ json.dump(result, f, indent=2, ensure_ascii=False)
298
+ return result
299
+
300
+ if __name__ == "__main__":
301
+ # Support both script and app/file-like usage
302
+ if len(sys.argv) == 3:
303
+ input_docx = sys.argv[1]
304
+ output_json = sys.argv[2]
305
+ doc = Document(input_docx)
306
+ word_data = extract_red_text(doc)
307
+ with open(output_json, 'w', encoding='utf-8') as f:
308
+ json.dump(word_data, f, indent=2, ensure_ascii=False)
309
+ print(json.dumps(word_data, indent=2, ensure_ascii=False))
310
+ else:
311
+ print("To use as a module: extract_red_text_filelike(input_file, output_file)")