File size: 20,283 Bytes
e8b46b5
24ad2d2
 
 
 
 
 
1055fe1
 
 
e8b46b5
 
24ad2d2
 
1055fe1
e8b46b5
24ad2d2
 
 
 
 
 
 
 
 
 
 
 
 
e8b46b5
24ad2d2
 
 
1055fe1
 
 
 
 
 
 
 
 
24ad2d2
 
 
1055fe1
 
24ad2d2
 
 
1055fe1
24ad2d2
 
 
 
 
 
 
1055fe1
e8b46b5
1055fe1
 
 
24ad2d2
 
 
1055fe1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24ad2d2
 
 
1055fe1
1487325
1055fe1
 
24ad2d2
 
1487325
 
 
 
 
24ad2d2
1487325
 
24ad2d2
1487325
24ad2d2
 
c4e2e43
1487325
c4e2e43
24ad2d2
c4e2e43
1487325
c4e2e43
24ad2d2
1487325
c4e2e43
 
 
 
 
 
24ad2d2
1487325
c4e2e43
 
 
 
 
 
 
1487325
c4e2e43
24ad2d2
c4e2e43
1055fe1
 
 
24ad2d2
c4e2e43
1055fe1
 
24ad2d2
1055fe1
 
 
24ad2d2
1487325
c4e2e43
 
 
 
 
 
 
1487325
c4e2e43
 
1487325
c4e2e43
24ad2d2
c4e2e43
1055fe1
 
 
 
 
 
 
 
 
24ad2d2
 
1055fe1
 
 
 
 
 
1487325
24ad2d2
1055fe1
24ad2d2
1055fe1
24ad2d2
 
1055fe1
 
 
 
 
 
 
24ad2d2
1055fe1
 
1487325
1055fe1
24ad2d2
1055fe1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24ad2d2
 
 
1055fe1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24ad2d2
 
 
1055fe1
1487325
24ad2d2
 
1487325
 
 
 
 
24ad2d2
1487325
 
 
24ad2d2
1487325
 
24ad2d2
1487325
24ad2d2
1487325
 
 
 
24ad2d2
1487325
24ad2d2
1487325
 
24ad2d2
1487325
 
 
 
 
24ad2d2
1487325
 
24ad2d2
1487325
 
 
 
24ad2d2
1487325
 
24ad2d2
1487325
 
 
 
 
24ad2d2
1487325
24ad2d2
1487325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24ad2d2
 
 
1055fe1
 
24ad2d2
1055fe1
 
24ad2d2
1055fe1
 
 
 
 
 
24ad2d2
1055fe1
e8b46b5
1055fe1
 
 
 
24ad2d2
1055fe1
 
 
 
24ad2d2
1055fe1
 
 
 
 
 
 
 
 
 
 
 
24ad2d2
 
 
1055fe1
24ad2d2
 
 
 
1055fe1
 
 
 
 
 
24ad2d2
1055fe1
 
24ad2d2
1055fe1
 
 
 
 
 
 
 
 
 
 
 
 
 
24ad2d2
1055fe1
 
24ad2d2
1055fe1
 
 
 
 
 
 
 
 
 
 
 
24ad2d2
 
e8b46b5
1055fe1
 
 
 
24ad2d2
 
1055fe1
 
 
e8b46b5
1055fe1
 
 
 
24ad2d2
 
1055fe1
 
24ad2d2
1055fe1
 
 
24ad2d2
e8b46b5
 
 
 
24ad2d2
 
 
1055fe1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24ad2d2
 
 
1055fe1
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
#!/usr/bin/env python3
"""
extract_red_text.py
Improved version that reuses hf_utils for shared heuristics while preserving
the original schema logic, logging and behavior.
"""

import re
import json
import sys
from docx import Document
from docx.oxml.ns import qn

# master schema & patterns (unchanged)
from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS

# canonical helpers (from your new hf_utils.py)
from hf_utils import (
    is_red_font,
    normalize_text,
    normalize_header_text,
    flatten_json,
    find_matching_json_key_and_value,
    get_clean_text,
    has_red_text,
    extract_red_text_segments,
    replace_red_text_in_cell,
    key_is_forbidden_for_position,
)

# -------------------------------------------------------------------
# Small XML helper (kept exactly as before β€” low-level)
# -------------------------------------------------------------------
def _prev_para_text(tbl):
    """Get text from previous paragraph before table"""
    prev = tbl._tbl.getprevious()
    while prev is not None and not prev.tag.endswith("}p"):
        prev = prev.getprevious()
    if prev is None:
        return ""
    return "".join(node.text for node in prev.iter() if node.tag.endswith("}t") and node.text).strip()

# -------------------------------------------------------------------
# Table context helpers (use normalize_text from hf_utils)
# -------------------------------------------------------------------
def fuzzy_match_heading(heading, patterns):
    """Check if heading matches any pattern with fuzzy matching"""
    if not heading:
        return False
    heading_norm = normalize_text(heading).upper()
    for pattern in patterns:
        try:
            if re.search(pattern, heading_norm, re.IGNORECASE):
                return True
        except re.error:
            # fallback simple substring if pattern isn't a valid re
            if pattern.upper() in heading_norm:
                return True
    return False

def get_table_context(tbl):
    """Get comprehensive context information for table"""
    heading = normalize_text(_prev_para_text(tbl))
    # first row headers
    headers = [normalize_text(c.text) for c in tbl.rows[0].cells if c.text.strip()] if tbl.rows else []
    col0 = [normalize_text(r.cells[0].text) for r in tbl.rows if r.cells and r.cells[0].text.strip()]
    first_cell = normalize_text(tbl.rows[0].cells[0].text) if tbl.rows else ""
    all_cells = []
    for row in tbl.rows:
        for cell in row.cells:
            text = normalize_text(cell.text)
            if text:
                all_cells.append(text)
    return {
        'heading': heading,
        'headers': headers,
        'col0': col0,
        'first_cell': first_cell,
        'all_cells': all_cells,
        'num_rows': len(tbl.rows),
        'num_cols': len(tbl.rows[0].cells) if tbl.rows else 0
    }

# -------------------------------------------------------------------
# Scoring / matching logic (kept your behavior but using normalize_text)
# -------------------------------------------------------------------
def calculate_schema_match_score(schema_name, spec, context):
    """Enhanced calculate match score - IMPROVED for Vehicle Registration tables"""
    score = 0
    reasons = []

    # VEHICLE REGISTRATION BOOST
    if "Vehicle Registration" in schema_name:
        vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
        table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
        keyword_matches = sum(1 for keyword in vehicle_keywords if keyword in table_text)
        if keyword_matches >= 2:
            score += 150
            reasons.append(f"Vehicle Registration keywords: {keyword_matches}/5")
        elif keyword_matches >= 1:
            score += 75
            reasons.append(f"Some Vehicle Registration keywords: {keyword_matches}/5")

    # SUMMARY TABLE BOOST (existing logic)
    if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
        score += 100
        reasons.append(f"Summary schema with DETAILS column - perfect match")

    if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
        score -= 75
        reasons.append(f"Non-summary schema penalized for DETAILS column presence")

    # Context exclusions
    if spec.get("context_exclusions"):
        table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
        for exclusion in spec["context_exclusions"]:
            if exclusion.lower() in table_text:
                score -= 50
                reasons.append(f"Context exclusion penalty: '{exclusion}' found")

    # Context keywords
    if spec.get("context_keywords"):
        table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
        keyword_matches = 0
        for keyword in spec["context_keywords"]:
            if keyword.lower() in table_text:
                keyword_matches += 1
        if keyword_matches > 0:
            score += keyword_matches * 15
            reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")

    # Direct first cell match
    if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
        score += 100
        reasons.append(f"Direct first cell match: '{context['first_cell']}'")

    # Heading pattern matching
    if spec.get("headings"):
        for h in spec["headings"]:
            if fuzzy_match_heading(context['heading'], [h.get("text", "")]):
                score += 50
                reasons.append(f"Heading match: '{context['heading']}'")
                break

    # Column header matching
    if spec.get("columns"):
        cols = [normalize_text(col) for col in spec["columns"]]
        matches = 0
        for col in cols:
            if any(col.upper() in h.upper() for h in context['headers']):
                matches += 1
        if matches == len(cols):
            score += 60
            reasons.append(f"All column headers match: {cols}")
        elif matches > 0:
            score += matches * 20
            reasons.append(f"Partial column matches: {matches}/{len(cols)}")

    # Label matching for left-oriented tables
    if spec.get("orientation") == "left":
        labels = [normalize_text(lbl) for lbl in spec["labels"]]
        matches = 0
        for lbl in labels:
            if any(lbl.upper() in c.upper() or c.upper() in lbl.upper() for c in context['col0']):
                matches += 1
        if matches > 0:
            score += (matches / len(labels)) * 30
            reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")

    # Enhanced Label matching for row1-oriented tables (Vehicle Registration)
    elif spec.get("orientation") == "row1":
        labels = [normalize_text(lbl) for lbl in spec["labels"]]
        matches = 0
        for lbl in labels:
            if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
                matches += 1
            elif any(word.upper() in " ".join(context['headers']).upper() for word in lbl.split() if len(word) > 3):
                matches += 0.5
        if matches > 0:
            score += (matches / len(labels)) * 40
            reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")

    # Special handling for Declaration tables
    if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
        if "OPERATOR DECLARATION" in context['heading'].upper():
            score += 80
            reasons.append("Operator Declaration context match")
        elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
            score += 60
            reasons.append("Manager found in cells (likely Operator Declaration)")

    if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
        if any("MANAGER" in cell.upper() for cell in context['all_cells']):
            score -= 50
            reasons.append("Penalty: Manager found (not auditor)")

    return score, reasons

def match_table_schema(tbl):
    """Improved table schema matching with scoring system"""
    context = get_table_context(tbl)
    best_match = None
    best_score = 0
    for name, spec in TABLE_SCHEMAS.items():
        score, reasons = calculate_schema_match_score(name, spec, context)
        if score > best_score:
            best_score = score
            best_match = name
    if best_score >= 20:
        return best_match
    return None

# -------------------------------------------------------------------
# Multi-schema detection & extraction (kept behavior)
# -------------------------------------------------------------------
def check_multi_schema_table(tbl):
    """Check if table contains multiple schemas and split appropriately"""
    context = get_table_context(tbl)
    operator_labels = ["Operator name (Legal entity)", "NHVAS Accreditation No.", "Registered trading name/s", 
                      "Australian Company Number", "NHVAS Manual"]
    contact_labels = ["Operator business address", "Operator Postal address", "Email address", "Operator Telephone Number"]
    has_operator = any(any(op_lbl.upper() in cell.upper() for op_lbl in operator_labels) for cell in context['col0'])
    has_contact = any(any(cont_lbl.upper() in cell.upper() for cont_lbl in contact_labels) for cell in context['col0'])
    if has_operator and has_contact:
        return ["Operator Information", "Operator contact details"]
    return None

def extract_multi_schema_table(tbl, schemas):
    """Extract data from table with multiple schemas"""
    result = {}
    for schema_name in schemas:
        if schema_name not in TABLE_SCHEMAS:
            continue
        spec = TABLE_SCHEMAS[schema_name]
        schema_data = {}
        for ri, row in enumerate(tbl.rows):
            if ri == 0:
                continue
            row_label = normalize_text(row.cells[0].text)
            belongs_to_schema = False
            matched_label = None
            for spec_label in spec["labels"]:
                spec_norm = normalize_text(spec_label).upper()
                row_norm = row_label.upper()
                if spec_norm == row_norm or spec_norm in row_norm or row_norm in spec_norm:
                    belongs_to_schema = True
                    matched_label = spec_label
                    break
            if not belongs_to_schema:
                continue
            for ci, cell in enumerate(row.cells):
                red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
                if red_txt:
                    if matched_label not in schema_data:
                        schema_data[matched_label] = []
                    if red_txt not in schema_data[matched_label]:
                        schema_data[matched_label].append(red_txt)
        if schema_data:
            result[schema_name] = schema_data
    return result

# -------------------------------------------------------------------
# Table extraction for schemas (kept your specialized vehicle handling)
# -------------------------------------------------------------------
def extract_table_data(tbl, schema_name, spec):
    """Extract red text data from table based on schema - ENHANCED for Vehicle Registration"""

    # Special handling for vehicle registration tables
    if "Vehicle Registration" in schema_name:
        print(f"    πŸš— EXTRACTION FIX: Processing Vehicle Registration table")
        labels = spec["labels"]
        collected = {lbl: [] for lbl in labels}
        seen = {lbl: set() for lbl in labels}

        if len(tbl.rows) < 2:
            print(f"    ❌ Vehicle table has less than 2 rows")
            return {}

        header_row = tbl.rows[0]
        column_mapping = {}

        print(f"    πŸ“‹ Mapping {len(header_row.cells)} header cells to labels")

        for col_idx, cell in enumerate(header_row.cells):
            header_text = normalize_text(cell.text).strip()
            if not header_text:
                continue

            print(f"      Column {col_idx}: '{header_text}'")

            best_match = None
            best_score = 0

            for label in labels:
                if header_text.upper() == label.upper():
                    best_match = label
                    best_score = 1.0
                    break

                header_words = set(word.upper() for word in header_text.split() if len(word) > 2)
                label_words = set(word.upper() for word in label.split() if len(word) > 2)

                if header_words and label_words:
                    common_words = header_words.intersection(label_words)
                    if common_words:
                        score = len(common_words) / max(len(header_words), len(label_words))
                        if score > best_score and score >= 0.4:
                            best_score = score
                            best_match = label

            if best_match:
                column_mapping[col_idx] = best_match
                print(f"        βœ… Mapped to: '{best_match}' (score: {best_score:.2f})")
            else:
                print(f"        ⚠️ No mapping found for '{header_text}'")

        print(f"    πŸ“Š Total column mappings: {len(column_mapping)}")

        # Extract red text from data rows (skip header)
        for row_idx in range(1, len(tbl.rows)):
            row = tbl.rows[row_idx]
            print(f"      πŸ“Œ Processing data row {row_idx}")
            for col_idx, cell in enumerate(row.cells):
                if col_idx in column_mapping:
                    label = column_mapping[col_idx]
                    red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
                    if red_txt:
                        print(f"        πŸ”΄ Found red text in '{label}': '{red_txt}'")
                        if red_txt not in seen[label]:
                            seen[label].add(red_txt)
                            collected[label].append(red_txt)
        result = {k: v for k, v in collected.items() if v}
        print(f"    βœ… Vehicle Registration extracted: {len(result)} columns with data")
        return result

    # FALLBACK: original extraction logic for other tables
    labels = spec.get("labels", []) + [schema_name]
    collected = {lbl: [] for lbl in labels}
    seen = {lbl: set() for lbl in labels}
    by_col = (spec.get("orientation") == "row1")
    start_row = 1 if by_col else 0
    rows = tbl.rows[start_row:]

    for ri, row in enumerate(rows):
        for ci, cell in enumerate(row.cells):
            red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
            if not red_txt:
                continue
            if by_col:
                if ci < len(spec.get("labels", [])):
                    lbl = spec["labels"][ci]
                else:
                    lbl = schema_name
            else:
                raw_label = normalize_text(row.cells[0].text)
                lbl = None
                for spec_label in spec.get("labels", []):
                    if normalize_text(spec_label).upper() == raw_label.upper():
                        lbl = spec_label
                        break
                if not lbl:
                    for spec_label in spec.get("labels", []):
                        spec_norm = normalize_text(spec_label).upper()
                        raw_norm = raw_label.upper()
                        if spec_norm in raw_norm or raw_norm in spec_norm:
                            lbl = spec_label
                            break
                if not lbl:
                    lbl = schema_name
            if red_txt not in seen[lbl]:
                seen[lbl].add(red_txt)
                collected[lbl].append(red_txt)
    return {k: v for k, v in collected.items() if v}

# -------------------------------------------------------------------
# Main extraction: iterate tables & paragraphs
# -------------------------------------------------------------------
def extract_red_text(input_doc):
    """
    input_doc: docx.Document object or file path
    returns: dict
    """
    if isinstance(input_doc, str):
        doc = Document(input_doc)
    else:
        doc = input_doc
    out = {}
    table_count = 0

    for tbl in doc.tables:
        table_count += 1
        # Check multi-schema table first
        multi_schemas = check_multi_schema_table(tbl)
        if multi_schemas:
            multi_data = extract_multi_schema_table(tbl, multi_schemas)
            for schema_name, schema_data in multi_data.items():
                if schema_data:
                    if schema_name in out:
                        for k, v in schema_data.items():
                            if k in out[schema_name]:
                                out[schema_name][k].extend(v)
                            else:
                                out[schema_name][k] = v
                    else:
                        out[schema_name] = schema_data
            continue

        schema = match_table_schema(tbl)
        if not schema:
            # keep scanning for tables even if no schema matched
            continue
        spec = TABLE_SCHEMAS[schema]
        data = extract_table_data(tbl, schema, spec)
        if data:
            if schema in out:
                for k, v in data.items():
                    if k in out[schema]:
                        out[schema][k].extend(v)
                    else:
                        out[schema][k] = v
            else:
                out[schema] = data

    # paragraphs
    paras = {}
    for idx, para in enumerate(doc.paragraphs):
        red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
        if not red_txt:
            continue

        # find context heading by scanning backward
        context = None
        for j in range(idx-1, -1, -1):
            txt = normalize_text(doc.paragraphs[j].text)
            if txt:
                all_patterns = HEADING_PATTERNS["main"] + HEADING_PATTERNS["sub"]
                if any(re.search(p, txt, re.IGNORECASE) for p in all_patterns):
                    context = txt
                    break

        # if it's date-like and matches date pattern, set context to Date
        if not context and re.fullmatch(PARAGRAPH_PATTERNS["date_line"], red_txt):
            context = "Date"

        if not context:
            context = "(para)"
        paras.setdefault(context, []).append(red_txt)

    if paras:
        out["paragraphs"] = paras
    return out

# -------------------------------------------------------------------
# File-like wrapper (keeps API used elsewhere)
# -------------------------------------------------------------------
def extract_red_text_filelike(input_file, output_file):
    """
    Accepts:
      input_file: file-like object (BytesIO/File) or path
      output_file: file-like object (opened for writing text) or path
    """
    if hasattr(input_file, "seek"):
        input_file.seek(0)
    doc = Document(input_file)
    result = extract_red_text(doc)
    if hasattr(output_file, "write"):
        json.dump(result, output_file, indent=2, ensure_ascii=False)
        output_file.flush()
    else:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=2, ensure_ascii=False)
    return result

# -------------------------------------------------------------------
# CLI entrypoint (preserve original UX)
# -------------------------------------------------------------------
if __name__ == "__main__":
    if len(sys.argv) == 3:
        input_docx = sys.argv[1]
        output_json = sys.argv[2]
        doc = Document(input_docx)
        word_data = extract_red_text(doc)
        with open(output_json, 'w', encoding='utf-8') as f:
            json.dump(word_data, f, indent=2, ensure_ascii=False)
        print(json.dumps(word_data, indent=2, ensure_ascii=False))
    else:
        print("To use as a module: extract_red_text_filelike(input_file, output_file)")