Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

704d2a2

verified ·

1 Parent(s): 9880bcc

Update extract_red_text.py

Browse files

Files changed (1) hide show

extract_red_text.py +316 -577

extract_red_text.py CHANGED Viewed

@@ -2,601 +2,340 @@
 """
 extract_red_text.py
 """
-import re
 import json
 import sys
-from io import BytesIO
-from docx import Document
-from docx.oxml.ns import qn
-# Import schema constants (TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS, GLOBAL_SETTINGS)
-# Ensure master_key.py is present in same dir / importable path
-from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS, GLOBAL_SETTINGS
-def is_red_font(run):
     """
-    Robust detection of 'red' font in a run.
-    Tries several sources:
-      - python-docx run.font.color.rgb (safe-guarded)
-      - raw XML rPr/w:color value (hex)
-    Returns True if color appears predominantly red.
     """
-    # Quick guard
-    if run is None:
-        return False
-    # 1) Try docx high-level color API if available
     try:
-        col = getattr(run.font, "color", None)
-        if col is not None:
-            rgb_val = getattr(col, "rgb", None)
-            if rgb_val:
-                # rgb_val might be an RGBColor object or a tuple/list or hex-string
-                try:
-                    # If it's sequence-like (tuple/list) with 3 ints
-                    if isinstance(rgb_val, (tuple, list)) and len(rgb_val) == 3:
-                        rr, gg, bb = rgb_val
-                    else:
-                        # Try string representation like 'FF0000' or 'ff0000'
-                        hexstr = str(rgb_val).strip()
-                        if re.fullmatch(r"[0-9A-Fa-f]{6}", hexstr):
-                            rr, gg, bb = int(hexstr[0:2], 16), int(hexstr[2:4], 16), int(hexstr[4:6], 16)
-                        else:
-                            # unknown format - fall through to XML check
-                            rr = gg = bb = None
-                    if rr is not None:
-                        # Heuristic thresholds for 'red-ish'
-                        if rr > 150 and gg < 120 and bb < 120 and (rr - gg) > 30 and (rr - bb) > 30:
-                            return True
-                except Exception:
-                    # fall back to rPr introspection below
-                    pass
     except Exception:
-        # ignore and continue to XML method
         pass
-    # 2) Inspect raw XML run properties for <w:color w:val="RRGGBB" />
     try:
-        rPr = getattr(run._element, "rPr", None)
-        if rPr is not None:
-            clr = rPr.find(qn('w:color'))
-            if clr is not None:
-                val = clr.get(qn('w:val')) or clr.get('w:val') or clr.get('val')
-                if val and isinstance(val, str):
-                    val = val.strip()
-                    # sometimes color is provided as 'FF0000' hex or shorthand
-                    if re.fullmatch(r"[0-9A-Fa-f]{6}", val):
-                        rr, gg, bb = int(val[0:2], 16), int(val[2:4], 16), int(val[4:6], 16)
-                        if rr > 150 and gg < 120 and bb < 120 and (rr - gg) > 30 and (rr - bb) > 30:
-                            return True
     except Exception:
         pass
-    return False
-def _prev_para_text(tbl):
-    """Return text of previous paragraph node before a given table element."""
-    prev = tbl._tbl.getprevious()
-    while prev is not None and not prev.tag.endswith("}p"):
-        prev = prev.getprevious()
-    if prev is None:
-        return ""
-    # gather all text nodes under the paragraph element
-    return "".join(node.text for node in prev.iter() if node.tag.endswith("}t") and node.text).strip()
-def normalize_text(text):
-    """Normalize text for more reliable matching (collapse whitespace)."""
-    if text is None:
-        return ""
-    return re.sub(r'\s+', ' ', text.strip())
-def fuzzy_match_heading(heading, patterns):
-    """
-    Attempt fuzzy matching of heading against regex patterns.
-    patterns is a list of pattern dicts or strings.
-    """
-    heading_norm = normalize_text(heading.upper())
-    for p in patterns:
-        if isinstance(p, dict):
-            pat = p.get("text", "")
-        else:
-            pat = p
-        try:
-            if re.search(pat, heading_norm, re.IGNORECASE):
-                return True
-        except re.error:
-            # treat as plain substring fallback
-            if pat and pat.upper() in heading_norm:
-                return True
     return False
-def get_table_context(tbl):
-    """Return context metadata for a table to aid schema matching."""
-    heading = normalize_text(_prev_para_text(tbl))
-    headers = []
-    if tbl.rows:
-        # collect header text of first row, keeping cell order
-        headers = [normalize_text(c.text) for c in tbl.rows[0].cells]
-    col0 = [normalize_text(r.cells[0].text) for r in tbl.rows if r.cells and r.cells[0].text.strip()]
-    first_cell = normalize_text(tbl.rows[0].cells[0].text) if tbl.rows else ""
-    all_cells = []
-    for row in tbl.rows:
-        for cell in row.cells:
-            text = normalize_text(cell.text)
-            if text:
-                all_cells.append(text)
-    return {
-        'heading': heading,
-        'headers': headers,
-        'col0': col0,
-        'first_cell': first_cell,
-        'all_cells': all_cells,
-        'num_rows': len(tbl.rows),
-        'num_cols': len(tbl.rows[0].cells) if tbl.rows else 0
-    }
-def calculate_schema_match_score(schema_name, spec, context):
-    """
-    Return (score, reasons[]) for how well a table context matches a schema.
-    Heuristic-based scoring; vehicle registration and 'DETAILS' summary boosts added.
-    """
-    score = 0
-    reasons = []
-    table_text = " ".join(context.get('headers', [])).lower() + " " + context.get('heading', "").lower()
-    # Vehicle Registration specific boost
-    if "Vehicle Registration" in schema_name:
-        vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension", "roadworthiness"]
-        keyword_matches = sum(1 for kw in vehicle_keywords if kw in table_text)
-        if keyword_matches >= 2:
-            score += 150
-            reasons.append(f"Vehicle keywords matched: {keyword_matches}")
-        elif keyword_matches >= 1:
-            score += 75
-            reasons.append(f"Some vehicle keywords matched: {keyword_matches}")
-    # Summary DETAILS boost
-    if "Summary" in schema_name and "details" in table_text:
-        score += 100
-        reasons.append("Summary with DETAILS found")
-    if "Summary" not in schema_name and "details" in table_text:
-        score -= 75
-        reasons.append("Non-summary schema penalized due to DETAILS column presence")
-    # Context exclusions
-    for exclusion in spec.get("context_exclusions", []):
-        if exclusion.lower() in table_text:
-            score -= 50
-            reasons.append(f"Context exclusion: {exclusion}")
-    # Context keywords positive matches
-    kw_count = 0
-    for kw in spec.get("context_keywords", []):
-        if kw.lower() in table_text:
-            kw_count += 1
-    if kw_count:
-        score += kw_count * 15
-        reasons.append(f"Context keywords matched: {kw_count}")
-    # First-cell exact match
-    if context.get('first_cell') and context['first_cell'].upper() == schema_name.upper():
-        score += 100
-        reasons.append("Exact first cell match")
-    # Heading pattern match
-    for h in spec.get("headings", []) or []:
-        pat = h.get("text") if isinstance(h, dict) and h.get("text") else h
-        try:
-            if pat and re.search(pat, context.get('heading', ""), re.IGNORECASE):
-                score += 50
-                reasons.append(f"Heading regex matched: {pat}")
-                break
-        except re.error:
-            if pat and pat.lower() in context.get('heading', "").lower():
-                score += 50
-                reasons.append(f"Heading substring matched: {pat}")
-                break
-    # Column header matching (strict)
-    if spec.get("columns"):
-        cols = [normalize_text(c) for c in spec["columns"]]
-        matches = 0
-        for col in cols:
-            if any(col.upper() in h.upper() for h in context.get('headers', [])):
-                matches += 1
-        if matches == len(cols):
-            score += 60
-            reasons.append("All expected columns matched exactly")
-        elif matches > 0:
-            score += matches * 20
-            reasons.append(f"Partial column matches: {matches}/{len(cols)}")
-    # Label matching for left-oriented tables
-    if spec.get("orientation") == "left":
-        labels = [normalize_text(lbl) for lbl in spec.get("labels", [])]
-        matches = 0
-        for lbl in labels:
-            if any(lbl.upper() in c.upper() or c.upper() in lbl.upper() for c in context.get('col0', [])):
-                matches += 1
-        if matches > 0:
-            score += (matches / max(1, len(labels))) * 30
-            reasons.append(f"Left-orientation label matches: {matches}/{len(labels)}")
-    # Row1 (header row) flexible matching
-    elif spec.get("orientation") == "row1":
-        labels = [normalize_text(lbl) for lbl in spec.get("labels", [])]
-        matches = 0.0
-        header_texts = " ".join(context.get('headers', [])).upper()
-        for lbl in labels:
-            label_upper = lbl.upper()
-            # exact in any header
-            if any(label_upper in h.upper() for h in context.get('headers', [])):
-                matches += 1.0
-            else:
-                # partial words from label in header_texts
-                for word in label_upper.split():
-                    if len(word) > 3 and word in header_texts:
-                        matches += 0.5
-                        break
-        if matches > 0:
-            score += (matches / max(1.0, len(labels))) * 40
-            reasons.append(f"Row1 header-like matches: {matches}/{len(labels)}")
-    # Special handling for declaration schemas
-    if schema_name == "Operator Declaration":
-        # boost if 'print name' first cell and heading indicates operator declaration
-        if context.get('first_cell', "").upper().startswith("PRINT"):
-            if "OPERATOR DECLARATION" in context.get('heading', "").upper():
-                score += 80
-                reasons.append("Operator Declaration context & first-cell indicate match")
-            elif any("MANAGER" in c.upper() for c in context.get('all_cells', [])):
-                score += 60
-                reasons.append("Manager found in cells for Operator Declaration")
-    if schema_name == "NHVAS Approved Auditor Declaration":
-        if context.get('first_cell', "").upper().startswith("PRINT"):
-            # penalize where manager words appear (to reduce false positives)
-            if any("MANAGER" in c.upper() for c in context.get('all_cells', [])):
-                score -= 50
-                reasons.append("Penalty: found manager text in auditor declaration table")
-    return score, reasons
-def match_table_schema(tbl):
-    """
-    Iterate TABLE_SCHEMAS and pick best match by score threshold.
-    Returns schema name or None when below threshold.
-    """
-    context = get_table_context(tbl)
-    best_match = None
-    best_score = float("-inf")
-    for name, spec in TABLE_SCHEMAS.items():
-        try:
-            score, reasons = calculate_schema_match_score(name, spec, context)
-        except Exception:
-            score, reasons = 0, ["error computing score"]
-        if score > best_score:
-            best_score = score
-            best_match = name
-    # threshold to avoid spurious picks
-    if best_score >= 20:
-        return best_match
-    return None
-def check_multi_schema_table(tbl):
-    """
-    Identify tables that contain multiple logical schemas (e.g., Operator Information + Contact Details)
-    Return list of schema names if multi, else None.
-    """
-    context = get_table_context(tbl)
-    operator_labels = ["Operator name (Legal entity)", "NHVAS Accreditation No.", "Registered trading name/s",
-                       "Australian Company Number", "NHVAS Manual"]
-    contact_labels = ["Operator business address", "Operator Postal address", "Email address", "Operator Telephone Number"]
-    has_operator = any(any(op_lbl.upper() in cell.upper() for op_lbl in operator_labels) for cell in context.get('col0', []))
-    has_contact = any(any(cont_lbl.upper() in cell.upper() for cont_lbl in contact_labels) for cell in context.get('col0', []))
-    if has_operator and has_contact:
-        return ["Operator Information", "Operator contact details"]
-    return None
-def extract_multi_schema_table(tbl, schemas):
-    """
-    For tables that embed multiple schema sections vertically (left orientation), split and extract.
-    Returns a dict mapping schema_name -> {label: [values,...]}
-    """
-    result = {}
-    for schema_name in schemas:
-        if schema_name not in TABLE_SCHEMAS:
-            continue
-        spec = TABLE_SCHEMAS[schema_name]
-        schema_data = {}
-        # iterate rows and match the left-most cell against spec labels
-        for ri, row in enumerate(tbl.rows):
-            if not row.cells:
-                continue
-            row_label = normalize_text(row.cells[0].text)
-            belongs = False
-            matched_label = None
-            for spec_label in spec.get("labels", []):
-                spec_norm = normalize_text(spec_label).upper()
-                row_norm = row_label.upper()
-                if spec_norm == row_norm or spec_norm in row_norm or row_norm in spec_norm:
-                    belongs = True
-                    matched_label = spec_label
-                    break
-            if not belongs:
-                continue
-            # gather red-text from the row's value cells (all others)
-            for ci, cell in enumerate(row.cells[1:], start=1):
-                red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
-                if red_txt:
-                    schema_data.setdefault(matched_label, []).append(red_txt)
-        if schema_data:
-            result[schema_name] = schema_data
-    return result
-def extract_table_data(tbl, schema_name, spec):
-    """
-    Extract red text from a table for a given schema.
-    Special handling for Vehicle Registration (row1 header orientation).
-    """
-    # Vehicle Registration special-case (headers in first row)
-    if "Vehicle Registration" in schema_name:
-        print(f"    🚗 EXTRACTION FIX: Processing Vehicle Registration table")
-        labels = spec.get("labels", [])
-        collected = {lbl: [] for lbl in labels}
-        seen = {lbl: set() for lbl in labels}
-        if len(tbl.rows) < 2:
-            print("    ❌ Vehicle table has less than 2 rows; skipping")
-            return {}
-        header_row = tbl.rows[0]
-        column_mapping = {}
-        print(f"    📋 Mapping {len(header_row.cells)} header cells to labels")
-        for col_idx, cell in enumerate(header_row.cells):
-            header_text = normalize_text(cell.text).strip()
-            if not header_text:
-                continue
-            print(f"      Column {col_idx}: '{header_text}'")
-            best_match = None
-            best_score = 0.0
-            for label in labels:
-                # exact match
-                if header_text.upper() == label.upper():
-                    best_match = label
-                    best_score = 1.0
-                    break
-                # partial token overlap scoring
-                header_words = set(word.upper() for word in header_text.split() if len(word) > 2)
-                label_words = set(word.upper() for word in label.split() if len(word) > 2)
-                if header_words and label_words:
-                    common = header_words.intersection(label_words)
-                    if common:
-                        score = len(common) / max(len(header_words), len(label_words))
-                        if score > best_score and score >= 0.35:  # relaxed threshold for OCR noise
-                            best_score = score
-                            best_match = label
-            if best_match:
-                column_mapping[col_idx] = best_match
-                print(f"        ✅ Mapped to: '{best_match}' (score: {best_score:.2f})")
-            else:
-                # additional heuristics: simple substring matches
-                for label in labels:
-                    if label.lower() in header_text.lower() or header_text.lower() in label.lower():
-                        column_mapping[col_idx] = label
-                        print(f"        ✅ Mapped by substring to: '{label}'")
-                        break
                 else:
-                    print(f"        ⚠️ No mapping found for '{header_text}'")
-        print(f"    📊 Total column mappings: {len(column_mapping)}")
-        # Extract data rows
-        for row_idx in range(1, len(tbl.rows)):
-            row = tbl.rows[row_idx]
-            print(f"      📌 Processing data row {row_idx}")
-            for col_idx, cell in enumerate(row.cells):
-                if col_idx not in column_mapping:
-                    continue
-                label = column_mapping[col_idx]
-                red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
-                if red_txt:
-                    print(f"        🔴 Found red text in '{label}': '{red_txt}'")
-                    if red_txt not in seen[label]:
-                        seen[label].add(red_txt)
-                        collected[label].append(red_txt)
-        result = {k: v for k, v in collected.items() if v}
-        print(f"    ✅ Vehicle Registration extracted: {len(result)} columns with data")
-        return result
-    # Generic extraction for other table types
-    labels = spec.get("labels", []) + [schema_name]
-    collected = {lbl: [] for lbl in labels}
-    seen = {lbl: set() for lbl in labels}
-    by_col = (spec.get("orientation") == "row1")
-    start_row = 1 if by_col else 0
-    rows = tbl.rows[start_row:]
-    for ri, row in enumerate(rows):
-        for ci, cell in enumerate(row.cells):
-            red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
-            if not red_txt:
-                continue
-            if by_col:
-                # column-wise mapping (header labels)
-                if ci < len(spec.get("labels", [])):
-                    lbl = spec["labels"][ci]
-                else:
-                    lbl = schema_name
-            else:
-                # left-oriented: match left label
-                raw_label = normalize_text(row.cells[0].text)
-                lbl = None
-                for spec_label in spec.get("labels", []):
-                    if normalize_text(spec_label).upper() == raw_label.upper():
-                        lbl = spec_label
-                        break
-                if not lbl:
-                    for spec_label in spec.get("labels", []):
-                        spec_norm = normalize_text(spec_label).upper()
-                        raw_norm = raw_label.upper()
-                        if spec_norm in raw_norm or raw_norm in spec_norm:
-                            lbl = spec_label
-                            break
-                if not lbl:
-                    lbl = schema_name
-            if red_txt not in seen[lbl]:
-                seen[lbl].add(red_txt)
-                collected[lbl].append(red_txt)
-    return {k: v for k, v in collected.items() if v}
-def extract_red_text(input_doc):
-    """
-    Main extraction function.
-    Accepts a docx.Document object or a path string (filename).
-    Returns dictionary of extracted red-text organized by schema.
-    """
-    if isinstance(input_doc, str):
-        doc = Document(input_doc)
-    else:
-        doc = input_doc
-    out = {}
-    table_count = 0
-    for tbl in doc.tables:
-        table_count += 1
-        # Check for multi-schema tables first
-        multi_schemas = check_multi_schema_table(tbl)
-        if multi_schemas:
-            multi_data = extract_multi_schema_table(tbl, multi_schemas)
-            for schema_name, schema_data in multi_data.items():
-                if schema_data:
-                    if schema_name in out:
-                        for k, v in schema_data.items():
-                            out[schema_name].setdefault(k, []).extend(v)
-                    else:
-                        out[schema_name] = schema_data
-            continue
-        # match a single schema
-        schema = match_table_schema(tbl)
-        if not schema:
-            # no confident schema match
-            continue
-        spec = TABLE_SCHEMAS.get(schema, {})
-        data = extract_table_data(tbl, schema, spec)
-        if data:
-            if schema in out:
-                for k, v in data.items():
-                    out[schema].setdefault(k, []).extend(v)
-            else:
-                out[schema] = data
-    # Paragraph-level red-text extraction (with contextual heading resolution)
-    paras = {}
-    for idx, para in enumerate(doc.paragraphs):
-        red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
-        if not red_txt:
-            continue
-        # attempt to find nearest preceding heading paragraph (using HEADING_PATTERNS)
-        context = None
-        for j in range(idx - 1, -1, -1):
-            txt = normalize_text(doc.paragraphs[j].text)
-            if not txt:
-                continue
-            all_patterns = HEADING_PATTERNS.get("main", []) + HEADING_PATTERNS.get("sub", [])
-            if any(re.search(p, txt, re.IGNORECASE) for p in all_patterns):
-                context = txt
-                break
-        # fallback: date-line mapping for 'Date' single-line red texts
-        if not context and re.fullmatch(PARAGRAPH_PATTERNS.get("date_line", r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$"), red_txt):
-            context = "Date"
-        if not context:
-            context = "(para)"
-        paras.setdefault(context, []).append(red_txt)
-    if paras:
-        out["paragraphs"] = paras
     return out
-def extract_red_text_filelike(input_file, output_file):
-    """
-    Accepts:
-      - input_file: file-like object (BytesIO/File) or path
-      - output_file: file-like object (opened for writing text) or path
-    Returns the parsed dictionary.
-    Writes the JSON to output_file if possible.
-    """
-    # Reset file-like if necessary
-    if hasattr(input_file, "seek"):
-        try:
-            input_file.seek(0)
-        except Exception:
-            pass
-    # Load Document
-    if isinstance(input_file, (str, bytes)):
-        doc = Document(input_file)
-    else:
-        doc = Document(input_file)
-    result = extract_red_text(doc)
-    # Write result out
-    if hasattr(output_file, "write"):
-        json.dump(result, output_file, indent=2, ensure_ascii=False)
-        try:
-            output_file.flush()
-        except Exception:
-            pass
-    else:
-        with open(output_file, "w", encoding="utf-8") as f:
-            json.dump(result, f, indent=2, ensure_ascii=False)
-    return result
 if __name__ == "__main__":
-    # Backwards-compatible script entry point
-    if len(sys.argv) == 3:
-        input_docx = sys.argv[1]
-        output_json = sys.argv[2]
-        try:
-            doc = Document(input_docx)
-            word_data = extract_red_text(doc)
-            with open(output_json, 'w', encoding='utf-8') as f:
-                json.dump(word_data, f, indent=2, ensure_ascii=False)
-            print(json.dumps(word_data, indent=2, ensure_ascii=False))
-        except Exception as e:
-            print("Error during extraction:", e)
-            raise
-    else:
-        print("To use as a module: extract_red_text_filelike(input_file, output_file)")

 """
 extract_red_text.py
 """
+from __future__ import annotations
 import json
+import re
 import sys
+import logging
+from collections import defaultdict
+from typing import List, Dict, Optional, Any
+# attempt to import python-docx (document processing)
+try:
+    from docx import Document
+    from docx.oxml.ns import qn
+    from docx.shared import RGBColor
+except Exception as e:
+    raise RuntimeError("python-docx is required. Install with: pip install python-docx") from e
+# ------------------------------
+# Import master_key GLOBAL_SETTINGS and optional EXTRA_HEADER_SYNONYMS
+# ------------------------------
+try:
+    import master_key as mk
+    GLOBAL_SETTINGS = getattr(mk, "GLOBAL_SETTINGS", {})
+    EXTRA_HEADER_SYNONYMS = getattr(mk, "EXTRA_HEADER_SYNONYMS", None)
+except Exception:
+    GLOBAL_SETTINGS = {
+        "normalize": {
+            "lower": True,
+            "strip_punctuation": True,
+            "collapse_whitespace": True,
+            "replace_smart_dashes": True
+        },
+        "ocr_repair_rules": [
+            (r"\s*\(\s*Yes\s*/\s*No\s*\)", " (Yes/No)"),
+            (r"R[e3]gistrat[i1]on", "Registration"),
+            (r"Prin?t", "Print"),
+            (r"Accredi[ta]tion", "Accreditation"),
+            (r"[^\w\s\-\&\(\)\/:]", " "),
+        ],
+        "split_on": [" – ", " - ", ";", "\n", " / "],
+        "date_like_pattern": r"^\s*(\d{1,2}(st|nd|rd|th)?\s+[A-Za-z]+|\d{1,2}\/\d{1,2}\/\d{2,4}|\d{1,2}\.\d{1,2}\.\d{2,4}|\d{1,2}\s+[A-Za-z]{3,})",
+        "fuzzy_thresholds": {"high_priority": 70, "medium_priority": 60, "low_priority": 45},
+        "fuzzy_algorithm": "token_set_ratio",
+    }
+    EXTRA_HEADER_SYNONYMS = None
+# Provide an internal default synonyms map (compact keys -> canonical label)
+# This is used only if master_key.EXTRA_HEADER_SYNONYMS is not defined.
+_DEFAULT_EXTRA_HEADER_SYNONYMS = {
+    # Compact key: canonical label
+    # Examples from your logs (long/noisy headers)
+    "roadworthinesscertificatesapplicableforentryaudit": "Roadworthiness Certificates",
+    "roadworthinesscertificates": "Roadworthiness Certificates",
+    "rfsuspensioncertificationn/aifnotapplicable": "RFS Suspension Certification #",
+    "rfsuspensioncertification": "RFS Suspension Certification #",
+    "maintenanceRecordsrecorddaterangeofrecordsreviewed".lower(): "Maintenance Records",
+    "maintenancerecords": "Maintenance Records",
+    "faultrecordingreportingonsuspensionsystemdaterange".lower(): "Fault Recording/ Reporting",
+    "faultrecordingreporting": "Fault Recording/ Reporting",
+    "faultrepairdaterange": "Fault Repair",
+    "triprecordsdaterange": "Trip Records",
+    # Add common variations
+    "registrationnumber": "Registration Number",
+    "registrationnumbernumber": "Registration Number",
+    "subcontractor(yesno)": "Sub-contractor (Yes/No)",
+    "sub-contractor(yes/no)": "Sub-contractor (Yes/No)",
+    "nhvrorexemplarglobalauditorregistrationnumber": "NHVR or Exemplar Global Auditor Registration Number",
+    "printname": "Print Name",
+    "print": "Print Name",
+}
+# If mk provided EXTRA_HEADER_SYNONYMS, use it (but ensure keys are compacted similarly)
+if EXTRA_HEADER_SYNONYMS is None:
+    EXTRA_HEADER_SYNONYMS = _DEFAULT_EXTRA_HEADER_SYNONYMS
+# ------------------------------
+# Logging
+# ------------------------------
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
+log = logging.getLogger("extract_red_text")
+# ------------------------------
+# Normalization & OCR-repair utilities (aligned to GLOBAL_SETTINGS)
+# ------------------------------
+def _apply_ocr_repair_rules(text: str) -> str:
+    s = text or ""
+    for pat, repl in GLOBAL_SETTINGS.get("ocr_repair_rules", []):
+        try:
+            s = re.sub(pat, repl, s, flags=re.I)
+        except re.error:
+            # skip invalid rule
+            continue
+    return s
+def _normalize_text(text: str) -> str:
+    """Normalize text according to GLOBAL_SETTINGS (readable normalized form)."""
+    s = _apply_ocr_repair_rules(text or "")
+    norm_cfg = GLOBAL_SETTINGS.get("normalize", {})
+    if norm_cfg.get("replace_smart_dashes", False):
+        s = s.replace("–", "-").replace("—", "-")
+    if norm_cfg.get("lower", False):
+        s = s.lower()
+    if norm_cfg.get("strip_punctuation", False):
+        # keep hyphen, ampersand, parentheses, slash, colon; drop other punctuation
+        s = re.sub(r"[^\w\s\-\&\(\)\/:]", " ", s)
+    if norm_cfg.get("collapse_whitespace", False):
+        s = re.sub(r"\s+", " ", s)
+    return s.strip()
+def _compact_key(text: str) -> str:
+    """Create compact key (no non-word chars) for deterministic lookup."""
+    if text is None:
+        return ""
+    normalized = _normalize_text(text)
+    return re.sub(r"[^\w]", "", normalized)
+def map_header_using_extra_synonyms(header_text: str) -> Optional[str]:
     """
+    Try deterministic mapping using EXTRA_HEADER_SYNONYMS.
+    Return canonical label if found, else None.
     """
+    if not header_text:
+        return None
+    normalized = _normalize_text(header_text)
+    compact = _compact_key(header_text)
+    # try compact key
+    if compact in EXTRA_HEADER_SYNONYMS:
+        return EXTRA_HEADER_SYNONYMS[compact]
+    # try normalized key directly
+    if normalized in EXTRA_HEADER_SYNONYMS:
+        return EXTRA_HEADER_SYNONYMS[normalized]
+    # also try case-insensitive match on keys
+    for k, v in EXTRA_HEADER_SYNONYMS.items():
+        if k.lower() == normalized.lower() or k.lower() == compact.lower():
+            return v
+    return None
+# ------------------------------
+# Helpers to detect red font runs robustly
+# ------------------------------
+def _run_is_red(run) -> bool:
+    """
+    Detect if a run is red. python-docx represents color by run.font.color.
+    We check RGB if available, or theme color 'red' as fallback.
+    """
     try:
+        color = run.font.color
+        if color is None:
+            return False
+        # If RGB is specified
+        rgb = getattr(color, "rgb", None)
+        if rgb is not None:
+            # rgb is a docx.shared.RGBColor or similar. Representable as 'FF0000' or integer tuple
+            hexval = ''.join("{:02X}".format(c) for c in rgb) if isinstance(rgb, (tuple, list)) else str(rgb)
+            # accept strings containing 'FF0000' or '0000FF'? (we want red)
+            # Accept any color where red component is high and others low-ish
+            try:
+                # If hex-like 'FF0000' -> interpret
+                hex_clean = re.sub(r"[^0-9A-Fa-f]", "", hexval)
+                if len(hex_clean) >= 6:
+                    r = int(hex_clean[-6:-4], 16)
+                    g = int(hex_clean[-4:-2], 16)
+                    b = int(hex_clean[-2:], 16)
+                    if r >= 150 and g < 120 and b < 120:
+                        return True
+            except Exception:
+                pass
+        # fallback: theme color or color.theme_color value
+        theme_color = getattr(color, "theme_color", None)
+        if theme_color:
+            try:
+                if str(theme_color).lower().find("red") != -1:
+                    return True
+            except Exception:
+                pass
     except Exception:
         pass
+    # final heuristic: if run.font.color.rgb as string contains 'FF' prefix and '00' for others
     try:
+        if hasattr(run.font.color, "rgb") and run.font.color.rgb is not None:
+            s = str(run.font.color.rgb)
+            if "FF" in s and "0000" in s:
+                return True
     except Exception:
         pass
     return False
+# ------------------------------
+# Extraction: paragraphs, headings, tables
+# ------------------------------
+def extract_from_docx(path: str) -> Dict[str, Any]:
+    doc = Document(path)
+    headings: List[str] = []
+    paragraphs_red: List[Dict[str, Any]] = []
+    red_runs: List[Dict[str, Any]] = []
+    tables_out: List[Dict[str, Any]] = []
+    # extract headings and paragraphs with red runs
+    for p_index, para in enumerate(doc.paragraphs):
+        text = para.text or ""
+        # identify heading level from style name if available
+        style_name = getattr(para.style, "name", "") if para.style is not None else ""
+        is_heading = bool(re.search(r"Heading\s*\d+|HEADING|TITLE|SUBTITLE", style_name, flags=re.I)) or bool(re.search(r"^(MAINTENANCE|MASS|FATIGUE|NHVAS|Vehicle Registration|CORRECTIVE)", text, flags=re.I))
+        if is_heading:
+            headings.append(text.strip())
+        # gather red runs in this paragraph
+        paragraph_red_texts = []
+        char_cursor = 0
+        for run in para.runs:
+            run_text = run.text or ""
+            run_len = len(run_text)
+            if _run_is_red(run) and run_text.strip():
+                # store a red run entry
+                rr = {
+                    "text": run_text,
+                    "paragraph_index": p_index,
+                    "char_index": char_cursor,
+                    "style_name": style_name
+                }
+                red_runs.append(rr)
+                paragraph_red_texts.append(run_text)
+            char_cursor += run_len
+        if paragraph_red_texts:
+            paragraphs_red.append({
+                "paragraph_index": p_index,
+                "text": text,
+                "red_texts": paragraph_red_texts,
+                "style_name": style_name
+            })
+    # extract tables
+    for t_index, table in enumerate(doc.tables):
+        # convert table to simple cell-text matrix
+        nrows = len(table.rows)
+        ncols = max(len(row.cells) for row in table.rows) if nrows > 0 else 0
+        headers = []
+        rows_text = []
+        rows_red_cells = []
+        # Attempt to treat first row as header if cells look like headers (bold or all-caps)
+        header_row = table.rows[0] if nrows > 0 else None
+        # build header texts & apply header mapping
+        if header_row:
+            for c_idx, cell in enumerate(header_row.cells):
+                cell_text = cell.text.strip()
+                # normalize & map using EXTRA_HEADER_SYNONYMS
+                mapped = map_header_using_extra_synonyms(cell_text)
+                if mapped:
+                    header_label = mapped
                 else:
+                    header_label = cell_text
+                headers.append(header_label)
+        # process all rows -> list of lists
+        for r_i, row in enumerate(table.rows):
+            row_texts = []
+            row_reds = []
+            for c_i, cell in enumerate(row.cells):
+                ct = cell.text.strip()
+                # gather red text from runs in this cell
+                red_in_cell = []
+                # docx cell may have paragraphs
+                for cpara in cell.paragraphs:
+                    for run in cpara.runs:
+                        if _run_is_red(run) and (run.text or "").strip():
+                            red_in_cell.append((run.text or "").strip())
+                # compact red text into a single string if multiple runs present
+                red_text_joined = " ".join(red_in_cell) if red_in_cell else None
+                row_texts.append(ct)
+                row_reds.append(red_text_joined)
+            rows_text.append(row_texts)
+            rows_red_cells.append(row_reds)
+        tables_out.append({
+            "table_index": t_index,
+            "nrows": nrows,
+            "ncols": ncols,
+            "headers": headers,
+            "rows": rows_text,
+            "red_cells": rows_red_cells
+        })
+    # assemble output structure
+    out = {
+        "headings": headings,
+        "paragraphs": paragraphs_red,
+        "tables": tables_out,
+        "red_runs": red_runs,
+        # helpful metadata for downstream processing
+        "meta": {
+            "source_file": path,
+            "total_headings": len(headings),
+            "total_red_paragraphs": len(paragraphs_red),
+            "total_tables": len(tables_out),
+            "total_red_runs": len(red_runs)
+        }
+    }
     return out
+# ------------------------------
+# Command-line interface
+# ------------------------------
+def main(argv):
+    if len(argv) < 3:
+        print("Usage: python extract_red_text.py input.docx output.json")
+        sys.exit(2)
+    input_docx = argv[1]
+    output_json = argv[2]
+    log.info("Extracting red text from: %s", input_docx)
+    try:
+        result = extract_from_docx(input_docx)
+    except Exception as exc:
+        log.exception("Failed to extract from docx: %s", exc)
+        raise
+    # Save JSON pretty-printed for debugging by default
+    try:
+        with open(output_json, "w", encoding="utf-8") as fh:
+            json.dump(result, fh, ensure_ascii=False, indent=2)
+        log.info("Saved extracted word JSON to: %s", output_json)
+    except Exception:
+        log.exception("Failed to write output JSON to %s", output_json)
+        raise
+    # Print a short summary for logs / quick verification
+    log.info("Headings found: %d, Red paragraphs: %d, Tables: %d, Red runs: %d",
+             len(result.get("headings", [])),
+             len(result.get("paragraphs", [])),
+             len(result.get("tables", [])),
+             len(result.get("red_runs", []))
+             )
 if __name__ == "__main__":
+    main(sys.argv)