Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

47f7e99

verified ·

1 Parent(s): ded60cc

Update extract_red_text.py

Browse files

Files changed (1) hide show

extract_red_text.py +264 -156

extract_red_text.py CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-extract_red_text.py
 """
 from __future__ import annotations
@@ -9,7 +9,7 @@ import re
 import sys
 import logging
 from collections import defaultdict
-from typing import List, Dict, Optional, Any
 # attempt to import python-docx (document processing)
 try:
@@ -20,62 +20,19 @@ except Exception as e:
     raise RuntimeError("python-docx is required. Install with: pip install python-docx") from e
 # ------------------------------
-# Import master_key GLOBAL_SETTINGS and optional EXTRA_HEADER_SYNONYMS
 # ------------------------------
 try:
     import master_key as mk
-    GLOBAL_SETTINGS = getattr(mk, "GLOBAL_SETTINGS", {})
-    EXTRA_HEADER_SYNONYMS = getattr(mk, "EXTRA_HEADER_SYNONYMS", None)
-except Exception:
-    GLOBAL_SETTINGS = {
-        "normalize": {
-            "lower": True,
-            "strip_punctuation": True,
-            "collapse_whitespace": True,
-            "replace_smart_dashes": True
-        },
-        "ocr_repair_rules": [
-            (r"\s*\(\s*Yes\s*/\s*No\s*\)", " (Yes/No)"),
-            (r"R[e3]gistrat[i1]on", "Registration"),
-            (r"Prin?t", "Print"),
-            (r"Accredi[ta]tion", "Accreditation"),
-            (r"[^\w\s\-\&\(\)\/:]", " "),
-        ],
-        "split_on": [" – ", " - ", ";", "\n", " / "],
-        "date_like_pattern": r"^\s*(\d{1,2}(st|nd|rd|th)?\s+[A-Za-z]+|\d{1,2}\/\d{1,2}\/\d{2,4}|\d{1,2}\.\d{1,2}\.\d{2,4}|\d{1,2}\s+[A-Za-z]{3,})",
-        "fuzzy_thresholds": {"high_priority": 70, "medium_priority": 60, "low_priority": 45},
-        "fuzzy_algorithm": "token_set_ratio",
-    }
-    EXTRA_HEADER_SYNONYMS = None
-# Provide an internal default synonyms map (compact keys -> canonical label)
-# This is used only if master_key.EXTRA_HEADER_SYNONYMS is not defined.
-_DEFAULT_EXTRA_HEADER_SYNONYMS = {
-    # Compact key: canonical label
-    # Examples from your logs (long/noisy headers)
-    "roadworthinesscertificatesapplicableforentryaudit": "Roadworthiness Certificates",
-    "roadworthinesscertificates": "Roadworthiness Certificates",
-    "rfsuspensioncertificationn/aifnotapplicable": "RFS Suspension Certification #",
-    "rfsuspensioncertification": "RFS Suspension Certification #",
-    "maintenanceRecordsrecorddaterangeofrecordsreviewed".lower(): "Maintenance Records",
-    "maintenancerecords": "Maintenance Records",
-    "faultrecordingreportingonsuspensionsystemdaterange".lower(): "Fault Recording/ Reporting",
-    "faultrecordingreporting": "Fault Recording/ Reporting",
-    "faultrepairdaterange": "Fault Repair",
-    "triprecordsdaterange": "Trip Records",
-    # Add common variations
-    "registrationnumber": "Registration Number",
-    "registrationnumbernumber": "Registration Number",
-    "subcontractor(yesno)": "Sub-contractor (Yes/No)",
-    "sub-contractor(yes/no)": "Sub-contractor (Yes/No)",
-    "nhvrorexemplarglobalauditorregistrationnumber": "NHVR or Exemplar Global Auditor Registration Number",
-    "printname": "Print Name",
-    "print": "Print Name",
-}
-# If mk provided EXTRA_HEADER_SYNONYMS, use it (but ensure keys are compacted similarly)
-if EXTRA_HEADER_SYNONYMS is None:
-    EXTRA_HEADER_SYNONYMS = _DEFAULT_EXTRA_HEADER_SYNONYMS
 # ------------------------------
 # Logging
@@ -87,6 +44,7 @@ log = logging.getLogger("extract_red_text")
 # Normalization & OCR-repair utilities (aligned to GLOBAL_SETTINGS)
 # ------------------------------
 def _apply_ocr_repair_rules(text: str) -> str:
     s = text or ""
     for pat, repl in GLOBAL_SETTINGS.get("ocr_repair_rules", []):
         try:
@@ -100,6 +58,7 @@ def _normalize_text(text: str) -> str:
     """Normalize text according to GLOBAL_SETTINGS (readable normalized form)."""
     s = _apply_ocr_repair_rules(text or "")
     norm_cfg = GLOBAL_SETTINGS.get("normalize", {})
     if norm_cfg.get("replace_smart_dashes", False):
         s = s.replace("–", "-").replace("—", "-")
     if norm_cfg.get("lower", False):
@@ -109,6 +68,7 @@ def _normalize_text(text: str) -> str:
         s = re.sub(r"[^\w\s\-\&\(\)\/:]", " ", s)
     if norm_cfg.get("collapse_whitespace", False):
         s = re.sub(r"\s+", " ", s)
     return s.strip()
 def _compact_key(text: str) -> str:
@@ -125,182 +85,320 @@ def map_header_using_extra_synonyms(header_text: str) -> Optional[str]:
     """
     if not header_text:
         return None
     normalized = _normalize_text(header_text)
     compact = _compact_key(header_text)
     # try compact key
     if compact in EXTRA_HEADER_SYNONYMS:
         return EXTRA_HEADER_SYNONYMS[compact]
     # try normalized key directly
     if normalized in EXTRA_HEADER_SYNONYMS:
         return EXTRA_HEADER_SYNONYMS[normalized]
     # also try case-insensitive match on keys
     for k, v in EXTRA_HEADER_SYNONYMS.items():
         if k.lower() == normalized.lower() or k.lower() == compact.lower():
             return v
     return None
 # ------------------------------
-# Helpers to detect red font runs robustly
 # ------------------------------
 def _run_is_red(run) -> bool:
     """
-    Detect if a run is red. python-docx represents color by run.font.color.
-    We check RGB if available, or theme color 'red' as fallback.
     """
     try:
-        color = run.font.color
-        if color is None:
-            return False
-        # If RGB is specified
-        rgb = getattr(color, "rgb", None)
-        if rgb is not None:
-            # rgb is a docx.shared.RGBColor or similar. Representable as 'FF0000' or integer tuple
-            hexval = ''.join("{:02X}".format(c) for c in rgb) if isinstance(rgb, (tuple, list)) else str(rgb)
-            # accept strings containing 'FF0000' or '0000FF'? (we want red)
-            # Accept any color where red component is high and others low-ish
             try:
-                # If hex-like 'FF0000' -> interpret
-                hex_clean = re.sub(r"[^0-9A-Fa-f]", "", hexval)
-                if len(hex_clean) >= 6:
-                    r = int(hex_clean[-6:-4], 16)
-                    g = int(hex_clean[-4:-2], 16)
-                    b = int(hex_clean[-2:], 16)
-                    if r >= 150 and g < 120 and b < 120:
                         return True
             except Exception:
                 pass
-        # fallback: theme color or color.theme_color value
-        theme_color = getattr(color, "theme_color", None)
-        if theme_color:
-            try:
-                if str(theme_color).lower().find("red") != -1:
                     return True
-            except Exception:
-                pass
     except Exception:
         pass
-    # final heuristic: if run.font.color.rgb as string contains 'FF' prefix and '00' for others
     try:
         if hasattr(run.font.color, "rgb") and run.font.color.rgb is not None:
             s = str(run.font.color.rgb)
-            if "FF" in s and "0000" in s:
                 return True
     except Exception:
         pass
     return False
 # ------------------------------
 # Extraction: paragraphs, headings, tables
 # ------------------------------
 def extract_from_docx(path: str) -> Dict[str, Any]:
     doc = Document(path)
     headings: List[str] = []
     paragraphs_red: List[Dict[str, Any]] = []
     red_runs: List[Dict[str, Any]] = []
     tables_out: List[Dict[str, Any]] = []
-    # extract headings and paragraphs with red runs
     for p_index, para in enumerate(doc.paragraphs):
         text = para.text or ""
-        # identify heading level from style name if available
         style_name = getattr(para.style, "name", "") if para.style is not None else ""
-        is_heading = bool(re.search(r"Heading\s*\d+|HEADING|TITLE|SUBTITLE", style_name, flags=re.I)) or bool(re.search(r"^(MAINTENANCE|MASS|FATIGUE|NHVAS|Vehicle Registration|CORRECTIVE)", text, flags=re.I))
         if is_heading:
             headings.append(text.strip())
-        # gather red runs in this paragraph
         paragraph_red_texts = []
         char_cursor = 0
         for run in para.runs:
             run_text = run.text or ""
             run_len = len(run_text)
             if _run_is_red(run) and run_text.strip():
-                # store a red run entry
                 rr = {
                     "text": run_text,
                     "paragraph_index": p_index,
                     "char_index": char_cursor,
-                    "style_name": style_name
                 }
                 red_runs.append(rr)
                 paragraph_red_texts.append(run_text)
             char_cursor += run_len
         if paragraph_red_texts:
             paragraphs_red.append({
                 "paragraph_index": p_index,
                 "text": text,
                 "red_texts": paragraph_red_texts,
-                "style_name": style_name
             })
-    # extract tables
     for t_index, table in enumerate(doc.tables):
-        # convert table to simple cell-text matrix
-        nrows = len(table.rows)
-        ncols = max(len(row.cells) for row in table.rows) if nrows > 0 else 0
-        headers = []
-        rows_text = []
-        rows_red_cells = []
-        # Attempt to treat first row as header if cells look like headers (bold or all-caps)
-        header_row = table.rows[0] if nrows > 0 else None
-        # build header texts & apply header mapping
-        if header_row:
-            for c_idx, cell in enumerate(header_row.cells):
-                cell_text = cell.text.strip()
-                # normalize & map using EXTRA_HEADER_SYNONYMS
-                mapped = map_header_using_extra_synonyms(cell_text)
-                if mapped:
-                    header_label = mapped
-                else:
-                    header_label = cell_text
-                headers.append(header_label)
-        # process all rows -> list of lists
-        for r_i, row in enumerate(table.rows):
-            row_texts = []
-            row_reds = []
-            for c_i, cell in enumerate(row.cells):
-                ct = cell.text.strip()
-                # gather red text from runs in this cell
-                red_in_cell = []
-                # docx cell may have paragraphs
-                for cpara in cell.paragraphs:
-                    for run in cpara.runs:
-                        if _run_is_red(run) and (run.text or "").strip():
-                            red_in_cell.append((run.text or "").strip())
-                # compact red text into a single string if multiple runs present
-                red_text_joined = " ".join(red_in_cell) if red_in_cell else None
-                row_texts.append(ct)
-                row_reds.append(red_text_joined)
-            rows_text.append(row_texts)
-            rows_red_cells.append(row_reds)
-        tables_out.append({
-            "table_index": t_index,
-            "nrows": nrows,
-            "ncols": ncols,
-            "headers": headers,
-            "rows": rows_text,
-            "red_cells": rows_red_cells
-        })
-    # assemble output structure
     out = {
         "headings": headings,
         "paragraphs": paragraphs_red,
         "tables": tables_out,
         "red_runs": red_runs,
-        # helpful metadata for downstream processing
         "meta": {
             "source_file": path,
             "total_headings": len(headings),
             "total_red_paragraphs": len(paragraphs_red),
             "total_tables": len(tables_out),
-            "total_red_runs": len(red_runs)
         }
     }
     return out
 # ------------------------------
@@ -310,10 +408,14 @@ def main(argv):
     if len(argv) < 3:
         print("Usage: python extract_red_text.py input.docx output.json")
         sys.exit(2)
     input_docx = argv[1]
     output_json = argv[2]
-    log.info("Extracting red text from: %s", input_docx)
     try:
         result = extract_from_docx(input_docx)
     except Exception as exc:
@@ -324,21 +426,27 @@ def main(argv):
     try:
         with open(output_json, "w", encoding="utf-8") as fh:
             json.dump(result, fh, ensure_ascii=False, indent=2)
-        log.info("Saved extracted word JSON to: %s", output_json)
     except Exception:
         log.exception("Failed to write output JSON to %s", output_json)
         raise
-    # Print a short summary for logs / quick verification
-    log.info("Headings found: %d, Red paragraphs: %d, Tables: %d, Red runs: %d",
-             len(result.get("headings", [])),
-             len(result.get("paragraphs", [])),
-             len(result.get("tables", [])),
-             len(result.get("red_runs", []))
-             )
 if __name__ == "__main__":
     main(sys.argv)
-    # ADD THIS LINE:
     if len(sys.argv) >= 3:
-        with open(sys.argv[2], 'r') as f: print(f"\n📄 EXTRACT_RED_TEXT OUTPUT:\n{f.read()}")

 #!/usr/bin/env python3
 """
+extract_red_text.py - Enhanced version with improved red text detection and master key alignment
 """
 from __future__ import annotations
 import sys
 import logging
 from collections import defaultdict
+from typing import List, Dict, Optional, Any, Tuple
 # attempt to import python-docx (document processing)
 try:
     raise RuntimeError("python-docx is required. Install with: pip install python-docx") from e
 # ------------------------------
+# Import master_key configurations
 # ------------------------------
 try:
     import master_key as mk
+    GLOBAL_SETTINGS = mk.GLOBAL_SETTINGS
+    EXTRA_HEADER_SYNONYMS = mk.EXTRA_HEADER_SYNONYMS
+    TABLE_SCHEMAS = getattr(mk, "TABLE_SCHEMAS", {})
+except ImportError as e:
+    logging.error("Failed to import master_key.py: %s", e)
+    raise RuntimeError("master_key.py is required for configuration") from e
+except AttributeError as e:
+    logging.error("Missing required configuration in master_key.py: %s", e)
+    raise RuntimeError("master_key.py missing required GLOBAL_SETTINGS or EXTRA_HEADER_SYNONYMS") from e
 # ------------------------------
 # Logging
 # Normalization & OCR-repair utilities (aligned to GLOBAL_SETTINGS)
 # ------------------------------
 def _apply_ocr_repair_rules(text: str) -> str:
+    """Apply OCR repair rules from GLOBAL_SETTINGS."""
     s = text or ""
     for pat, repl in GLOBAL_SETTINGS.get("ocr_repair_rules", []):
         try:
     """Normalize text according to GLOBAL_SETTINGS (readable normalized form)."""
     s = _apply_ocr_repair_rules(text or "")
     norm_cfg = GLOBAL_SETTINGS.get("normalize", {})
     if norm_cfg.get("replace_smart_dashes", False):
         s = s.replace("–", "-").replace("—", "-")
     if norm_cfg.get("lower", False):
         s = re.sub(r"[^\w\s\-\&\(\)\/:]", " ", s)
     if norm_cfg.get("collapse_whitespace", False):
         s = re.sub(r"\s+", " ", s)
     return s.strip()
 def _compact_key(text: str) -> str:
     """
     if not header_text:
         return None
     normalized = _normalize_text(header_text)
     compact = _compact_key(header_text)
     # try compact key
     if compact in EXTRA_HEADER_SYNONYMS:
         return EXTRA_HEADER_SYNONYMS[compact]
     # try normalized key directly
     if normalized in EXTRA_HEADER_SYNONYMS:
         return EXTRA_HEADER_SYNONYMS[normalized]
     # also try case-insensitive match on keys
     for k, v in EXTRA_HEADER_SYNONYMS.items():
         if k.lower() == normalized.lower() or k.lower() == compact.lower():
             return v
     return None
 # ------------------------------
+# Enhanced red font detection using hf_utils pattern
 # ------------------------------
 def _run_is_red(run) -> bool:
     """
+    Enhanced red color detection for docx.run objects.
+    Uses multiple methods to detect red text robustly.
     """
     try:
+        # Method 1: Check run.font.color.rgb
+        col = getattr(run.font, "color", None)
+        if col is not None and getattr(col, "rgb", None):
+            rgb = col.rgb
             try:
+                # rgb may be sequence-like or have attributes
+                if hasattr(rgb, '__getitem__'):  # sequence-like
+                    r, g, b = rgb[0], rgb[1], rgb[2]
+                else:  # attribute access
+                    r = getattr(rgb, "r", None) or getattr(rgb, "red", None)
+                    g = getattr(rgb, "g", None) or getattr(rgb, "green", None)
+                    b = getattr(rgb, "b", None) or getattr(rgb, "blue", None)
+                if r is not None and g is not None and b is not None:
+                    # Tolerant heuristic: red must be noticeably higher than green/blue
+                    if r >= 160 and g <= 120 and b <= 120 and (r - g) >= 30 and (r - b) >= 30:
                         return True
             except Exception:
                 pass
+    except Exception:
+        pass
+    # Method 2: Check raw XML color code
+    try:
+        rPr = run._element.rPr
+        if rPr is not None:
+            clr = rPr.find('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}color')
+            if clr is not None:
+                val = clr.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
+                if val and re.fullmatch(r"[0-9A-Fa-f]{6}", val):
+                    rr = int(val[:2], 16)
+                    gg = int(val[2:4], 16)
+                    bb = int(val[4:], 16)
+                    if rr >= 160 and gg <= 120 and bb <= 120 and (rr - gg) >= 30 and (rr - bb) >= 30:
+                        return True
+    except Exception:
+        pass
+    # Method 3: Check theme color
+    try:
+        color = run.font.color
+        if color is not None:
+            theme_color = getattr(color, "theme_color", None)
+            if theme_color:
+                theme_str = str(theme_color).lower()
+                if "red" in theme_str or "accent_2" in theme_str:  # Common red theme
                     return True
     except Exception:
         pass
+    # Method 4: String representation fallback
     try:
         if hasattr(run.font.color, "rgb") and run.font.color.rgb is not None:
             s = str(run.font.color.rgb)
+            # Look for patterns like "FF0000" or similar high-red values
+            if re.search(r"[Ff]{2}0{4}|[Ee]{2}0{4}|[Dd]{2}0{4}", s):
                 return True
     except Exception:
         pass
     return False
+def _extract_red_text_segments(cell):
+    """Extract red text segments from a table cell."""
+    segments = []
+    for p_idx, paragraph in enumerate(cell.paragraphs):
+        current_text = ""
+        current_runs = []
+        for r_idx, run in enumerate(paragraph.runs):
+            if _run_is_red(run) and run.text.strip():
+                current_text += run.text
+                current_runs.append((p_idx, r_idx, run))
+            else:
+                # End of red segment
+                if current_runs:
+                    segments.append({
+                        'text': current_text.strip(),
+                        'runs': current_runs.copy(),
+                        'paragraph_idx': p_idx
+                    })
+                    current_text = ""
+                    current_runs = []
+        # Handle segment at end of paragraph
+        if current_runs:
+            segments.append({
+                'text': current_text.strip(),
+                'runs': current_runs.copy(),
+                'paragraph_idx': p_idx
+            })
+    return segments
+def _has_red_text(cell) -> bool:
+    """Check if a cell contains any red text."""
+    for paragraph in cell.paragraphs:
+        for run in paragraph.runs:
+            if _run_is_red(run) and run.text.strip():
+                return True
+    return False
+# ------------------------------
+# Enhanced table processing with schema-aware header mapping
+# ------------------------------
+def _process_table_with_schema_mapping(table, t_index: int) -> Dict[str, Any]:
+    """Process table with enhanced header mapping using master key schemas."""
+    nrows = len(table.rows)
+    ncols = max(len(row.cells) for row in table.rows) if nrows > 0 else 0
+    if nrows == 0:
+        return {
+            "table_index": t_index,
+            "nrows": 0,
+            "ncols": 0,
+            "headers": [],
+            "rows": [],
+            "red_cells": [],
+            "mapped_headers": []
+        }
+    # Process headers from first row
+    header_row = table.rows[0]
+    headers = []
+    mapped_headers = []
+    for c_idx, cell in enumerate(header_row.cells[:ncols]):
+        cell_text = cell.text.strip()
+        # Try mapping using EXTRA_HEADER_SYNONYMS first
+        mapped = map_header_using_extra_synonyms(cell_text)
+        if mapped:
+            header_label = mapped
+            log.debug(f"Mapped header '{cell_text}' -> '{mapped}'")
+        else:
+            header_label = cell_text
+        headers.append(cell_text)  # Original header
+        mapped_headers.append(header_label)  # Mapped header
+    # Process all rows
+    rows_text = []
+    rows_red_cells = []
+    rows_red_metadata = []
+    for r_i, row in enumerate(table.rows):
+        row_texts = []
+        row_reds = []
+        row_red_meta = []
+        for c_i, cell in enumerate(row.cells[:ncols]):
+            cell_text = cell.text.strip()
+            # Extract red text segments with metadata
+            red_segments = _extract_red_text_segments(cell)
+            if red_segments:
+                # Join all red text segments
+                red_text_parts = [seg['text'] for seg in red_segments if seg['text']]
+                red_text_joined = " ".join(red_text_parts).strip()
+                # Store metadata about red text location
+                red_metadata = {
+                    "has_red": True,
+                    "red_text": red_text_joined,
+                    "segments": len(red_segments),
+                    "total_red_runs": sum(len(seg['runs']) for seg in red_segments)
+                }
+            else:
+                red_text_joined = None
+                red_metadata = {"has_red": False}
+            row_texts.append(cell_text)
+            row_reds.append(red_text_joined)
+            row_red_meta.append(red_metadata)
+        rows_text.append(row_texts)
+        rows_red_cells.append(row_reds)
+        rows_red_metadata.append(row_red_meta)
+    return {
+        "table_index": t_index,
+        "nrows": nrows,
+        "ncols": ncols,
+        "headers": headers,  # Original headers
+        "mapped_headers": mapped_headers,  # Mapped headers
+        "rows": rows_text,
+        "red_cells": rows_red_cells,
+        "red_metadata": rows_red_metadata  # Additional red text metadata
+    }
 # ------------------------------
 # Extraction: paragraphs, headings, tables
 # ------------------------------
 def extract_from_docx(path: str) -> Dict[str, Any]:
+    """Extract content from DOCX with enhanced red text detection and schema mapping."""
+    log.info(f"Opening document: {path}")
     doc = Document(path)
     headings: List[str] = []
     paragraphs_red: List[Dict[str, Any]] = []
     red_runs: List[Dict[str, Any]] = []
     tables_out: List[Dict[str, Any]] = []
+    # Extract headings and paragraphs with red runs
+    log.info("Processing paragraphs and headings...")
     for p_index, para in enumerate(doc.paragraphs):
         text = para.text or ""
+        # Identify heading level from style name if available
         style_name = getattr(para.style, "name", "") if para.style is not None else ""
+        is_heading = bool(re.search(r"Heading\s*\d+|HEADING|TITLE|SUBTITLE", style_name, flags=re.I)) or \
+                    bool(re.search(r"^(MAINTENANCE|MASS|FATIGUE|NHVAS|Vehicle Registration|CORRECTIVE)", text, flags=re.I))
         if is_heading:
             headings.append(text.strip())
+            log.debug(f"Found heading: {text.strip()}")
+        # Gather red runs in this paragraph
         paragraph_red_texts = []
         char_cursor = 0
         for run in para.runs:
             run_text = run.text or ""
             run_len = len(run_text)
             if _run_is_red(run) and run_text.strip():
+                # Store a red run entry
                 rr = {
                     "text": run_text,
                     "paragraph_index": p_index,
                     "char_index": char_cursor,
+                    "style_name": style_name,
+                    "normalized_text": _normalize_text(run_text)
                 }
                 red_runs.append(rr)
                 paragraph_red_texts.append(run_text)
+                log.debug(f"Found red text in paragraph {p_index}: '{run_text.strip()}'")
             char_cursor += run_len
         if paragraph_red_texts:
             paragraphs_red.append({
                 "paragraph_index": p_index,
                 "text": text,
                 "red_texts": paragraph_red_texts,
+                "style_name": style_name,
+                "red_text_joined": " ".join(paragraph_red_texts).strip()
             })
+    # Extract tables with enhanced processing
+    log.info(f"Processing {len(doc.tables)} tables...")
     for t_index, table in enumerate(doc.tables):
+        table_data = _process_table_with_schema_mapping(table, t_index)
+        tables_out.append(table_data)
+        # Log red text findings
+        red_cell_count = sum(1 for row in table_data["red_cells"] for cell in row if cell)
+        if red_cell_count > 0:
+            log.info(f"Table {t_index}: Found {red_cell_count} cells with red text")
+    # Assemble output structure
     out = {
         "headings": headings,
         "paragraphs": paragraphs_red,
         "tables": tables_out,
         "red_runs": red_runs,
+        # Enhanced metadata
         "meta": {
             "source_file": path,
             "total_headings": len(headings),
             "total_red_paragraphs": len(paragraphs_red),
             "total_tables": len(tables_out),
+            "total_red_runs": len(red_runs),
+            "total_red_cells": sum(
+                sum(1 for cell in row for cell in table["red_cells"] if cell)
+                for table in tables_out
+            ),
+            "global_settings_used": {
+                "normalization": GLOBAL_SETTINGS.get("normalize", {}),
+                "ocr_repair_rules_count": len(GLOBAL_SETTINGS.get("ocr_repair_rules", [])),
+                "synonyms_count": len(EXTRA_HEADER_SYNONYMS) if EXTRA_HEADER_SYNONYMS else 0
+            }
         }
     }
     return out
 # ------------------------------
     if len(argv) < 3:
         print("Usage: python extract_red_text.py input.docx output.json")
         sys.exit(2)
     input_docx = argv[1]
     output_json = argv[2]
+    log.info("Starting red text extraction from: %s", input_docx)
+    log.info("Using master_key configuration with %d header synonyms",
+             len(EXTRA_HEADER_SYNONYMS) if EXTRA_HEADER_SYNONYMS else 0)
     try:
         result = extract_from_docx(input_docx)
     except Exception as exc:
     try:
         with open(output_json, "w", encoding="utf-8") as fh:
             json.dump(result, fh, ensure_ascii=False, indent=2)
+        log.info("Saved extracted data to: %s", output_json)
     except Exception:
         log.exception("Failed to write output JSON to %s", output_json)
         raise
+    # Print comprehensive summary
+    meta = result.get("meta", {})
+    log.info("=== EXTRACTION SUMMARY ===")
+    log.info("Headings found: %d", meta.get("total_headings", 0))
+    log.info("Red paragraphs: %d", meta.get("total_red_paragraphs", 0))
+    log.info("Red runs total: %d", meta.get("total_red_runs", 0))
+    log.info("Tables processed: %d", meta.get("total_tables", 0))
+    log.info("Red cells found: %d", meta.get("total_red_cells", 0))
+    log.info("Header synonyms used: %d", meta.get("global_settings_used", {}).get("synonyms_count", 0))
 if __name__ == "__main__":
     main(sys.argv)
+    # Print output for verification
     if len(sys.argv) >= 3:
+        try:
+            with open(sys.argv[2], 'r') as f:
+                print(f"\n📄 EXTRACT_RED_TEXT OUTPUT:\n{f.read()}")
+        except Exception as e:
+            print(f"\n❌ Could not read output file: {e}")