Spaces:
Running
Running
Update extract_red_text.py
Browse files- extract_red_text.py +57 -10
extract_red_text.py
CHANGED
|
@@ -68,18 +68,68 @@ def get_table_context(tbl):
|
|
| 68 |
}
|
| 69 |
|
| 70 |
def calculate_schema_match_score(schema_name, spec, context):
|
| 71 |
-
"""
|
| 72 |
score = 0
|
| 73 |
reasons = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
|
| 75 |
score += 100
|
| 76 |
reasons.append(f"Direct first cell match: '{context['first_cell']}'")
|
|
|
|
|
|
|
| 77 |
if spec.get("headings"):
|
| 78 |
for h in spec["headings"]:
|
| 79 |
if fuzzy_match_heading(context['heading'], [h["text"]]):
|
| 80 |
score += 50
|
| 81 |
reasons.append(f"Heading match: '{context['heading']}'")
|
| 82 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
if spec.get("orientation") == "left":
|
| 84 |
labels = [normalize_text(lbl) for lbl in spec["labels"]]
|
| 85 |
matches = 0
|
|
@@ -89,6 +139,8 @@ def calculate_schema_match_score(schema_name, spec, context):
|
|
| 89 |
if matches > 0:
|
| 90 |
score += (matches / len(labels)) * 30
|
| 91 |
reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
|
|
|
|
|
|
|
| 92 |
elif spec.get("orientation") == "row1":
|
| 93 |
labels = [normalize_text(lbl) for lbl in spec["labels"]]
|
| 94 |
matches = 0
|
|
@@ -98,15 +150,8 @@ def calculate_schema_match_score(schema_name, spec, context):
|
|
| 98 |
if matches > 0:
|
| 99 |
score += (matches / len(labels)) * 30
|
| 100 |
reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
matches = 0
|
| 104 |
-
for col in cols:
|
| 105 |
-
if any(col.upper() in h.upper() for h in context['headers']):
|
| 106 |
-
matches += 1
|
| 107 |
-
if matches == len(cols):
|
| 108 |
-
score += 40
|
| 109 |
-
reasons.append(f"All column headers match: {cols}")
|
| 110 |
if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
|
| 111 |
if "OPERATOR DECLARATION" in context['heading'].upper():
|
| 112 |
score += 80
|
|
@@ -114,10 +159,12 @@ def calculate_schema_match_score(schema_name, spec, context):
|
|
| 114 |
elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
|
| 115 |
score += 60
|
| 116 |
reasons.append("Manager found in cells (likely Operator Declaration)")
|
|
|
|
| 117 |
if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
|
| 118 |
if any("MANAGER" in cell.upper() for cell in context['all_cells']):
|
| 119 |
score -= 50 # Penalty because auditors shouldn't be managers
|
| 120 |
reasons.append("Penalty: Manager found (not auditor)")
|
|
|
|
| 121 |
return score, reasons
|
| 122 |
|
| 123 |
def match_table_schema(tbl):
|
|
|
|
| 68 |
}
|
| 69 |
|
| 70 |
def calculate_schema_match_score(schema_name, spec, context):
|
| 71 |
+
"""Enhanced calculate match score for a schema against table context with Summary table detection"""
|
| 72 |
score = 0
|
| 73 |
reasons = []
|
| 74 |
+
|
| 75 |
+
# 🎯 CRITICAL: Boost Summary schemas when DETAILS column is detected
|
| 76 |
+
if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
|
| 77 |
+
score += 100 # Very high boost for summary tables with DETAILS column
|
| 78 |
+
reasons.append(f"Summary schema with DETAILS column - perfect match")
|
| 79 |
+
|
| 80 |
+
# 🎯 CRITICAL: Heavily penalize non-Summary schemas when DETAILS column is present
|
| 81 |
+
if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
|
| 82 |
+
score -= 75 # Heavy penalty to prevent basic schemas from matching summary tables
|
| 83 |
+
reasons.append(f"Non-summary schema penalized for DETAILS column presence")
|
| 84 |
+
|
| 85 |
+
# Check for context exclusions (prevents basic Management from matching Summary tables)
|
| 86 |
+
if spec.get("context_exclusions"):
|
| 87 |
+
table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
|
| 88 |
+
for exclusion in spec["context_exclusions"]:
|
| 89 |
+
if exclusion.lower() in table_text:
|
| 90 |
+
score -= 50
|
| 91 |
+
reasons.append(f"Context exclusion penalty: '{exclusion}' found")
|
| 92 |
+
|
| 93 |
+
# Check for context keywords (boosts matching for relevant tables)
|
| 94 |
+
if spec.get("context_keywords"):
|
| 95 |
+
table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
|
| 96 |
+
keyword_matches = 0
|
| 97 |
+
for keyword in spec["context_keywords"]:
|
| 98 |
+
if keyword.lower() in table_text:
|
| 99 |
+
keyword_matches += 1
|
| 100 |
+
|
| 101 |
+
if keyword_matches > 0:
|
| 102 |
+
score += keyword_matches * 15 # Boost for each matching keyword
|
| 103 |
+
reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")
|
| 104 |
+
|
| 105 |
+
# Direct first cell match
|
| 106 |
if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
|
| 107 |
score += 100
|
| 108 |
reasons.append(f"Direct first cell match: '{context['first_cell']}'")
|
| 109 |
+
|
| 110 |
+
# Heading pattern matching
|
| 111 |
if spec.get("headings"):
|
| 112 |
for h in spec["headings"]:
|
| 113 |
if fuzzy_match_heading(context['heading'], [h["text"]]):
|
| 114 |
score += 50
|
| 115 |
reasons.append(f"Heading match: '{context['heading']}'")
|
| 116 |
break
|
| 117 |
+
|
| 118 |
+
# Column header matching (important for Summary tables)
|
| 119 |
+
if spec.get("columns"):
|
| 120 |
+
cols = [normalize_text(col) for col in spec["columns"]]
|
| 121 |
+
matches = 0
|
| 122 |
+
for col in cols:
|
| 123 |
+
if any(col.upper() in h.upper() for h in context['headers']):
|
| 124 |
+
matches += 1
|
| 125 |
+
if matches == len(cols):
|
| 126 |
+
score += 60 # High boost for exact column matches
|
| 127 |
+
reasons.append(f"All column headers match: {cols}")
|
| 128 |
+
elif matches > 0:
|
| 129 |
+
score += matches * 20 # Partial column matches
|
| 130 |
+
reasons.append(f"Partial column matches: {matches}/{len(cols)}")
|
| 131 |
+
|
| 132 |
+
# Label matching for left-oriented tables
|
| 133 |
if spec.get("orientation") == "left":
|
| 134 |
labels = [normalize_text(lbl) for lbl in spec["labels"]]
|
| 135 |
matches = 0
|
|
|
|
| 139 |
if matches > 0:
|
| 140 |
score += (matches / len(labels)) * 30
|
| 141 |
reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
|
| 142 |
+
|
| 143 |
+
# Label matching for row1-oriented tables
|
| 144 |
elif spec.get("orientation") == "row1":
|
| 145 |
labels = [normalize_text(lbl) for lbl in spec["labels"]]
|
| 146 |
matches = 0
|
|
|
|
| 150 |
if matches > 0:
|
| 151 |
score += (matches / len(labels)) * 30
|
| 152 |
reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
|
| 153 |
+
|
| 154 |
+
# Special handling for Declaration tables
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
|
| 156 |
if "OPERATOR DECLARATION" in context['heading'].upper():
|
| 157 |
score += 80
|
|
|
|
| 159 |
elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
|
| 160 |
score += 60
|
| 161 |
reasons.append("Manager found in cells (likely Operator Declaration)")
|
| 162 |
+
|
| 163 |
if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
|
| 164 |
if any("MANAGER" in cell.upper() for cell in context['all_cells']):
|
| 165 |
score -= 50 # Penalty because auditors shouldn't be managers
|
| 166 |
reasons.append("Penalty: Manager found (not auditor)")
|
| 167 |
+
|
| 168 |
return score, reasons
|
| 169 |
|
| 170 |
def match_table_schema(tbl):
|