Shami96 commited on
Commit
c4e2e43
·
verified ·
1 Parent(s): 3f0b8d7

Update extract_red_text.py

Browse files
Files changed (1) hide show
  1. extract_red_text.py +57 -10
extract_red_text.py CHANGED
@@ -68,18 +68,68 @@ def get_table_context(tbl):
68
  }
69
 
70
  def calculate_schema_match_score(schema_name, spec, context):
71
- """Calculate match score for a schema against table context"""
72
  score = 0
73
  reasons = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
75
  score += 100
76
  reasons.append(f"Direct first cell match: '{context['first_cell']}'")
 
 
77
  if spec.get("headings"):
78
  for h in spec["headings"]:
79
  if fuzzy_match_heading(context['heading'], [h["text"]]):
80
  score += 50
81
  reasons.append(f"Heading match: '{context['heading']}'")
82
  break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  if spec.get("orientation") == "left":
84
  labels = [normalize_text(lbl) for lbl in spec["labels"]]
85
  matches = 0
@@ -89,6 +139,8 @@ def calculate_schema_match_score(schema_name, spec, context):
89
  if matches > 0:
90
  score += (matches / len(labels)) * 30
91
  reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
 
 
92
  elif spec.get("orientation") == "row1":
93
  labels = [normalize_text(lbl) for lbl in spec["labels"]]
94
  matches = 0
@@ -98,15 +150,8 @@ def calculate_schema_match_score(schema_name, spec, context):
98
  if matches > 0:
99
  score += (matches / len(labels)) * 30
100
  reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
101
- if spec.get("columns"):
102
- cols = [normalize_text(col) for col in spec["columns"]]
103
- matches = 0
104
- for col in cols:
105
- if any(col.upper() in h.upper() for h in context['headers']):
106
- matches += 1
107
- if matches == len(cols):
108
- score += 40
109
- reasons.append(f"All column headers match: {cols}")
110
  if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
111
  if "OPERATOR DECLARATION" in context['heading'].upper():
112
  score += 80
@@ -114,10 +159,12 @@ def calculate_schema_match_score(schema_name, spec, context):
114
  elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
115
  score += 60
116
  reasons.append("Manager found in cells (likely Operator Declaration)")
 
117
  if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
118
  if any("MANAGER" in cell.upper() for cell in context['all_cells']):
119
  score -= 50 # Penalty because auditors shouldn't be managers
120
  reasons.append("Penalty: Manager found (not auditor)")
 
121
  return score, reasons
122
 
123
  def match_table_schema(tbl):
 
68
  }
69
 
70
  def calculate_schema_match_score(schema_name, spec, context):
71
+ """Enhanced calculate match score for a schema against table context with Summary table detection"""
72
  score = 0
73
  reasons = []
74
+
75
+ # 🎯 CRITICAL: Boost Summary schemas when DETAILS column is detected
76
+ if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
77
+ score += 100 # Very high boost for summary tables with DETAILS column
78
+ reasons.append(f"Summary schema with DETAILS column - perfect match")
79
+
80
+ # 🎯 CRITICAL: Heavily penalize non-Summary schemas when DETAILS column is present
81
+ if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
82
+ score -= 75 # Heavy penalty to prevent basic schemas from matching summary tables
83
+ reasons.append(f"Non-summary schema penalized for DETAILS column presence")
84
+
85
+ # Check for context exclusions (prevents basic Management from matching Summary tables)
86
+ if spec.get("context_exclusions"):
87
+ table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
88
+ for exclusion in spec["context_exclusions"]:
89
+ if exclusion.lower() in table_text:
90
+ score -= 50
91
+ reasons.append(f"Context exclusion penalty: '{exclusion}' found")
92
+
93
+ # Check for context keywords (boosts matching for relevant tables)
94
+ if spec.get("context_keywords"):
95
+ table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
96
+ keyword_matches = 0
97
+ for keyword in spec["context_keywords"]:
98
+ if keyword.lower() in table_text:
99
+ keyword_matches += 1
100
+
101
+ if keyword_matches > 0:
102
+ score += keyword_matches * 15 # Boost for each matching keyword
103
+ reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")
104
+
105
+ # Direct first cell match
106
  if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
107
  score += 100
108
  reasons.append(f"Direct first cell match: '{context['first_cell']}'")
109
+
110
+ # Heading pattern matching
111
  if spec.get("headings"):
112
  for h in spec["headings"]:
113
  if fuzzy_match_heading(context['heading'], [h["text"]]):
114
  score += 50
115
  reasons.append(f"Heading match: '{context['heading']}'")
116
  break
117
+
118
+ # Column header matching (important for Summary tables)
119
+ if spec.get("columns"):
120
+ cols = [normalize_text(col) for col in spec["columns"]]
121
+ matches = 0
122
+ for col in cols:
123
+ if any(col.upper() in h.upper() for h in context['headers']):
124
+ matches += 1
125
+ if matches == len(cols):
126
+ score += 60 # High boost for exact column matches
127
+ reasons.append(f"All column headers match: {cols}")
128
+ elif matches > 0:
129
+ score += matches * 20 # Partial column matches
130
+ reasons.append(f"Partial column matches: {matches}/{len(cols)}")
131
+
132
+ # Label matching for left-oriented tables
133
  if spec.get("orientation") == "left":
134
  labels = [normalize_text(lbl) for lbl in spec["labels"]]
135
  matches = 0
 
139
  if matches > 0:
140
  score += (matches / len(labels)) * 30
141
  reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
142
+
143
+ # Label matching for row1-oriented tables
144
  elif spec.get("orientation") == "row1":
145
  labels = [normalize_text(lbl) for lbl in spec["labels"]]
146
  matches = 0
 
150
  if matches > 0:
151
  score += (matches / len(labels)) * 30
152
  reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
153
+
154
+ # Special handling for Declaration tables
 
 
 
 
 
 
 
155
  if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
156
  if "OPERATOR DECLARATION" in context['heading'].upper():
157
  score += 80
 
159
  elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
160
  score += 60
161
  reasons.append("Manager found in cells (likely Operator Declaration)")
162
+
163
  if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
164
  if any("MANAGER" in cell.upper() for cell in context['all_cells']):
165
  score -= 50 # Penalty because auditors shouldn't be managers
166
  reasons.append("Penalty: Manager found (not auditor)")
167
+
168
  return score, reasons
169
 
170
  def match_table_schema(tbl):