Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 21

Commit

c4e2e43

verified ·

1 Parent(s): 3f0b8d7

Update extract_red_text.py

Browse files

Files changed (1) hide show

extract_red_text.py +57 -10

extract_red_text.py CHANGED Viewed

@@ -68,18 +68,68 @@ def get_table_context(tbl):
     }
 def calculate_schema_match_score(schema_name, spec, context):
-    """Calculate match score for a schema against table context"""
     score = 0
     reasons = []
     if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
         score += 100
         reasons.append(f"Direct first cell match: '{context['first_cell']}'")
     if spec.get("headings"):
         for h in spec["headings"]:
             if fuzzy_match_heading(context['heading'], [h["text"]]):
                 score += 50
                 reasons.append(f"Heading match: '{context['heading']}'")
                 break
     if spec.get("orientation") == "left":
         labels = [normalize_text(lbl) for lbl in spec["labels"]]
         matches = 0
@@ -89,6 +139,8 @@ def calculate_schema_match_score(schema_name, spec, context):
         if matches > 0:
             score += (matches / len(labels)) * 30
             reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
     elif spec.get("orientation") == "row1":
         labels = [normalize_text(lbl) for lbl in spec["labels"]]
         matches = 0
@@ -98,15 +150,8 @@ def calculate_schema_match_score(schema_name, spec, context):
         if matches > 0:
             score += (matches / len(labels)) * 30
             reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
-    if spec.get("columns"):
-        cols = [normalize_text(col) for col in spec["columns"]]
-        matches = 0
-        for col in cols:
-            if any(col.upper() in h.upper() for h in context['headers']):
-                matches += 1
-        if matches == len(cols):
-            score += 40
-            reasons.append(f"All column headers match: {cols}")
     if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
         if "OPERATOR DECLARATION" in context['heading'].upper():
             score += 80
@@ -114,10 +159,12 @@ def calculate_schema_match_score(schema_name, spec, context):
         elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
             score += 60
             reasons.append("Manager found in cells (likely Operator Declaration)")
     if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
         if any("MANAGER" in cell.upper() for cell in context['all_cells']):
             score -= 50  # Penalty because auditors shouldn't be managers
             reasons.append("Penalty: Manager found (not auditor)")
     return score, reasons
 def match_table_schema(tbl):

     }
 def calculate_schema_match_score(schema_name, spec, context):
+    """Enhanced calculate match score for a schema against table context with Summary table detection"""
     score = 0
     reasons = []
+    # 🎯 CRITICAL: Boost Summary schemas when DETAILS column is detected
+    if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
+        score += 100  # Very high boost for summary tables with DETAILS column
+        reasons.append(f"Summary schema with DETAILS column - perfect match")
+    # 🎯 CRITICAL: Heavily penalize non-Summary schemas when DETAILS column is present
+    if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
+        score -= 75  # Heavy penalty to prevent basic schemas from matching summary tables
+        reasons.append(f"Non-summary schema penalized for DETAILS column presence")
+    # Check for context exclusions (prevents basic Management from matching Summary tables)
+    if spec.get("context_exclusions"):
+        table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
+        for exclusion in spec["context_exclusions"]:
+            if exclusion.lower() in table_text:
+                score -= 50
+                reasons.append(f"Context exclusion penalty: '{exclusion}' found")
+    # Check for context keywords (boosts matching for relevant tables)
+    if spec.get("context_keywords"):
+        table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
+        keyword_matches = 0
+        for keyword in spec["context_keywords"]:
+            if keyword.lower() in table_text:
+                keyword_matches += 1
+        if keyword_matches > 0:
+            score += keyword_matches * 15  # Boost for each matching keyword
+            reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")
+    # Direct first cell match
     if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
         score += 100
         reasons.append(f"Direct first cell match: '{context['first_cell']}'")
+    # Heading pattern matching
     if spec.get("headings"):
         for h in spec["headings"]:
             if fuzzy_match_heading(context['heading'], [h["text"]]):
                 score += 50
                 reasons.append(f"Heading match: '{context['heading']}'")
                 break
+    # Column header matching (important for Summary tables)
+    if spec.get("columns"):
+        cols = [normalize_text(col) for col in spec["columns"]]
+        matches = 0
+        for col in cols:
+            if any(col.upper() in h.upper() for h in context['headers']):
+                matches += 1
+        if matches == len(cols):
+            score += 60  # High boost for exact column matches
+            reasons.append(f"All column headers match: {cols}")
+        elif matches > 0:
+            score += matches * 20  # Partial column matches
+            reasons.append(f"Partial column matches: {matches}/{len(cols)}")
+    # Label matching for left-oriented tables
     if spec.get("orientation") == "left":
         labels = [normalize_text(lbl) for lbl in spec["labels"]]
         matches = 0
         if matches > 0:
             score += (matches / len(labels)) * 30
             reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
+    # Label matching for row1-oriented tables
     elif spec.get("orientation") == "row1":
         labels = [normalize_text(lbl) for lbl in spec["labels"]]
         matches = 0
         if matches > 0:
             score += (matches / len(labels)) * 30
             reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
+    # Special handling for Declaration tables
     if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
         if "OPERATOR DECLARATION" in context['heading'].upper():
             score += 80
         elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
             score += 60
             reasons.append("Manager found in cells (likely Operator Declaration)")
     if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
         if any("MANAGER" in cell.upper() for cell in context['all_cells']):
             score -= 50  # Penalty because auditors shouldn't be managers
             reasons.append("Penalty: Manager found (not auditor)")
     return score, reasons
 def match_table_schema(tbl):