Spaces:

VanguardAI
/

Arabic-OCR

Running

App Files Files Community

VanguardAI commited on Nov 5

Commit

0fe4722

verified ·

1 Parent(s): 67294ce

Update app.py

Browse files

Files changed (1) hide show

app.py +198 -35

app.py CHANGED Viewed

@@ -749,6 +749,108 @@ def analyze_image_line_characteristics(image: Image.Image) -> Dict[str, float]:
         }
 def detect_line_spacing(image: Image.Image, bbox: List[int]) -> Optional[float]:
     """
     Detect average line spacing in a text region using horizontal projection analysis.
@@ -817,6 +919,8 @@ def detect_line_spacing(image: Image.Image, bbox: List[int]) -> Optional[float]:
 def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) -> List[int]:
     """
     Detect actual line break positions within a text region using horizontal projection.
     Returns list of y-coordinates where lines break.
     """
     try:
@@ -828,23 +932,33 @@ def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) ->
         if img_array.size == 0:
             return []
         # Horizontal projection
         row_sums = np.sum(img_array < 128, axis=1)
-        if len(row_sums) < 5:
             return []
         # Find valleys (spaces between lines) and peaks (text lines)
         mean_val = np.mean(row_sums)
         std_val = np.std(row_sums)
-        text_threshold = max(mean_val * 0.25, mean_val - std_val * 0.4)
-        space_threshold = mean_val * 0.15  # Much lower for spaces
         # Find text rows and space rows
         text_rows = np.where(row_sums > text_threshold)[0]
-        space_rows = np.where(row_sums < space_threshold)[0]
-        if len(text_rows) < 2:
             return []
         # Group text rows into lines
@@ -855,23 +969,28 @@ def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) ->
             if text_rows[i] - text_rows[i-1] <= 3:
                 current_group.append(text_rows[i])
             else:
-                if len(current_group) > 0:
                     line_groups.append(current_group)
                 current_group = [text_rows[i]]
-        if len(current_group) > 0:
             line_groups.append(current_group)
         if len(line_groups) < 2:
             return []  # Single line or can't detect
         # Find break points (midpoints between line groups)
         break_points = []
         for i in range(len(line_groups) - 1):
             last_row_of_line1 = max(line_groups[i])
             first_row_of_line2 = min(line_groups[i+1])
-            break_point = (last_row_of_line1 + first_row_of_line2) // 2
-            break_points.append(y1 + break_point)  # Convert to image coordinates
         return break_points
@@ -932,6 +1051,12 @@ def split_text_regions_into_lines(
         height = y2 - y1
         width = x2 - x1
         # ALWAYS check if region contains multiple lines, regardless of height
         # Use image analysis to detect actual line breaks
         line_breaks = detect_actual_line_breaks_in_region(image, bbox)
@@ -944,23 +1069,31 @@ def split_text_regions_into_lines(
             current_y = y1
             for i, break_y in enumerate(line_breaks):
                 # Create line from current_y to break_y
                 new_item = item.copy()
-                new_item['bbox'] = [x1, int(current_y), x2, int(break_y)]
-                new_item['text'] = ""  # Will be re-OCR'd
                 new_item['split_from_parent'] = True
                 new_item['needs_reocr'] = True
-                new_item['line_number'] = i + 1
                 result.append(new_item)
-                current_y = break_y
-            # Add last line
-            new_item = item.copy()
-            new_item['bbox'] = [x1, int(current_y), x2, y2]
-            new_item['text'] = ""
-            new_item['split_from_parent'] = True
-            new_item['needs_reocr'] = True
-            new_item['line_number'] = len(line_breaks) + 1
-            result.append(new_item)
             split_count += 1
         elif height > adaptive_max:
@@ -984,8 +1117,6 @@ def split_text_regions_into_lines(
             # Split geometrically
             for i in range(estimated_lines):
-                new_item = item.copy()
                 if i == 0:
                     new_y1 = y1
                     new_y2 = y1 + line_height + padding
@@ -1000,12 +1131,17 @@ def split_text_regions_into_lines(
                 new_y2 = min(y2, int(new_y2))
                 if new_y2 > new_y1:
-                    new_item['bbox'] = [x1, new_y1, x2, new_y2]
-                    new_item['text'] = ""
-                    new_item['split_from_parent'] = True
-                    new_item['needs_reocr'] = True
-                    new_item['line_number'] = i + 1
-                    result.append(new_item)
             split_count += 1
@@ -1246,9 +1382,11 @@ def process_image(
             # Try to parse JSON output
             layout_data = json.loads(raw_output)
-            # 📏 LINE-LEVEL SPLITTING: Split large text regions into individual lines
-            # This ensures each line gets its own bounding box for easier verification
             print(f"\n📋 Initial layout: {len(layout_data)} regions detected")
             for idx, item in enumerate(layout_data):
                 bbox = item.get('bbox', [])
                 text = item.get('text', '')[:50]
@@ -1259,6 +1397,11 @@ def process_image(
                 layout_data_before = len(layout_data)
                 layout_data = split_text_regions_into_lines(image, layout_data)
                 print(f"📐 After splitting: {layout_data_before} → {len(layout_data)} regions")
             except Exception as e:
                 print(f"⚠️ Warning: Could not split text regions: {e}")
                 traceback.print_exc()
@@ -1268,6 +1411,7 @@ def process_image(
             regions_needing_reocr = [item for item in layout_data if item.get('needs_reocr')]
             if regions_needing_reocr:
                 print(f"🔄 Re-OCRing {len(regions_needing_reocr)} split line regions for accurate per-line text...")
                 for idx, item in enumerate(regions_needing_reocr):
                     try:
                         bbox = item.get('bbox', [])
@@ -1279,6 +1423,13 @@ def process_image(
                         if x2 <= x1 or y2 <= y1:
                             continue
                         # Add small safety margin to ensure we capture full text
                         margin = 2  # Small margin to avoid edge clipping
                         crop_x1 = max(0, x1 - margin)
@@ -1292,8 +1443,7 @@ def process_image(
                         # Validate crop is reasonable size
                         if crop_img.size[0] < 10 or crop_img.size[1] < 10:
                             print(f"   ⚠️ Skipping line {idx+1}: crop too small ({crop_img.size})")
-                            item['text'] = "[Crop too small]"
-                            item['confidence'] = 0.0
                             continue
                         # Apply preprocessing to enhance handwriting quality
@@ -1321,16 +1471,25 @@ def process_image(
                         # Keep: Arabic letters, Arabic numbers, spaces, basic punctuation
                         line_text = re.sub(r'[a-zA-Z]+', '', line_text).strip()
                         item['text'] = line_text
                         item['confidence'] = line_conf
                         item['reocr_completed'] = True
                         print(f"   ✓ Line {idx+1}/{len(regions_needing_reocr)}: '{line_text[:50]}...' (conf: {line_conf:.1f}%)")
                     except Exception as e:
                         print(f"   ✗ Error re-OCRing line {idx}: {e}")
                         traceback.print_exc()
-                        item['text'] = "[OCR Failed]"
-                        item['confidence'] = 0.0
                 print(f"\n✅ Re-OCR complete. Final layout has {len(layout_data)} regions:")
                 for idx, item in enumerate(layout_data):
@@ -1338,6 +1497,10 @@ def process_image(
                     conf = item.get('confidence', 0)
                     reocr = item.get('reocr_completed', False)
                     print(f"   Region {idx+1}: '{text}...' (conf={conf:.1f}%, re-OCR={reocr})")
             # 🎯 INTELLIGENT CONFIDENCE SCORING
             # Count text regions to determine if per-region scoring is feasible

         }
+def validate_region_contains_text(image: Image.Image, bbox: List[int], min_text_density: float = 0.05) -> bool:
+    """
+    Validate that a bounding box region actually contains text (not empty space).
+    Args:
+        image: Original image
+        bbox: Bounding box [x1, y1, x2, y2]
+        min_text_density: Minimum fraction of pixels that should be text (dark pixels)
+    Returns:
+        True if region contains sufficient text, False otherwise
+    """
+    try:
+        x1, y1, x2, y2 = bbox
+        x1, y1 = max(0, int(x1)), max(0, int(y1))
+        x2, y2 = min(image.width, int(x2)), min(image.height, int(y2))
+        if x2 <= x1 or y2 <= y1:
+            return False
+        # Crop region
+        crop = image.crop((x1, y1, x2, y2))
+        gray = crop.convert('L')
+        img_array = np.array(gray)
+        if img_array.size == 0:
+            return False
+        # Calculate text density (fraction of dark pixels)
+        # For handwriting/text, we expect at least some dark pixels
+        dark_pixels = np.sum(img_array < 128)  # Pixels darker than middle gray
+        total_pixels = img_array.size
+        text_density = dark_pixels / total_pixels if total_pixels > 0 else 0
+        # Also check for minimum height/width (avoid tiny regions)
+        height = y2 - y1
+        width = x2 - x1
+        min_dimension = min(height, width)
+        # Reject if:
+        # 1. Text density too low (mostly empty space)
+        # 2. Region too small (likely noise)
+        if text_density < min_text_density:
+            return False
+        if min_dimension < 15:  # Too small to be a real text line
+            return False
+        return True
+    except Exception as e:
+        print(f"   ⚠️ Error validating region: {e}")
+        return False
+def filter_empty_regions(image: Image.Image, layout_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Filter out regions that are empty spaces, noise, or false positives.
+    This removes:
+    - Regions with very low text density (empty margins/spaces)
+    - Regions that are too small
+    - Regions that are likely noise artifacts
+    """
+    filtered = []
+    removed_count = 0
+    for item in layout_data:
+        bbox = item.get('bbox', [])
+        category = item.get('category', '')
+        text = item.get('text', '').strip()
+        # Skip if no bbox
+        if not bbox or len(bbox) != 4:
+            continue
+        # For Text/List-item regions, validate they contain actual text
+        if category in ['Text', 'List-item']:
+            if not validate_region_contains_text(image, bbox):
+                print(f"   🗑️ Removing empty region: {category} bbox={bbox}")
+                removed_count += 1
+                continue
+        # Even if region passes validation, check if text is meaningful
+        # Remove regions with very short or meaningless text
+        if category in ['Text', 'List-item']:
+            # Remove if text is empty or too short (likely noise)
+            if not text or len(text.strip()) < 2:
+                # But only if it also failed validation
+                if not validate_region_contains_text(image, bbox, min_text_density=0.03):
+                    print(f"   🗑️ Removing empty/noise region: {category} bbox={bbox}")
+                    removed_count += 1
+                    continue
+        filtered.append(item)
+    if removed_count > 0:
+        print(f"🗑️ Filtered out {removed_count} empty/noise regions")
+    return filtered
 def detect_line_spacing(image: Image.Image, bbox: List[int]) -> Optional[float]:
     """
     Detect average line spacing in a text region using horizontal projection analysis.
 def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) -> List[int]:
     """
     Detect actual line break positions within a text region using horizontal projection.
+    Only detects breaks if region has sufficient text density (not empty space).
     Returns list of y-coordinates where lines break.
     """
     try:
         if img_array.size == 0:
             return []
+        # FIRST: Validate region has actual text (not empty space)
+        total_pixels = img_array.size
+        dark_pixels = np.sum(img_array < 128)
+        text_density = dark_pixels / total_pixels if total_pixels > 0 else 0
+        # Require minimum text density to avoid false positives on empty regions
+        if text_density < 0.03:  # Less than 3% dark pixels = likely empty
+            return []
         # Horizontal projection
         row_sums = np.sum(img_array < 128, axis=1)
+        if len(row_sums) < 10:  # Need enough rows
             return []
         # Find valleys (spaces between lines) and peaks (text lines)
         mean_val = np.mean(row_sums)
         std_val = np.std(row_sums)
+        # More aggressive thresholds to avoid false positives
+        text_threshold = max(mean_val * 0.3, mean_val - std_val * 0.5)
+        space_threshold = mean_val * 0.1  # Very low for actual spaces
         # Find text rows and space rows
         text_rows = np.where(row_sums > text_threshold)[0]
+        if len(text_rows) < 5:  # Need substantial text rows
             return []
         # Group text rows into lines
             if text_rows[i] - text_rows[i-1] <= 3:
                 current_group.append(text_rows[i])
             else:
+                if len(current_group) >= 3:  # Require minimum group size
                     line_groups.append(current_group)
                 current_group = [text_rows[i]]
+        if len(current_group) >= 3:
             line_groups.append(current_group)
         if len(line_groups) < 2:
             return []  # Single line or can't detect
         # Find break points (midpoints between line groups)
+        # Require minimum gap between lines to avoid false splits
         break_points = []
         for i in range(len(line_groups) - 1):
             last_row_of_line1 = max(line_groups[i])
             first_row_of_line2 = min(line_groups[i+1])
+            gap = first_row_of_line2 - last_row_of_line1
+            # Only split if gap is substantial (at least 5 pixels)
+            if gap >= 5:
+                break_point = (last_row_of_line1 + first_row_of_line2) // 2
+                break_points.append(y1 + break_point)  # Convert to image coordinates
         return break_points
         height = y2 - y1
         width = x2 - x1
+        # FIRST: Validate region actually contains text before trying to split
+        if not validate_region_contains_text(image, bbox, min_text_density=0.03):
+            print(f"   Region: {category} (h={height}px) - Empty/noise region, skipping split")
+            # Don't add empty regions - they'll be filtered out later
+            continue
         # ALWAYS check if region contains multiple lines, regardless of height
         # Use image analysis to detect actual line breaks
         line_breaks = detect_actual_line_breaks_in_region(image, bbox)
             current_y = y1
             for i, break_y in enumerate(line_breaks):
                 # Create line from current_y to break_y
+                new_bbox = [x1, int(current_y), x2, int(break_y)]
+                # Validate split region contains text before adding
+                if validate_region_contains_text(image, new_bbox, min_text_density=0.03):
+                    new_item = item.copy()
+                    new_item['bbox'] = new_bbox
+                    new_item['text'] = ""  # Will be re-OCR'd
+                    new_item['split_from_parent'] = True
+                    new_item['needs_reocr'] = True
+                    new_item['line_number'] = i + 1
+                    result.append(new_item)
+                current_y = break_y
+            # Add last line
+            final_bbox = [x1, int(current_y), x2, y2]
+            if validate_region_contains_text(image, final_bbox, min_text_density=0.03):
                 new_item = item.copy()
+                new_item['bbox'] = final_bbox
+                new_item['text'] = ""
                 new_item['split_from_parent'] = True
                 new_item['needs_reocr'] = True
+                new_item['line_number'] = len(line_breaks) + 1
                 result.append(new_item)
             split_count += 1
         elif height > adaptive_max:
             # Split geometrically
             for i in range(estimated_lines):
                 if i == 0:
                     new_y1 = y1
                     new_y2 = y1 + line_height + padding
                 new_y2 = min(y2, int(new_y2))
                 if new_y2 > new_y1:
+                    new_bbox = [x1, new_y1, x2, new_y2]
+                    # Validate split region contains text before adding
+                    if validate_region_contains_text(image, new_bbox, min_text_density=0.03):
+                        new_item = item.copy()
+                        new_item['bbox'] = new_bbox
+                        new_item['text'] = ""
+                        new_item['split_from_parent'] = True
+                        new_item['needs_reocr'] = True
+                        new_item['line_number'] = i + 1
+                        result.append(new_item)
             split_count += 1
             # Try to parse JSON output
             layout_data = json.loads(raw_output)
+            # 🗑️ FIRST FILTER: Remove empty regions and false positives from initial detection
             print(f"\n📋 Initial layout: {len(layout_data)} regions detected")
+            layout_data = filter_empty_regions(image, layout_data)
+            print(f"✅ After initial filtering: {len(layout_data)} regions remaining")
             for idx, item in enumerate(layout_data):
                 bbox = item.get('bbox', [])
                 text = item.get('text', '')[:50]
                 layout_data_before = len(layout_data)
                 layout_data = split_text_regions_into_lines(image, layout_data)
                 print(f"📐 After splitting: {layout_data_before} → {len(layout_data)} regions")
+                # 🗑️ SECOND FILTER: Remove any empty regions created during splitting
+                layout_data = filter_empty_regions(image, layout_data)
+                print(f"✅ After post-split filtering: {len(layout_data)} regions remaining")
             except Exception as e:
                 print(f"⚠️ Warning: Could not split text regions: {e}")
                 traceback.print_exc()
             regions_needing_reocr = [item for item in layout_data if item.get('needs_reocr')]
             if regions_needing_reocr:
                 print(f"🔄 Re-OCRing {len(regions_needing_reocr)} split line regions for accurate per-line text...")
+                valid_regions = []
                 for idx, item in enumerate(regions_needing_reocr):
                     try:
                         bbox = item.get('bbox', [])
                         if x2 <= x1 or y2 <= y1:
                             continue
+                        # 🚫 VALIDATE BEFORE RE-OCR: Skip empty regions
+                        if not validate_region_contains_text(image, bbox, min_text_density=0.03):
+                            print(f"   ⚠️ Skipping line {idx+1}: empty region (bbox={bbox})")
+                            # Mark for removal
+                            item['_should_remove'] = True
+                            continue
                         # Add small safety margin to ensure we capture full text
                         margin = 2  # Small margin to avoid edge clipping
                         crop_x1 = max(0, x1 - margin)
                         # Validate crop is reasonable size
                         if crop_img.size[0] < 10 or crop_img.size[1] < 10:
                             print(f"   ⚠️ Skipping line {idx+1}: crop too small ({crop_img.size})")
+                            item['_should_remove'] = True
                             continue
                         # Apply preprocessing to enhance handwriting quality
                         # Keep: Arabic letters, Arabic numbers, spaces, basic punctuation
                         line_text = re.sub(r'[a-zA-Z]+', '', line_text).strip()
+                        # If text is empty or too short after filtering, mark for removal
+                        if not line_text or len(line_text.strip()) < 2:
+                            print(f"   ⚠️ Skipping line {idx+1}: no meaningful text after filtering")
+                            item['_should_remove'] = True
+                            continue
                         item['text'] = line_text
                         item['confidence'] = line_conf
                         item['reocr_completed'] = True
+                        valid_regions.append(item)
                         print(f"   ✓ Line {idx+1}/{len(regions_needing_reocr)}: '{line_text[:50]}...' (conf: {line_conf:.1f}%)")
                     except Exception as e:
                         print(f"   ✗ Error re-OCRing line {idx}: {e}")
                         traceback.print_exc()
+                        item['_should_remove'] = True
+                # Remove regions marked for removal
+                layout_data = [item for item in layout_data if not item.get('_should_remove', False)]
                 print(f"\n✅ Re-OCR complete. Final layout has {len(layout_data)} regions:")
                 for idx, item in enumerate(layout_data):
                     conf = item.get('confidence', 0)
                     reocr = item.get('reocr_completed', False)
                     print(f"   Region {idx+1}: '{text}...' (conf={conf:.1f}%, re-OCR={reocr})")
+                # 🗑️ FINAL FILTER: Remove any remaining empty/invalid regions
+                layout_data = filter_empty_regions(image, layout_data)
+                print(f"✅ After final filtering: {len(layout_data)} regions remaining")
             # 🎯 INTELLIGENT CONFIDENCE SCORING
             # Count text regions to determine if per-region scoring is feasible