VanguardAI commited on
Commit
0fe4722
Β·
verified Β·
1 Parent(s): 67294ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -35
app.py CHANGED
@@ -749,6 +749,108 @@ def analyze_image_line_characteristics(image: Image.Image) -> Dict[str, float]:
749
  }
750
 
751
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
752
  def detect_line_spacing(image: Image.Image, bbox: List[int]) -> Optional[float]:
753
  """
754
  Detect average line spacing in a text region using horizontal projection analysis.
@@ -817,6 +919,8 @@ def detect_line_spacing(image: Image.Image, bbox: List[int]) -> Optional[float]:
817
  def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) -> List[int]:
818
  """
819
  Detect actual line break positions within a text region using horizontal projection.
 
 
820
  Returns list of y-coordinates where lines break.
821
  """
822
  try:
@@ -828,23 +932,33 @@ def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) ->
828
  if img_array.size == 0:
829
  return []
830
 
 
 
 
 
 
 
 
 
 
831
  # Horizontal projection
832
  row_sums = np.sum(img_array < 128, axis=1)
833
 
834
- if len(row_sums) < 5:
835
  return []
836
 
837
  # Find valleys (spaces between lines) and peaks (text lines)
838
  mean_val = np.mean(row_sums)
839
  std_val = np.std(row_sums)
840
- text_threshold = max(mean_val * 0.25, mean_val - std_val * 0.4)
841
- space_threshold = mean_val * 0.15 # Much lower for spaces
 
 
842
 
843
  # Find text rows and space rows
844
  text_rows = np.where(row_sums > text_threshold)[0]
845
- space_rows = np.where(row_sums < space_threshold)[0]
846
 
847
- if len(text_rows) < 2:
848
  return []
849
 
850
  # Group text rows into lines
@@ -855,23 +969,28 @@ def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) ->
855
  if text_rows[i] - text_rows[i-1] <= 3:
856
  current_group.append(text_rows[i])
857
  else:
858
- if len(current_group) > 0:
859
  line_groups.append(current_group)
860
  current_group = [text_rows[i]]
861
 
862
- if len(current_group) > 0:
863
  line_groups.append(current_group)
864
 
865
  if len(line_groups) < 2:
866
  return [] # Single line or can't detect
867
 
868
  # Find break points (midpoints between line groups)
 
869
  break_points = []
870
  for i in range(len(line_groups) - 1):
871
  last_row_of_line1 = max(line_groups[i])
872
  first_row_of_line2 = min(line_groups[i+1])
873
- break_point = (last_row_of_line1 + first_row_of_line2) // 2
874
- break_points.append(y1 + break_point) # Convert to image coordinates
 
 
 
 
875
 
876
  return break_points
877
 
@@ -932,6 +1051,12 @@ def split_text_regions_into_lines(
932
  height = y2 - y1
933
  width = x2 - x1
934
 
 
 
 
 
 
 
935
  # ALWAYS check if region contains multiple lines, regardless of height
936
  # Use image analysis to detect actual line breaks
937
  line_breaks = detect_actual_line_breaks_in_region(image, bbox)
@@ -944,23 +1069,31 @@ def split_text_regions_into_lines(
944
  current_y = y1
945
  for i, break_y in enumerate(line_breaks):
946
  # Create line from current_y to break_y
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
947
  new_item = item.copy()
948
- new_item['bbox'] = [x1, int(current_y), x2, int(break_y)]
949
- new_item['text'] = "" # Will be re-OCR'd
950
  new_item['split_from_parent'] = True
951
  new_item['needs_reocr'] = True
952
- new_item['line_number'] = i + 1
953
  result.append(new_item)
954
- current_y = break_y
955
 
956
- # Add last line
957
- new_item = item.copy()
958
- new_item['bbox'] = [x1, int(current_y), x2, y2]
959
- new_item['text'] = ""
960
- new_item['split_from_parent'] = True
961
- new_item['needs_reocr'] = True
962
- new_item['line_number'] = len(line_breaks) + 1
963
- result.append(new_item)
964
  split_count += 1
965
 
966
  elif height > adaptive_max:
@@ -984,8 +1117,6 @@ def split_text_regions_into_lines(
984
 
985
  # Split geometrically
986
  for i in range(estimated_lines):
987
- new_item = item.copy()
988
-
989
  if i == 0:
990
  new_y1 = y1
991
  new_y2 = y1 + line_height + padding
@@ -1000,12 +1131,17 @@ def split_text_regions_into_lines(
1000
  new_y2 = min(y2, int(new_y2))
1001
 
1002
  if new_y2 > new_y1:
1003
- new_item['bbox'] = [x1, new_y1, x2, new_y2]
1004
- new_item['text'] = ""
1005
- new_item['split_from_parent'] = True
1006
- new_item['needs_reocr'] = True
1007
- new_item['line_number'] = i + 1
1008
- result.append(new_item)
 
 
 
 
 
1009
 
1010
  split_count += 1
1011
 
@@ -1246,9 +1382,11 @@ def process_image(
1246
  # Try to parse JSON output
1247
  layout_data = json.loads(raw_output)
1248
 
1249
- # πŸ“ LINE-LEVEL SPLITTING: Split large text regions into individual lines
1250
- # This ensures each line gets its own bounding box for easier verification
1251
  print(f"\nπŸ“‹ Initial layout: {len(layout_data)} regions detected")
 
 
 
1252
  for idx, item in enumerate(layout_data):
1253
  bbox = item.get('bbox', [])
1254
  text = item.get('text', '')[:50]
@@ -1259,6 +1397,11 @@ def process_image(
1259
  layout_data_before = len(layout_data)
1260
  layout_data = split_text_regions_into_lines(image, layout_data)
1261
  print(f"πŸ“ After splitting: {layout_data_before} β†’ {len(layout_data)} regions")
 
 
 
 
 
1262
  except Exception as e:
1263
  print(f"⚠️ Warning: Could not split text regions: {e}")
1264
  traceback.print_exc()
@@ -1268,6 +1411,7 @@ def process_image(
1268
  regions_needing_reocr = [item for item in layout_data if item.get('needs_reocr')]
1269
  if regions_needing_reocr:
1270
  print(f"πŸ”„ Re-OCRing {len(regions_needing_reocr)} split line regions for accurate per-line text...")
 
1271
  for idx, item in enumerate(regions_needing_reocr):
1272
  try:
1273
  bbox = item.get('bbox', [])
@@ -1279,6 +1423,13 @@ def process_image(
1279
  if x2 <= x1 or y2 <= y1:
1280
  continue
1281
 
 
 
 
 
 
 
 
1282
  # Add small safety margin to ensure we capture full text
1283
  margin = 2 # Small margin to avoid edge clipping
1284
  crop_x1 = max(0, x1 - margin)
@@ -1292,8 +1443,7 @@ def process_image(
1292
  # Validate crop is reasonable size
1293
  if crop_img.size[0] < 10 or crop_img.size[1] < 10:
1294
  print(f" ⚠️ Skipping line {idx+1}: crop too small ({crop_img.size})")
1295
- item['text'] = "[Crop too small]"
1296
- item['confidence'] = 0.0
1297
  continue
1298
 
1299
  # Apply preprocessing to enhance handwriting quality
@@ -1321,16 +1471,25 @@ def process_image(
1321
  # Keep: Arabic letters, Arabic numbers, spaces, basic punctuation
1322
  line_text = re.sub(r'[a-zA-Z]+', '', line_text).strip()
1323
 
 
 
 
 
 
 
1324
  item['text'] = line_text
1325
  item['confidence'] = line_conf
1326
  item['reocr_completed'] = True
 
1327
 
1328
  print(f" βœ“ Line {idx+1}/{len(regions_needing_reocr)}: '{line_text[:50]}...' (conf: {line_conf:.1f}%)")
1329
  except Exception as e:
1330
  print(f" βœ— Error re-OCRing line {idx}: {e}")
1331
  traceback.print_exc()
1332
- item['text'] = "[OCR Failed]"
1333
- item['confidence'] = 0.0
 
 
1334
 
1335
  print(f"\nβœ… Re-OCR complete. Final layout has {len(layout_data)} regions:")
1336
  for idx, item in enumerate(layout_data):
@@ -1338,6 +1497,10 @@ def process_image(
1338
  conf = item.get('confidence', 0)
1339
  reocr = item.get('reocr_completed', False)
1340
  print(f" Region {idx+1}: '{text}...' (conf={conf:.1f}%, re-OCR={reocr})")
 
 
 
 
1341
 
1342
  # 🎯 INTELLIGENT CONFIDENCE SCORING
1343
  # Count text regions to determine if per-region scoring is feasible
 
749
  }
750
 
751
 
752
+ def validate_region_contains_text(image: Image.Image, bbox: List[int], min_text_density: float = 0.05) -> bool:
753
+ """
754
+ Validate that a bounding box region actually contains text (not empty space).
755
+
756
+ Args:
757
+ image: Original image
758
+ bbox: Bounding box [x1, y1, x2, y2]
759
+ min_text_density: Minimum fraction of pixels that should be text (dark pixels)
760
+
761
+ Returns:
762
+ True if region contains sufficient text, False otherwise
763
+ """
764
+ try:
765
+ x1, y1, x2, y2 = bbox
766
+ x1, y1 = max(0, int(x1)), max(0, int(y1))
767
+ x2, y2 = min(image.width, int(x2)), min(image.height, int(y2))
768
+
769
+ if x2 <= x1 or y2 <= y1:
770
+ return False
771
+
772
+ # Crop region
773
+ crop = image.crop((x1, y1, x2, y2))
774
+ gray = crop.convert('L')
775
+ img_array = np.array(gray)
776
+
777
+ if img_array.size == 0:
778
+ return False
779
+
780
+ # Calculate text density (fraction of dark pixels)
781
+ # For handwriting/text, we expect at least some dark pixels
782
+ dark_pixels = np.sum(img_array < 128) # Pixels darker than middle gray
783
+ total_pixels = img_array.size
784
+ text_density = dark_pixels / total_pixels if total_pixels > 0 else 0
785
+
786
+ # Also check for minimum height/width (avoid tiny regions)
787
+ height = y2 - y1
788
+ width = x2 - x1
789
+ min_dimension = min(height, width)
790
+
791
+ # Reject if:
792
+ # 1. Text density too low (mostly empty space)
793
+ # 2. Region too small (likely noise)
794
+ if text_density < min_text_density:
795
+ return False
796
+
797
+ if min_dimension < 15: # Too small to be a real text line
798
+ return False
799
+
800
+ return True
801
+
802
+ except Exception as e:
803
+ print(f" ⚠️ Error validating region: {e}")
804
+ return False
805
+
806
+
807
+ def filter_empty_regions(image: Image.Image, layout_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
808
+ """
809
+ Filter out regions that are empty spaces, noise, or false positives.
810
+
811
+ This removes:
812
+ - Regions with very low text density (empty margins/spaces)
813
+ - Regions that are too small
814
+ - Regions that are likely noise artifacts
815
+ """
816
+ filtered = []
817
+ removed_count = 0
818
+
819
+ for item in layout_data:
820
+ bbox = item.get('bbox', [])
821
+ category = item.get('category', '')
822
+ text = item.get('text', '').strip()
823
+
824
+ # Skip if no bbox
825
+ if not bbox or len(bbox) != 4:
826
+ continue
827
+
828
+ # For Text/List-item regions, validate they contain actual text
829
+ if category in ['Text', 'List-item']:
830
+ if not validate_region_contains_text(image, bbox):
831
+ print(f" πŸ—‘οΈ Removing empty region: {category} bbox={bbox}")
832
+ removed_count += 1
833
+ continue
834
+
835
+ # Even if region passes validation, check if text is meaningful
836
+ # Remove regions with very short or meaningless text
837
+ if category in ['Text', 'List-item']:
838
+ # Remove if text is empty or too short (likely noise)
839
+ if not text or len(text.strip()) < 2:
840
+ # But only if it also failed validation
841
+ if not validate_region_contains_text(image, bbox, min_text_density=0.03):
842
+ print(f" πŸ—‘οΈ Removing empty/noise region: {category} bbox={bbox}")
843
+ removed_count += 1
844
+ continue
845
+
846
+ filtered.append(item)
847
+
848
+ if removed_count > 0:
849
+ print(f"πŸ—‘οΈ Filtered out {removed_count} empty/noise regions")
850
+
851
+ return filtered
852
+
853
+
854
  def detect_line_spacing(image: Image.Image, bbox: List[int]) -> Optional[float]:
855
  """
856
  Detect average line spacing in a text region using horizontal projection analysis.
 
919
  def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) -> List[int]:
920
  """
921
  Detect actual line break positions within a text region using horizontal projection.
922
+ Only detects breaks if region has sufficient text density (not empty space).
923
+
924
  Returns list of y-coordinates where lines break.
925
  """
926
  try:
 
932
  if img_array.size == 0:
933
  return []
934
 
935
+ # FIRST: Validate region has actual text (not empty space)
936
+ total_pixels = img_array.size
937
+ dark_pixels = np.sum(img_array < 128)
938
+ text_density = dark_pixels / total_pixels if total_pixels > 0 else 0
939
+
940
+ # Require minimum text density to avoid false positives on empty regions
941
+ if text_density < 0.03: # Less than 3% dark pixels = likely empty
942
+ return []
943
+
944
  # Horizontal projection
945
  row_sums = np.sum(img_array < 128, axis=1)
946
 
947
+ if len(row_sums) < 10: # Need enough rows
948
  return []
949
 
950
  # Find valleys (spaces between lines) and peaks (text lines)
951
  mean_val = np.mean(row_sums)
952
  std_val = np.std(row_sums)
953
+
954
+ # More aggressive thresholds to avoid false positives
955
+ text_threshold = max(mean_val * 0.3, mean_val - std_val * 0.5)
956
+ space_threshold = mean_val * 0.1 # Very low for actual spaces
957
 
958
  # Find text rows and space rows
959
  text_rows = np.where(row_sums > text_threshold)[0]
 
960
 
961
+ if len(text_rows) < 5: # Need substantial text rows
962
  return []
963
 
964
  # Group text rows into lines
 
969
  if text_rows[i] - text_rows[i-1] <= 3:
970
  current_group.append(text_rows[i])
971
  else:
972
+ if len(current_group) >= 3: # Require minimum group size
973
  line_groups.append(current_group)
974
  current_group = [text_rows[i]]
975
 
976
+ if len(current_group) >= 3:
977
  line_groups.append(current_group)
978
 
979
  if len(line_groups) < 2:
980
  return [] # Single line or can't detect
981
 
982
  # Find break points (midpoints between line groups)
983
+ # Require minimum gap between lines to avoid false splits
984
  break_points = []
985
  for i in range(len(line_groups) - 1):
986
  last_row_of_line1 = max(line_groups[i])
987
  first_row_of_line2 = min(line_groups[i+1])
988
+ gap = first_row_of_line2 - last_row_of_line1
989
+
990
+ # Only split if gap is substantial (at least 5 pixels)
991
+ if gap >= 5:
992
+ break_point = (last_row_of_line1 + first_row_of_line2) // 2
993
+ break_points.append(y1 + break_point) # Convert to image coordinates
994
 
995
  return break_points
996
 
 
1051
  height = y2 - y1
1052
  width = x2 - x1
1053
 
1054
+ # FIRST: Validate region actually contains text before trying to split
1055
+ if not validate_region_contains_text(image, bbox, min_text_density=0.03):
1056
+ print(f" Region: {category} (h={height}px) - Empty/noise region, skipping split")
1057
+ # Don't add empty regions - they'll be filtered out later
1058
+ continue
1059
+
1060
  # ALWAYS check if region contains multiple lines, regardless of height
1061
  # Use image analysis to detect actual line breaks
1062
  line_breaks = detect_actual_line_breaks_in_region(image, bbox)
 
1069
  current_y = y1
1070
  for i, break_y in enumerate(line_breaks):
1071
  # Create line from current_y to break_y
1072
+ new_bbox = [x1, int(current_y), x2, int(break_y)]
1073
+
1074
+ # Validate split region contains text before adding
1075
+ if validate_region_contains_text(image, new_bbox, min_text_density=0.03):
1076
+ new_item = item.copy()
1077
+ new_item['bbox'] = new_bbox
1078
+ new_item['text'] = "" # Will be re-OCR'd
1079
+ new_item['split_from_parent'] = True
1080
+ new_item['needs_reocr'] = True
1081
+ new_item['line_number'] = i + 1
1082
+ result.append(new_item)
1083
+
1084
+ current_y = break_y
1085
+
1086
+ # Add last line
1087
+ final_bbox = [x1, int(current_y), x2, y2]
1088
+ if validate_region_contains_text(image, final_bbox, min_text_density=0.03):
1089
  new_item = item.copy()
1090
+ new_item['bbox'] = final_bbox
1091
+ new_item['text'] = ""
1092
  new_item['split_from_parent'] = True
1093
  new_item['needs_reocr'] = True
1094
+ new_item['line_number'] = len(line_breaks) + 1
1095
  result.append(new_item)
 
1096
 
 
 
 
 
 
 
 
 
1097
  split_count += 1
1098
 
1099
  elif height > adaptive_max:
 
1117
 
1118
  # Split geometrically
1119
  for i in range(estimated_lines):
 
 
1120
  if i == 0:
1121
  new_y1 = y1
1122
  new_y2 = y1 + line_height + padding
 
1131
  new_y2 = min(y2, int(new_y2))
1132
 
1133
  if new_y2 > new_y1:
1134
+ new_bbox = [x1, new_y1, x2, new_y2]
1135
+
1136
+ # Validate split region contains text before adding
1137
+ if validate_region_contains_text(image, new_bbox, min_text_density=0.03):
1138
+ new_item = item.copy()
1139
+ new_item['bbox'] = new_bbox
1140
+ new_item['text'] = ""
1141
+ new_item['split_from_parent'] = True
1142
+ new_item['needs_reocr'] = True
1143
+ new_item['line_number'] = i + 1
1144
+ result.append(new_item)
1145
 
1146
  split_count += 1
1147
 
 
1382
  # Try to parse JSON output
1383
  layout_data = json.loads(raw_output)
1384
 
1385
+ # πŸ—‘οΈ FIRST FILTER: Remove empty regions and false positives from initial detection
 
1386
  print(f"\nπŸ“‹ Initial layout: {len(layout_data)} regions detected")
1387
+ layout_data = filter_empty_regions(image, layout_data)
1388
+ print(f"βœ… After initial filtering: {len(layout_data)} regions remaining")
1389
+
1390
  for idx, item in enumerate(layout_data):
1391
  bbox = item.get('bbox', [])
1392
  text = item.get('text', '')[:50]
 
1397
  layout_data_before = len(layout_data)
1398
  layout_data = split_text_regions_into_lines(image, layout_data)
1399
  print(f"πŸ“ After splitting: {layout_data_before} β†’ {len(layout_data)} regions")
1400
+
1401
+ # πŸ—‘οΈ SECOND FILTER: Remove any empty regions created during splitting
1402
+ layout_data = filter_empty_regions(image, layout_data)
1403
+ print(f"βœ… After post-split filtering: {len(layout_data)} regions remaining")
1404
+
1405
  except Exception as e:
1406
  print(f"⚠️ Warning: Could not split text regions: {e}")
1407
  traceback.print_exc()
 
1411
  regions_needing_reocr = [item for item in layout_data if item.get('needs_reocr')]
1412
  if regions_needing_reocr:
1413
  print(f"πŸ”„ Re-OCRing {len(regions_needing_reocr)} split line regions for accurate per-line text...")
1414
+ valid_regions = []
1415
  for idx, item in enumerate(regions_needing_reocr):
1416
  try:
1417
  bbox = item.get('bbox', [])
 
1423
  if x2 <= x1 or y2 <= y1:
1424
  continue
1425
 
1426
+ # 🚫 VALIDATE BEFORE RE-OCR: Skip empty regions
1427
+ if not validate_region_contains_text(image, bbox, min_text_density=0.03):
1428
+ print(f" ⚠️ Skipping line {idx+1}: empty region (bbox={bbox})")
1429
+ # Mark for removal
1430
+ item['_should_remove'] = True
1431
+ continue
1432
+
1433
  # Add small safety margin to ensure we capture full text
1434
  margin = 2 # Small margin to avoid edge clipping
1435
  crop_x1 = max(0, x1 - margin)
 
1443
  # Validate crop is reasonable size
1444
  if crop_img.size[0] < 10 or crop_img.size[1] < 10:
1445
  print(f" ⚠️ Skipping line {idx+1}: crop too small ({crop_img.size})")
1446
+ item['_should_remove'] = True
 
1447
  continue
1448
 
1449
  # Apply preprocessing to enhance handwriting quality
 
1471
  # Keep: Arabic letters, Arabic numbers, spaces, basic punctuation
1472
  line_text = re.sub(r'[a-zA-Z]+', '', line_text).strip()
1473
 
1474
+ # If text is empty or too short after filtering, mark for removal
1475
+ if not line_text or len(line_text.strip()) < 2:
1476
+ print(f" ⚠️ Skipping line {idx+1}: no meaningful text after filtering")
1477
+ item['_should_remove'] = True
1478
+ continue
1479
+
1480
  item['text'] = line_text
1481
  item['confidence'] = line_conf
1482
  item['reocr_completed'] = True
1483
+ valid_regions.append(item)
1484
 
1485
  print(f" βœ“ Line {idx+1}/{len(regions_needing_reocr)}: '{line_text[:50]}...' (conf: {line_conf:.1f}%)")
1486
  except Exception as e:
1487
  print(f" βœ— Error re-OCRing line {idx}: {e}")
1488
  traceback.print_exc()
1489
+ item['_should_remove'] = True
1490
+
1491
+ # Remove regions marked for removal
1492
+ layout_data = [item for item in layout_data if not item.get('_should_remove', False)]
1493
 
1494
  print(f"\nβœ… Re-OCR complete. Final layout has {len(layout_data)} regions:")
1495
  for idx, item in enumerate(layout_data):
 
1497
  conf = item.get('confidence', 0)
1498
  reocr = item.get('reocr_completed', False)
1499
  print(f" Region {idx+1}: '{text}...' (conf={conf:.1f}%, re-OCR={reocr})")
1500
+
1501
+ # πŸ—‘οΈ FINAL FILTER: Remove any remaining empty/invalid regions
1502
+ layout_data = filter_empty_regions(image, layout_data)
1503
+ print(f"βœ… After final filtering: {len(layout_data)} regions remaining")
1504
 
1505
  # 🎯 INTELLIGENT CONFIDENCE SCORING
1506
  # Count text regions to determine if per-region scoring is feasible