Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -749,6 +749,108 @@ def analyze_image_line_characteristics(image: Image.Image) -> Dict[str, float]:
|
|
| 749 |
}
|
| 750 |
|
| 751 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 752 |
def detect_line_spacing(image: Image.Image, bbox: List[int]) -> Optional[float]:
|
| 753 |
"""
|
| 754 |
Detect average line spacing in a text region using horizontal projection analysis.
|
|
@@ -817,6 +919,8 @@ def detect_line_spacing(image: Image.Image, bbox: List[int]) -> Optional[float]:
|
|
| 817 |
def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) -> List[int]:
|
| 818 |
"""
|
| 819 |
Detect actual line break positions within a text region using horizontal projection.
|
|
|
|
|
|
|
| 820 |
Returns list of y-coordinates where lines break.
|
| 821 |
"""
|
| 822 |
try:
|
|
@@ -828,23 +932,33 @@ def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) ->
|
|
| 828 |
if img_array.size == 0:
|
| 829 |
return []
|
| 830 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 831 |
# Horizontal projection
|
| 832 |
row_sums = np.sum(img_array < 128, axis=1)
|
| 833 |
|
| 834 |
-
if len(row_sums) <
|
| 835 |
return []
|
| 836 |
|
| 837 |
# Find valleys (spaces between lines) and peaks (text lines)
|
| 838 |
mean_val = np.mean(row_sums)
|
| 839 |
std_val = np.std(row_sums)
|
| 840 |
-
|
| 841 |
-
|
|
|
|
|
|
|
| 842 |
|
| 843 |
# Find text rows and space rows
|
| 844 |
text_rows = np.where(row_sums > text_threshold)[0]
|
| 845 |
-
space_rows = np.where(row_sums < space_threshold)[0]
|
| 846 |
|
| 847 |
-
if len(text_rows) <
|
| 848 |
return []
|
| 849 |
|
| 850 |
# Group text rows into lines
|
|
@@ -855,23 +969,28 @@ def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) ->
|
|
| 855 |
if text_rows[i] - text_rows[i-1] <= 3:
|
| 856 |
current_group.append(text_rows[i])
|
| 857 |
else:
|
| 858 |
-
if len(current_group)
|
| 859 |
line_groups.append(current_group)
|
| 860 |
current_group = [text_rows[i]]
|
| 861 |
|
| 862 |
-
if len(current_group)
|
| 863 |
line_groups.append(current_group)
|
| 864 |
|
| 865 |
if len(line_groups) < 2:
|
| 866 |
return [] # Single line or can't detect
|
| 867 |
|
| 868 |
# Find break points (midpoints between line groups)
|
|
|
|
| 869 |
break_points = []
|
| 870 |
for i in range(len(line_groups) - 1):
|
| 871 |
last_row_of_line1 = max(line_groups[i])
|
| 872 |
first_row_of_line2 = min(line_groups[i+1])
|
| 873 |
-
|
| 874 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 875 |
|
| 876 |
return break_points
|
| 877 |
|
|
@@ -932,6 +1051,12 @@ def split_text_regions_into_lines(
|
|
| 932 |
height = y2 - y1
|
| 933 |
width = x2 - x1
|
| 934 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 935 |
# ALWAYS check if region contains multiple lines, regardless of height
|
| 936 |
# Use image analysis to detect actual line breaks
|
| 937 |
line_breaks = detect_actual_line_breaks_in_region(image, bbox)
|
|
@@ -944,23 +1069,31 @@ def split_text_regions_into_lines(
|
|
| 944 |
current_y = y1
|
| 945 |
for i, break_y in enumerate(line_breaks):
|
| 946 |
# Create line from current_y to break_y
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 947 |
new_item = item.copy()
|
| 948 |
-
new_item['bbox'] =
|
| 949 |
-
new_item['text'] = ""
|
| 950 |
new_item['split_from_parent'] = True
|
| 951 |
new_item['needs_reocr'] = True
|
| 952 |
-
new_item['line_number'] =
|
| 953 |
result.append(new_item)
|
| 954 |
-
current_y = break_y
|
| 955 |
|
| 956 |
-
# Add last line
|
| 957 |
-
new_item = item.copy()
|
| 958 |
-
new_item['bbox'] = [x1, int(current_y), x2, y2]
|
| 959 |
-
new_item['text'] = ""
|
| 960 |
-
new_item['split_from_parent'] = True
|
| 961 |
-
new_item['needs_reocr'] = True
|
| 962 |
-
new_item['line_number'] = len(line_breaks) + 1
|
| 963 |
-
result.append(new_item)
|
| 964 |
split_count += 1
|
| 965 |
|
| 966 |
elif height > adaptive_max:
|
|
@@ -984,8 +1117,6 @@ def split_text_regions_into_lines(
|
|
| 984 |
|
| 985 |
# Split geometrically
|
| 986 |
for i in range(estimated_lines):
|
| 987 |
-
new_item = item.copy()
|
| 988 |
-
|
| 989 |
if i == 0:
|
| 990 |
new_y1 = y1
|
| 991 |
new_y2 = y1 + line_height + padding
|
|
@@ -1000,12 +1131,17 @@ def split_text_regions_into_lines(
|
|
| 1000 |
new_y2 = min(y2, int(new_y2))
|
| 1001 |
|
| 1002 |
if new_y2 > new_y1:
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
-
|
| 1008 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1009 |
|
| 1010 |
split_count += 1
|
| 1011 |
|
|
@@ -1246,9 +1382,11 @@ def process_image(
|
|
| 1246 |
# Try to parse JSON output
|
| 1247 |
layout_data = json.loads(raw_output)
|
| 1248 |
|
| 1249 |
-
#
|
| 1250 |
-
# This ensures each line gets its own bounding box for easier verification
|
| 1251 |
print(f"\nπ Initial layout: {len(layout_data)} regions detected")
|
|
|
|
|
|
|
|
|
|
| 1252 |
for idx, item in enumerate(layout_data):
|
| 1253 |
bbox = item.get('bbox', [])
|
| 1254 |
text = item.get('text', '')[:50]
|
|
@@ -1259,6 +1397,11 @@ def process_image(
|
|
| 1259 |
layout_data_before = len(layout_data)
|
| 1260 |
layout_data = split_text_regions_into_lines(image, layout_data)
|
| 1261 |
print(f"π After splitting: {layout_data_before} β {len(layout_data)} regions")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1262 |
except Exception as e:
|
| 1263 |
print(f"β οΈ Warning: Could not split text regions: {e}")
|
| 1264 |
traceback.print_exc()
|
|
@@ -1268,6 +1411,7 @@ def process_image(
|
|
| 1268 |
regions_needing_reocr = [item for item in layout_data if item.get('needs_reocr')]
|
| 1269 |
if regions_needing_reocr:
|
| 1270 |
print(f"π Re-OCRing {len(regions_needing_reocr)} split line regions for accurate per-line text...")
|
|
|
|
| 1271 |
for idx, item in enumerate(regions_needing_reocr):
|
| 1272 |
try:
|
| 1273 |
bbox = item.get('bbox', [])
|
|
@@ -1279,6 +1423,13 @@ def process_image(
|
|
| 1279 |
if x2 <= x1 or y2 <= y1:
|
| 1280 |
continue
|
| 1281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1282 |
# Add small safety margin to ensure we capture full text
|
| 1283 |
margin = 2 # Small margin to avoid edge clipping
|
| 1284 |
crop_x1 = max(0, x1 - margin)
|
|
@@ -1292,8 +1443,7 @@ def process_image(
|
|
| 1292 |
# Validate crop is reasonable size
|
| 1293 |
if crop_img.size[0] < 10 or crop_img.size[1] < 10:
|
| 1294 |
print(f" β οΈ Skipping line {idx+1}: crop too small ({crop_img.size})")
|
| 1295 |
-
item['
|
| 1296 |
-
item['confidence'] = 0.0
|
| 1297 |
continue
|
| 1298 |
|
| 1299 |
# Apply preprocessing to enhance handwriting quality
|
|
@@ -1321,16 +1471,25 @@ def process_image(
|
|
| 1321 |
# Keep: Arabic letters, Arabic numbers, spaces, basic punctuation
|
| 1322 |
line_text = re.sub(r'[a-zA-Z]+', '', line_text).strip()
|
| 1323 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1324 |
item['text'] = line_text
|
| 1325 |
item['confidence'] = line_conf
|
| 1326 |
item['reocr_completed'] = True
|
|
|
|
| 1327 |
|
| 1328 |
print(f" β Line {idx+1}/{len(regions_needing_reocr)}: '{line_text[:50]}...' (conf: {line_conf:.1f}%)")
|
| 1329 |
except Exception as e:
|
| 1330 |
print(f" β Error re-OCRing line {idx}: {e}")
|
| 1331 |
traceback.print_exc()
|
| 1332 |
-
item['
|
| 1333 |
-
|
|
|
|
|
|
|
| 1334 |
|
| 1335 |
print(f"\nβ
Re-OCR complete. Final layout has {len(layout_data)} regions:")
|
| 1336 |
for idx, item in enumerate(layout_data):
|
|
@@ -1338,6 +1497,10 @@ def process_image(
|
|
| 1338 |
conf = item.get('confidence', 0)
|
| 1339 |
reocr = item.get('reocr_completed', False)
|
| 1340 |
print(f" Region {idx+1}: '{text}...' (conf={conf:.1f}%, re-OCR={reocr})")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1341 |
|
| 1342 |
# π― INTELLIGENT CONFIDENCE SCORING
|
| 1343 |
# Count text regions to determine if per-region scoring is feasible
|
|
|
|
| 749 |
}
|
| 750 |
|
| 751 |
|
| 752 |
+
def validate_region_contains_text(image: Image.Image, bbox: List[int], min_text_density: float = 0.05) -> bool:
|
| 753 |
+
"""
|
| 754 |
+
Validate that a bounding box region actually contains text (not empty space).
|
| 755 |
+
|
| 756 |
+
Args:
|
| 757 |
+
image: Original image
|
| 758 |
+
bbox: Bounding box [x1, y1, x2, y2]
|
| 759 |
+
min_text_density: Minimum fraction of pixels that should be text (dark pixels)
|
| 760 |
+
|
| 761 |
+
Returns:
|
| 762 |
+
True if region contains sufficient text, False otherwise
|
| 763 |
+
"""
|
| 764 |
+
try:
|
| 765 |
+
x1, y1, x2, y2 = bbox
|
| 766 |
+
x1, y1 = max(0, int(x1)), max(0, int(y1))
|
| 767 |
+
x2, y2 = min(image.width, int(x2)), min(image.height, int(y2))
|
| 768 |
+
|
| 769 |
+
if x2 <= x1 or y2 <= y1:
|
| 770 |
+
return False
|
| 771 |
+
|
| 772 |
+
# Crop region
|
| 773 |
+
crop = image.crop((x1, y1, x2, y2))
|
| 774 |
+
gray = crop.convert('L')
|
| 775 |
+
img_array = np.array(gray)
|
| 776 |
+
|
| 777 |
+
if img_array.size == 0:
|
| 778 |
+
return False
|
| 779 |
+
|
| 780 |
+
# Calculate text density (fraction of dark pixels)
|
| 781 |
+
# For handwriting/text, we expect at least some dark pixels
|
| 782 |
+
dark_pixels = np.sum(img_array < 128) # Pixels darker than middle gray
|
| 783 |
+
total_pixels = img_array.size
|
| 784 |
+
text_density = dark_pixels / total_pixels if total_pixels > 0 else 0
|
| 785 |
+
|
| 786 |
+
# Also check for minimum height/width (avoid tiny regions)
|
| 787 |
+
height = y2 - y1
|
| 788 |
+
width = x2 - x1
|
| 789 |
+
min_dimension = min(height, width)
|
| 790 |
+
|
| 791 |
+
# Reject if:
|
| 792 |
+
# 1. Text density too low (mostly empty space)
|
| 793 |
+
# 2. Region too small (likely noise)
|
| 794 |
+
if text_density < min_text_density:
|
| 795 |
+
return False
|
| 796 |
+
|
| 797 |
+
if min_dimension < 15: # Too small to be a real text line
|
| 798 |
+
return False
|
| 799 |
+
|
| 800 |
+
return True
|
| 801 |
+
|
| 802 |
+
except Exception as e:
|
| 803 |
+
print(f" β οΈ Error validating region: {e}")
|
| 804 |
+
return False
|
| 805 |
+
|
| 806 |
+
|
| 807 |
+
def filter_empty_regions(image: Image.Image, layout_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 808 |
+
"""
|
| 809 |
+
Filter out regions that are empty spaces, noise, or false positives.
|
| 810 |
+
|
| 811 |
+
This removes:
|
| 812 |
+
- Regions with very low text density (empty margins/spaces)
|
| 813 |
+
- Regions that are too small
|
| 814 |
+
- Regions that are likely noise artifacts
|
| 815 |
+
"""
|
| 816 |
+
filtered = []
|
| 817 |
+
removed_count = 0
|
| 818 |
+
|
| 819 |
+
for item in layout_data:
|
| 820 |
+
bbox = item.get('bbox', [])
|
| 821 |
+
category = item.get('category', '')
|
| 822 |
+
text = item.get('text', '').strip()
|
| 823 |
+
|
| 824 |
+
# Skip if no bbox
|
| 825 |
+
if not bbox or len(bbox) != 4:
|
| 826 |
+
continue
|
| 827 |
+
|
| 828 |
+
# For Text/List-item regions, validate they contain actual text
|
| 829 |
+
if category in ['Text', 'List-item']:
|
| 830 |
+
if not validate_region_contains_text(image, bbox):
|
| 831 |
+
print(f" ποΈ Removing empty region: {category} bbox={bbox}")
|
| 832 |
+
removed_count += 1
|
| 833 |
+
continue
|
| 834 |
+
|
| 835 |
+
# Even if region passes validation, check if text is meaningful
|
| 836 |
+
# Remove regions with very short or meaningless text
|
| 837 |
+
if category in ['Text', 'List-item']:
|
| 838 |
+
# Remove if text is empty or too short (likely noise)
|
| 839 |
+
if not text or len(text.strip()) < 2:
|
| 840 |
+
# But only if it also failed validation
|
| 841 |
+
if not validate_region_contains_text(image, bbox, min_text_density=0.03):
|
| 842 |
+
print(f" ποΈ Removing empty/noise region: {category} bbox={bbox}")
|
| 843 |
+
removed_count += 1
|
| 844 |
+
continue
|
| 845 |
+
|
| 846 |
+
filtered.append(item)
|
| 847 |
+
|
| 848 |
+
if removed_count > 0:
|
| 849 |
+
print(f"ποΈ Filtered out {removed_count} empty/noise regions")
|
| 850 |
+
|
| 851 |
+
return filtered
|
| 852 |
+
|
| 853 |
+
|
| 854 |
def detect_line_spacing(image: Image.Image, bbox: List[int]) -> Optional[float]:
|
| 855 |
"""
|
| 856 |
Detect average line spacing in a text region using horizontal projection analysis.
|
|
|
|
| 919 |
def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) -> List[int]:
|
| 920 |
"""
|
| 921 |
Detect actual line break positions within a text region using horizontal projection.
|
| 922 |
+
Only detects breaks if region has sufficient text density (not empty space).
|
| 923 |
+
|
| 924 |
Returns list of y-coordinates where lines break.
|
| 925 |
"""
|
| 926 |
try:
|
|
|
|
| 932 |
if img_array.size == 0:
|
| 933 |
return []
|
| 934 |
|
| 935 |
+
# FIRST: Validate region has actual text (not empty space)
|
| 936 |
+
total_pixels = img_array.size
|
| 937 |
+
dark_pixels = np.sum(img_array < 128)
|
| 938 |
+
text_density = dark_pixels / total_pixels if total_pixels > 0 else 0
|
| 939 |
+
|
| 940 |
+
# Require minimum text density to avoid false positives on empty regions
|
| 941 |
+
if text_density < 0.03: # Less than 3% dark pixels = likely empty
|
| 942 |
+
return []
|
| 943 |
+
|
| 944 |
# Horizontal projection
|
| 945 |
row_sums = np.sum(img_array < 128, axis=1)
|
| 946 |
|
| 947 |
+
if len(row_sums) < 10: # Need enough rows
|
| 948 |
return []
|
| 949 |
|
| 950 |
# Find valleys (spaces between lines) and peaks (text lines)
|
| 951 |
mean_val = np.mean(row_sums)
|
| 952 |
std_val = np.std(row_sums)
|
| 953 |
+
|
| 954 |
+
# More aggressive thresholds to avoid false positives
|
| 955 |
+
text_threshold = max(mean_val * 0.3, mean_val - std_val * 0.5)
|
| 956 |
+
space_threshold = mean_val * 0.1 # Very low for actual spaces
|
| 957 |
|
| 958 |
# Find text rows and space rows
|
| 959 |
text_rows = np.where(row_sums > text_threshold)[0]
|
|
|
|
| 960 |
|
| 961 |
+
if len(text_rows) < 5: # Need substantial text rows
|
| 962 |
return []
|
| 963 |
|
| 964 |
# Group text rows into lines
|
|
|
|
| 969 |
if text_rows[i] - text_rows[i-1] <= 3:
|
| 970 |
current_group.append(text_rows[i])
|
| 971 |
else:
|
| 972 |
+
if len(current_group) >= 3: # Require minimum group size
|
| 973 |
line_groups.append(current_group)
|
| 974 |
current_group = [text_rows[i]]
|
| 975 |
|
| 976 |
+
if len(current_group) >= 3:
|
| 977 |
line_groups.append(current_group)
|
| 978 |
|
| 979 |
if len(line_groups) < 2:
|
| 980 |
return [] # Single line or can't detect
|
| 981 |
|
| 982 |
# Find break points (midpoints between line groups)
|
| 983 |
+
# Require minimum gap between lines to avoid false splits
|
| 984 |
break_points = []
|
| 985 |
for i in range(len(line_groups) - 1):
|
| 986 |
last_row_of_line1 = max(line_groups[i])
|
| 987 |
first_row_of_line2 = min(line_groups[i+1])
|
| 988 |
+
gap = first_row_of_line2 - last_row_of_line1
|
| 989 |
+
|
| 990 |
+
# Only split if gap is substantial (at least 5 pixels)
|
| 991 |
+
if gap >= 5:
|
| 992 |
+
break_point = (last_row_of_line1 + first_row_of_line2) // 2
|
| 993 |
+
break_points.append(y1 + break_point) # Convert to image coordinates
|
| 994 |
|
| 995 |
return break_points
|
| 996 |
|
|
|
|
| 1051 |
height = y2 - y1
|
| 1052 |
width = x2 - x1
|
| 1053 |
|
| 1054 |
+
# FIRST: Validate region actually contains text before trying to split
|
| 1055 |
+
if not validate_region_contains_text(image, bbox, min_text_density=0.03):
|
| 1056 |
+
print(f" Region: {category} (h={height}px) - Empty/noise region, skipping split")
|
| 1057 |
+
# Don't add empty regions - they'll be filtered out later
|
| 1058 |
+
continue
|
| 1059 |
+
|
| 1060 |
# ALWAYS check if region contains multiple lines, regardless of height
|
| 1061 |
# Use image analysis to detect actual line breaks
|
| 1062 |
line_breaks = detect_actual_line_breaks_in_region(image, bbox)
|
|
|
|
| 1069 |
current_y = y1
|
| 1070 |
for i, break_y in enumerate(line_breaks):
|
| 1071 |
# Create line from current_y to break_y
|
| 1072 |
+
new_bbox = [x1, int(current_y), x2, int(break_y)]
|
| 1073 |
+
|
| 1074 |
+
# Validate split region contains text before adding
|
| 1075 |
+
if validate_region_contains_text(image, new_bbox, min_text_density=0.03):
|
| 1076 |
+
new_item = item.copy()
|
| 1077 |
+
new_item['bbox'] = new_bbox
|
| 1078 |
+
new_item['text'] = "" # Will be re-OCR'd
|
| 1079 |
+
new_item['split_from_parent'] = True
|
| 1080 |
+
new_item['needs_reocr'] = True
|
| 1081 |
+
new_item['line_number'] = i + 1
|
| 1082 |
+
result.append(new_item)
|
| 1083 |
+
|
| 1084 |
+
current_y = break_y
|
| 1085 |
+
|
| 1086 |
+
# Add last line
|
| 1087 |
+
final_bbox = [x1, int(current_y), x2, y2]
|
| 1088 |
+
if validate_region_contains_text(image, final_bbox, min_text_density=0.03):
|
| 1089 |
new_item = item.copy()
|
| 1090 |
+
new_item['bbox'] = final_bbox
|
| 1091 |
+
new_item['text'] = ""
|
| 1092 |
new_item['split_from_parent'] = True
|
| 1093 |
new_item['needs_reocr'] = True
|
| 1094 |
+
new_item['line_number'] = len(line_breaks) + 1
|
| 1095 |
result.append(new_item)
|
|
|
|
| 1096 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1097 |
split_count += 1
|
| 1098 |
|
| 1099 |
elif height > adaptive_max:
|
|
|
|
| 1117 |
|
| 1118 |
# Split geometrically
|
| 1119 |
for i in range(estimated_lines):
|
|
|
|
|
|
|
| 1120 |
if i == 0:
|
| 1121 |
new_y1 = y1
|
| 1122 |
new_y2 = y1 + line_height + padding
|
|
|
|
| 1131 |
new_y2 = min(y2, int(new_y2))
|
| 1132 |
|
| 1133 |
if new_y2 > new_y1:
|
| 1134 |
+
new_bbox = [x1, new_y1, x2, new_y2]
|
| 1135 |
+
|
| 1136 |
+
# Validate split region contains text before adding
|
| 1137 |
+
if validate_region_contains_text(image, new_bbox, min_text_density=0.03):
|
| 1138 |
+
new_item = item.copy()
|
| 1139 |
+
new_item['bbox'] = new_bbox
|
| 1140 |
+
new_item['text'] = ""
|
| 1141 |
+
new_item['split_from_parent'] = True
|
| 1142 |
+
new_item['needs_reocr'] = True
|
| 1143 |
+
new_item['line_number'] = i + 1
|
| 1144 |
+
result.append(new_item)
|
| 1145 |
|
| 1146 |
split_count += 1
|
| 1147 |
|
|
|
|
| 1382 |
# Try to parse JSON output
|
| 1383 |
layout_data = json.loads(raw_output)
|
| 1384 |
|
| 1385 |
+
# ποΈ FIRST FILTER: Remove empty regions and false positives from initial detection
|
|
|
|
| 1386 |
print(f"\nπ Initial layout: {len(layout_data)} regions detected")
|
| 1387 |
+
layout_data = filter_empty_regions(image, layout_data)
|
| 1388 |
+
print(f"β
After initial filtering: {len(layout_data)} regions remaining")
|
| 1389 |
+
|
| 1390 |
for idx, item in enumerate(layout_data):
|
| 1391 |
bbox = item.get('bbox', [])
|
| 1392 |
text = item.get('text', '')[:50]
|
|
|
|
| 1397 |
layout_data_before = len(layout_data)
|
| 1398 |
layout_data = split_text_regions_into_lines(image, layout_data)
|
| 1399 |
print(f"π After splitting: {layout_data_before} β {len(layout_data)} regions")
|
| 1400 |
+
|
| 1401 |
+
# ποΈ SECOND FILTER: Remove any empty regions created during splitting
|
| 1402 |
+
layout_data = filter_empty_regions(image, layout_data)
|
| 1403 |
+
print(f"β
After post-split filtering: {len(layout_data)} regions remaining")
|
| 1404 |
+
|
| 1405 |
except Exception as e:
|
| 1406 |
print(f"β οΈ Warning: Could not split text regions: {e}")
|
| 1407 |
traceback.print_exc()
|
|
|
|
| 1411 |
regions_needing_reocr = [item for item in layout_data if item.get('needs_reocr')]
|
| 1412 |
if regions_needing_reocr:
|
| 1413 |
print(f"π Re-OCRing {len(regions_needing_reocr)} split line regions for accurate per-line text...")
|
| 1414 |
+
valid_regions = []
|
| 1415 |
for idx, item in enumerate(regions_needing_reocr):
|
| 1416 |
try:
|
| 1417 |
bbox = item.get('bbox', [])
|
|
|
|
| 1423 |
if x2 <= x1 or y2 <= y1:
|
| 1424 |
continue
|
| 1425 |
|
| 1426 |
+
# π« VALIDATE BEFORE RE-OCR: Skip empty regions
|
| 1427 |
+
if not validate_region_contains_text(image, bbox, min_text_density=0.03):
|
| 1428 |
+
print(f" β οΈ Skipping line {idx+1}: empty region (bbox={bbox})")
|
| 1429 |
+
# Mark for removal
|
| 1430 |
+
item['_should_remove'] = True
|
| 1431 |
+
continue
|
| 1432 |
+
|
| 1433 |
# Add small safety margin to ensure we capture full text
|
| 1434 |
margin = 2 # Small margin to avoid edge clipping
|
| 1435 |
crop_x1 = max(0, x1 - margin)
|
|
|
|
| 1443 |
# Validate crop is reasonable size
|
| 1444 |
if crop_img.size[0] < 10 or crop_img.size[1] < 10:
|
| 1445 |
print(f" β οΈ Skipping line {idx+1}: crop too small ({crop_img.size})")
|
| 1446 |
+
item['_should_remove'] = True
|
|
|
|
| 1447 |
continue
|
| 1448 |
|
| 1449 |
# Apply preprocessing to enhance handwriting quality
|
|
|
|
| 1471 |
# Keep: Arabic letters, Arabic numbers, spaces, basic punctuation
|
| 1472 |
line_text = re.sub(r'[a-zA-Z]+', '', line_text).strip()
|
| 1473 |
|
| 1474 |
+
# If text is empty or too short after filtering, mark for removal
|
| 1475 |
+
if not line_text or len(line_text.strip()) < 2:
|
| 1476 |
+
print(f" β οΈ Skipping line {idx+1}: no meaningful text after filtering")
|
| 1477 |
+
item['_should_remove'] = True
|
| 1478 |
+
continue
|
| 1479 |
+
|
| 1480 |
item['text'] = line_text
|
| 1481 |
item['confidence'] = line_conf
|
| 1482 |
item['reocr_completed'] = True
|
| 1483 |
+
valid_regions.append(item)
|
| 1484 |
|
| 1485 |
print(f" β Line {idx+1}/{len(regions_needing_reocr)}: '{line_text[:50]}...' (conf: {line_conf:.1f}%)")
|
| 1486 |
except Exception as e:
|
| 1487 |
print(f" β Error re-OCRing line {idx}: {e}")
|
| 1488 |
traceback.print_exc()
|
| 1489 |
+
item['_should_remove'] = True
|
| 1490 |
+
|
| 1491 |
+
# Remove regions marked for removal
|
| 1492 |
+
layout_data = [item for item in layout_data if not item.get('_should_remove', False)]
|
| 1493 |
|
| 1494 |
print(f"\nβ
Re-OCR complete. Final layout has {len(layout_data)} regions:")
|
| 1495 |
for idx, item in enumerate(layout_data):
|
|
|
|
| 1497 |
conf = item.get('confidence', 0)
|
| 1498 |
reocr = item.get('reocr_completed', False)
|
| 1499 |
print(f" Region {idx+1}: '{text}...' (conf={conf:.1f}%, re-OCR={reocr})")
|
| 1500 |
+
|
| 1501 |
+
# ποΈ FINAL FILTER: Remove any remaining empty/invalid regions
|
| 1502 |
+
layout_data = filter_empty_regions(image, layout_data)
|
| 1503 |
+
print(f"β
After final filtering: {len(layout_data)} regions remaining")
|
| 1504 |
|
| 1505 |
# π― INTELLIGENT CONFIDENCE SCORING
|
| 1506 |
# Count text regions to determine if per-region scoring is feasible
|