Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -34,16 +34,11 @@ torch.backends.cudnn.benchmark = False
|
|
| 34 |
|
| 35 |
# Constants
|
| 36 |
MIN_PIXELS = 3136
|
| 37 |
-
MAX_PIXELS =
|
| 38 |
IMAGE_FACTOR = 28
|
| 39 |
|
| 40 |
# Prompts
|
| 41 |
-
prompt = """Please output the layout information from the
|
| 42 |
-
|
| 43 |
-
CRITICAL REQUIREMENTS:
|
| 44 |
-
- Detect EACH LINE of text as a SEPARATE element. Do NOT group multiple lines together into a single bbox.
|
| 45 |
-
- This document may contain ARABIC HANDWRITTEN or TYPED text. Extract it accurately character-by-character.
|
| 46 |
-
- Output ONLY the text you see - NO translation, NO English words, NO explanations.
|
| 47 |
|
| 48 |
1. Bbox format: [x1, y1, x2, y2]
|
| 49 |
|
|
@@ -54,14 +49,10 @@ CRITICAL REQUIREMENTS:
|
|
| 54 |
- Formula: Format its text as LaTeX.
|
| 55 |
- Table: Format its text as HTML.
|
| 56 |
- All Others (Text, Title, etc.): Format their text as Markdown.
|
| 57 |
-
- For Arabic text (handwritten or typed): Extract exactly as written, character-by-character.
|
| 58 |
|
| 59 |
4. Constraints:
|
| 60 |
-
- The output text must be the
|
| 61 |
-
-
|
| 62 |
-
- Each text line should have its own bbox and text content.
|
| 63 |
-
- All layout elements must be sorted according to human reading order (right-to-left for Arabic).
|
| 64 |
-
- Focus on ACCURACY over speed - take time to recognize each character correctly.
|
| 65 |
|
| 66 |
5. Final Output: The entire output must be a single JSON object.
|
| 67 |
"""
|
|
@@ -500,18 +491,7 @@ def _generate_text_and_confidence_for_crop(
|
|
| 500 |
{"type": "image", "image": image},
|
| 501 |
{
|
| 502 |
"type": "text",
|
| 503 |
-
"text": "
|
| 504 |
-
|
| 505 |
-
STRICT RULES:
|
| 506 |
-
- Output ONLY Arabic characters you see in the image
|
| 507 |
-
- NO English words whatsoever (no 'Commission', 'Text', etc.)
|
| 508 |
-
- NO translations
|
| 509 |
-
- NO explanations
|
| 510 |
-
- NO additional text
|
| 511 |
-
- If you see handwriting, transcribe it exactly
|
| 512 |
-
- If there is no Arabic text, output nothing
|
| 513 |
-
|
| 514 |
-
Extract the Arabic text now:""",
|
| 515 |
},
|
| 516 |
],
|
| 517 |
}
|
|
@@ -580,43 +560,6 @@ Extract the Arabic text now:""",
|
|
| 580 |
return "", 0.0
|
| 581 |
|
| 582 |
|
| 583 |
-
def preprocess_for_handwriting_ocr(image: Image.Image) -> Image.Image:
|
| 584 |
-
"""
|
| 585 |
-
Enhance image quality for better handwriting OCR.
|
| 586 |
-
|
| 587 |
-
Applies:
|
| 588 |
-
- Contrast enhancement
|
| 589 |
-
- Sharpening
|
| 590 |
-
- Noise reduction (if needed)
|
| 591 |
-
"""
|
| 592 |
-
try:
|
| 593 |
-
from PIL import ImageEnhance, ImageFilter
|
| 594 |
-
|
| 595 |
-
# Convert to RGB if needed
|
| 596 |
-
if image.mode != 'RGB':
|
| 597 |
-
image = image.convert('RGB')
|
| 598 |
-
|
| 599 |
-
# 1. Increase contrast to make text more distinct from background
|
| 600 |
-
enhancer = ImageEnhance.Contrast(image)
|
| 601 |
-
image = enhancer.enhance(1.5) # Boost contrast by 50%
|
| 602 |
-
|
| 603 |
-
# 2. Increase sharpness to make character edges clearer
|
| 604 |
-
enhancer = ImageEnhance.Sharpness(image)
|
| 605 |
-
image = enhancer.enhance(1.8) # Significant sharpening
|
| 606 |
-
|
| 607 |
-
# 3. Slight brightness adjustment if image is too dark
|
| 608 |
-
enhancer = ImageEnhance.Brightness(image)
|
| 609 |
-
image = enhancer.enhance(1.1)
|
| 610 |
-
|
| 611 |
-
# 4. Apply unsharp mask for better edge definition
|
| 612 |
-
image = image.filter(ImageFilter.UnsharpMask(radius=1, percent=150, threshold=3))
|
| 613 |
-
|
| 614 |
-
return image
|
| 615 |
-
except Exception as e:
|
| 616 |
-
print(f"⚠️ Warning: Image preprocessing failed: {e}")
|
| 617 |
-
return image # Return original if preprocessing fails
|
| 618 |
-
|
| 619 |
-
|
| 620 |
def estimate_text_density(image: Image.Image) -> float:
|
| 621 |
"""
|
| 622 |
Estimate text density in image using pixel analysis.
|
|
@@ -644,518 +587,6 @@ def estimate_text_density(image: Image.Image) -> float:
|
|
| 644 |
return 0.1 # Default to low density
|
| 645 |
|
| 646 |
|
| 647 |
-
def analyze_image_line_characteristics(image: Image.Image) -> Dict[str, float]:
|
| 648 |
-
"""
|
| 649 |
-
Analyze image to determine optimal line detection parameters.
|
| 650 |
-
Works adaptively for any image type (sparse, dense, tables, forms).
|
| 651 |
-
|
| 652 |
-
Returns dict with: avg_line_height, min_line_height, max_line_height, line_spacing
|
| 653 |
-
"""
|
| 654 |
-
try:
|
| 655 |
-
width, height = image.size
|
| 656 |
-
gray = image.convert('L')
|
| 657 |
-
img_array = np.array(gray)
|
| 658 |
-
|
| 659 |
-
# Horizontal projection: sum of dark pixels per row
|
| 660 |
-
row_sums = np.sum(img_array < 128, axis=1)
|
| 661 |
-
|
| 662 |
-
if len(row_sums) < 10:
|
| 663 |
-
# Fallback for very small images
|
| 664 |
-
return {
|
| 665 |
-
'avg_line_height': height / 10, # Assume ~10 lines
|
| 666 |
-
'min_line_height': max(15, height / 20),
|
| 667 |
-
'max_line_height': height / 3, # Split if > 1/3 of image height
|
| 668 |
-
'line_spacing': height / 15
|
| 669 |
-
}
|
| 670 |
-
|
| 671 |
-
# Find text rows (peaks in projection)
|
| 672 |
-
mean_val = np.mean(row_sums)
|
| 673 |
-
std_val = np.std(row_sums)
|
| 674 |
-
threshold = max(mean_val * 0.2, mean_val - std_val * 0.3)
|
| 675 |
-
text_rows = np.where(row_sums > threshold)[0]
|
| 676 |
-
|
| 677 |
-
if len(text_rows) < 2:
|
| 678 |
-
# No clear text lines detected, use conservative estimates
|
| 679 |
-
estimated_lines = max(5, height // 50)
|
| 680 |
-
return {
|
| 681 |
-
'avg_line_height': height / estimated_lines,
|
| 682 |
-
'min_line_height': max(15, height / (estimated_lines * 2)),
|
| 683 |
-
'max_line_height': height / 2, # Split if > half image
|
| 684 |
-
'line_spacing': height / estimated_lines
|
| 685 |
-
}
|
| 686 |
-
|
| 687 |
-
# Group consecutive text rows into lines
|
| 688 |
-
line_centers = []
|
| 689 |
-
current_group = [text_rows[0]]
|
| 690 |
-
|
| 691 |
-
for i in range(1, len(text_rows)):
|
| 692 |
-
if text_rows[i] - text_rows[i-1] <= 5: # Consecutive rows
|
| 693 |
-
current_group.append(text_rows[i])
|
| 694 |
-
else:
|
| 695 |
-
line_centers.append(int(np.mean(current_group)))
|
| 696 |
-
current_group = [text_rows[i]]
|
| 697 |
-
|
| 698 |
-
if current_group:
|
| 699 |
-
line_centers.append(int(np.mean(current_group)))
|
| 700 |
-
|
| 701 |
-
if len(line_centers) < 2:
|
| 702 |
-
# Can't determine spacing
|
| 703 |
-
estimated_lines = max(3, height // 60)
|
| 704 |
-
return {
|
| 705 |
-
'avg_line_height': height / estimated_lines,
|
| 706 |
-
'min_line_height': max(20, height / (estimated_lines * 2)),
|
| 707 |
-
'max_line_height': height / 2,
|
| 708 |
-
'line_spacing': height / estimated_lines
|
| 709 |
-
}
|
| 710 |
-
|
| 711 |
-
# Calculate spacing between lines
|
| 712 |
-
spacings = []
|
| 713 |
-
for i in range(len(line_centers) - 1):
|
| 714 |
-
spacing = line_centers[i+1] - line_centers[i]
|
| 715 |
-
if spacing > 8: # Minimum reasonable spacing
|
| 716 |
-
spacings.append(spacing)
|
| 717 |
-
|
| 718 |
-
if spacings:
|
| 719 |
-
avg_spacing = np.median(spacings)
|
| 720 |
-
min_spacing = np.percentile(spacings, 25)
|
| 721 |
-
max_spacing = np.percentile(spacings, 75)
|
| 722 |
-
|
| 723 |
-
return {
|
| 724 |
-
'avg_line_height': float(avg_spacing),
|
| 725 |
-
'min_line_height': float(max(15, min_spacing * 0.6)), # 60% of min spacing
|
| 726 |
-
'max_line_height': float(max_spacing * 1.5), # 1.5x max spacing = likely multi-line
|
| 727 |
-
'line_spacing': float(avg_spacing),
|
| 728 |
-
'num_lines_detected': len(line_centers)
|
| 729 |
-
}
|
| 730 |
-
|
| 731 |
-
# Fallback
|
| 732 |
-
estimated_lines = max(3, height // 50)
|
| 733 |
-
return {
|
| 734 |
-
'avg_line_height': height / estimated_lines,
|
| 735 |
-
'min_line_height': max(20, height / (estimated_lines * 2)),
|
| 736 |
-
'max_line_height': height / 2,
|
| 737 |
-
'line_spacing': height / estimated_lines
|
| 738 |
-
}
|
| 739 |
-
|
| 740 |
-
except Exception as e:
|
| 741 |
-
print(f" ⚠️ Error analyzing image: {e}")
|
| 742 |
-
# Ultra-conservative fallback
|
| 743 |
-
width, height = image.size
|
| 744 |
-
return {
|
| 745 |
-
'avg_line_height': 50,
|
| 746 |
-
'min_line_height': 25,
|
| 747 |
-
'max_line_height': 100,
|
| 748 |
-
'line_spacing': 50
|
| 749 |
-
}
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
def validate_region_contains_text(image: Image.Image, bbox: List[int], min_text_density: float = 0.05) -> bool:
|
| 753 |
-
"""
|
| 754 |
-
Validate that a bounding box region actually contains text (not empty space).
|
| 755 |
-
|
| 756 |
-
Args:
|
| 757 |
-
image: Original image
|
| 758 |
-
bbox: Bounding box [x1, y1, x2, y2]
|
| 759 |
-
min_text_density: Minimum fraction of pixels that should be text (dark pixels)
|
| 760 |
-
|
| 761 |
-
Returns:
|
| 762 |
-
True if region contains sufficient text, False otherwise
|
| 763 |
-
"""
|
| 764 |
-
try:
|
| 765 |
-
x1, y1, x2, y2 = bbox
|
| 766 |
-
x1, y1 = max(0, int(x1)), max(0, int(y1))
|
| 767 |
-
x2, y2 = min(image.width, int(x2)), min(image.height, int(y2))
|
| 768 |
-
|
| 769 |
-
if x2 <= x1 or y2 <= y1:
|
| 770 |
-
return False
|
| 771 |
-
|
| 772 |
-
# Crop region
|
| 773 |
-
crop = image.crop((x1, y1, x2, y2))
|
| 774 |
-
gray = crop.convert('L')
|
| 775 |
-
img_array = np.array(gray)
|
| 776 |
-
|
| 777 |
-
if img_array.size == 0:
|
| 778 |
-
return False
|
| 779 |
-
|
| 780 |
-
# Calculate text density (fraction of dark pixels)
|
| 781 |
-
# For handwriting/text, we expect at least some dark pixels
|
| 782 |
-
dark_pixels = np.sum(img_array < 128) # Pixels darker than middle gray
|
| 783 |
-
total_pixels = img_array.size
|
| 784 |
-
text_density = dark_pixels / total_pixels if total_pixels > 0 else 0
|
| 785 |
-
|
| 786 |
-
# Also check for minimum height/width (avoid tiny regions)
|
| 787 |
-
height = y2 - y1
|
| 788 |
-
width = x2 - x1
|
| 789 |
-
min_dimension = min(height, width)
|
| 790 |
-
|
| 791 |
-
# Reject if:
|
| 792 |
-
# 1. Text density too low (mostly empty space)
|
| 793 |
-
# 2. Region too small (likely noise)
|
| 794 |
-
if text_density < min_text_density:
|
| 795 |
-
return False
|
| 796 |
-
|
| 797 |
-
if min_dimension < 15: # Too small to be a real text line
|
| 798 |
-
return False
|
| 799 |
-
|
| 800 |
-
return True
|
| 801 |
-
|
| 802 |
-
except Exception as e:
|
| 803 |
-
print(f" ⚠️ Error validating region: {e}")
|
| 804 |
-
return False
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
def filter_empty_regions(image: Image.Image, layout_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 808 |
-
"""
|
| 809 |
-
Filter out regions that are empty spaces, noise, or false positives.
|
| 810 |
-
|
| 811 |
-
This removes:
|
| 812 |
-
- Regions with very low text density (empty margins/spaces)
|
| 813 |
-
- Regions that are too small
|
| 814 |
-
- Regions that are likely noise artifacts
|
| 815 |
-
"""
|
| 816 |
-
filtered = []
|
| 817 |
-
removed_count = 0
|
| 818 |
-
|
| 819 |
-
for item in layout_data:
|
| 820 |
-
bbox = item.get('bbox', [])
|
| 821 |
-
category = item.get('category', '')
|
| 822 |
-
text = item.get('text', '').strip()
|
| 823 |
-
|
| 824 |
-
# Skip if no bbox
|
| 825 |
-
if not bbox or len(bbox) != 4:
|
| 826 |
-
continue
|
| 827 |
-
|
| 828 |
-
# For Text/List-item regions, validate they contain actual text
|
| 829 |
-
if category in ['Text', 'List-item']:
|
| 830 |
-
if not validate_region_contains_text(image, bbox):
|
| 831 |
-
print(f" 🗑️ Removing empty region: {category} bbox={bbox}")
|
| 832 |
-
removed_count += 1
|
| 833 |
-
continue
|
| 834 |
-
|
| 835 |
-
# Even if region passes validation, check if text is meaningful
|
| 836 |
-
# Remove regions with very short or meaningless text
|
| 837 |
-
if category in ['Text', 'List-item']:
|
| 838 |
-
# Remove if text is empty or too short (likely noise)
|
| 839 |
-
if not text or len(text.strip()) < 2:
|
| 840 |
-
# But only if it also failed validation
|
| 841 |
-
if not validate_region_contains_text(image, bbox, min_text_density=0.03):
|
| 842 |
-
print(f" 🗑️ Removing empty/noise region: {category} bbox={bbox}")
|
| 843 |
-
removed_count += 1
|
| 844 |
-
continue
|
| 845 |
-
|
| 846 |
-
filtered.append(item)
|
| 847 |
-
|
| 848 |
-
if removed_count > 0:
|
| 849 |
-
print(f"🗑️ Filtered out {removed_count} empty/noise regions")
|
| 850 |
-
|
| 851 |
-
return filtered
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
def detect_line_spacing(image: Image.Image, bbox: List[int]) -> Optional[float]:
|
| 855 |
-
"""
|
| 856 |
-
Detect average line spacing in a text region using horizontal projection analysis.
|
| 857 |
-
|
| 858 |
-
Returns estimated line height in pixels, or None if detection fails.
|
| 859 |
-
"""
|
| 860 |
-
try:
|
| 861 |
-
x1, y1, x2, y2 = bbox
|
| 862 |
-
crop = image.crop((x1, y1, x2, y2))
|
| 863 |
-
|
| 864 |
-
# Convert to grayscale
|
| 865 |
-
gray = crop.convert('L')
|
| 866 |
-
img_array = np.array(gray)
|
| 867 |
-
|
| 868 |
-
if img_array.size == 0:
|
| 869 |
-
return None
|
| 870 |
-
|
| 871 |
-
# Horizontal projection: sum of dark pixels per row
|
| 872 |
-
row_sums = np.sum(img_array < 128, axis=1)
|
| 873 |
-
|
| 874 |
-
if len(row_sums) < 10:
|
| 875 |
-
return None
|
| 876 |
-
|
| 877 |
-
# Find peaks (text lines) and valleys (spacing between lines)
|
| 878 |
-
mean_val = np.mean(row_sums)
|
| 879 |
-
std_val = np.std(row_sums)
|
| 880 |
-
threshold = max(mean_val * 0.25, mean_val - std_val * 0.4)
|
| 881 |
-
|
| 882 |
-
text_rows = np.where(row_sums > threshold)[0]
|
| 883 |
-
|
| 884 |
-
if len(text_rows) < 2:
|
| 885 |
-
return None
|
| 886 |
-
|
| 887 |
-
# Group consecutive rows to find line centers
|
| 888 |
-
line_centers = []
|
| 889 |
-
current_group = [text_rows[0]]
|
| 890 |
-
|
| 891 |
-
for i in range(1, len(text_rows)):
|
| 892 |
-
if text_rows[i] - text_rows[i-1] <= 3:
|
| 893 |
-
current_group.append(text_rows[i])
|
| 894 |
-
else:
|
| 895 |
-
line_centers.append(int(np.mean(current_group)))
|
| 896 |
-
current_group = [text_rows[i]]
|
| 897 |
-
|
| 898 |
-
if current_group:
|
| 899 |
-
line_centers.append(int(np.mean(current_group)))
|
| 900 |
-
|
| 901 |
-
if len(line_centers) < 2:
|
| 902 |
-
return None
|
| 903 |
-
|
| 904 |
-
# Calculate spacing between line centers
|
| 905 |
-
spacings = []
|
| 906 |
-
for i in range(len(line_centers) - 1):
|
| 907 |
-
spacing = line_centers[i+1] - line_centers[i]
|
| 908 |
-
if spacing > 10:
|
| 909 |
-
spacings.append(spacing)
|
| 910 |
-
|
| 911 |
-
if spacings:
|
| 912 |
-
return float(np.median(spacings))
|
| 913 |
-
|
| 914 |
-
return None
|
| 915 |
-
except Exception as e:
|
| 916 |
-
return None
|
| 917 |
-
|
| 918 |
-
|
| 919 |
-
def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) -> List[int]:
|
| 920 |
-
"""
|
| 921 |
-
Detect actual line break positions within a text region using horizontal projection.
|
| 922 |
-
Only detects breaks if region has sufficient text density (not empty space).
|
| 923 |
-
|
| 924 |
-
Returns list of y-coordinates where lines break.
|
| 925 |
-
"""
|
| 926 |
-
try:
|
| 927 |
-
x1, y1, x2, y2 = bbox
|
| 928 |
-
crop = image.crop((x1, y1, x2, y2))
|
| 929 |
-
gray = crop.convert('L')
|
| 930 |
-
img_array = np.array(gray)
|
| 931 |
-
|
| 932 |
-
if img_array.size == 0:
|
| 933 |
-
return []
|
| 934 |
-
|
| 935 |
-
# FIRST: Validate region has actual text (not empty space)
|
| 936 |
-
total_pixels = img_array.size
|
| 937 |
-
dark_pixels = np.sum(img_array < 128)
|
| 938 |
-
text_density = dark_pixels / total_pixels if total_pixels > 0 else 0
|
| 939 |
-
|
| 940 |
-
# Require minimum text density to avoid false positives on empty regions
|
| 941 |
-
if text_density < 0.03: # Less than 3% dark pixels = likely empty
|
| 942 |
-
return []
|
| 943 |
-
|
| 944 |
-
# Horizontal projection
|
| 945 |
-
row_sums = np.sum(img_array < 128, axis=1)
|
| 946 |
-
|
| 947 |
-
if len(row_sums) < 10: # Need enough rows
|
| 948 |
-
return []
|
| 949 |
-
|
| 950 |
-
# Find valleys (spaces between lines) and peaks (text lines)
|
| 951 |
-
mean_val = np.mean(row_sums)
|
| 952 |
-
std_val = np.std(row_sums)
|
| 953 |
-
|
| 954 |
-
# More aggressive thresholds to avoid false positives
|
| 955 |
-
text_threshold = max(mean_val * 0.3, mean_val - std_val * 0.5)
|
| 956 |
-
space_threshold = mean_val * 0.1 # Very low for actual spaces
|
| 957 |
-
|
| 958 |
-
# Find text rows and space rows
|
| 959 |
-
text_rows = np.where(row_sums > text_threshold)[0]
|
| 960 |
-
|
| 961 |
-
if len(text_rows) < 5: # Need substantial text rows
|
| 962 |
-
return []
|
| 963 |
-
|
| 964 |
-
# Group text rows into lines
|
| 965 |
-
line_groups = []
|
| 966 |
-
current_group = [text_rows[0]]
|
| 967 |
-
|
| 968 |
-
for i in range(1, len(text_rows)):
|
| 969 |
-
if text_rows[i] - text_rows[i-1] <= 3:
|
| 970 |
-
current_group.append(text_rows[i])
|
| 971 |
-
else:
|
| 972 |
-
if len(current_group) >= 3: # Require minimum group size
|
| 973 |
-
line_groups.append(current_group)
|
| 974 |
-
current_group = [text_rows[i]]
|
| 975 |
-
|
| 976 |
-
if len(current_group) >= 3:
|
| 977 |
-
line_groups.append(current_group)
|
| 978 |
-
|
| 979 |
-
if len(line_groups) < 2:
|
| 980 |
-
return [] # Single line or can't detect
|
| 981 |
-
|
| 982 |
-
# Find break points (midpoints between line groups)
|
| 983 |
-
# Require minimum gap between lines to avoid false splits
|
| 984 |
-
break_points = []
|
| 985 |
-
for i in range(len(line_groups) - 1):
|
| 986 |
-
last_row_of_line1 = max(line_groups[i])
|
| 987 |
-
first_row_of_line2 = min(line_groups[i+1])
|
| 988 |
-
gap = first_row_of_line2 - last_row_of_line1
|
| 989 |
-
|
| 990 |
-
# Only split if gap is substantial (at least 5 pixels)
|
| 991 |
-
if gap >= 5:
|
| 992 |
-
break_point = (last_row_of_line1 + first_row_of_line2) // 2
|
| 993 |
-
break_points.append(y1 + break_point) # Convert to image coordinates
|
| 994 |
-
|
| 995 |
-
return break_points
|
| 996 |
-
|
| 997 |
-
except Exception as e:
|
| 998 |
-
print(f" ⚠️ Error detecting line breaks: {e}")
|
| 999 |
-
return []
|
| 1000 |
-
|
| 1001 |
-
|
| 1002 |
-
def split_text_regions_into_lines(
|
| 1003 |
-
image: Image.Image,
|
| 1004 |
-
layout_data: List[Dict[str, Any]],
|
| 1005 |
-
min_line_height: Optional[int] = None,
|
| 1006 |
-
max_line_height: Optional[int] = None
|
| 1007 |
-
) -> List[Dict[str, Any]]:
|
| 1008 |
-
"""
|
| 1009 |
-
Intelligently split text regions into individual lines.
|
| 1010 |
-
|
| 1011 |
-
ADAPTIVE APPROACH:
|
| 1012 |
-
- Analyzes image to determine optimal parameters
|
| 1013 |
-
- Detects actual line breaks using image analysis
|
| 1014 |
-
- Works for any image type (sparse, dense, tables, forms)
|
| 1015 |
-
- No hardcoded thresholds
|
| 1016 |
-
|
| 1017 |
-
Args:
|
| 1018 |
-
image: Original image
|
| 1019 |
-
layout_data: Layout detection results
|
| 1020 |
-
min_line_height: Optional override (auto-detected if None)
|
| 1021 |
-
max_line_height: Optional override (auto-detected if None)
|
| 1022 |
-
|
| 1023 |
-
Returns:
|
| 1024 |
-
Updated layout data with lines split
|
| 1025 |
-
"""
|
| 1026 |
-
# Analyze image to get adaptive parameters
|
| 1027 |
-
img_chars = analyze_image_line_characteristics(image)
|
| 1028 |
-
adaptive_min = min_line_height if min_line_height else int(img_chars['min_line_height'])
|
| 1029 |
-
adaptive_max = max_line_height if max_line_height else int(img_chars['max_line_height'])
|
| 1030 |
-
avg_line_height = img_chars['avg_line_height']
|
| 1031 |
-
|
| 1032 |
-
print(f"\n📊 Image analysis: avg_line_height={avg_line_height:.1f}px, "
|
| 1033 |
-
f"min={adaptive_min}px, max={adaptive_max}px")
|
| 1034 |
-
if 'num_lines_detected' in img_chars:
|
| 1035 |
-
print(f" Detected ~{img_chars['num_lines_detected']} lines in image")
|
| 1036 |
-
|
| 1037 |
-
result = []
|
| 1038 |
-
split_count = 0
|
| 1039 |
-
|
| 1040 |
-
for item in layout_data:
|
| 1041 |
-
bbox = item.get('bbox', [])
|
| 1042 |
-
category = item.get('category', '')
|
| 1043 |
-
text_content = item.get('text', '')
|
| 1044 |
-
|
| 1045 |
-
# Only split Text regions (not titles, headers, tables, etc.)
|
| 1046 |
-
if len(bbox) != 4 or category not in ['Text', 'List-item']:
|
| 1047 |
-
result.append(item)
|
| 1048 |
-
continue
|
| 1049 |
-
|
| 1050 |
-
x1, y1, x2, y2 = bbox
|
| 1051 |
-
height = y2 - y1
|
| 1052 |
-
width = x2 - x1
|
| 1053 |
-
|
| 1054 |
-
# FIRST: Validate region actually contains text before trying to split
|
| 1055 |
-
if not validate_region_contains_text(image, bbox, min_text_density=0.03):
|
| 1056 |
-
print(f" Region: {category} (h={height}px) - Empty/noise region, skipping split")
|
| 1057 |
-
# Don't add empty regions - they'll be filtered out later
|
| 1058 |
-
continue
|
| 1059 |
-
|
| 1060 |
-
# ALWAYS check if region contains multiple lines, regardless of height
|
| 1061 |
-
# Use image analysis to detect actual line breaks
|
| 1062 |
-
line_breaks = detect_actual_line_breaks_in_region(image, bbox)
|
| 1063 |
-
|
| 1064 |
-
if len(line_breaks) > 0:
|
| 1065 |
-
# We detected actual line breaks - split at those positions
|
| 1066 |
-
print(f" Region: {category} (h={height}px) - Detected {len(line_breaks)+1} lines via image analysis")
|
| 1067 |
-
|
| 1068 |
-
# Create lines based on detected breaks
|
| 1069 |
-
current_y = y1
|
| 1070 |
-
for i, break_y in enumerate(line_breaks):
|
| 1071 |
-
# Create line from current_y to break_y
|
| 1072 |
-
new_bbox = [x1, int(current_y), x2, int(break_y)]
|
| 1073 |
-
|
| 1074 |
-
# Validate split region contains text before adding
|
| 1075 |
-
if validate_region_contains_text(image, new_bbox, min_text_density=0.03):
|
| 1076 |
-
new_item = item.copy()
|
| 1077 |
-
new_item['bbox'] = new_bbox
|
| 1078 |
-
new_item['text'] = "" # Will be re-OCR'd
|
| 1079 |
-
new_item['split_from_parent'] = True
|
| 1080 |
-
new_item['needs_reocr'] = True
|
| 1081 |
-
new_item['line_number'] = i + 1
|
| 1082 |
-
result.append(new_item)
|
| 1083 |
-
|
| 1084 |
-
current_y = break_y
|
| 1085 |
-
|
| 1086 |
-
# Add last line
|
| 1087 |
-
final_bbox = [x1, int(current_y), x2, y2]
|
| 1088 |
-
if validate_region_contains_text(image, final_bbox, min_text_density=0.03):
|
| 1089 |
-
new_item = item.copy()
|
| 1090 |
-
new_item['bbox'] = final_bbox
|
| 1091 |
-
new_item['text'] = ""
|
| 1092 |
-
new_item['split_from_parent'] = True
|
| 1093 |
-
new_item['needs_reocr'] = True
|
| 1094 |
-
new_item['line_number'] = len(line_breaks) + 1
|
| 1095 |
-
result.append(new_item)
|
| 1096 |
-
|
| 1097 |
-
split_count += 1
|
| 1098 |
-
|
| 1099 |
-
elif height > adaptive_max:
|
| 1100 |
-
# No line breaks detected but region is tall - use spacing-based split
|
| 1101 |
-
print(f" Region: {category} (h={height}px) - Tall region, using spacing-based split")
|
| 1102 |
-
|
| 1103 |
-
# Try to detect spacing in this specific region
|
| 1104 |
-
detected_spacing = detect_line_spacing(image, bbox)
|
| 1105 |
-
|
| 1106 |
-
if detected_spacing and detected_spacing > adaptive_min:
|
| 1107 |
-
line_height = detected_spacing
|
| 1108 |
-
estimated_lines = max(2, round(height / line_height))
|
| 1109 |
-
else:
|
| 1110 |
-
line_height = avg_line_height
|
| 1111 |
-
estimated_lines = max(2, round(height / line_height))
|
| 1112 |
-
|
| 1113 |
-
estimated_lines = min(estimated_lines, 15) # Cap at 15 lines
|
| 1114 |
-
|
| 1115 |
-
# Calculate padding (adaptive: 8% of line height, min 2px)
|
| 1116 |
-
padding = max(2, int(line_height * 0.08))
|
| 1117 |
-
|
| 1118 |
-
# Split geometrically
|
| 1119 |
-
for i in range(estimated_lines):
|
| 1120 |
-
if i == 0:
|
| 1121 |
-
new_y1 = y1
|
| 1122 |
-
new_y2 = y1 + line_height + padding
|
| 1123 |
-
elif i == estimated_lines - 1:
|
| 1124 |
-
new_y1 = y1 + (i * line_height) - padding
|
| 1125 |
-
new_y2 = y2
|
| 1126 |
-
else:
|
| 1127 |
-
new_y1 = y1 + (i * line_height) - padding
|
| 1128 |
-
new_y2 = y1 + ((i + 1) * line_height) + padding
|
| 1129 |
-
|
| 1130 |
-
new_y1 = max(y1, int(new_y1))
|
| 1131 |
-
new_y2 = min(y2, int(new_y2))
|
| 1132 |
-
|
| 1133 |
-
if new_y2 > new_y1:
|
| 1134 |
-
new_bbox = [x1, new_y1, x2, new_y2]
|
| 1135 |
-
|
| 1136 |
-
# Validate split region contains text before adding
|
| 1137 |
-
if validate_region_contains_text(image, new_bbox, min_text_density=0.03):
|
| 1138 |
-
new_item = item.copy()
|
| 1139 |
-
new_item['bbox'] = new_bbox
|
| 1140 |
-
new_item['text'] = ""
|
| 1141 |
-
new_item['split_from_parent'] = True
|
| 1142 |
-
new_item['needs_reocr'] = True
|
| 1143 |
-
new_item['line_number'] = i + 1
|
| 1144 |
-
result.append(new_item)
|
| 1145 |
-
|
| 1146 |
-
split_count += 1
|
| 1147 |
-
|
| 1148 |
-
else:
|
| 1149 |
-
# Region is reasonably sized - keep as is
|
| 1150 |
-
print(f" Region: {category} (h={height}px) - Keeping as single line")
|
| 1151 |
-
result.append(item)
|
| 1152 |
-
|
| 1153 |
-
if split_count > 0:
|
| 1154 |
-
print(f"📏 Split {split_count} regions into {len(result)} total lines")
|
| 1155 |
-
|
| 1156 |
-
return result
|
| 1157 |
-
|
| 1158 |
-
|
| 1159 |
def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
|
| 1160 |
"""
|
| 1161 |
Intelligently determine if image should be chunked for better accuracy.
|
|
@@ -1382,126 +813,6 @@ def process_image(
|
|
| 1382 |
# Try to parse JSON output
|
| 1383 |
layout_data = json.loads(raw_output)
|
| 1384 |
|
| 1385 |
-
# 🗑️ FIRST FILTER: Remove empty regions and false positives from initial detection
|
| 1386 |
-
print(f"\n📋 Initial layout: {len(layout_data)} regions detected")
|
| 1387 |
-
layout_data = filter_empty_regions(image, layout_data)
|
| 1388 |
-
print(f"✅ After initial filtering: {len(layout_data)} regions remaining")
|
| 1389 |
-
|
| 1390 |
-
for idx, item in enumerate(layout_data):
|
| 1391 |
-
bbox = item.get('bbox', [])
|
| 1392 |
-
text = item.get('text', '')[:50]
|
| 1393 |
-
cat = item.get('category', '')
|
| 1394 |
-
print(f" Region {idx+1}: {cat} - '{text}...' bbox={bbox}")
|
| 1395 |
-
|
| 1396 |
-
try:
|
| 1397 |
-
layout_data_before = len(layout_data)
|
| 1398 |
-
layout_data = split_text_regions_into_lines(image, layout_data)
|
| 1399 |
-
print(f"📐 After splitting: {layout_data_before} → {len(layout_data)} regions")
|
| 1400 |
-
|
| 1401 |
-
# 🗑️ SECOND FILTER: Remove any empty regions created during splitting
|
| 1402 |
-
layout_data = filter_empty_regions(image, layout_data)
|
| 1403 |
-
print(f"✅ After post-split filtering: {len(layout_data)} regions remaining")
|
| 1404 |
-
|
| 1405 |
-
except Exception as e:
|
| 1406 |
-
print(f"⚠️ Warning: Could not split text regions: {e}")
|
| 1407 |
-
traceback.print_exc()
|
| 1408 |
-
# Continue with original layout data
|
| 1409 |
-
|
| 1410 |
-
# 🔄 RE-OCR SPLIT LINES: For split regions, perform per-line OCR
|
| 1411 |
-
regions_needing_reocr = [item for item in layout_data if item.get('needs_reocr')]
|
| 1412 |
-
if regions_needing_reocr:
|
| 1413 |
-
print(f"🔄 Re-OCRing {len(regions_needing_reocr)} split line regions for accurate per-line text...")
|
| 1414 |
-
valid_regions = []
|
| 1415 |
-
for idx, item in enumerate(regions_needing_reocr):
|
| 1416 |
-
try:
|
| 1417 |
-
bbox = item.get('bbox', [])
|
| 1418 |
-
if not bbox or len(bbox) != 4:
|
| 1419 |
-
continue
|
| 1420 |
-
x1, y1, x2, y2 = bbox
|
| 1421 |
-
x1, y1 = max(0, int(x1)), max(0, int(y1))
|
| 1422 |
-
x2, y2 = min(image.width, int(x2)), min(image.height, int(y2))
|
| 1423 |
-
if x2 <= x1 or y2 <= y1:
|
| 1424 |
-
continue
|
| 1425 |
-
|
| 1426 |
-
# 🚫 VALIDATE BEFORE RE-OCR: Skip empty regions
|
| 1427 |
-
if not validate_region_contains_text(image, bbox, min_text_density=0.03):
|
| 1428 |
-
print(f" ⚠️ Skipping line {idx+1}: empty region (bbox={bbox})")
|
| 1429 |
-
# Mark for removal
|
| 1430 |
-
item['_should_remove'] = True
|
| 1431 |
-
continue
|
| 1432 |
-
|
| 1433 |
-
# Add small safety margin to ensure we capture full text
|
| 1434 |
-
margin = 2 # Small margin to avoid edge clipping
|
| 1435 |
-
crop_x1 = max(0, x1 - margin)
|
| 1436 |
-
crop_y1 = max(0, y1 - margin)
|
| 1437 |
-
crop_x2 = min(image.width, x2 + margin)
|
| 1438 |
-
crop_y2 = min(image.height, y2 + margin)
|
| 1439 |
-
|
| 1440 |
-
# Crop and preprocess the line region
|
| 1441 |
-
crop_img = image.crop((crop_x1, crop_y1, crop_x2, crop_y2))
|
| 1442 |
-
|
| 1443 |
-
# Validate crop is reasonable size
|
| 1444 |
-
if crop_img.size[0] < 10 or crop_img.size[1] < 10:
|
| 1445 |
-
print(f" ⚠️ Skipping line {idx+1}: crop too small ({crop_img.size})")
|
| 1446 |
-
item['_should_remove'] = True
|
| 1447 |
-
continue
|
| 1448 |
-
|
| 1449 |
-
# Apply preprocessing to enhance handwriting quality
|
| 1450 |
-
crop_img = preprocess_for_handwriting_ocr(crop_img)
|
| 1451 |
-
|
| 1452 |
-
# Re-OCR this specific line
|
| 1453 |
-
line_text, line_conf = _generate_text_and_confidence_for_crop(crop_img)
|
| 1454 |
-
|
| 1455 |
-
# AGGRESSIVE FILTERING: Remove any English words/hallucinations
|
| 1456 |
-
line_text = line_text.strip()
|
| 1457 |
-
|
| 1458 |
-
# Remove common English hallucinations
|
| 1459 |
-
english_hallucinations = [
|
| 1460 |
-
'Commission', 'commission', 'COMMISSION',
|
| 1461 |
-
'The', 'the', 'and', 'or', 'of', 'in', 'to', 'a', 'is',
|
| 1462 |
-
'Text', 'text', 'Title', 'title', 'Caption', 'caption',
|
| 1463 |
-
'Page', 'page', 'Document', 'document', 'Image', 'image'
|
| 1464 |
-
]
|
| 1465 |
-
|
| 1466 |
-
for hallucination in english_hallucinations:
|
| 1467 |
-
line_text = line_text.replace(hallucination, '').strip()
|
| 1468 |
-
|
| 1469 |
-
# Remove any remaining Latin alphabet (keep only Arabic, numbers, punctuation)
|
| 1470 |
-
import re
|
| 1471 |
-
# Keep: Arabic letters, Arabic numbers, spaces, basic punctuation
|
| 1472 |
-
line_text = re.sub(r'[a-zA-Z]+', '', line_text).strip()
|
| 1473 |
-
|
| 1474 |
-
# If text is empty or too short after filtering, mark for removal
|
| 1475 |
-
if not line_text or len(line_text.strip()) < 2:
|
| 1476 |
-
print(f" ⚠️ Skipping line {idx+1}: no meaningful text after filtering")
|
| 1477 |
-
item['_should_remove'] = True
|
| 1478 |
-
continue
|
| 1479 |
-
|
| 1480 |
-
item['text'] = line_text
|
| 1481 |
-
item['confidence'] = line_conf
|
| 1482 |
-
item['reocr_completed'] = True
|
| 1483 |
-
valid_regions.append(item)
|
| 1484 |
-
|
| 1485 |
-
print(f" ✓ Line {idx+1}/{len(regions_needing_reocr)}: '{line_text[:50]}...' (conf: {line_conf:.1f}%)")
|
| 1486 |
-
except Exception as e:
|
| 1487 |
-
print(f" ✗ Error re-OCRing line {idx}: {e}")
|
| 1488 |
-
traceback.print_exc()
|
| 1489 |
-
item['_should_remove'] = True
|
| 1490 |
-
|
| 1491 |
-
# Remove regions marked for removal
|
| 1492 |
-
layout_data = [item for item in layout_data if not item.get('_should_remove', False)]
|
| 1493 |
-
|
| 1494 |
-
print(f"\n✅ Re-OCR complete. Final layout has {len(layout_data)} regions:")
|
| 1495 |
-
for idx, item in enumerate(layout_data):
|
| 1496 |
-
text = item.get('text', '')[:50]
|
| 1497 |
-
conf = item.get('confidence', 0)
|
| 1498 |
-
reocr = item.get('reocr_completed', False)
|
| 1499 |
-
print(f" Region {idx+1}: '{text}...' (conf={conf:.1f}%, re-OCR={reocr})")
|
| 1500 |
-
|
| 1501 |
-
# 🗑️ FINAL FILTER: Remove any remaining empty/invalid regions
|
| 1502 |
-
layout_data = filter_empty_regions(image, layout_data)
|
| 1503 |
-
print(f"✅ After final filtering: {len(layout_data)} regions remaining")
|
| 1504 |
-
|
| 1505 |
# 🎯 INTELLIGENT CONFIDENCE SCORING
|
| 1506 |
# Count text regions to determine if per-region scoring is feasible
|
| 1507 |
num_text_regions = sum(1 for item in layout_data
|
|
@@ -1514,10 +825,6 @@ def process_image(
|
|
| 1514 |
# Compute per-region confidence using the model on each cropped region
|
| 1515 |
for idx, item in enumerate(layout_data):
|
| 1516 |
try:
|
| 1517 |
-
# Skip if already processed during re-OCR
|
| 1518 |
-
if item.get('reocr_completed'):
|
| 1519 |
-
continue
|
| 1520 |
-
|
| 1521 |
bbox = item.get('bbox', [])
|
| 1522 |
text_content = item.get('text', '')
|
| 1523 |
category = item.get('category', '')
|
|
@@ -1563,10 +870,9 @@ def process_image(
|
|
| 1563 |
|
| 1564 |
# ✨ ARABIC TEXT CORRECTION: Apply intelligent correction to each text region
|
| 1565 |
try:
|
| 1566 |
-
print(
|
| 1567 |
corrector = get_corrector()
|
| 1568 |
|
| 1569 |
-
corrections_applied = 0
|
| 1570 |
for idx, item in enumerate(layout_data):
|
| 1571 |
text_content = item.get('text', '')
|
| 1572 |
category = item.get('category', '')
|
|
@@ -1575,8 +881,6 @@ def process_image(
|
|
| 1575 |
if not text_content or category in ['Picture', 'Formula', 'Table']:
|
| 1576 |
continue
|
| 1577 |
|
| 1578 |
-
print(f" Correcting region {idx+1}: '{text_content[:40]}...'")
|
| 1579 |
-
|
| 1580 |
# Apply correction
|
| 1581 |
correction_result = corrector.correct_text(text_content)
|
| 1582 |
|
|
@@ -1589,17 +893,13 @@ def process_image(
|
|
| 1589 |
|
| 1590 |
# Update the text field to use corrected version
|
| 1591 |
item['text'] = correction_result['corrected']
|
| 1592 |
-
|
| 1593 |
-
if correction_result['corrections_made'] > 0:
|
| 1594 |
-
corrections_applied += correction_result['corrections_made']
|
| 1595 |
-
print(f" → Made {correction_result['corrections_made']} corrections")
|
| 1596 |
|
| 1597 |
# Regenerate markdown with corrected text
|
| 1598 |
corrected_markdown = layoutjson2md(image, layout_data, text_key='text')
|
| 1599 |
result['markdown_content_corrected'] = corrected_markdown
|
| 1600 |
result['markdown_content_original'] = markdown_content
|
| 1601 |
|
| 1602 |
-
print(f"✅ Correction complete
|
| 1603 |
|
| 1604 |
except Exception as e:
|
| 1605 |
print(f"⚠️ Error during Arabic correction: {e}")
|
|
|
|
| 34 |
|
| 35 |
# Constants
|
| 36 |
MIN_PIXELS = 3136
|
| 37 |
+
MAX_PIXELS = 11289600
|
| 38 |
IMAGE_FACTOR = 28
|
| 39 |
|
| 40 |
# Prompts
|
| 41 |
+
prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
1. Bbox format: [x1, y1, x2, y2]
|
| 44 |
|
|
|
|
| 49 |
- Formula: Format its text as LaTeX.
|
| 50 |
- Table: Format its text as HTML.
|
| 51 |
- All Others (Text, Title, etc.): Format their text as Markdown.
|
|
|
|
| 52 |
|
| 53 |
4. Constraints:
|
| 54 |
+
- The output text must be the original text from the image, with no translation.
|
| 55 |
+
- All layout elements must be sorted according to human reading order.
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
5. Final Output: The entire output must be a single JSON object.
|
| 58 |
"""
|
|
|
|
| 491 |
{"type": "image", "image": image},
|
| 492 |
{
|
| 493 |
"type": "text",
|
| 494 |
+
"text": "Extract the exact text content from this image region. Output text only without translation or additional words.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
},
|
| 496 |
],
|
| 497 |
}
|
|
|
|
| 560 |
return "", 0.0
|
| 561 |
|
| 562 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 563 |
def estimate_text_density(image: Image.Image) -> float:
|
| 564 |
"""
|
| 565 |
Estimate text density in image using pixel analysis.
|
|
|
|
| 587 |
return 0.1 # Default to low density
|
| 588 |
|
| 589 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
|
| 591 |
"""
|
| 592 |
Intelligently determine if image should be chunked for better accuracy.
|
|
|
|
| 813 |
# Try to parse JSON output
|
| 814 |
layout_data = json.loads(raw_output)
|
| 815 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 816 |
# 🎯 INTELLIGENT CONFIDENCE SCORING
|
| 817 |
# Count text regions to determine if per-region scoring is feasible
|
| 818 |
num_text_regions = sum(1 for item in layout_data
|
|
|
|
| 825 |
# Compute per-region confidence using the model on each cropped region
|
| 826 |
for idx, item in enumerate(layout_data):
|
| 827 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 828 |
bbox = item.get('bbox', [])
|
| 829 |
text_content = item.get('text', '')
|
| 830 |
category = item.get('category', '')
|
|
|
|
| 870 |
|
| 871 |
# ✨ ARABIC TEXT CORRECTION: Apply intelligent correction to each text region
|
| 872 |
try:
|
| 873 |
+
print("🔧 Applying Arabic text correction...")
|
| 874 |
corrector = get_corrector()
|
| 875 |
|
|
|
|
| 876 |
for idx, item in enumerate(layout_data):
|
| 877 |
text_content = item.get('text', '')
|
| 878 |
category = item.get('category', '')
|
|
|
|
| 881 |
if not text_content or category in ['Picture', 'Formula', 'Table']:
|
| 882 |
continue
|
| 883 |
|
|
|
|
|
|
|
| 884 |
# Apply correction
|
| 885 |
correction_result = corrector.correct_text(text_content)
|
| 886 |
|
|
|
|
| 893 |
|
| 894 |
# Update the text field to use corrected version
|
| 895 |
item['text'] = correction_result['corrected']
|
|
|
|
|
|
|
|
|
|
|
|
|
| 896 |
|
| 897 |
# Regenerate markdown with corrected text
|
| 898 |
corrected_markdown = layoutjson2md(image, layout_data, text_key='text')
|
| 899 |
result['markdown_content_corrected'] = corrected_markdown
|
| 900 |
result['markdown_content_original'] = markdown_content
|
| 901 |
|
| 902 |
+
print(f"✅ Correction complete")
|
| 903 |
|
| 904 |
except Exception as e:
|
| 905 |
print(f"⚠️ Error during Arabic correction: {e}")
|