VanguardAI commited on
Commit
4a32a6e
·
verified ·
1 Parent(s): 0fe4722

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -707
app.py CHANGED
@@ -34,16 +34,11 @@ torch.backends.cudnn.benchmark = False
34
 
35
  # Constants
36
  MIN_PIXELS = 3136
37
- MAX_PIXELS = 16000000 # Increased for better line detection (was 11289600)
38
  IMAGE_FACTOR = 28
39
 
40
  # Prompts
41
- prompt = """Please output the layout information from the document image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
42
-
43
- CRITICAL REQUIREMENTS:
44
- - Detect EACH LINE of text as a SEPARATE element. Do NOT group multiple lines together into a single bbox.
45
- - This document may contain ARABIC HANDWRITTEN or TYPED text. Extract it accurately character-by-character.
46
- - Output ONLY the text you see - NO translation, NO English words, NO explanations.
47
 
48
  1. Bbox format: [x1, y1, x2, y2]
49
 
@@ -54,14 +49,10 @@ CRITICAL REQUIREMENTS:
54
  - Formula: Format its text as LaTeX.
55
  - Table: Format its text as HTML.
56
  - All Others (Text, Title, etc.): Format their text as Markdown.
57
- - For Arabic text (handwritten or typed): Extract exactly as written, character-by-character.
58
 
59
  4. Constraints:
60
- - The output text must be the EXACT original text from the image, with NO translation whatsoever.
61
- - IMPORTANT: Detect every individual line of text separately - do not merge multiple lines into one element.
62
- - Each text line should have its own bbox and text content.
63
- - All layout elements must be sorted according to human reading order (right-to-left for Arabic).
64
- - Focus on ACCURACY over speed - take time to recognize each character correctly.
65
 
66
  5. Final Output: The entire output must be a single JSON object.
67
  """
@@ -500,18 +491,7 @@ def _generate_text_and_confidence_for_crop(
500
  {"type": "image", "image": image},
501
  {
502
  "type": "text",
503
- "text": """Extract ONLY the Arabic text from this image line.
504
-
505
- STRICT RULES:
506
- - Output ONLY Arabic characters you see in the image
507
- - NO English words whatsoever (no 'Commission', 'Text', etc.)
508
- - NO translations
509
- - NO explanations
510
- - NO additional text
511
- - If you see handwriting, transcribe it exactly
512
- - If there is no Arabic text, output nothing
513
-
514
- Extract the Arabic text now:""",
515
  },
516
  ],
517
  }
@@ -580,43 +560,6 @@ Extract the Arabic text now:""",
580
  return "", 0.0
581
 
582
 
583
- def preprocess_for_handwriting_ocr(image: Image.Image) -> Image.Image:
584
- """
585
- Enhance image quality for better handwriting OCR.
586
-
587
- Applies:
588
- - Contrast enhancement
589
- - Sharpening
590
- - Noise reduction (if needed)
591
- """
592
- try:
593
- from PIL import ImageEnhance, ImageFilter
594
-
595
- # Convert to RGB if needed
596
- if image.mode != 'RGB':
597
- image = image.convert('RGB')
598
-
599
- # 1. Increase contrast to make text more distinct from background
600
- enhancer = ImageEnhance.Contrast(image)
601
- image = enhancer.enhance(1.5) # Boost contrast by 50%
602
-
603
- # 2. Increase sharpness to make character edges clearer
604
- enhancer = ImageEnhance.Sharpness(image)
605
- image = enhancer.enhance(1.8) # Significant sharpening
606
-
607
- # 3. Slight brightness adjustment if image is too dark
608
- enhancer = ImageEnhance.Brightness(image)
609
- image = enhancer.enhance(1.1)
610
-
611
- # 4. Apply unsharp mask for better edge definition
612
- image = image.filter(ImageFilter.UnsharpMask(radius=1, percent=150, threshold=3))
613
-
614
- return image
615
- except Exception as e:
616
- print(f"⚠️ Warning: Image preprocessing failed: {e}")
617
- return image # Return original if preprocessing fails
618
-
619
-
620
  def estimate_text_density(image: Image.Image) -> float:
621
  """
622
  Estimate text density in image using pixel analysis.
@@ -644,518 +587,6 @@ def estimate_text_density(image: Image.Image) -> float:
644
  return 0.1 # Default to low density
645
 
646
 
647
- def analyze_image_line_characteristics(image: Image.Image) -> Dict[str, float]:
648
- """
649
- Analyze image to determine optimal line detection parameters.
650
- Works adaptively for any image type (sparse, dense, tables, forms).
651
-
652
- Returns dict with: avg_line_height, min_line_height, max_line_height, line_spacing
653
- """
654
- try:
655
- width, height = image.size
656
- gray = image.convert('L')
657
- img_array = np.array(gray)
658
-
659
- # Horizontal projection: sum of dark pixels per row
660
- row_sums = np.sum(img_array < 128, axis=1)
661
-
662
- if len(row_sums) < 10:
663
- # Fallback for very small images
664
- return {
665
- 'avg_line_height': height / 10, # Assume ~10 lines
666
- 'min_line_height': max(15, height / 20),
667
- 'max_line_height': height / 3, # Split if > 1/3 of image height
668
- 'line_spacing': height / 15
669
- }
670
-
671
- # Find text rows (peaks in projection)
672
- mean_val = np.mean(row_sums)
673
- std_val = np.std(row_sums)
674
- threshold = max(mean_val * 0.2, mean_val - std_val * 0.3)
675
- text_rows = np.where(row_sums > threshold)[0]
676
-
677
- if len(text_rows) < 2:
678
- # No clear text lines detected, use conservative estimates
679
- estimated_lines = max(5, height // 50)
680
- return {
681
- 'avg_line_height': height / estimated_lines,
682
- 'min_line_height': max(15, height / (estimated_lines * 2)),
683
- 'max_line_height': height / 2, # Split if > half image
684
- 'line_spacing': height / estimated_lines
685
- }
686
-
687
- # Group consecutive text rows into lines
688
- line_centers = []
689
- current_group = [text_rows[0]]
690
-
691
- for i in range(1, len(text_rows)):
692
- if text_rows[i] - text_rows[i-1] <= 5: # Consecutive rows
693
- current_group.append(text_rows[i])
694
- else:
695
- line_centers.append(int(np.mean(current_group)))
696
- current_group = [text_rows[i]]
697
-
698
- if current_group:
699
- line_centers.append(int(np.mean(current_group)))
700
-
701
- if len(line_centers) < 2:
702
- # Can't determine spacing
703
- estimated_lines = max(3, height // 60)
704
- return {
705
- 'avg_line_height': height / estimated_lines,
706
- 'min_line_height': max(20, height / (estimated_lines * 2)),
707
- 'max_line_height': height / 2,
708
- 'line_spacing': height / estimated_lines
709
- }
710
-
711
- # Calculate spacing between lines
712
- spacings = []
713
- for i in range(len(line_centers) - 1):
714
- spacing = line_centers[i+1] - line_centers[i]
715
- if spacing > 8: # Minimum reasonable spacing
716
- spacings.append(spacing)
717
-
718
- if spacings:
719
- avg_spacing = np.median(spacings)
720
- min_spacing = np.percentile(spacings, 25)
721
- max_spacing = np.percentile(spacings, 75)
722
-
723
- return {
724
- 'avg_line_height': float(avg_spacing),
725
- 'min_line_height': float(max(15, min_spacing * 0.6)), # 60% of min spacing
726
- 'max_line_height': float(max_spacing * 1.5), # 1.5x max spacing = likely multi-line
727
- 'line_spacing': float(avg_spacing),
728
- 'num_lines_detected': len(line_centers)
729
- }
730
-
731
- # Fallback
732
- estimated_lines = max(3, height // 50)
733
- return {
734
- 'avg_line_height': height / estimated_lines,
735
- 'min_line_height': max(20, height / (estimated_lines * 2)),
736
- 'max_line_height': height / 2,
737
- 'line_spacing': height / estimated_lines
738
- }
739
-
740
- except Exception as e:
741
- print(f" ⚠️ Error analyzing image: {e}")
742
- # Ultra-conservative fallback
743
- width, height = image.size
744
- return {
745
- 'avg_line_height': 50,
746
- 'min_line_height': 25,
747
- 'max_line_height': 100,
748
- 'line_spacing': 50
749
- }
750
-
751
-
752
- def validate_region_contains_text(image: Image.Image, bbox: List[int], min_text_density: float = 0.05) -> bool:
753
- """
754
- Validate that a bounding box region actually contains text (not empty space).
755
-
756
- Args:
757
- image: Original image
758
- bbox: Bounding box [x1, y1, x2, y2]
759
- min_text_density: Minimum fraction of pixels that should be text (dark pixels)
760
-
761
- Returns:
762
- True if region contains sufficient text, False otherwise
763
- """
764
- try:
765
- x1, y1, x2, y2 = bbox
766
- x1, y1 = max(0, int(x1)), max(0, int(y1))
767
- x2, y2 = min(image.width, int(x2)), min(image.height, int(y2))
768
-
769
- if x2 <= x1 or y2 <= y1:
770
- return False
771
-
772
- # Crop region
773
- crop = image.crop((x1, y1, x2, y2))
774
- gray = crop.convert('L')
775
- img_array = np.array(gray)
776
-
777
- if img_array.size == 0:
778
- return False
779
-
780
- # Calculate text density (fraction of dark pixels)
781
- # For handwriting/text, we expect at least some dark pixels
782
- dark_pixels = np.sum(img_array < 128) # Pixels darker than middle gray
783
- total_pixels = img_array.size
784
- text_density = dark_pixels / total_pixels if total_pixels > 0 else 0
785
-
786
- # Also check for minimum height/width (avoid tiny regions)
787
- height = y2 - y1
788
- width = x2 - x1
789
- min_dimension = min(height, width)
790
-
791
- # Reject if:
792
- # 1. Text density too low (mostly empty space)
793
- # 2. Region too small (likely noise)
794
- if text_density < min_text_density:
795
- return False
796
-
797
- if min_dimension < 15: # Too small to be a real text line
798
- return False
799
-
800
- return True
801
-
802
- except Exception as e:
803
- print(f" ⚠️ Error validating region: {e}")
804
- return False
805
-
806
-
807
- def filter_empty_regions(image: Image.Image, layout_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
808
- """
809
- Filter out regions that are empty spaces, noise, or false positives.
810
-
811
- This removes:
812
- - Regions with very low text density (empty margins/spaces)
813
- - Regions that are too small
814
- - Regions that are likely noise artifacts
815
- """
816
- filtered = []
817
- removed_count = 0
818
-
819
- for item in layout_data:
820
- bbox = item.get('bbox', [])
821
- category = item.get('category', '')
822
- text = item.get('text', '').strip()
823
-
824
- # Skip if no bbox
825
- if not bbox or len(bbox) != 4:
826
- continue
827
-
828
- # For Text/List-item regions, validate they contain actual text
829
- if category in ['Text', 'List-item']:
830
- if not validate_region_contains_text(image, bbox):
831
- print(f" 🗑️ Removing empty region: {category} bbox={bbox}")
832
- removed_count += 1
833
- continue
834
-
835
- # Even if region passes validation, check if text is meaningful
836
- # Remove regions with very short or meaningless text
837
- if category in ['Text', 'List-item']:
838
- # Remove if text is empty or too short (likely noise)
839
- if not text or len(text.strip()) < 2:
840
- # But only if it also failed validation
841
- if not validate_region_contains_text(image, bbox, min_text_density=0.03):
842
- print(f" 🗑️ Removing empty/noise region: {category} bbox={bbox}")
843
- removed_count += 1
844
- continue
845
-
846
- filtered.append(item)
847
-
848
- if removed_count > 0:
849
- print(f"🗑️ Filtered out {removed_count} empty/noise regions")
850
-
851
- return filtered
852
-
853
-
854
- def detect_line_spacing(image: Image.Image, bbox: List[int]) -> Optional[float]:
855
- """
856
- Detect average line spacing in a text region using horizontal projection analysis.
857
-
858
- Returns estimated line height in pixels, or None if detection fails.
859
- """
860
- try:
861
- x1, y1, x2, y2 = bbox
862
- crop = image.crop((x1, y1, x2, y2))
863
-
864
- # Convert to grayscale
865
- gray = crop.convert('L')
866
- img_array = np.array(gray)
867
-
868
- if img_array.size == 0:
869
- return None
870
-
871
- # Horizontal projection: sum of dark pixels per row
872
- row_sums = np.sum(img_array < 128, axis=1)
873
-
874
- if len(row_sums) < 10:
875
- return None
876
-
877
- # Find peaks (text lines) and valleys (spacing between lines)
878
- mean_val = np.mean(row_sums)
879
- std_val = np.std(row_sums)
880
- threshold = max(mean_val * 0.25, mean_val - std_val * 0.4)
881
-
882
- text_rows = np.where(row_sums > threshold)[0]
883
-
884
- if len(text_rows) < 2:
885
- return None
886
-
887
- # Group consecutive rows to find line centers
888
- line_centers = []
889
- current_group = [text_rows[0]]
890
-
891
- for i in range(1, len(text_rows)):
892
- if text_rows[i] - text_rows[i-1] <= 3:
893
- current_group.append(text_rows[i])
894
- else:
895
- line_centers.append(int(np.mean(current_group)))
896
- current_group = [text_rows[i]]
897
-
898
- if current_group:
899
- line_centers.append(int(np.mean(current_group)))
900
-
901
- if len(line_centers) < 2:
902
- return None
903
-
904
- # Calculate spacing between line centers
905
- spacings = []
906
- for i in range(len(line_centers) - 1):
907
- spacing = line_centers[i+1] - line_centers[i]
908
- if spacing > 10:
909
- spacings.append(spacing)
910
-
911
- if spacings:
912
- return float(np.median(spacings))
913
-
914
- return None
915
- except Exception as e:
916
- return None
917
-
918
-
919
- def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) -> List[int]:
920
- """
921
- Detect actual line break positions within a text region using horizontal projection.
922
- Only detects breaks if region has sufficient text density (not empty space).
923
-
924
- Returns list of y-coordinates where lines break.
925
- """
926
- try:
927
- x1, y1, x2, y2 = bbox
928
- crop = image.crop((x1, y1, x2, y2))
929
- gray = crop.convert('L')
930
- img_array = np.array(gray)
931
-
932
- if img_array.size == 0:
933
- return []
934
-
935
- # FIRST: Validate region has actual text (not empty space)
936
- total_pixels = img_array.size
937
- dark_pixels = np.sum(img_array < 128)
938
- text_density = dark_pixels / total_pixels if total_pixels > 0 else 0
939
-
940
- # Require minimum text density to avoid false positives on empty regions
941
- if text_density < 0.03: # Less than 3% dark pixels = likely empty
942
- return []
943
-
944
- # Horizontal projection
945
- row_sums = np.sum(img_array < 128, axis=1)
946
-
947
- if len(row_sums) < 10: # Need enough rows
948
- return []
949
-
950
- # Find valleys (spaces between lines) and peaks (text lines)
951
- mean_val = np.mean(row_sums)
952
- std_val = np.std(row_sums)
953
-
954
- # More aggressive thresholds to avoid false positives
955
- text_threshold = max(mean_val * 0.3, mean_val - std_val * 0.5)
956
- space_threshold = mean_val * 0.1 # Very low for actual spaces
957
-
958
- # Find text rows and space rows
959
- text_rows = np.where(row_sums > text_threshold)[0]
960
-
961
- if len(text_rows) < 5: # Need substantial text rows
962
- return []
963
-
964
- # Group text rows into lines
965
- line_groups = []
966
- current_group = [text_rows[0]]
967
-
968
- for i in range(1, len(text_rows)):
969
- if text_rows[i] - text_rows[i-1] <= 3:
970
- current_group.append(text_rows[i])
971
- else:
972
- if len(current_group) >= 3: # Require minimum group size
973
- line_groups.append(current_group)
974
- current_group = [text_rows[i]]
975
-
976
- if len(current_group) >= 3:
977
- line_groups.append(current_group)
978
-
979
- if len(line_groups) < 2:
980
- return [] # Single line or can't detect
981
-
982
- # Find break points (midpoints between line groups)
983
- # Require minimum gap between lines to avoid false splits
984
- break_points = []
985
- for i in range(len(line_groups) - 1):
986
- last_row_of_line1 = max(line_groups[i])
987
- first_row_of_line2 = min(line_groups[i+1])
988
- gap = first_row_of_line2 - last_row_of_line1
989
-
990
- # Only split if gap is substantial (at least 5 pixels)
991
- if gap >= 5:
992
- break_point = (last_row_of_line1 + first_row_of_line2) // 2
993
- break_points.append(y1 + break_point) # Convert to image coordinates
994
-
995
- return break_points
996
-
997
- except Exception as e:
998
- print(f" ⚠️ Error detecting line breaks: {e}")
999
- return []
1000
-
1001
-
1002
- def split_text_regions_into_lines(
1003
- image: Image.Image,
1004
- layout_data: List[Dict[str, Any]],
1005
- min_line_height: Optional[int] = None,
1006
- max_line_height: Optional[int] = None
1007
- ) -> List[Dict[str, Any]]:
1008
- """
1009
- Intelligently split text regions into individual lines.
1010
-
1011
- ADAPTIVE APPROACH:
1012
- - Analyzes image to determine optimal parameters
1013
- - Detects actual line breaks using image analysis
1014
- - Works for any image type (sparse, dense, tables, forms)
1015
- - No hardcoded thresholds
1016
-
1017
- Args:
1018
- image: Original image
1019
- layout_data: Layout detection results
1020
- min_line_height: Optional override (auto-detected if None)
1021
- max_line_height: Optional override (auto-detected if None)
1022
-
1023
- Returns:
1024
- Updated layout data with lines split
1025
- """
1026
- # Analyze image to get adaptive parameters
1027
- img_chars = analyze_image_line_characteristics(image)
1028
- adaptive_min = min_line_height if min_line_height else int(img_chars['min_line_height'])
1029
- adaptive_max = max_line_height if max_line_height else int(img_chars['max_line_height'])
1030
- avg_line_height = img_chars['avg_line_height']
1031
-
1032
- print(f"\n📊 Image analysis: avg_line_height={avg_line_height:.1f}px, "
1033
- f"min={adaptive_min}px, max={adaptive_max}px")
1034
- if 'num_lines_detected' in img_chars:
1035
- print(f" Detected ~{img_chars['num_lines_detected']} lines in image")
1036
-
1037
- result = []
1038
- split_count = 0
1039
-
1040
- for item in layout_data:
1041
- bbox = item.get('bbox', [])
1042
- category = item.get('category', '')
1043
- text_content = item.get('text', '')
1044
-
1045
- # Only split Text regions (not titles, headers, tables, etc.)
1046
- if len(bbox) != 4 or category not in ['Text', 'List-item']:
1047
- result.append(item)
1048
- continue
1049
-
1050
- x1, y1, x2, y2 = bbox
1051
- height = y2 - y1
1052
- width = x2 - x1
1053
-
1054
- # FIRST: Validate region actually contains text before trying to split
1055
- if not validate_region_contains_text(image, bbox, min_text_density=0.03):
1056
- print(f" Region: {category} (h={height}px) - Empty/noise region, skipping split")
1057
- # Don't add empty regions - they'll be filtered out later
1058
- continue
1059
-
1060
- # ALWAYS check if region contains multiple lines, regardless of height
1061
- # Use image analysis to detect actual line breaks
1062
- line_breaks = detect_actual_line_breaks_in_region(image, bbox)
1063
-
1064
- if len(line_breaks) > 0:
1065
- # We detected actual line breaks - split at those positions
1066
- print(f" Region: {category} (h={height}px) - Detected {len(line_breaks)+1} lines via image analysis")
1067
-
1068
- # Create lines based on detected breaks
1069
- current_y = y1
1070
- for i, break_y in enumerate(line_breaks):
1071
- # Create line from current_y to break_y
1072
- new_bbox = [x1, int(current_y), x2, int(break_y)]
1073
-
1074
- # Validate split region contains text before adding
1075
- if validate_region_contains_text(image, new_bbox, min_text_density=0.03):
1076
- new_item = item.copy()
1077
- new_item['bbox'] = new_bbox
1078
- new_item['text'] = "" # Will be re-OCR'd
1079
- new_item['split_from_parent'] = True
1080
- new_item['needs_reocr'] = True
1081
- new_item['line_number'] = i + 1
1082
- result.append(new_item)
1083
-
1084
- current_y = break_y
1085
-
1086
- # Add last line
1087
- final_bbox = [x1, int(current_y), x2, y2]
1088
- if validate_region_contains_text(image, final_bbox, min_text_density=0.03):
1089
- new_item = item.copy()
1090
- new_item['bbox'] = final_bbox
1091
- new_item['text'] = ""
1092
- new_item['split_from_parent'] = True
1093
- new_item['needs_reocr'] = True
1094
- new_item['line_number'] = len(line_breaks) + 1
1095
- result.append(new_item)
1096
-
1097
- split_count += 1
1098
-
1099
- elif height > adaptive_max:
1100
- # No line breaks detected but region is tall - use spacing-based split
1101
- print(f" Region: {category} (h={height}px) - Tall region, using spacing-based split")
1102
-
1103
- # Try to detect spacing in this specific region
1104
- detected_spacing = detect_line_spacing(image, bbox)
1105
-
1106
- if detected_spacing and detected_spacing > adaptive_min:
1107
- line_height = detected_spacing
1108
- estimated_lines = max(2, round(height / line_height))
1109
- else:
1110
- line_height = avg_line_height
1111
- estimated_lines = max(2, round(height / line_height))
1112
-
1113
- estimated_lines = min(estimated_lines, 15) # Cap at 15 lines
1114
-
1115
- # Calculate padding (adaptive: 8% of line height, min 2px)
1116
- padding = max(2, int(line_height * 0.08))
1117
-
1118
- # Split geometrically
1119
- for i in range(estimated_lines):
1120
- if i == 0:
1121
- new_y1 = y1
1122
- new_y2 = y1 + line_height + padding
1123
- elif i == estimated_lines - 1:
1124
- new_y1 = y1 + (i * line_height) - padding
1125
- new_y2 = y2
1126
- else:
1127
- new_y1 = y1 + (i * line_height) - padding
1128
- new_y2 = y1 + ((i + 1) * line_height) + padding
1129
-
1130
- new_y1 = max(y1, int(new_y1))
1131
- new_y2 = min(y2, int(new_y2))
1132
-
1133
- if new_y2 > new_y1:
1134
- new_bbox = [x1, new_y1, x2, new_y2]
1135
-
1136
- # Validate split region contains text before adding
1137
- if validate_region_contains_text(image, new_bbox, min_text_density=0.03):
1138
- new_item = item.copy()
1139
- new_item['bbox'] = new_bbox
1140
- new_item['text'] = ""
1141
- new_item['split_from_parent'] = True
1142
- new_item['needs_reocr'] = True
1143
- new_item['line_number'] = i + 1
1144
- result.append(new_item)
1145
-
1146
- split_count += 1
1147
-
1148
- else:
1149
- # Region is reasonably sized - keep as is
1150
- print(f" Region: {category} (h={height}px) - Keeping as single line")
1151
- result.append(item)
1152
-
1153
- if split_count > 0:
1154
- print(f"📏 Split {split_count} regions into {len(result)} total lines")
1155
-
1156
- return result
1157
-
1158
-
1159
  def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
1160
  """
1161
  Intelligently determine if image should be chunked for better accuracy.
@@ -1382,126 +813,6 @@ def process_image(
1382
  # Try to parse JSON output
1383
  layout_data = json.loads(raw_output)
1384
 
1385
- # 🗑️ FIRST FILTER: Remove empty regions and false positives from initial detection
1386
- print(f"\n📋 Initial layout: {len(layout_data)} regions detected")
1387
- layout_data = filter_empty_regions(image, layout_data)
1388
- print(f"✅ After initial filtering: {len(layout_data)} regions remaining")
1389
-
1390
- for idx, item in enumerate(layout_data):
1391
- bbox = item.get('bbox', [])
1392
- text = item.get('text', '')[:50]
1393
- cat = item.get('category', '')
1394
- print(f" Region {idx+1}: {cat} - '{text}...' bbox={bbox}")
1395
-
1396
- try:
1397
- layout_data_before = len(layout_data)
1398
- layout_data = split_text_regions_into_lines(image, layout_data)
1399
- print(f"📐 After splitting: {layout_data_before} → {len(layout_data)} regions")
1400
-
1401
- # 🗑️ SECOND FILTER: Remove any empty regions created during splitting
1402
- layout_data = filter_empty_regions(image, layout_data)
1403
- print(f"✅ After post-split filtering: {len(layout_data)} regions remaining")
1404
-
1405
- except Exception as e:
1406
- print(f"⚠️ Warning: Could not split text regions: {e}")
1407
- traceback.print_exc()
1408
- # Continue with original layout data
1409
-
1410
- # 🔄 RE-OCR SPLIT LINES: For split regions, perform per-line OCR
1411
- regions_needing_reocr = [item for item in layout_data if item.get('needs_reocr')]
1412
- if regions_needing_reocr:
1413
- print(f"🔄 Re-OCRing {len(regions_needing_reocr)} split line regions for accurate per-line text...")
1414
- valid_regions = []
1415
- for idx, item in enumerate(regions_needing_reocr):
1416
- try:
1417
- bbox = item.get('bbox', [])
1418
- if not bbox or len(bbox) != 4:
1419
- continue
1420
- x1, y1, x2, y2 = bbox
1421
- x1, y1 = max(0, int(x1)), max(0, int(y1))
1422
- x2, y2 = min(image.width, int(x2)), min(image.height, int(y2))
1423
- if x2 <= x1 or y2 <= y1:
1424
- continue
1425
-
1426
- # 🚫 VALIDATE BEFORE RE-OCR: Skip empty regions
1427
- if not validate_region_contains_text(image, bbox, min_text_density=0.03):
1428
- print(f" ⚠️ Skipping line {idx+1}: empty region (bbox={bbox})")
1429
- # Mark for removal
1430
- item['_should_remove'] = True
1431
- continue
1432
-
1433
- # Add small safety margin to ensure we capture full text
1434
- margin = 2 # Small margin to avoid edge clipping
1435
- crop_x1 = max(0, x1 - margin)
1436
- crop_y1 = max(0, y1 - margin)
1437
- crop_x2 = min(image.width, x2 + margin)
1438
- crop_y2 = min(image.height, y2 + margin)
1439
-
1440
- # Crop and preprocess the line region
1441
- crop_img = image.crop((crop_x1, crop_y1, crop_x2, crop_y2))
1442
-
1443
- # Validate crop is reasonable size
1444
- if crop_img.size[0] < 10 or crop_img.size[1] < 10:
1445
- print(f" ⚠️ Skipping line {idx+1}: crop too small ({crop_img.size})")
1446
- item['_should_remove'] = True
1447
- continue
1448
-
1449
- # Apply preprocessing to enhance handwriting quality
1450
- crop_img = preprocess_for_handwriting_ocr(crop_img)
1451
-
1452
- # Re-OCR this specific line
1453
- line_text, line_conf = _generate_text_and_confidence_for_crop(crop_img)
1454
-
1455
- # AGGRESSIVE FILTERING: Remove any English words/hallucinations
1456
- line_text = line_text.strip()
1457
-
1458
- # Remove common English hallucinations
1459
- english_hallucinations = [
1460
- 'Commission', 'commission', 'COMMISSION',
1461
- 'The', 'the', 'and', 'or', 'of', 'in', 'to', 'a', 'is',
1462
- 'Text', 'text', 'Title', 'title', 'Caption', 'caption',
1463
- 'Page', 'page', 'Document', 'document', 'Image', 'image'
1464
- ]
1465
-
1466
- for hallucination in english_hallucinations:
1467
- line_text = line_text.replace(hallucination, '').strip()
1468
-
1469
- # Remove any remaining Latin alphabet (keep only Arabic, numbers, punctuation)
1470
- import re
1471
- # Keep: Arabic letters, Arabic numbers, spaces, basic punctuation
1472
- line_text = re.sub(r'[a-zA-Z]+', '', line_text).strip()
1473
-
1474
- # If text is empty or too short after filtering, mark for removal
1475
- if not line_text or len(line_text.strip()) < 2:
1476
- print(f" ⚠️ Skipping line {idx+1}: no meaningful text after filtering")
1477
- item['_should_remove'] = True
1478
- continue
1479
-
1480
- item['text'] = line_text
1481
- item['confidence'] = line_conf
1482
- item['reocr_completed'] = True
1483
- valid_regions.append(item)
1484
-
1485
- print(f" ✓ Line {idx+1}/{len(regions_needing_reocr)}: '{line_text[:50]}...' (conf: {line_conf:.1f}%)")
1486
- except Exception as e:
1487
- print(f" ✗ Error re-OCRing line {idx}: {e}")
1488
- traceback.print_exc()
1489
- item['_should_remove'] = True
1490
-
1491
- # Remove regions marked for removal
1492
- layout_data = [item for item in layout_data if not item.get('_should_remove', False)]
1493
-
1494
- print(f"\n✅ Re-OCR complete. Final layout has {len(layout_data)} regions:")
1495
- for idx, item in enumerate(layout_data):
1496
- text = item.get('text', '')[:50]
1497
- conf = item.get('confidence', 0)
1498
- reocr = item.get('reocr_completed', False)
1499
- print(f" Region {idx+1}: '{text}...' (conf={conf:.1f}%, re-OCR={reocr})")
1500
-
1501
- # 🗑️ FINAL FILTER: Remove any remaining empty/invalid regions
1502
- layout_data = filter_empty_regions(image, layout_data)
1503
- print(f"✅ After final filtering: {len(layout_data)} regions remaining")
1504
-
1505
  # 🎯 INTELLIGENT CONFIDENCE SCORING
1506
  # Count text regions to determine if per-region scoring is feasible
1507
  num_text_regions = sum(1 for item in layout_data
@@ -1514,10 +825,6 @@ def process_image(
1514
  # Compute per-region confidence using the model on each cropped region
1515
  for idx, item in enumerate(layout_data):
1516
  try:
1517
- # Skip if already processed during re-OCR
1518
- if item.get('reocr_completed'):
1519
- continue
1520
-
1521
  bbox = item.get('bbox', [])
1522
  text_content = item.get('text', '')
1523
  category = item.get('category', '')
@@ -1563,10 +870,9 @@ def process_image(
1563
 
1564
  # ✨ ARABIC TEXT CORRECTION: Apply intelligent correction to each text region
1565
  try:
1566
- print(f"\n🔧 Applying Arabic text correction to {len(layout_data)} regions...")
1567
  corrector = get_corrector()
1568
 
1569
- corrections_applied = 0
1570
  for idx, item in enumerate(layout_data):
1571
  text_content = item.get('text', '')
1572
  category = item.get('category', '')
@@ -1575,8 +881,6 @@ def process_image(
1575
  if not text_content or category in ['Picture', 'Formula', 'Table']:
1576
  continue
1577
 
1578
- print(f" Correcting region {idx+1}: '{text_content[:40]}...'")
1579
-
1580
  # Apply correction
1581
  correction_result = corrector.correct_text(text_content)
1582
 
@@ -1589,17 +893,13 @@ def process_image(
1589
 
1590
  # Update the text field to use corrected version
1591
  item['text'] = correction_result['corrected']
1592
-
1593
- if correction_result['corrections_made'] > 0:
1594
- corrections_applied += correction_result['corrections_made']
1595
- print(f" → Made {correction_result['corrections_made']} corrections")
1596
 
1597
  # Regenerate markdown with corrected text
1598
  corrected_markdown = layoutjson2md(image, layout_data, text_key='text')
1599
  result['markdown_content_corrected'] = corrected_markdown
1600
  result['markdown_content_original'] = markdown_content
1601
 
1602
- print(f"✅ Correction complete: {corrections_applied} total corrections made across {len(layout_data)} regions")
1603
 
1604
  except Exception as e:
1605
  print(f"⚠️ Error during Arabic correction: {e}")
 
34
 
35
  # Constants
36
  MIN_PIXELS = 3136
37
+ MAX_PIXELS = 11289600
38
  IMAGE_FACTOR = 28
39
 
40
  # Prompts
41
+ prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
 
 
 
 
 
42
 
43
  1. Bbox format: [x1, y1, x2, y2]
44
 
 
49
  - Formula: Format its text as LaTeX.
50
  - Table: Format its text as HTML.
51
  - All Others (Text, Title, etc.): Format their text as Markdown.
 
52
 
53
  4. Constraints:
54
+ - The output text must be the original text from the image, with no translation.
55
+ - All layout elements must be sorted according to human reading order.
 
 
 
56
 
57
  5. Final Output: The entire output must be a single JSON object.
58
  """
 
491
  {"type": "image", "image": image},
492
  {
493
  "type": "text",
494
+ "text": "Extract the exact text content from this image region. Output text only without translation or additional words.",
 
 
 
 
 
 
 
 
 
 
 
495
  },
496
  ],
497
  }
 
560
  return "", 0.0
561
 
562
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  def estimate_text_density(image: Image.Image) -> float:
564
  """
565
  Estimate text density in image using pixel analysis.
 
587
  return 0.1 # Default to low density
588
 
589
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
591
  """
592
  Intelligently determine if image should be chunked for better accuracy.
 
813
  # Try to parse JSON output
814
  layout_data = json.loads(raw_output)
815
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816
  # 🎯 INTELLIGENT CONFIDENCE SCORING
817
  # Count text regions to determine if per-region scoring is feasible
818
  num_text_regions = sum(1 for item in layout_data
 
825
  # Compute per-region confidence using the model on each cropped region
826
  for idx, item in enumerate(layout_data):
827
  try:
 
 
 
 
828
  bbox = item.get('bbox', [])
829
  text_content = item.get('text', '')
830
  category = item.get('category', '')
 
870
 
871
  # ✨ ARABIC TEXT CORRECTION: Apply intelligent correction to each text region
872
  try:
873
+ print("🔧 Applying Arabic text correction...")
874
  corrector = get_corrector()
875
 
 
876
  for idx, item in enumerate(layout_data):
877
  text_content = item.get('text', '')
878
  category = item.get('category', '')
 
881
  if not text_content or category in ['Picture', 'Formula', 'Table']:
882
  continue
883
 
 
 
884
  # Apply correction
885
  correction_result = corrector.correct_text(text_content)
886
 
 
893
 
894
  # Update the text field to use corrected version
895
  item['text'] = correction_result['corrected']
 
 
 
 
896
 
897
  # Regenerate markdown with corrected text
898
  corrected_markdown = layoutjson2md(image, layout_data, text_key='text')
899
  result['markdown_content_corrected'] = corrected_markdown
900
  result['markdown_content_original'] = markdown_content
901
 
902
+ print(f"✅ Correction complete")
903
 
904
  except Exception as e:
905
  print(f"⚠️ Error during Arabic correction: {e}")