Spaces:

VanguardAI
/

Arabic-OCR

Running on Zero

App Files Files Community

VanguardAI commited on 2 days ago

Commit

4a32a6e

verified ·

1 Parent(s): 0fe4722

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -707

app.py CHANGED Viewed

@@ -34,16 +34,11 @@ torch.backends.cudnn.benchmark = False
 # Constants
 MIN_PIXELS = 3136
-MAX_PIXELS = 16000000  # Increased for better line detection (was 11289600)
 IMAGE_FACTOR = 28
 # Prompts
-prompt = """Please output the layout information from the document image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
-CRITICAL REQUIREMENTS:
-- Detect EACH LINE of text as a SEPARATE element. Do NOT group multiple lines together into a single bbox.
-- This document may contain ARABIC HANDWRITTEN or TYPED text. Extract it accurately character-by-character.
-- Output ONLY the text you see - NO translation, NO English words, NO explanations.
 1. Bbox format: [x1, y1, x2, y2]
@@ -54,14 +49,10 @@ CRITICAL REQUIREMENTS:
     - Formula: Format its text as LaTeX.
     - Table: Format its text as HTML.
     - All Others (Text, Title, etc.): Format their text as Markdown.
-    - For Arabic text (handwritten or typed): Extract exactly as written, character-by-character.
 4. Constraints:
-    - The output text must be the EXACT original text from the image, with NO translation whatsoever.
-    - IMPORTANT: Detect every individual line of text separately - do not merge multiple lines into one element.
-    - Each text line should have its own bbox and text content.
-    - All layout elements must be sorted according to human reading order (right-to-left for Arabic).
-    - Focus on ACCURACY over speed - take time to recognize each character correctly.
 5. Final Output: The entire output must be a single JSON object.
 """
@@ -500,18 +491,7 @@ def _generate_text_and_confidence_for_crop(
                     {"type": "image", "image": image},
                     {
                         "type": "text",
-                        "text": """Extract ONLY the Arabic text from this image line.
-STRICT RULES:
-- Output ONLY Arabic characters you see in the image
-- NO English words whatsoever (no 'Commission', 'Text', etc.)
-- NO translations
-- NO explanations
-- NO additional text
-- If you see handwriting, transcribe it exactly
-- If there is no Arabic text, output nothing
-Extract the Arabic text now:""",
                     },
                 ],
             }
@@ -580,43 +560,6 @@ Extract the Arabic text now:""",
         return "", 0.0
-def preprocess_for_handwriting_ocr(image: Image.Image) -> Image.Image:
-    """
-    Enhance image quality for better handwriting OCR.
-    Applies:
-    - Contrast enhancement
-    - Sharpening
-    - Noise reduction (if needed)
-    """
-    try:
-        from PIL import ImageEnhance, ImageFilter
-        # Convert to RGB if needed
-        if image.mode != 'RGB':
-            image = image.convert('RGB')
-        # 1. Increase contrast to make text more distinct from background
-        enhancer = ImageEnhance.Contrast(image)
-        image = enhancer.enhance(1.5)  # Boost contrast by 50%
-        # 2. Increase sharpness to make character edges clearer
-        enhancer = ImageEnhance.Sharpness(image)
-        image = enhancer.enhance(1.8)  # Significant sharpening
-        # 3. Slight brightness adjustment if image is too dark
-        enhancer = ImageEnhance.Brightness(image)
-        image = enhancer.enhance(1.1)
-        # 4. Apply unsharp mask for better edge definition
-        image = image.filter(ImageFilter.UnsharpMask(radius=1, percent=150, threshold=3))
-        return image
-    except Exception as e:
-        print(f"⚠️ Warning: Image preprocessing failed: {e}")
-        return image  # Return original if preprocessing fails
 def estimate_text_density(image: Image.Image) -> float:
     """
     Estimate text density in image using pixel analysis.
@@ -644,518 +587,6 @@ def estimate_text_density(image: Image.Image) -> float:
         return 0.1  # Default to low density
-def analyze_image_line_characteristics(image: Image.Image) -> Dict[str, float]:
-    """
-    Analyze image to determine optimal line detection parameters.
-    Works adaptively for any image type (sparse, dense, tables, forms).
-    Returns dict with: avg_line_height, min_line_height, max_line_height, line_spacing
-    """
-    try:
-        width, height = image.size
-        gray = image.convert('L')
-        img_array = np.array(gray)
-        # Horizontal projection: sum of dark pixels per row
-        row_sums = np.sum(img_array < 128, axis=1)
-        if len(row_sums) < 10:
-            # Fallback for very small images
-            return {
-                'avg_line_height': height / 10,  # Assume ~10 lines
-                'min_line_height': max(15, height / 20),
-                'max_line_height': height / 3,  # Split if > 1/3 of image height
-                'line_spacing': height / 15
-            }
-        # Find text rows (peaks in projection)
-        mean_val = np.mean(row_sums)
-        std_val = np.std(row_sums)
-        threshold = max(mean_val * 0.2, mean_val - std_val * 0.3)
-        text_rows = np.where(row_sums > threshold)[0]
-        if len(text_rows) < 2:
-            # No clear text lines detected, use conservative estimates
-            estimated_lines = max(5, height // 50)
-            return {
-                'avg_line_height': height / estimated_lines,
-                'min_line_height': max(15, height / (estimated_lines * 2)),
-                'max_line_height': height / 2,  # Split if > half image
-                'line_spacing': height / estimated_lines
-            }
-        # Group consecutive text rows into lines
-        line_centers = []
-        current_group = [text_rows[0]]
-        for i in range(1, len(text_rows)):
-            if text_rows[i] - text_rows[i-1] <= 5:  # Consecutive rows
-                current_group.append(text_rows[i])
-            else:
-                line_centers.append(int(np.mean(current_group)))
-                current_group = [text_rows[i]]
-        if current_group:
-            line_centers.append(int(np.mean(current_group)))
-        if len(line_centers) < 2:
-            # Can't determine spacing
-            estimated_lines = max(3, height // 60)
-            return {
-                'avg_line_height': height / estimated_lines,
-                'min_line_height': max(20, height / (estimated_lines * 2)),
-                'max_line_height': height / 2,
-                'line_spacing': height / estimated_lines
-            }
-        # Calculate spacing between lines
-        spacings = []
-        for i in range(len(line_centers) - 1):
-            spacing = line_centers[i+1] - line_centers[i]
-            if spacing > 8:  # Minimum reasonable spacing
-                spacings.append(spacing)
-        if spacings:
-            avg_spacing = np.median(spacings)
-            min_spacing = np.percentile(spacings, 25)
-            max_spacing = np.percentile(spacings, 75)
-            return {
-                'avg_line_height': float(avg_spacing),
-                'min_line_height': float(max(15, min_spacing * 0.6)),  # 60% of min spacing
-                'max_line_height': float(max_spacing * 1.5),  # 1.5x max spacing = likely multi-line
-                'line_spacing': float(avg_spacing),
-                'num_lines_detected': len(line_centers)
-            }
-        # Fallback
-        estimated_lines = max(3, height // 50)
-        return {
-            'avg_line_height': height / estimated_lines,
-            'min_line_height': max(20, height / (estimated_lines * 2)),
-            'max_line_height': height / 2,
-            'line_spacing': height / estimated_lines
-        }
-    except Exception as e:
-        print(f"   ⚠️ Error analyzing image: {e}")
-        # Ultra-conservative fallback
-        width, height = image.size
-        return {
-            'avg_line_height': 50,
-            'min_line_height': 25,
-            'max_line_height': 100,
-            'line_spacing': 50
-        }
-def validate_region_contains_text(image: Image.Image, bbox: List[int], min_text_density: float = 0.05) -> bool:
-    """
-    Validate that a bounding box region actually contains text (not empty space).
-    Args:
-        image: Original image
-        bbox: Bounding box [x1, y1, x2, y2]
-        min_text_density: Minimum fraction of pixels that should be text (dark pixels)
-    Returns:
-        True if region contains sufficient text, False otherwise
-    """
-    try:
-        x1, y1, x2, y2 = bbox
-        x1, y1 = max(0, int(x1)), max(0, int(y1))
-        x2, y2 = min(image.width, int(x2)), min(image.height, int(y2))
-        if x2 <= x1 or y2 <= y1:
-            return False
-        # Crop region
-        crop = image.crop((x1, y1, x2, y2))
-        gray = crop.convert('L')
-        img_array = np.array(gray)
-        if img_array.size == 0:
-            return False
-        # Calculate text density (fraction of dark pixels)
-        # For handwriting/text, we expect at least some dark pixels
-        dark_pixels = np.sum(img_array < 128)  # Pixels darker than middle gray
-        total_pixels = img_array.size
-        text_density = dark_pixels / total_pixels if total_pixels > 0 else 0
-        # Also check for minimum height/width (avoid tiny regions)
-        height = y2 - y1
-        width = x2 - x1
-        min_dimension = min(height, width)
-        # Reject if:
-        # 1. Text density too low (mostly empty space)
-        # 2. Region too small (likely noise)
-        if text_density < min_text_density:
-            return False
-        if min_dimension < 15:  # Too small to be a real text line
-            return False
-        return True
-    except Exception as e:
-        print(f"   ⚠️ Error validating region: {e}")
-        return False
-def filter_empty_regions(image: Image.Image, layout_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """
-    Filter out regions that are empty spaces, noise, or false positives.
-    This removes:
-    - Regions with very low text density (empty margins/spaces)
-    - Regions that are too small
-    - Regions that are likely noise artifacts
-    """
-    filtered = []
-    removed_count = 0
-    for item in layout_data:
-        bbox = item.get('bbox', [])
-        category = item.get('category', '')
-        text = item.get('text', '').strip()
-        # Skip if no bbox
-        if not bbox or len(bbox) != 4:
-            continue
-        # For Text/List-item regions, validate they contain actual text
-        if category in ['Text', 'List-item']:
-            if not validate_region_contains_text(image, bbox):
-                print(f"   🗑️ Removing empty region: {category} bbox={bbox}")
-                removed_count += 1
-                continue
-        # Even if region passes validation, check if text is meaningful
-        # Remove regions with very short or meaningless text
-        if category in ['Text', 'List-item']:
-            # Remove if text is empty or too short (likely noise)
-            if not text or len(text.strip()) < 2:
-                # But only if it also failed validation
-                if not validate_region_contains_text(image, bbox, min_text_density=0.03):
-                    print(f"   🗑️ Removing empty/noise region: {category} bbox={bbox}")
-                    removed_count += 1
-                    continue
-        filtered.append(item)
-    if removed_count > 0:
-        print(f"🗑️ Filtered out {removed_count} empty/noise regions")
-    return filtered
-def detect_line_spacing(image: Image.Image, bbox: List[int]) -> Optional[float]:
-    """
-    Detect average line spacing in a text region using horizontal projection analysis.
-    Returns estimated line height in pixels, or None if detection fails.
-    """
-    try:
-        x1, y1, x2, y2 = bbox
-        crop = image.crop((x1, y1, x2, y2))
-        # Convert to grayscale
-        gray = crop.convert('L')
-        img_array = np.array(gray)
-        if img_array.size == 0:
-            return None
-        # Horizontal projection: sum of dark pixels per row
-        row_sums = np.sum(img_array < 128, axis=1)
-        if len(row_sums) < 10:
-            return None
-        # Find peaks (text lines) and valleys (spacing between lines)
-        mean_val = np.mean(row_sums)
-        std_val = np.std(row_sums)
-        threshold = max(mean_val * 0.25, mean_val - std_val * 0.4)
-        text_rows = np.where(row_sums > threshold)[0]
-        if len(text_rows) < 2:
-            return None
-        # Group consecutive rows to find line centers
-        line_centers = []
-        current_group = [text_rows[0]]
-        for i in range(1, len(text_rows)):
-            if text_rows[i] - text_rows[i-1] <= 3:
-                current_group.append(text_rows[i])
-            else:
-                line_centers.append(int(np.mean(current_group)))
-                current_group = [text_rows[i]]
-        if current_group:
-            line_centers.append(int(np.mean(current_group)))
-        if len(line_centers) < 2:
-            return None
-        # Calculate spacing between line centers
-        spacings = []
-        for i in range(len(line_centers) - 1):
-            spacing = line_centers[i+1] - line_centers[i]
-            if spacing > 10:
-                spacings.append(spacing)
-        if spacings:
-            return float(np.median(spacings))
-        return None
-    except Exception as e:
-        return None
-def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) -> List[int]:
-    """
-    Detect actual line break positions within a text region using horizontal projection.
-    Only detects breaks if region has sufficient text density (not empty space).
-    Returns list of y-coordinates where lines break.
-    """
-    try:
-        x1, y1, x2, y2 = bbox
-        crop = image.crop((x1, y1, x2, y2))
-        gray = crop.convert('L')
-        img_array = np.array(gray)
-        if img_array.size == 0:
-            return []
-        # FIRST: Validate region has actual text (not empty space)
-        total_pixels = img_array.size
-        dark_pixels = np.sum(img_array < 128)
-        text_density = dark_pixels / total_pixels if total_pixels > 0 else 0
-        # Require minimum text density to avoid false positives on empty regions
-        if text_density < 0.03:  # Less than 3% dark pixels = likely empty
-            return []
-        # Horizontal projection
-        row_sums = np.sum(img_array < 128, axis=1)
-        if len(row_sums) < 10:  # Need enough rows
-            return []
-        # Find valleys (spaces between lines) and peaks (text lines)
-        mean_val = np.mean(row_sums)
-        std_val = np.std(row_sums)
-        # More aggressive thresholds to avoid false positives
-        text_threshold = max(mean_val * 0.3, mean_val - std_val * 0.5)
-        space_threshold = mean_val * 0.1  # Very low for actual spaces
-        # Find text rows and space rows
-        text_rows = np.where(row_sums > text_threshold)[0]
-        if len(text_rows) < 5:  # Need substantial text rows
-            return []
-        # Group text rows into lines
-        line_groups = []
-        current_group = [text_rows[0]]
-        for i in range(1, len(text_rows)):
-            if text_rows[i] - text_rows[i-1] <= 3:
-                current_group.append(text_rows[i])
-            else:
-                if len(current_group) >= 3:  # Require minimum group size
-                    line_groups.append(current_group)
-                current_group = [text_rows[i]]
-        if len(current_group) >= 3:
-            line_groups.append(current_group)
-        if len(line_groups) < 2:
-            return []  # Single line or can't detect
-        # Find break points (midpoints between line groups)
-        # Require minimum gap between lines to avoid false splits
-        break_points = []
-        for i in range(len(line_groups) - 1):
-            last_row_of_line1 = max(line_groups[i])
-            first_row_of_line2 = min(line_groups[i+1])
-            gap = first_row_of_line2 - last_row_of_line1
-            # Only split if gap is substantial (at least 5 pixels)
-            if gap >= 5:
-                break_point = (last_row_of_line1 + first_row_of_line2) // 2
-                break_points.append(y1 + break_point)  # Convert to image coordinates
-        return break_points
-    except Exception as e:
-        print(f"   ⚠️ Error detecting line breaks: {e}")
-        return []
-def split_text_regions_into_lines(
-    image: Image.Image,
-    layout_data: List[Dict[str, Any]],
-    min_line_height: Optional[int] = None,
-    max_line_height: Optional[int] = None
-) -> List[Dict[str, Any]]:
-    """
-    Intelligently split text regions into individual lines.
-    ADAPTIVE APPROACH:
-    - Analyzes image to determine optimal parameters
-    - Detects actual line breaks using image analysis
-    - Works for any image type (sparse, dense, tables, forms)
-    - No hardcoded thresholds
-    Args:
-        image: Original image
-        layout_data: Layout detection results
-        min_line_height: Optional override (auto-detected if None)
-        max_line_height: Optional override (auto-detected if None)
-    Returns:
-        Updated layout data with lines split
-    """
-    # Analyze image to get adaptive parameters
-    img_chars = analyze_image_line_characteristics(image)
-    adaptive_min = min_line_height if min_line_height else int(img_chars['min_line_height'])
-    adaptive_max = max_line_height if max_line_height else int(img_chars['max_line_height'])
-    avg_line_height = img_chars['avg_line_height']
-    print(f"\n📊 Image analysis: avg_line_height={avg_line_height:.1f}px, "
-          f"min={adaptive_min}px, max={adaptive_max}px")
-    if 'num_lines_detected' in img_chars:
-        print(f"   Detected ~{img_chars['num_lines_detected']} lines in image")
-    result = []
-    split_count = 0
-    for item in layout_data:
-        bbox = item.get('bbox', [])
-        category = item.get('category', '')
-        text_content = item.get('text', '')
-        # Only split Text regions (not titles, headers, tables, etc.)
-        if len(bbox) != 4 or category not in ['Text', 'List-item']:
-            result.append(item)
-            continue
-        x1, y1, x2, y2 = bbox
-        height = y2 - y1
-        width = x2 - x1
-        # FIRST: Validate region actually contains text before trying to split
-        if not validate_region_contains_text(image, bbox, min_text_density=0.03):
-            print(f"   Region: {category} (h={height}px) - Empty/noise region, skipping split")
-            # Don't add empty regions - they'll be filtered out later
-            continue
-        # ALWAYS check if region contains multiple lines, regardless of height
-        # Use image analysis to detect actual line breaks
-        line_breaks = detect_actual_line_breaks_in_region(image, bbox)
-        if len(line_breaks) > 0:
-            # We detected actual line breaks - split at those positions
-            print(f"   Region: {category} (h={height}px) - Detected {len(line_breaks)+1} lines via image analysis")
-            # Create lines based on detected breaks
-            current_y = y1
-            for i, break_y in enumerate(line_breaks):
-                # Create line from current_y to break_y
-                new_bbox = [x1, int(current_y), x2, int(break_y)]
-                # Validate split region contains text before adding
-                if validate_region_contains_text(image, new_bbox, min_text_density=0.03):
-                    new_item = item.copy()
-                    new_item['bbox'] = new_bbox
-                    new_item['text'] = ""  # Will be re-OCR'd
-                    new_item['split_from_parent'] = True
-                    new_item['needs_reocr'] = True
-                    new_item['line_number'] = i + 1
-                    result.append(new_item)
-                current_y = break_y
-            # Add last line
-            final_bbox = [x1, int(current_y), x2, y2]
-            if validate_region_contains_text(image, final_bbox, min_text_density=0.03):
-                new_item = item.copy()
-                new_item['bbox'] = final_bbox
-                new_item['text'] = ""
-                new_item['split_from_parent'] = True
-                new_item['needs_reocr'] = True
-                new_item['line_number'] = len(line_breaks) + 1
-                result.append(new_item)
-            split_count += 1
-        elif height > adaptive_max:
-            # No line breaks detected but region is tall - use spacing-based split
-            print(f"   Region: {category} (h={height}px) - Tall region, using spacing-based split")
-            # Try to detect spacing in this specific region
-            detected_spacing = detect_line_spacing(image, bbox)
-            if detected_spacing and detected_spacing > adaptive_min:
-                line_height = detected_spacing
-                estimated_lines = max(2, round(height / line_height))
-            else:
-                line_height = avg_line_height
-                estimated_lines = max(2, round(height / line_height))
-            estimated_lines = min(estimated_lines, 15)  # Cap at 15 lines
-            # Calculate padding (adaptive: 8% of line height, min 2px)
-            padding = max(2, int(line_height * 0.08))
-            # Split geometrically
-            for i in range(estimated_lines):
-                if i == 0:
-                    new_y1 = y1
-                    new_y2 = y1 + line_height + padding
-                elif i == estimated_lines - 1:
-                    new_y1 = y1 + (i * line_height) - padding
-                    new_y2 = y2
-                else:
-                    new_y1 = y1 + (i * line_height) - padding
-                    new_y2 = y1 + ((i + 1) * line_height) + padding
-                new_y1 = max(y1, int(new_y1))
-                new_y2 = min(y2, int(new_y2))
-                if new_y2 > new_y1:
-                    new_bbox = [x1, new_y1, x2, new_y2]
-                    # Validate split region contains text before adding
-                    if validate_region_contains_text(image, new_bbox, min_text_density=0.03):
-                        new_item = item.copy()
-                        new_item['bbox'] = new_bbox
-                        new_item['text'] = ""
-                        new_item['split_from_parent'] = True
-                        new_item['needs_reocr'] = True
-                        new_item['line_number'] = i + 1
-                        result.append(new_item)
-            split_count += 1
-        else:
-            # Region is reasonably sized - keep as is
-            print(f"   Region: {category} (h={height}px) - Keeping as single line")
-            result.append(item)
-    if split_count > 0:
-        print(f"📏 Split {split_count} regions into {len(result)} total lines")
-    return result
 def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
     """
     Intelligently determine if image should be chunked for better accuracy.
@@ -1382,126 +813,6 @@ def process_image(
             # Try to parse JSON output
             layout_data = json.loads(raw_output)
-            # 🗑️ FIRST FILTER: Remove empty regions and false positives from initial detection
-            print(f"\n📋 Initial layout: {len(layout_data)} regions detected")
-            layout_data = filter_empty_regions(image, layout_data)
-            print(f"✅ After initial filtering: {len(layout_data)} regions remaining")
-            for idx, item in enumerate(layout_data):
-                bbox = item.get('bbox', [])
-                text = item.get('text', '')[:50]
-                cat = item.get('category', '')
-                print(f"   Region {idx+1}: {cat} - '{text}...' bbox={bbox}")
-            try:
-                layout_data_before = len(layout_data)
-                layout_data = split_text_regions_into_lines(image, layout_data)
-                print(f"📐 After splitting: {layout_data_before} → {len(layout_data)} regions")
-                # 🗑️ SECOND FILTER: Remove any empty regions created during splitting
-                layout_data = filter_empty_regions(image, layout_data)
-                print(f"✅ After post-split filtering: {len(layout_data)} regions remaining")
-            except Exception as e:
-                print(f"⚠️ Warning: Could not split text regions: {e}")
-                traceback.print_exc()
-                # Continue with original layout data
-            # 🔄 RE-OCR SPLIT LINES: For split regions, perform per-line OCR
-            regions_needing_reocr = [item for item in layout_data if item.get('needs_reocr')]
-            if regions_needing_reocr:
-                print(f"🔄 Re-OCRing {len(regions_needing_reocr)} split line regions for accurate per-line text...")
-                valid_regions = []
-                for idx, item in enumerate(regions_needing_reocr):
-                    try:
-                        bbox = item.get('bbox', [])
-                        if not bbox or len(bbox) != 4:
-                            continue
-                        x1, y1, x2, y2 = bbox
-                        x1, y1 = max(0, int(x1)), max(0, int(y1))
-                        x2, y2 = min(image.width, int(x2)), min(image.height, int(y2))
-                        if x2 <= x1 or y2 <= y1:
-                            continue
-                        # 🚫 VALIDATE BEFORE RE-OCR: Skip empty regions
-                        if not validate_region_contains_text(image, bbox, min_text_density=0.03):
-                            print(f"   ⚠️ Skipping line {idx+1}: empty region (bbox={bbox})")
-                            # Mark for removal
-                            item['_should_remove'] = True
-                            continue
-                        # Add small safety margin to ensure we capture full text
-                        margin = 2  # Small margin to avoid edge clipping
-                        crop_x1 = max(0, x1 - margin)
-                        crop_y1 = max(0, y1 - margin)
-                        crop_x2 = min(image.width, x2 + margin)
-                        crop_y2 = min(image.height, y2 + margin)
-                        # Crop and preprocess the line region
-                        crop_img = image.crop((crop_x1, crop_y1, crop_x2, crop_y2))
-                        # Validate crop is reasonable size
-                        if crop_img.size[0] < 10 or crop_img.size[1] < 10:
-                            print(f"   ⚠️ Skipping line {idx+1}: crop too small ({crop_img.size})")
-                            item['_should_remove'] = True
-                            continue
-                        # Apply preprocessing to enhance handwriting quality
-                        crop_img = preprocess_for_handwriting_ocr(crop_img)
-                        # Re-OCR this specific line
-                        line_text, line_conf = _generate_text_and_confidence_for_crop(crop_img)
-                        # AGGRESSIVE FILTERING: Remove any English words/hallucinations
-                        line_text = line_text.strip()
-                        # Remove common English hallucinations
-                        english_hallucinations = [
-                            'Commission', 'commission', 'COMMISSION',
-                            'The', 'the', 'and', 'or', 'of', 'in', 'to', 'a', 'is',
-                            'Text', 'text', 'Title', 'title', 'Caption', 'caption',
-                            'Page', 'page', 'Document', 'document', 'Image', 'image'
-                        ]
-                        for hallucination in english_hallucinations:
-                            line_text = line_text.replace(hallucination, '').strip()
-                        # Remove any remaining Latin alphabet (keep only Arabic, numbers, punctuation)
-                        import re
-                        # Keep: Arabic letters, Arabic numbers, spaces, basic punctuation
-                        line_text = re.sub(r'[a-zA-Z]+', '', line_text).strip()
-                        # If text is empty or too short after filtering, mark for removal
-                        if not line_text or len(line_text.strip()) < 2:
-                            print(f"   ⚠️ Skipping line {idx+1}: no meaningful text after filtering")
-                            item['_should_remove'] = True
-                            continue
-                        item['text'] = line_text
-                        item['confidence'] = line_conf
-                        item['reocr_completed'] = True
-                        valid_regions.append(item)
-                        print(f"   ✓ Line {idx+1}/{len(regions_needing_reocr)}: '{line_text[:50]}...' (conf: {line_conf:.1f}%)")
-                    except Exception as e:
-                        print(f"   ✗ Error re-OCRing line {idx}: {e}")
-                        traceback.print_exc()
-                        item['_should_remove'] = True
-                # Remove regions marked for removal
-                layout_data = [item for item in layout_data if not item.get('_should_remove', False)]
-                print(f"\n✅ Re-OCR complete. Final layout has {len(layout_data)} regions:")
-                for idx, item in enumerate(layout_data):
-                    text = item.get('text', '')[:50]
-                    conf = item.get('confidence', 0)
-                    reocr = item.get('reocr_completed', False)
-                    print(f"   Region {idx+1}: '{text}...' (conf={conf:.1f}%, re-OCR={reocr})")
-                # 🗑️ FINAL FILTER: Remove any remaining empty/invalid regions
-                layout_data = filter_empty_regions(image, layout_data)
-                print(f"✅ After final filtering: {len(layout_data)} regions remaining")
             # 🎯 INTELLIGENT CONFIDENCE SCORING
             # Count text regions to determine if per-region scoring is feasible
             num_text_regions = sum(1 for item in layout_data
@@ -1514,10 +825,6 @@ def process_image(
                 # Compute per-region confidence using the model on each cropped region
                 for idx, item in enumerate(layout_data):
                     try:
-                        # Skip if already processed during re-OCR
-                        if item.get('reocr_completed'):
-                            continue
                         bbox = item.get('bbox', [])
                         text_content = item.get('text', '')
                         category = item.get('category', '')
@@ -1563,10 +870,9 @@ def process_image(
             # ✨ ARABIC TEXT CORRECTION: Apply intelligent correction to each text region
             try:
-                print(f"\n🔧 Applying Arabic text correction to {len(layout_data)} regions...")
                 corrector = get_corrector()
-                corrections_applied = 0
                 for idx, item in enumerate(layout_data):
                     text_content = item.get('text', '')
                     category = item.get('category', '')
@@ -1575,8 +881,6 @@ def process_image(
                     if not text_content or category in ['Picture', 'Formula', 'Table']:
                         continue
-                    print(f"   Correcting region {idx+1}: '{text_content[:40]}...'")
                     # Apply correction
                     correction_result = corrector.correct_text(text_content)
@@ -1589,17 +893,13 @@ def process_image(
                     # Update the text field to use corrected version
                     item['text'] = correction_result['corrected']
-                    if correction_result['corrections_made'] > 0:
-                        corrections_applied += correction_result['corrections_made']
-                        print(f"   → Made {correction_result['corrections_made']} corrections")
                 # Regenerate markdown with corrected text
                 corrected_markdown = layoutjson2md(image, layout_data, text_key='text')
                 result['markdown_content_corrected'] = corrected_markdown
                 result['markdown_content_original'] = markdown_content
-                print(f"✅ Correction complete: {corrections_applied} total corrections made across {len(layout_data)} regions")
             except Exception as e:
                 print(f"⚠️ Error during Arabic correction: {e}")

 # Constants
 MIN_PIXELS = 3136
+MAX_PIXELS = 11289600
 IMAGE_FACTOR = 28
 # Prompts
+prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
 1. Bbox format: [x1, y1, x2, y2]
     - Formula: Format its text as LaTeX.
     - Table: Format its text as HTML.
     - All Others (Text, Title, etc.): Format their text as Markdown.
 4. Constraints:
+    - The output text must be the original text from the image, with no translation.
+    - All layout elements must be sorted according to human reading order.
 5. Final Output: The entire output must be a single JSON object.
 """
                     {"type": "image", "image": image},
                     {
                         "type": "text",
+                        "text": "Extract the exact text content from this image region. Output text only without translation or additional words.",
                     },
                 ],
             }
         return "", 0.0
 def estimate_text_density(image: Image.Image) -> float:
     """
     Estimate text density in image using pixel analysis.
         return 0.1  # Default to low density
 def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
     """
     Intelligently determine if image should be chunked for better accuracy.
             # Try to parse JSON output
             layout_data = json.loads(raw_output)
             # 🎯 INTELLIGENT CONFIDENCE SCORING
             # Count text regions to determine if per-region scoring is feasible
             num_text_regions = sum(1 for item in layout_data
                 # Compute per-region confidence using the model on each cropped region
                 for idx, item in enumerate(layout_data):
                     try:
                         bbox = item.get('bbox', [])
                         text_content = item.get('text', '')
                         category = item.get('category', '')
             # ✨ ARABIC TEXT CORRECTION: Apply intelligent correction to each text region
             try:
+                print("🔧 Applying Arabic text correction...")
                 corrector = get_corrector()
                 for idx, item in enumerate(layout_data):
                     text_content = item.get('text', '')
                     category = item.get('category', '')
                     if not text_content or category in ['Picture', 'Formula', 'Table']:
                         continue
                     # Apply correction
                     correction_result = corrector.correct_text(text_content)
                     # Update the text field to use corrected version
                     item['text'] = correction_result['corrected']
                 # Regenerate markdown with corrected text
                 corrected_markdown = layoutjson2md(image, layout_data, text_key='text')
                 result['markdown_content_corrected'] = corrected_markdown
                 result['markdown_content_original'] = markdown_content
+                print(f"✅ Correction complete")
             except Exception as e:
                 print(f"⚠️ Error during Arabic correction: {e}")