Spaces:

VanguardAI
/

Arabic-OCR

Running

App Files Files Community

VanguardAI commited on 26 days ago

Commit

e14005e

verified ·

1 Parent(s): 928fb2c

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -1

app.py CHANGED Viewed

@@ -34,12 +34,14 @@ torch.backends.cudnn.benchmark = False
 # Constants
 MIN_PIXELS = 3136
-MAX_PIXELS = 11289600
 IMAGE_FACTOR = 28
 # Prompts
 prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
 1. Bbox format: [x1, y1, x2, y2]
 2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
@@ -52,6 +54,8 @@ prompt = """Please output the layout information from the PDF image, including e
 4. Constraints:
     - The output text must be the original text from the image, with no translation.
     - All layout elements must be sorted according to human reading order.
 5. Final Output: The entire output must be a single JSON object.
@@ -587,6 +591,94 @@ def estimate_text_density(image: Image.Image) -> float:
         return 0.1  # Default to low density
 def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
     """
     Intelligently determine if image should be chunked for better accuracy.
@@ -813,6 +905,14 @@ def process_image(
             # Try to parse JSON output
             layout_data = json.loads(raw_output)
             # 🎯 INTELLIGENT CONFIDENCE SCORING
             # Count text regions to determine if per-region scoring is feasible
             num_text_regions = sum(1 for item in layout_data

 # Constants
 MIN_PIXELS = 3136
+MAX_PIXELS = 16000000  # Increased for better line detection (was 11289600)
 IMAGE_FACTOR = 28
 # Prompts
 prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
+CRITICAL REQUIREMENT: Detect EACH LINE of text as a SEPARATE element. Do NOT group multiple lines together into a single bbox.
 1. Bbox format: [x1, y1, x2, y2]
 2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
 4. Constraints:
     - The output text must be the original text from the image, with no translation.
+    - IMPORTANT: Detect every individual line of text separately - do not merge multiple lines into one element.
+    - Each text line should have its own bbox and text content.
     - All layout elements must be sorted according to human reading order.
 5. Final Output: The entire output must be a single JSON object.
         return 0.1  # Default to low density
+def split_text_regions_into_lines(
+    image: Image.Image,
+    layout_data: List[Dict[str, Any]],
+    min_line_height: int = 30,
+    max_line_height: int = 120
+) -> List[Dict[str, Any]]:
+    """
+    Post-process layout data to split large text regions into individual lines.
+    This ensures each line gets its own bounding box for easier verification.
+    Args:
+        image: Original image
+        layout_data: Layout detection results
+        min_line_height: Minimum height for a text line (pixels)
+        max_line_height: Maximum height for a single line before splitting
+    Returns:
+        Updated layout data with lines split
+    """
+    result = []
+    split_count = 0
+    for item in layout_data:
+        bbox = item.get('bbox', [])
+        category = item.get('category', '')
+        text_content = item.get('text', '')
+        # Only split Text regions (not titles, headers, tables, etc.)
+        if len(bbox) != 4 or category not in ['Text', 'List-item']:
+            result.append(item)
+            continue
+        x1, y1, x2, y2 = bbox
+        height = y2 - y1
+        # If region is tall enough to contain multiple lines, split it
+        if height > max_line_height:
+            # Estimate number of lines based on typical line height
+            # Arabic handwritten text: ~40-60px per line
+            # Arabic typed text: ~30-50px per line
+            avg_line_height = 45  # Middle ground
+            estimated_lines = max(1, round(height / avg_line_height))
+            # Don't split into too many lines (might be a paragraph)
+            estimated_lines = min(estimated_lines, 10)
+            line_height = height / estimated_lines
+            # Split text content by newlines if available
+            text_lines = text_content.split('\n') if text_content else []
+            # If we have the same number of text lines as estimated, use them
+            if len(text_lines) == estimated_lines and len(text_lines) > 1:
+                for i, line_text in enumerate(text_lines):
+                    if not line_text.strip():
+                        continue
+                    new_item = item.copy()
+                    new_y1 = y1 + (i * line_height)
+                    new_y2 = y1 + ((i + 1) * line_height)
+                    new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
+                    new_item['text'] = line_text.strip()
+                    new_item['split_from_parent'] = True
+                    result.append(new_item)
+                split_count += 1
+            else:
+                # Split geometrically but keep full text in each (user can verify)
+                for i in range(estimated_lines):
+                    new_item = item.copy()
+                    new_y1 = y1 + (i * line_height)
+                    new_y2 = y1 + ((i + 1) * line_height)
+                    new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
+                    # Keep original text (cropping will happen visually)
+                    new_item['text'] = text_content  # User can edit in table
+                    new_item['split_from_parent'] = True
+                    new_item['line_number'] = i + 1
+                    result.append(new_item)
+                split_count += 1
+        else:
+            # Region is already line-sized, keep as is
+            result.append(item)
+    if split_count > 0:
+        print(f"📏 Split {split_count} large regions into individual lines ({len(layout_data)} → {len(result)} regions)")
+    return result
 def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
     """
     Intelligently determine if image should be chunked for better accuracy.
             # Try to parse JSON output
             layout_data = json.loads(raw_output)
+            # 📏 LINE-LEVEL SPLITTING: Split large text regions into individual lines
+            # This ensures each line gets its own bounding box for easier verification
+            try:
+                layout_data = split_text_regions_into_lines(image, layout_data)
+            except Exception as e:
+                print(f"⚠️ Warning: Could not split text regions: {e}")
+                # Continue with original layout data
             # 🎯 INTELLIGENT CONFIDENCE SCORING
             # Count text regions to determine if per-region scoring is feasible
             num_text_regions = sum(1 for item in layout_data