VanguardAI commited on
Commit
e14005e
Β·
verified Β·
1 Parent(s): 928fb2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -1
app.py CHANGED
@@ -34,12 +34,14 @@ torch.backends.cudnn.benchmark = False
34
 
35
  # Constants
36
  MIN_PIXELS = 3136
37
- MAX_PIXELS = 11289600
38
  IMAGE_FACTOR = 28
39
 
40
  # Prompts
41
  prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
42
 
 
 
43
  1. Bbox format: [x1, y1, x2, y2]
44
 
45
  2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
@@ -52,6 +54,8 @@ prompt = """Please output the layout information from the PDF image, including e
52
 
53
  4. Constraints:
54
  - The output text must be the original text from the image, with no translation.
 
 
55
  - All layout elements must be sorted according to human reading order.
56
 
57
  5. Final Output: The entire output must be a single JSON object.
@@ -587,6 +591,94 @@ def estimate_text_density(image: Image.Image) -> float:
587
  return 0.1 # Default to low density
588
 
589
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
591
  """
592
  Intelligently determine if image should be chunked for better accuracy.
@@ -813,6 +905,14 @@ def process_image(
813
  # Try to parse JSON output
814
  layout_data = json.loads(raw_output)
815
 
 
 
 
 
 
 
 
 
816
  # 🎯 INTELLIGENT CONFIDENCE SCORING
817
  # Count text regions to determine if per-region scoring is feasible
818
  num_text_regions = sum(1 for item in layout_data
 
34
 
35
  # Constants
36
  MIN_PIXELS = 3136
37
+ MAX_PIXELS = 16000000 # Increased for better line detection (was 11289600)
38
  IMAGE_FACTOR = 28
39
 
40
  # Prompts
41
  prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
42
 
43
+ CRITICAL REQUIREMENT: Detect EACH LINE of text as a SEPARATE element. Do NOT group multiple lines together into a single bbox.
44
+
45
  1. Bbox format: [x1, y1, x2, y2]
46
 
47
  2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
 
54
 
55
  4. Constraints:
56
  - The output text must be the original text from the image, with no translation.
57
+ - IMPORTANT: Detect every individual line of text separately - do not merge multiple lines into one element.
58
+ - Each text line should have its own bbox and text content.
59
  - All layout elements must be sorted according to human reading order.
60
 
61
  5. Final Output: The entire output must be a single JSON object.
 
591
  return 0.1 # Default to low density
592
 
593
 
594
+ def split_text_regions_into_lines(
595
+ image: Image.Image,
596
+ layout_data: List[Dict[str, Any]],
597
+ min_line_height: int = 30,
598
+ max_line_height: int = 120
599
+ ) -> List[Dict[str, Any]]:
600
+ """
601
+ Post-process layout data to split large text regions into individual lines.
602
+
603
+ This ensures each line gets its own bounding box for easier verification.
604
+
605
+ Args:
606
+ image: Original image
607
+ layout_data: Layout detection results
608
+ min_line_height: Minimum height for a text line (pixels)
609
+ max_line_height: Maximum height for a single line before splitting
610
+
611
+ Returns:
612
+ Updated layout data with lines split
613
+ """
614
+ result = []
615
+ split_count = 0
616
+
617
+ for item in layout_data:
618
+ bbox = item.get('bbox', [])
619
+ category = item.get('category', '')
620
+ text_content = item.get('text', '')
621
+
622
+ # Only split Text regions (not titles, headers, tables, etc.)
623
+ if len(bbox) != 4 or category not in ['Text', 'List-item']:
624
+ result.append(item)
625
+ continue
626
+
627
+ x1, y1, x2, y2 = bbox
628
+ height = y2 - y1
629
+
630
+ # If region is tall enough to contain multiple lines, split it
631
+ if height > max_line_height:
632
+ # Estimate number of lines based on typical line height
633
+ # Arabic handwritten text: ~40-60px per line
634
+ # Arabic typed text: ~30-50px per line
635
+ avg_line_height = 45 # Middle ground
636
+ estimated_lines = max(1, round(height / avg_line_height))
637
+
638
+ # Don't split into too many lines (might be a paragraph)
639
+ estimated_lines = min(estimated_lines, 10)
640
+
641
+ line_height = height / estimated_lines
642
+
643
+ # Split text content by newlines if available
644
+ text_lines = text_content.split('\n') if text_content else []
645
+
646
+ # If we have the same number of text lines as estimated, use them
647
+ if len(text_lines) == estimated_lines and len(text_lines) > 1:
648
+ for i, line_text in enumerate(text_lines):
649
+ if not line_text.strip():
650
+ continue
651
+ new_item = item.copy()
652
+ new_y1 = y1 + (i * line_height)
653
+ new_y2 = y1 + ((i + 1) * line_height)
654
+ new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
655
+ new_item['text'] = line_text.strip()
656
+ new_item['split_from_parent'] = True
657
+ result.append(new_item)
658
+ split_count += 1
659
+ else:
660
+ # Split geometrically but keep full text in each (user can verify)
661
+ for i in range(estimated_lines):
662
+ new_item = item.copy()
663
+ new_y1 = y1 + (i * line_height)
664
+ new_y2 = y1 + ((i + 1) * line_height)
665
+ new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
666
+ # Keep original text (cropping will happen visually)
667
+ new_item['text'] = text_content # User can edit in table
668
+ new_item['split_from_parent'] = True
669
+ new_item['line_number'] = i + 1
670
+ result.append(new_item)
671
+ split_count += 1
672
+ else:
673
+ # Region is already line-sized, keep as is
674
+ result.append(item)
675
+
676
+ if split_count > 0:
677
+ print(f"πŸ“ Split {split_count} large regions into individual lines ({len(layout_data)} β†’ {len(result)} regions)")
678
+
679
+ return result
680
+
681
+
682
  def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
683
  """
684
  Intelligently determine if image should be chunked for better accuracy.
 
905
  # Try to parse JSON output
906
  layout_data = json.loads(raw_output)
907
 
908
+ # πŸ“ LINE-LEVEL SPLITTING: Split large text regions into individual lines
909
+ # This ensures each line gets its own bounding box for easier verification
910
+ try:
911
+ layout_data = split_text_regions_into_lines(image, layout_data)
912
+ except Exception as e:
913
+ print(f"⚠️ Warning: Could not split text regions: {e}")
914
+ # Continue with original layout data
915
+
916
  # 🎯 INTELLIGENT CONFIDENCE SCORING
917
  # Count text regions to determine if per-region scoring is feasible
918
  num_text_regions = sum(1 for item in layout_data