Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -34,12 +34,14 @@ torch.backends.cudnn.benchmark = False
|
|
| 34 |
|
| 35 |
# Constants
|
| 36 |
MIN_PIXELS = 3136
|
| 37 |
-
MAX_PIXELS = 11289600
|
| 38 |
IMAGE_FACTOR = 28
|
| 39 |
|
| 40 |
# Prompts
|
| 41 |
prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
|
| 42 |
|
|
|
|
|
|
|
| 43 |
1. Bbox format: [x1, y1, x2, y2]
|
| 44 |
|
| 45 |
2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
|
|
@@ -52,6 +54,8 @@ prompt = """Please output the layout information from the PDF image, including e
|
|
| 52 |
|
| 53 |
4. Constraints:
|
| 54 |
- The output text must be the original text from the image, with no translation.
|
|
|
|
|
|
|
| 55 |
- All layout elements must be sorted according to human reading order.
|
| 56 |
|
| 57 |
5. Final Output: The entire output must be a single JSON object.
|
|
@@ -587,6 +591,94 @@ def estimate_text_density(image: Image.Image) -> float:
|
|
| 587 |
return 0.1 # Default to low density
|
| 588 |
|
| 589 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
|
| 591 |
"""
|
| 592 |
Intelligently determine if image should be chunked for better accuracy.
|
|
@@ -813,6 +905,14 @@ def process_image(
|
|
| 813 |
# Try to parse JSON output
|
| 814 |
layout_data = json.loads(raw_output)
|
| 815 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 816 |
# π― INTELLIGENT CONFIDENCE SCORING
|
| 817 |
# Count text regions to determine if per-region scoring is feasible
|
| 818 |
num_text_regions = sum(1 for item in layout_data
|
|
|
|
| 34 |
|
| 35 |
# Constants
|
| 36 |
MIN_PIXELS = 3136
|
| 37 |
+
MAX_PIXELS = 16000000 # Increased for better line detection (was 11289600)
|
| 38 |
IMAGE_FACTOR = 28
|
| 39 |
|
| 40 |
# Prompts
|
| 41 |
prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
|
| 42 |
|
| 43 |
+
CRITICAL REQUIREMENT: Detect EACH LINE of text as a SEPARATE element. Do NOT group multiple lines together into a single bbox.
|
| 44 |
+
|
| 45 |
1. Bbox format: [x1, y1, x2, y2]
|
| 46 |
|
| 47 |
2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
|
|
|
|
| 54 |
|
| 55 |
4. Constraints:
|
| 56 |
- The output text must be the original text from the image, with no translation.
|
| 57 |
+
- IMPORTANT: Detect every individual line of text separately - do not merge multiple lines into one element.
|
| 58 |
+
- Each text line should have its own bbox and text content.
|
| 59 |
- All layout elements must be sorted according to human reading order.
|
| 60 |
|
| 61 |
5. Final Output: The entire output must be a single JSON object.
|
|
|
|
| 591 |
return 0.1 # Default to low density
|
| 592 |
|
| 593 |
|
| 594 |
+
def split_text_regions_into_lines(
|
| 595 |
+
image: Image.Image,
|
| 596 |
+
layout_data: List[Dict[str, Any]],
|
| 597 |
+
min_line_height: int = 30,
|
| 598 |
+
max_line_height: int = 120
|
| 599 |
+
) -> List[Dict[str, Any]]:
|
| 600 |
+
"""
|
| 601 |
+
Post-process layout data to split large text regions into individual lines.
|
| 602 |
+
|
| 603 |
+
This ensures each line gets its own bounding box for easier verification.
|
| 604 |
+
|
| 605 |
+
Args:
|
| 606 |
+
image: Original image
|
| 607 |
+
layout_data: Layout detection results
|
| 608 |
+
min_line_height: Minimum height for a text line (pixels)
|
| 609 |
+
max_line_height: Maximum height for a single line before splitting
|
| 610 |
+
|
| 611 |
+
Returns:
|
| 612 |
+
Updated layout data with lines split
|
| 613 |
+
"""
|
| 614 |
+
result = []
|
| 615 |
+
split_count = 0
|
| 616 |
+
|
| 617 |
+
for item in layout_data:
|
| 618 |
+
bbox = item.get('bbox', [])
|
| 619 |
+
category = item.get('category', '')
|
| 620 |
+
text_content = item.get('text', '')
|
| 621 |
+
|
| 622 |
+
# Only split Text regions (not titles, headers, tables, etc.)
|
| 623 |
+
if len(bbox) != 4 or category not in ['Text', 'List-item']:
|
| 624 |
+
result.append(item)
|
| 625 |
+
continue
|
| 626 |
+
|
| 627 |
+
x1, y1, x2, y2 = bbox
|
| 628 |
+
height = y2 - y1
|
| 629 |
+
|
| 630 |
+
# If region is tall enough to contain multiple lines, split it
|
| 631 |
+
if height > max_line_height:
|
| 632 |
+
# Estimate number of lines based on typical line height
|
| 633 |
+
# Arabic handwritten text: ~40-60px per line
|
| 634 |
+
# Arabic typed text: ~30-50px per line
|
| 635 |
+
avg_line_height = 45 # Middle ground
|
| 636 |
+
estimated_lines = max(1, round(height / avg_line_height))
|
| 637 |
+
|
| 638 |
+
# Don't split into too many lines (might be a paragraph)
|
| 639 |
+
estimated_lines = min(estimated_lines, 10)
|
| 640 |
+
|
| 641 |
+
line_height = height / estimated_lines
|
| 642 |
+
|
| 643 |
+
# Split text content by newlines if available
|
| 644 |
+
text_lines = text_content.split('\n') if text_content else []
|
| 645 |
+
|
| 646 |
+
# If we have the same number of text lines as estimated, use them
|
| 647 |
+
if len(text_lines) == estimated_lines and len(text_lines) > 1:
|
| 648 |
+
for i, line_text in enumerate(text_lines):
|
| 649 |
+
if not line_text.strip():
|
| 650 |
+
continue
|
| 651 |
+
new_item = item.copy()
|
| 652 |
+
new_y1 = y1 + (i * line_height)
|
| 653 |
+
new_y2 = y1 + ((i + 1) * line_height)
|
| 654 |
+
new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
|
| 655 |
+
new_item['text'] = line_text.strip()
|
| 656 |
+
new_item['split_from_parent'] = True
|
| 657 |
+
result.append(new_item)
|
| 658 |
+
split_count += 1
|
| 659 |
+
else:
|
| 660 |
+
# Split geometrically but keep full text in each (user can verify)
|
| 661 |
+
for i in range(estimated_lines):
|
| 662 |
+
new_item = item.copy()
|
| 663 |
+
new_y1 = y1 + (i * line_height)
|
| 664 |
+
new_y2 = y1 + ((i + 1) * line_height)
|
| 665 |
+
new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
|
| 666 |
+
# Keep original text (cropping will happen visually)
|
| 667 |
+
new_item['text'] = text_content # User can edit in table
|
| 668 |
+
new_item['split_from_parent'] = True
|
| 669 |
+
new_item['line_number'] = i + 1
|
| 670 |
+
result.append(new_item)
|
| 671 |
+
split_count += 1
|
| 672 |
+
else:
|
| 673 |
+
# Region is already line-sized, keep as is
|
| 674 |
+
result.append(item)
|
| 675 |
+
|
| 676 |
+
if split_count > 0:
|
| 677 |
+
print(f"π Split {split_count} large regions into individual lines ({len(layout_data)} β {len(result)} regions)")
|
| 678 |
+
|
| 679 |
+
return result
|
| 680 |
+
|
| 681 |
+
|
| 682 |
def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
|
| 683 |
"""
|
| 684 |
Intelligently determine if image should be chunked for better accuracy.
|
|
|
|
| 905 |
# Try to parse JSON output
|
| 906 |
layout_data = json.loads(raw_output)
|
| 907 |
|
| 908 |
+
# π LINE-LEVEL SPLITTING: Split large text regions into individual lines
|
| 909 |
+
# This ensures each line gets its own bounding box for easier verification
|
| 910 |
+
try:
|
| 911 |
+
layout_data = split_text_regions_into_lines(image, layout_data)
|
| 912 |
+
except Exception as e:
|
| 913 |
+
print(f"β οΈ Warning: Could not split text regions: {e}")
|
| 914 |
+
# Continue with original layout data
|
| 915 |
+
|
| 916 |
# π― INTELLIGENT CONFIDENCE SCORING
|
| 917 |
# Count text regions to determine if per-region scoring is feasible
|
| 918 |
num_text_regions = sum(1 for item in layout_data
|