Spaces:
Runtime error
Runtime error
Commit
Β·
5f82e6a
1
Parent(s):
77be916
Add Arabic text detection functionality and update UI for multilingual support
Browse files
app.py
CHANGED
|
@@ -185,6 +185,29 @@ def draw_layout_on_image(image: Image.Image, layout_data: List[Dict]) -> Image.I
|
|
| 185 |
return img_copy
|
| 186 |
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
def layoutjson2md(image: Image.Image, layout_data: List[Dict], text_key: str = 'text') -> str:
|
| 189 |
"""Convert layout JSON to markdown format"""
|
| 190 |
markdown_lines = []
|
|
@@ -494,7 +517,13 @@ def turn_page(direction: str) -> Tuple[Optional[Image.Image], str, str]:
|
|
| 494 |
else:
|
| 495 |
current_result = "Page not processed yet"
|
| 496 |
|
| 497 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 498 |
|
| 499 |
|
| 500 |
def create_gradio_interface():
|
|
@@ -573,9 +602,22 @@ def create_gradio_interface():
|
|
| 573 |
|
| 574 |
# Header
|
| 575 |
gr.HTML("""
|
| 576 |
-
<div class="
|
| 577 |
-
<h1>π
|
| 578 |
-
<p
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
</div>
|
| 580 |
""")
|
| 581 |
|
|
@@ -735,9 +777,15 @@ def create_gradio_interface():
|
|
| 735 |
first_result = all_results[0]
|
| 736 |
combined_markdown = "\n\n---\n\n".join(all_markdown)
|
| 737 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 738 |
return (
|
| 739 |
first_result['processed_image'],
|
| 740 |
-
|
| 741 |
first_result['raw_output'],
|
| 742 |
first_result['layout_result'],
|
| 743 |
'<div class="model-status status-ready">β
Processing completed!</div>'
|
|
@@ -753,9 +801,16 @@ def create_gradio_interface():
|
|
| 753 |
pdf_cache["results"] = [result]
|
| 754 |
pdf_cache["is_parsed"] = True
|
| 755 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 756 |
return (
|
| 757 |
result['processed_image'],
|
| 758 |
-
|
| 759 |
result['raw_output'],
|
| 760 |
result['layout_result'],
|
| 761 |
'<div class="model-status status-ready">β
Processing completed!</div>'
|
|
|
|
| 185 |
return img_copy
|
| 186 |
|
| 187 |
|
| 188 |
+
def is_arabic_text(text: str) -> bool:
|
| 189 |
+
"""Check if text contains mostly Arabic characters"""
|
| 190 |
+
if not text:
|
| 191 |
+
return False
|
| 192 |
+
|
| 193 |
+
# Arabic Unicode ranges
|
| 194 |
+
arabic_chars = 0
|
| 195 |
+
total_chars = 0
|
| 196 |
+
|
| 197 |
+
for char in text:
|
| 198 |
+
if char.isalpha():
|
| 199 |
+
total_chars += 1
|
| 200 |
+
# Arabic script ranges
|
| 201 |
+
if ('\u0600' <= char <= '\u06FF') or ('\u0750' <= char <= '\u077F') or ('\u08A0' <= char <= '\u08FF'):
|
| 202 |
+
arabic_chars += 1
|
| 203 |
+
|
| 204 |
+
if total_chars == 0:
|
| 205 |
+
return False
|
| 206 |
+
|
| 207 |
+
# Consider text as Arabic if more than 50% of alphabetic characters are Arabic
|
| 208 |
+
return (arabic_chars / total_chars) > 0.5
|
| 209 |
+
|
| 210 |
+
|
| 211 |
def layoutjson2md(image: Image.Image, layout_data: List[Dict], text_key: str = 'text') -> str:
|
| 212 |
"""Convert layout JSON to markdown format"""
|
| 213 |
markdown_lines = []
|
|
|
|
| 517 |
else:
|
| 518 |
current_result = "Page not processed yet"
|
| 519 |
|
| 520 |
+
# Check if the result contains mostly Arabic text and return appropriate update
|
| 521 |
+
if is_arabic_text(current_result):
|
| 522 |
+
result_update = gr.update(value=current_result, rtl=True)
|
| 523 |
+
else:
|
| 524 |
+
result_update = current_result
|
| 525 |
+
|
| 526 |
+
return current_image, page_info, result_update
|
| 527 |
|
| 528 |
|
| 529 |
def create_gradio_interface():
|
|
|
|
| 602 |
|
| 603 |
# Header
|
| 604 |
gr.HTML("""
|
| 605 |
+
<div class="title" style="text-align: center">
|
| 606 |
+
<h1>π Dot-OCR - Multilingual Document Text Extraction</h1>
|
| 607 |
+
<p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
|
| 608 |
+
A state-of-the-art image/pdf-to-markdown vision language model for intelligent document processing
|
| 609 |
+
</p>
|
| 610 |
+
<div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;">
|
| 611 |
+
<a href="https://huggingface.co/rednote-hilab/dots.ocr" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">
|
| 612 |
+
π Hugging Face Model
|
| 613 |
+
</a>
|
| 614 |
+
<a href="https://github.com/rednote-hilab/dots.ocr/blob/master/assets/blog.md" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">
|
| 615 |
+
π Release Blog
|
| 616 |
+
</a>
|
| 617 |
+
<a href="https://github.com/rednote-hilab/dots.ocr" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">
|
| 618 |
+
π» GitHub Repository
|
| 619 |
+
</a>
|
| 620 |
+
</div>
|
| 621 |
</div>
|
| 622 |
""")
|
| 623 |
|
|
|
|
| 777 |
first_result = all_results[0]
|
| 778 |
combined_markdown = "\n\n---\n\n".join(all_markdown)
|
| 779 |
|
| 780 |
+
# Check if the combined markdown contains mostly Arabic text
|
| 781 |
+
if is_arabic_text(combined_markdown):
|
| 782 |
+
markdown_update = gr.update(value=combined_markdown, rtl=True)
|
| 783 |
+
else:
|
| 784 |
+
markdown_update = combined_markdown
|
| 785 |
+
|
| 786 |
return (
|
| 787 |
first_result['processed_image'],
|
| 788 |
+
markdown_update,
|
| 789 |
first_result['raw_output'],
|
| 790 |
first_result['layout_result'],
|
| 791 |
'<div class="model-status status-ready">β
Processing completed!</div>'
|
|
|
|
| 801 |
pdf_cache["results"] = [result]
|
| 802 |
pdf_cache["is_parsed"] = True
|
| 803 |
|
| 804 |
+
# Check if the content contains mostly Arabic text
|
| 805 |
+
content = result['markdown_content'] or "No content extracted"
|
| 806 |
+
if is_arabic_text(content):
|
| 807 |
+
markdown_update = gr.update(value=content, rtl=True)
|
| 808 |
+
else:
|
| 809 |
+
markdown_update = content
|
| 810 |
+
|
| 811 |
return (
|
| 812 |
result['processed_image'],
|
| 813 |
+
markdown_update,
|
| 814 |
result['raw_output'],
|
| 815 |
result['layout_result'],
|
| 816 |
'<div class="model-status status-ready">β
Processing completed!</div>'
|