Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -16,6 +16,9 @@ from PIL import Image, ImageDraw, ImageFont
|
|
| 16 |
from qwen_vl_utils import process_vision_info
|
| 17 |
from transformers import AutoModelForCausalLM, AutoProcessor
|
| 18 |
|
|
|
|
|
|
|
|
|
|
| 19 |
# Constants
|
| 20 |
MIN_PIXELS = 3136
|
| 21 |
MAX_PIXELS = 11289600
|
|
@@ -378,7 +381,7 @@ pdf_cache = {
|
|
| 378 |
"is_parsed": False,
|
| 379 |
"results": []
|
| 380 |
}
|
| 381 |
-
@spaces.GPU(
|
| 382 |
def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> str:
|
| 383 |
"""Run inference on an image with the given prompt"""
|
| 384 |
try:
|
|
@@ -451,7 +454,7 @@ def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> s
|
|
| 451 |
return f"Error during inference: {str(e)}"
|
| 452 |
|
| 453 |
|
| 454 |
-
@spaces.GPU(
|
| 455 |
def _generate_text_and_confidence_for_crop(
|
| 456 |
image: Image.Image,
|
| 457 |
max_new_tokens: int = 128,
|
|
@@ -604,9 +607,51 @@ def process_image(
|
|
| 604 |
print(f"Error generating markdown: {e}")
|
| 605 |
result['markdown_content'] = raw_output
|
| 606 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 607 |
except json.JSONDecodeError:
|
| 608 |
print("Failed to parse JSON output, using raw output")
|
| 609 |
result['markdown_content'] = raw_output
|
|
|
|
|
|
|
| 610 |
|
| 611 |
return result
|
| 612 |
|
|
@@ -813,6 +858,43 @@ def create_gradio_interface():
|
|
| 813 |
color: #0c5460;
|
| 814 |
border: 1px solid #b8daff;
|
| 815 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 816 |
"""
|
| 817 |
|
| 818 |
with gr.Blocks(theme=gr.themes.Soft(), css=css, title="Arabic OCR - Document Text Extraction") as demo:
|
|
@@ -900,6 +982,30 @@ def create_gradio_interface():
|
|
| 900 |
interactive=False,
|
| 901 |
height=500
|
| 902 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 903 |
# Editable OCR Results Table
|
| 904 |
with gr.Tab("π OCR Results Table"):
|
| 905 |
gr.Markdown("### Editable OCR Results\nReview and edit the extracted text for each detected region")
|
|
@@ -981,7 +1087,7 @@ def create_gradio_interface():
|
|
| 981 |
return table_data
|
| 982 |
|
| 983 |
# Event handlers
|
| 984 |
-
@spaces.GPU(
|
| 985 |
def process_document(file_path, max_tokens, min_pix, max_pix):
|
| 986 |
"""Process the uploaded document"""
|
| 987 |
global pdf_cache
|
|
@@ -1038,8 +1144,23 @@ def create_gradio_interface():
|
|
| 1038 |
first_result['layout_result']
|
| 1039 |
)
|
| 1040 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1041 |
return (
|
| 1042 |
first_result['processed_image'],
|
|
|
|
|
|
|
|
|
|
| 1043 |
ocr_table_data,
|
| 1044 |
markdown_update,
|
| 1045 |
first_result['layout_result']
|
|
@@ -1071,8 +1192,23 @@ def create_gradio_interface():
|
|
| 1071 |
result['layout_result']
|
| 1072 |
)
|
| 1073 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1074 |
return (
|
| 1075 |
result['processed_image'],
|
|
|
|
|
|
|
|
|
|
| 1076 |
ocr_table_data,
|
| 1077 |
markdown_update,
|
| 1078 |
result['layout_result']
|
|
@@ -1082,7 +1218,7 @@ def create_gradio_interface():
|
|
| 1082 |
error_msg = f"Error processing document: {str(e)}"
|
| 1083 |
print(error_msg)
|
| 1084 |
traceback.print_exc()
|
| 1085 |
-
return None, [], error_msg, None
|
| 1086 |
|
| 1087 |
def handle_file_upload(file_path):
|
| 1088 |
"""Handle file upload and show preview"""
|
|
@@ -1111,6 +1247,9 @@ def create_gradio_interface():
|
|
| 1111 |
None, # image_preview
|
| 1112 |
'<div class="page-info">No file loaded</div>', # page_info
|
| 1113 |
None, # processed_image
|
|
|
|
|
|
|
|
|
|
| 1114 |
[], # ocr_table
|
| 1115 |
"Click 'Process Document' to see extracted content...", # markdown_output
|
| 1116 |
None, # json_output
|
|
@@ -1137,7 +1276,7 @@ def create_gradio_interface():
|
|
| 1137 |
process_btn.click(
|
| 1138 |
process_document,
|
| 1139 |
inputs=[file_input, max_new_tokens, min_pixels, max_pixels],
|
| 1140 |
-
outputs=[processed_image, ocr_table, markdown_output, json_output]
|
| 1141 |
)
|
| 1142 |
|
| 1143 |
# The outputs list for the clear button is now correct
|
|
@@ -1145,6 +1284,7 @@ def create_gradio_interface():
|
|
| 1145 |
clear_all,
|
| 1146 |
outputs=[
|
| 1147 |
file_input, image_preview, page_info, processed_image,
|
|
|
|
| 1148 |
ocr_table, markdown_output, json_output
|
| 1149 |
]
|
| 1150 |
)
|
|
|
|
| 16 |
from qwen_vl_utils import process_vision_info
|
| 17 |
from transformers import AutoModelForCausalLM, AutoProcessor
|
| 18 |
|
| 19 |
+
# Import Arabic text correction module
|
| 20 |
+
from arabic_corrector import get_corrector
|
| 21 |
+
|
| 22 |
# Constants
|
| 23 |
MIN_PIXELS = 3136
|
| 24 |
MAX_PIXELS = 11289600
|
|
|
|
| 381 |
"is_parsed": False,
|
| 382 |
"results": []
|
| 383 |
}
|
| 384 |
+
@spaces.GPU()
|
| 385 |
def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> str:
|
| 386 |
"""Run inference on an image with the given prompt"""
|
| 387 |
try:
|
|
|
|
| 454 |
return f"Error during inference: {str(e)}"
|
| 455 |
|
| 456 |
|
| 457 |
+
@spaces.GPU()
|
| 458 |
def _generate_text_and_confidence_for_crop(
|
| 459 |
image: Image.Image,
|
| 460 |
max_new_tokens: int = 128,
|
|
|
|
| 607 |
print(f"Error generating markdown: {e}")
|
| 608 |
result['markdown_content'] = raw_output
|
| 609 |
|
| 610 |
+
# β¨ ARABIC TEXT CORRECTION: Apply intelligent correction to each text region
|
| 611 |
+
try:
|
| 612 |
+
print("π§ Applying Arabic text correction...")
|
| 613 |
+
corrector = get_corrector()
|
| 614 |
+
|
| 615 |
+
for idx, item in enumerate(layout_data):
|
| 616 |
+
text_content = item.get('text', '')
|
| 617 |
+
category = item.get('category', '')
|
| 618 |
+
|
| 619 |
+
# Only correct text regions (skip pictures, formulas, etc.)
|
| 620 |
+
if not text_content or category in ['Picture', 'Formula', 'Table']:
|
| 621 |
+
continue
|
| 622 |
+
|
| 623 |
+
# Apply correction
|
| 624 |
+
correction_result = corrector.correct_text(text_content)
|
| 625 |
+
|
| 626 |
+
# Store both original and corrected versions
|
| 627 |
+
item['text_original'] = text_content
|
| 628 |
+
item['text_corrected'] = correction_result['corrected']
|
| 629 |
+
item['correction_confidence'] = correction_result['overall_confidence']
|
| 630 |
+
item['corrections_made'] = correction_result['corrections_made']
|
| 631 |
+
item['word_corrections'] = correction_result['words']
|
| 632 |
+
|
| 633 |
+
# Update the text field to use corrected version
|
| 634 |
+
item['text'] = correction_result['corrected']
|
| 635 |
+
|
| 636 |
+
# Regenerate markdown with corrected text
|
| 637 |
+
corrected_markdown = layoutjson2md(image, layout_data, text_key='text')
|
| 638 |
+
result['markdown_content_corrected'] = corrected_markdown
|
| 639 |
+
result['markdown_content_original'] = markdown_content
|
| 640 |
+
|
| 641 |
+
print(f"β
Correction complete")
|
| 642 |
+
|
| 643 |
+
except Exception as e:
|
| 644 |
+
print(f"β οΈ Error during Arabic correction: {e}")
|
| 645 |
+
traceback.print_exc()
|
| 646 |
+
# Fallback: keep original text
|
| 647 |
+
result['markdown_content_corrected'] = markdown_content
|
| 648 |
+
result['markdown_content_original'] = markdown_content
|
| 649 |
+
|
| 650 |
except json.JSONDecodeError:
|
| 651 |
print("Failed to parse JSON output, using raw output")
|
| 652 |
result['markdown_content'] = raw_output
|
| 653 |
+
result['markdown_content_original'] = raw_output
|
| 654 |
+
result['markdown_content_corrected'] = raw_output
|
| 655 |
|
| 656 |
return result
|
| 657 |
|
|
|
|
| 858 |
color: #0c5460;
|
| 859 |
border: 1px solid #b8daff;
|
| 860 |
}
|
| 861 |
+
|
| 862 |
+
/* Arabic Correction Styling */
|
| 863 |
+
.original-text-box {
|
| 864 |
+
background: #fff5f5 !important;
|
| 865 |
+
border: 2px solid #fc8181 !important;
|
| 866 |
+
border-radius: 8px;
|
| 867 |
+
padding: 15px;
|
| 868 |
+
min-height: 300px;
|
| 869 |
+
direction: rtl;
|
| 870 |
+
}
|
| 871 |
+
|
| 872 |
+
.corrected-text-box {
|
| 873 |
+
background: #f0fff4 !important;
|
| 874 |
+
border: 2px solid #68d391 !important;
|
| 875 |
+
border-radius: 8px;
|
| 876 |
+
padding: 15px;
|
| 877 |
+
min-height: 300px;
|
| 878 |
+
direction: rtl;
|
| 879 |
+
}
|
| 880 |
+
|
| 881 |
+
.correction-high {
|
| 882 |
+
background: #c6f6d5;
|
| 883 |
+
padding: 2px 4px;
|
| 884 |
+
border-radius: 3px;
|
| 885 |
+
}
|
| 886 |
+
|
| 887 |
+
.correction-medium {
|
| 888 |
+
background: #fef5e7;
|
| 889 |
+
padding: 2px 4px;
|
| 890 |
+
border-radius: 3px;
|
| 891 |
+
}
|
| 892 |
+
|
| 893 |
+
.correction-low {
|
| 894 |
+
background: #ffe0e0;
|
| 895 |
+
padding: 2px 4px;
|
| 896 |
+
border-radius: 3px;
|
| 897 |
+
}
|
| 898 |
"""
|
| 899 |
|
| 900 |
with gr.Blocks(theme=gr.themes.Soft(), css=css, title="Arabic OCR - Document Text Extraction") as demo:
|
|
|
|
| 982 |
interactive=False,
|
| 983 |
height=500
|
| 984 |
)
|
| 985 |
+
# β¨ NEW: Arabic Text Correction Comparison Tab
|
| 986 |
+
with gr.Tab("β¨ Corrected Text (AI)"):
|
| 987 |
+
gr.Markdown("""
|
| 988 |
+
### π§ AI-Powered Arabic Text Correction
|
| 989 |
+
This tab shows **Original OCR** vs **AI-Corrected** text side-by-side.
|
| 990 |
+
Corrections use dictionary matching, context analysis, and linguistic intelligence.
|
| 991 |
+
""")
|
| 992 |
+
|
| 993 |
+
with gr.Row():
|
| 994 |
+
with gr.Column():
|
| 995 |
+
gr.Markdown("#### π Original OCR Output")
|
| 996 |
+
original_text_output = gr.Markdown(
|
| 997 |
+
value="Original text will appear here...",
|
| 998 |
+
elem_classes=["original-text-box"]
|
| 999 |
+
)
|
| 1000 |
+
with gr.Column():
|
| 1001 |
+
gr.Markdown("#### β
Corrected Text")
|
| 1002 |
+
corrected_text_output = gr.Markdown(
|
| 1003 |
+
value="Corrected text will appear here...",
|
| 1004 |
+
elem_classes=["corrected-text-box"]
|
| 1005 |
+
)
|
| 1006 |
+
|
| 1007 |
+
correction_stats = gr.Markdown(value="")
|
| 1008 |
+
|
| 1009 |
# Editable OCR Results Table
|
| 1010 |
with gr.Tab("π OCR Results Table"):
|
| 1011 |
gr.Markdown("### Editable OCR Results\nReview and edit the extracted text for each detected region")
|
|
|
|
| 1087 |
return table_data
|
| 1088 |
|
| 1089 |
# Event handlers
|
| 1090 |
+
@spaces.GPU()
|
| 1091 |
def process_document(file_path, max_tokens, min_pix, max_pix):
|
| 1092 |
"""Process the uploaded document"""
|
| 1093 |
global pdf_cache
|
|
|
|
| 1144 |
first_result['layout_result']
|
| 1145 |
)
|
| 1146 |
|
| 1147 |
+
# Prepare correction comparison
|
| 1148 |
+
original_text = first_result.get('markdown_content_original', first_result.get('markdown_content', ''))
|
| 1149 |
+
corrected_text = first_result.get('markdown_content_corrected', first_result.get('markdown_content', ''))
|
| 1150 |
+
|
| 1151 |
+
# Calculate correction statistics
|
| 1152 |
+
total_corrections = 0
|
| 1153 |
+
if first_result.get('layout_result'):
|
| 1154 |
+
for item in first_result['layout_result']:
|
| 1155 |
+
total_corrections += item.get('corrections_made', 0)
|
| 1156 |
+
|
| 1157 |
+
stats_text = f"### π Correction Statistics\n- **Corrections Made**: {total_corrections}\n- **Method**: Dictionary + Context Analysis"
|
| 1158 |
+
|
| 1159 |
return (
|
| 1160 |
first_result['processed_image'],
|
| 1161 |
+
original_text if is_arabic_text(original_text) else gr.update(value=original_text, rtl=False),
|
| 1162 |
+
corrected_text if is_arabic_text(corrected_text) else gr.update(value=corrected_text, rtl=False),
|
| 1163 |
+
stats_text,
|
| 1164 |
ocr_table_data,
|
| 1165 |
markdown_update,
|
| 1166 |
first_result['layout_result']
|
|
|
|
| 1192 |
result['layout_result']
|
| 1193 |
)
|
| 1194 |
|
| 1195 |
+
# Prepare correction comparison
|
| 1196 |
+
original_text = result.get('markdown_content_original', result.get('markdown_content', ''))
|
| 1197 |
+
corrected_text = result.get('markdown_content_corrected', result.get('markdown_content', ''))
|
| 1198 |
+
|
| 1199 |
+
# Calculate correction statistics
|
| 1200 |
+
total_corrections = 0
|
| 1201 |
+
if result.get('layout_result'):
|
| 1202 |
+
for item in result['layout_result']:
|
| 1203 |
+
total_corrections += item.get('corrections_made', 0)
|
| 1204 |
+
|
| 1205 |
+
stats_text = f"### π Correction Statistics\n- **Corrections Made**: {total_corrections}\n- **Method**: Dictionary + Context Analysis"
|
| 1206 |
+
|
| 1207 |
return (
|
| 1208 |
result['processed_image'],
|
| 1209 |
+
original_text if is_arabic_text(original_text) else gr.update(value=original_text, rtl=False),
|
| 1210 |
+
corrected_text if is_arabic_text(corrected_text) else gr.update(value=corrected_text, rtl=False),
|
| 1211 |
+
stats_text,
|
| 1212 |
ocr_table_data,
|
| 1213 |
markdown_update,
|
| 1214 |
result['layout_result']
|
|
|
|
| 1218 |
error_msg = f"Error processing document: {str(e)}"
|
| 1219 |
print(error_msg)
|
| 1220 |
traceback.print_exc()
|
| 1221 |
+
return None, "Error", "Error", "Error occurred", [], error_msg, None
|
| 1222 |
|
| 1223 |
def handle_file_upload(file_path):
|
| 1224 |
"""Handle file upload and show preview"""
|
|
|
|
| 1247 |
None, # image_preview
|
| 1248 |
'<div class="page-info">No file loaded</div>', # page_info
|
| 1249 |
None, # processed_image
|
| 1250 |
+
"Original text will appear here...", # original_text_output
|
| 1251 |
+
"Corrected text will appear here...", # corrected_text_output
|
| 1252 |
+
"", # correction_stats
|
| 1253 |
[], # ocr_table
|
| 1254 |
"Click 'Process Document' to see extracted content...", # markdown_output
|
| 1255 |
None, # json_output
|
|
|
|
| 1276 |
process_btn.click(
|
| 1277 |
process_document,
|
| 1278 |
inputs=[file_input, max_new_tokens, min_pixels, max_pixels],
|
| 1279 |
+
outputs=[processed_image, original_text_output, corrected_text_output, correction_stats, ocr_table, markdown_output, json_output]
|
| 1280 |
)
|
| 1281 |
|
| 1282 |
# The outputs list for the clear button is now correct
|
|
|
|
| 1284 |
clear_all,
|
| 1285 |
outputs=[
|
| 1286 |
file_input, image_preview, page_info, processed_image,
|
| 1287 |
+
original_text_output, corrected_text_output, correction_stats,
|
| 1288 |
ocr_table, markdown_output, json_output
|
| 1289 |
]
|
| 1290 |
)
|