Spaces:
Runtime error
Runtime error
Commit
Β·
008db80
1
Parent(s):
c4e5804
Refactor inference and process_image functions: streamline parameters and enhance layout processing
Browse files
app.py
CHANGED
|
@@ -284,6 +284,7 @@ processing_results = {
|
|
| 284 |
'markdown_content': None,
|
| 285 |
'raw_output': None,
|
| 286 |
}
|
|
|
|
| 287 |
def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> str:
|
| 288 |
"""Run inference on an image with the given prompt"""
|
| 289 |
try:
|
|
@@ -356,7 +357,6 @@ def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> s
|
|
| 356 |
|
| 357 |
def process_image(
|
| 358 |
image: Image.Image,
|
| 359 |
-
prompt_mode: str,
|
| 360 |
min_pixels: Optional[int] = None,
|
| 361 |
max_pixels: Optional[int] = None
|
| 362 |
) -> Dict[str, Any]:
|
|
@@ -366,48 +366,42 @@ def process_image(
|
|
| 366 |
if min_pixels is not None or max_pixels is not None:
|
| 367 |
image = fetch_image(image, min_pixels=min_pixels, max_pixels=max_pixels)
|
| 368 |
|
| 369 |
-
# Run inference
|
| 370 |
raw_output = inference(image, prompt)
|
| 371 |
|
| 372 |
# Process results based on prompt mode
|
| 373 |
result = {
|
| 374 |
'original_image': image,
|
| 375 |
'raw_output': raw_output,
|
| 376 |
-
'prompt_mode': prompt_mode,
|
| 377 |
'processed_image': image,
|
| 378 |
'layout_result': None,
|
| 379 |
'markdown_content': None
|
| 380 |
}
|
| 381 |
|
| 382 |
-
#
|
| 383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
try:
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
# Generate markdown if text is available
|
| 398 |
-
if prompt_mode == 'prompt_layout_all_en':
|
| 399 |
-
try:
|
| 400 |
-
markdown_content = layoutjson2md(image, layout_data, text_key='text')
|
| 401 |
-
result['markdown_content'] = markdown_content
|
| 402 |
-
except Exception as e:
|
| 403 |
-
print(f"Error generating markdown: {e}")
|
| 404 |
-
result['markdown_content'] = raw_output
|
| 405 |
-
|
| 406 |
-
except json.JSONDecodeError:
|
| 407 |
-
print("Failed to parse JSON output, using raw output")
|
| 408 |
result['markdown_content'] = raw_output
|
| 409 |
-
|
| 410 |
-
|
|
|
|
| 411 |
result['markdown_content'] = raw_output
|
| 412 |
|
| 413 |
return result
|
|
@@ -418,7 +412,6 @@ def process_image(
|
|
| 418 |
return {
|
| 419 |
'original_image': image,
|
| 420 |
'raw_output': f"Error processing image: {str(e)}",
|
| 421 |
-
'prompt_mode': prompt_mode,
|
| 422 |
'processed_image': image,
|
| 423 |
'layout_result': None,
|
| 424 |
'markdown_content': f"Error processing image: {str(e)}"
|
|
@@ -707,7 +700,7 @@ def create_gradio_interface():
|
|
| 707 |
except Exception as e:
|
| 708 |
return f'<div class="model-status status-error">β Error: {str(e)}</div>'
|
| 709 |
|
| 710 |
-
def process_document(file_path,
|
| 711 |
"""Process the uploaded document"""
|
| 712 |
global pdf_cache
|
| 713 |
|
|
@@ -750,7 +743,6 @@ def create_gradio_interface():
|
|
| 750 |
for i, img in enumerate(pdf_cache["images"]):
|
| 751 |
result = process_image(
|
| 752 |
img,
|
| 753 |
-
prompt_mode_val,
|
| 754 |
min_pixels=int(min_pix) if min_pix else None,
|
| 755 |
max_pixels=int(max_pix) if max_pix else None
|
| 756 |
)
|
|
@@ -776,7 +768,6 @@ def create_gradio_interface():
|
|
| 776 |
# Process single image
|
| 777 |
result = process_image(
|
| 778 |
image,
|
| 779 |
-
prompt_mode_val,
|
| 780 |
min_pixels=int(min_pix) if min_pix else None,
|
| 781 |
max_pixels=int(max_pix) if max_pix else None
|
| 782 |
)
|
|
@@ -804,10 +795,6 @@ def create_gradio_interface():
|
|
| 804 |
f'<div class="model-status status-error">β {error_msg}</div>'
|
| 805 |
)
|
| 806 |
|
| 807 |
-
def update_prompt_display(mode):
|
| 808 |
-
"""Update the prompt display when mode changes"""
|
| 809 |
-
return prompt
|
| 810 |
-
|
| 811 |
def handle_file_upload(file_path):
|
| 812 |
"""Handle file upload and show preview"""
|
| 813 |
if not file_path:
|
|
@@ -871,15 +858,9 @@ def create_gradio_interface():
|
|
| 871 |
outputs=[image_preview, page_info, markdown_output]
|
| 872 |
)
|
| 873 |
|
| 874 |
-
prompt_mode.change(
|
| 875 |
-
update_prompt_display,
|
| 876 |
-
inputs=[prompt_mode],
|
| 877 |
-
outputs=[prompt_display]
|
| 878 |
-
)
|
| 879 |
-
|
| 880 |
process_btn.click(
|
| 881 |
process_document,
|
| 882 |
-
inputs=[file_input,
|
| 883 |
outputs=[processed_image, markdown_output, raw_output, json_output, model_status]
|
| 884 |
)
|
| 885 |
|
|
|
|
| 284 |
'markdown_content': None,
|
| 285 |
'raw_output': None,
|
| 286 |
}
|
| 287 |
+
@spaces.gpu
|
| 288 |
def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> str:
|
| 289 |
"""Run inference on an image with the given prompt"""
|
| 290 |
try:
|
|
|
|
| 357 |
|
| 358 |
def process_image(
|
| 359 |
image: Image.Image,
|
|
|
|
| 360 |
min_pixels: Optional[int] = None,
|
| 361 |
max_pixels: Optional[int] = None
|
| 362 |
) -> Dict[str, Any]:
|
|
|
|
| 366 |
if min_pixels is not None or max_pixels is not None:
|
| 367 |
image = fetch_image(image, min_pixels=min_pixels, max_pixels=max_pixels)
|
| 368 |
|
| 369 |
+
# Run inference with the default prompt
|
| 370 |
raw_output = inference(image, prompt)
|
| 371 |
|
| 372 |
# Process results based on prompt mode
|
| 373 |
result = {
|
| 374 |
'original_image': image,
|
| 375 |
'raw_output': raw_output,
|
|
|
|
| 376 |
'processed_image': image,
|
| 377 |
'layout_result': None,
|
| 378 |
'markdown_content': None
|
| 379 |
}
|
| 380 |
|
| 381 |
+
# Try to parse JSON and create visualizations (since we're doing layout analysis)
|
| 382 |
+
try:
|
| 383 |
+
# Try to parse JSON output
|
| 384 |
+
layout_data = json.loads(raw_output)
|
| 385 |
+
result['layout_result'] = layout_data
|
| 386 |
+
|
| 387 |
+
# Create visualization with bounding boxes
|
| 388 |
try:
|
| 389 |
+
processed_image = draw_layout_on_image(image, layout_data)
|
| 390 |
+
result['processed_image'] = processed_image
|
| 391 |
+
except Exception as e:
|
| 392 |
+
print(f"Error drawing layout: {e}")
|
| 393 |
+
result['processed_image'] = image
|
| 394 |
+
|
| 395 |
+
# Generate markdown from layout data
|
| 396 |
+
try:
|
| 397 |
+
markdown_content = layoutjson2md(image, layout_data, text_key='text')
|
| 398 |
+
result['markdown_content'] = markdown_content
|
| 399 |
+
except Exception as e:
|
| 400 |
+
print(f"Error generating markdown: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
result['markdown_content'] = raw_output
|
| 402 |
+
|
| 403 |
+
except json.JSONDecodeError:
|
| 404 |
+
print("Failed to parse JSON output, using raw output")
|
| 405 |
result['markdown_content'] = raw_output
|
| 406 |
|
| 407 |
return result
|
|
|
|
| 412 |
return {
|
| 413 |
'original_image': image,
|
| 414 |
'raw_output': f"Error processing image: {str(e)}",
|
|
|
|
| 415 |
'processed_image': image,
|
| 416 |
'layout_result': None,
|
| 417 |
'markdown_content': f"Error processing image: {str(e)}"
|
|
|
|
| 700 |
except Exception as e:
|
| 701 |
return f'<div class="model-status status-error">β Error: {str(e)}</div>'
|
| 702 |
|
| 703 |
+
def process_document(file_path, max_tokens, min_pix, max_pix):
|
| 704 |
"""Process the uploaded document"""
|
| 705 |
global pdf_cache
|
| 706 |
|
|
|
|
| 743 |
for i, img in enumerate(pdf_cache["images"]):
|
| 744 |
result = process_image(
|
| 745 |
img,
|
|
|
|
| 746 |
min_pixels=int(min_pix) if min_pix else None,
|
| 747 |
max_pixels=int(max_pix) if max_pix else None
|
| 748 |
)
|
|
|
|
| 768 |
# Process single image
|
| 769 |
result = process_image(
|
| 770 |
image,
|
|
|
|
| 771 |
min_pixels=int(min_pix) if min_pix else None,
|
| 772 |
max_pixels=int(max_pix) if max_pix else None
|
| 773 |
)
|
|
|
|
| 795 |
f'<div class="model-status status-error">β {error_msg}</div>'
|
| 796 |
)
|
| 797 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 798 |
def handle_file_upload(file_path):
|
| 799 |
"""Handle file upload and show preview"""
|
| 800 |
if not file_path:
|
|
|
|
| 858 |
outputs=[image_preview, page_info, markdown_output]
|
| 859 |
)
|
| 860 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 861 |
process_btn.click(
|
| 862 |
process_document,
|
| 863 |
+
inputs=[file_input, max_new_tokens, min_pixels, max_pixels],
|
| 864 |
outputs=[processed_image, markdown_output, raw_output, json_output, model_status]
|
| 865 |
)
|
| 866 |
|