axiilay's picture
enable example cache
2681cf8
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
import spaces
import os
import tempfile
from PIL import Image
# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
model_name,
_attn_implementation="flash_attention_2",
trust_remote_code=True,
use_safetensors=True,
)
model = model.eval()
@spaces.GPU
def process_image(image, model_size, task_type, is_eval_mode):
"""
Process image with DeepSeek-OCR and return multiple output formats.
Args:
image: PIL Image or file path
model_size: Model size configuration
task_type: OCR task type
Returns:
A tuple containing:
- Path to the image with bounding boxes.
- The content of the markdown result file.
- The plain text OCR result.
"""
if image is None:
return None, "Please upload an image first.", "Please upload an image first."
model_gpu = model.cuda().to(torch.bfloat16)
# Create temporary directory for output
with tempfile.TemporaryDirectory() as output_path:
# Set prompt based on task type
if task_type == "Free OCR":
prompt = "<image>\nFree OCR. "
elif task_type == "Convert to Markdown":
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
else:
prompt = "<image>\nFree OCR. "
# Save uploaded image temporarily
temp_image_path = os.path.join(output_path, "temp_image.jpg")
image.save(temp_image_path)
# Configure model size parameters
size_configs = {
"Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
"Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
"Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
"Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
"Gundam (Recommended)": {
"base_size": 1024,
"image_size": 640,
"crop_mode": True,
},
}
config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
# Run inference
plain_text_result = model_gpu.infer(
tokenizer,
prompt=prompt,
image_file=temp_image_path,
output_path=output_path,
base_size=config["base_size"],
image_size=config["image_size"],
crop_mode=config["crop_mode"],
save_results=True, # Ensure results are saved to disk
test_compress=True,
eval_mode=is_eval_mode,
)
# Define paths for the generated files
image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
markdown_result_path = os.path.join(output_path, "result.mmd")
# Read the markdown file content if it exists
markdown_content = ""
if os.path.exists(markdown_result_path):
with open(markdown_result_path, "r", encoding="utf-8") as f:
markdown_content = f.read()
else:
markdown_content = "Markdown result was not generated. This is expected for 'Free OCR' task."
result_image = None
# Check if the annotated image exists
if os.path.exists(image_result_path):
result_image = Image.open(image_result_path)
result_image.load()
# Return all three results. Gradio will handle the temporary file path for the image.
text_result = plain_text_result if plain_text_result else markdown_content
return result_image, markdown_content, text_result
# Create Gradio interface
with gr.Blocks(title="DeepSeek-OCR", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# DeepSeek-OCR Demo
Upload an image to extract text using DeepSeek-OCR model.
Supports various document types and handwriting recognition.
**Model Sizes:**
- **Tiny**: Fastest, lower accuracy (512x512)
- **Small**: Fast, good accuracy (640x640)
- **Base**: Balanced performance (1024x1024)
- **Large**: Best accuracy, slower (1280x1280)
- **Gundam (Recommended)**: Optimized for documents (1024 base, 640 image, crop mode)
"""
)
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(
type="pil", label="Upload Image", sources=["upload", "clipboard"]
)
model_size = gr.Dropdown(
choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
value="Gundam (Recommended)",
label="Model Size",
)
task_type = gr.Dropdown(
choices=["Free OCR", "Convert to Markdown"],
value="Convert to Markdown",
label="Task Type",
)
eval_mode_checkbox = gr.Checkbox(
value=False,
label="Enable Evaluation Mode",
info="Returns only plain text, but might be faster. Uncheck to get annotated image and markdown.",
)
submit_btn = gr.Button("Process Image", variant="primary")
with gr.Column(scale=2):
with gr.Tabs():
with gr.TabItem("Annotated Image"):
output_image = gr.Image(
interactive=False
)
with gr.TabItem("Markdown Preview"):
output_markdown = gr.Markdown()
with gr.TabItem("Markdown Source(or Eval Output)"):
output_text = gr.Textbox(
lines=20,
show_copy_button=True,
interactive=False,
)
# Examples
gr.Examples(
examples=[
["examples/math.png", "Gundam (Recommended)", "Convert to Markdown"],
["examples/receipt.jpg", "Base", "Convert to Markdown"],
["examples/receipt-2.png", "Base", "Convert to Markdown"],
],
inputs=[image_input, model_size, task_type, eval_mode_checkbox],
outputs=[output_image, output_markdown, output_text],
fn=process_image,
cache_examples=True,
)
submit_btn.click(
fn=process_image,
inputs=[image_input, model_size, task_type, eval_mode_checkbox],
outputs=[output_image, output_markdown, output_text],
)
# Launch the app
if __name__ == "__main__":
demo.queue(max_size=20)
demo.launch()