DeepSeek-OCR-DEMO

Running on Zero

App Files Files Community

khang119966 commited on 15 days ago

Commit

d03ac84

verified ·

1 Parent(s): 9d5127a

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -64

app.py CHANGED Viewed

@@ -4,10 +4,14 @@ from transformers import AutoModel, AutoTokenizer
 import spaces
 import os
 import tempfile
-# Load model and tokenizer
 model_name = "deepseek-ai/DeepSeek-OCR"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 model = AutoModel.from_pretrained(
     model_name,
     _attn_implementation="flash_attention_2",
@@ -15,52 +19,63 @@ model = AutoModel.from_pretrained(
     use_safetensors=True,
 )
 model = model.eval()
 @spaces.GPU
-def process_image(image, model_size, task_type):
     """
-    Process image with DeepSeek-OCR
     Args:
-        image: PIL Image or file path
-        model_size: Model size configuration
-        task_type: OCR task type
     """
-    # 在 GPU 函数内部移动模型到 GPU
     model_gpu = model.cuda().to(torch.bfloat16)
-    # Create temporary directory for output
     with tempfile.TemporaryDirectory() as output_path:
-        # Set prompt based on task type
         if task_type == "Free OCR":
-            prompt = "<image>\nFree OCR. "
         elif task_type == "Convert to Markdown":
-            prompt = "<image>\n<|grounding|>Convert the document to markdown. "
         else:
-            prompt = "<image>\nFree OCR. "
-        # Save uploaded image temporarily
-        temp_image_path = os.path.join(output_path, "temp_image.jpg")
         image.save(temp_image_path)
-        # Configure model size parameters
         size_configs = {
             "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
             "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
             "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
             "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
-            "Gundam (Recommended)": {
-                "base_size": 1024,
-                "image_size": 640,
-                "crop_mode": True,
-            },
         }
         config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
-        # Run inference
-        result = model_gpu.infer(
             tokenizer,
             prompt=prompt,
             image_file=temp_image_path,
@@ -68,38 +83,50 @@ def process_image(image, model_size, task_type):
             base_size=config["base_size"],
             image_size=config["image_size"],
             crop_mode=config["crop_mode"],
-            save_results=True,
             test_compress=True,
             eval_mode=True,
         )
-        print(f"====\nresult: {result}\n====\n")
-        return result
-# Create Gradio interface
-with gr.Blocks(title="DeepSeek-OCR") as demo:
     gr.Markdown(
         """
-        # DeepSeek-OCR Document Recognition
-        Upload an image to extract text using DeepSeek-OCR model.
-        Supports various document types and handwriting recognition.
-        **Model Sizes:**
-        - **Tiny**: Fastest, lower accuracy (512x512)
-        - **Small**: Fast, good accuracy (640x640)
-        - **Base**: Balanced performance (1024x1024)
-        - **Large**: Best accuracy, slower (1280x1280)
-        - **Gundam (Recommended)**: Optimized for documents (1024 base, 640 image, crop mode)
         """
     )
     with gr.Row():
-        with gr.Column():
-            image_input = gr.Image(
-                type="pil", label="Upload Image", sources=["upload", "clipboard"]
-            )
             model_size = gr.Dropdown(
                 choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
@@ -108,37 +135,68 @@ with gr.Blocks(title="DeepSeek-OCR") as demo:
             )
             task_type = gr.Dropdown(
-                choices=["Free OCR", "Convert to Markdown"],
                 value="Convert to Markdown",
                 label="Task Type",
             )
-            submit_btn = gr.Button("Process Image", variant="primary")
-        with gr.Column():
-            output_text = gr.Textbox(
-                label="OCR Result", lines=20, show_copy_button=True
-            )
-    # Examples
     gr.Examples(
         examples=[
-            ["examples/math.png", "Gundam (Recommended)", "Convert to Markdown"],
-            ["examples/receipt.jpg", "Base", "Free OCR"],
         ],
-        inputs=[image_input, model_size, task_type],
-        outputs=output_text,
-        fn=process_image,
-        cache_examples=False,
-    )
-    submit_btn.click(
-        fn=process_image,
-        inputs=[image_input, model_size, task_type],
-        outputs=output_text,
     )
-# Launch the app
 if __name__ == "__main__":
     demo.queue(max_size=20)
-    demo.launch()

 import spaces
 import os
 import tempfile
+from PIL import Image
+# --- Tải Model và Tokenizer (Chỉ một lần khi khởi động) ---
+# Di chuyển việc tải model ra ngoài để tránh tải lại mỗi lần gọi hàm
+print("Loading model and tokenizer...")
 model_name = "deepseek-ai/DeepSeek-OCR"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+# Tải model lên CPU trước, sau đó chuyển sang GPU trong hàm xử lý
 model = AutoModel.from_pretrained(
     model_name,
     _attn_implementation="flash_attention_2",
     use_safetensors=True,
 )
 model = model.eval()
+print("Model loaded successfully.")
+# --- Hàm xử lý chính ---
 @spaces.GPU
+def process_ocr_task(image, model_size, task_type, ref_text):
     """
+    Xử lý hình ảnh với DeepSeek-OCR cho tất cả các tác vụ.
     Args:
+        image: Đối tượng PIL Image
+        model_size: Cấu hình kích thước model
+        task_type: Loại tác vụ OCR
+        ref_text: Văn bản tham chiếu cho tác vụ 'Locate'
     """
+    if image is None:
+        return "Please upload an image first.", None
+    # Chuyển model sang GPU và định dạng bfloat16 để tối ưu hiệu suất
+    print("Moving model to GPU...")
     model_gpu = model.cuda().to(torch.bfloat16)
+    print("Model on GPU.")
+    # Tạo thư mục tạm thời để lưu trữ đầu ra
     with tempfile.TemporaryDirectory() as output_path:
+        # --- Xây dựng prompt dựa trên loại tác vụ ---
         if task_type == "Free OCR":
+            prompt = "<image>\nFree OCR."
         elif task_type == "Convert to Markdown":
+            prompt = "<image>\n<|grounding|>Convert the document to markdown."
+        elif task_type == "Parse Figure":
+            prompt = "<image>\nParse the figure."
+        elif task_type == "Locate Object by Reference":
+            if not ref_text or ref_text.strip() == "":
+                raise gr.Error("For 'Locate' task, please provide the reference text to find.")
+            # Sử dụng f-string để chèn văn bản tham chiếu vào prompt
+            prompt = f"<image>\nLocate <|ref|>{ref_text.strip()}<|/ref|> in the image."
         else:
+            # Mặc định là Free OCR
+            prompt = "<image>\nFree OCR."
+        # Lưu ảnh được tải lên vào thư mục tạm
+        temp_image_path = os.path.join(output_path, "temp_image.png")
         image.save(temp_image_path)
+        # Cấu hình các tham số kích thước model
         size_configs = {
             "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
             "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
             "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
             "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
+            "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
         }
         config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
+        print(f"Running inference with prompt: {prompt}")
+        # --- Chạy inference ---
+        text_result = model_gpu.infer(
             tokenizer,
             prompt=prompt,
             image_file=temp_image_path,
             base_size=config["base_size"],
             image_size=config["image_size"],
             crop_mode=config["crop_mode"],
+            save_results=True,  # Quan trọng: phải lưu kết quả để lấy ảnh output
             test_compress=True,
             eval_mode=True,
         )
+        print(f"====\nText Result: {text_result}\n====")
+        # --- Xử lý output (văn bản và hình ảnh) ---
+        image_result_path = None
+        # Tác vụ 'Locate' và 'Markdown' thường tạo ra ảnh kết quả c�� chữ 'grounding'
+        if task_type in ["Locate Object by Reference", "Convert to Markdown", "Parse Figure"]:
+            # Tìm file ảnh kết quả trong thư mục output
+            for filename in os.listdir(output_path):
+                if "grounding" in filename or "result" in filename:
+                    image_result_path = os.path.join(output_path, filename)
+                    break
+        # Nếu tìm thấy ảnh, tải nó, nếu không trả về None
+        result_image_pil = Image.open(image_result_path) if image_result_path else None
+        return text_result, result_image_pil
+# --- Xây dựng giao diện Gradio ---
+with gr.Blocks(title="DeepSeek-OCR", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # Demo toàn diện DeepSeek-OCR
+        Tải lên một hình ảnh để thử nghiệm các khả năng nhận dạng và hiểu tài liệu của DeepSeek-OCR.
+        **Hướng dẫn:**
+        1. Tải lên một hình ảnh.
+        2. Chọn **Model Size** phù hợp (Gundam được khuyến nghị cho tài liệu).
+        3. Chọn **Task Type**:
+            - **Free OCR**: Trích xuất văn bản thô.
+            - **Convert to Markdown**: Chuyển đổi tài liệu (giữ cấu trúc) sang định dạng Markdown.
+            - **Parse Figure**: Phân tích và trích xuất dữ liệu từ biểu đồ, hình vẽ.
+            - **Locate Object by Reference**: Tìm một đối tượng hoặc văn bản cụ thể trong ảnh. **Bạn cần nhập nội dung cần tìm vào ô "Reference Text" bên dưới.**
         """
     )
     with gr.Row():
+        with gr.Column(scale=1):
+            image_input = gr.Image(type="pil", label="Tải ảnh lên", sources=["upload", "clipboard"])
             model_size = gr.Dropdown(
                 choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
             )
             task_type = gr.Dropdown(
+                choices=["Free OCR", "Convert to Markdown", "Parse Figure", "Locate Object by Reference"],
                 value="Convert to Markdown",
                 label="Task Type",
             )
+            # Ô nhập văn bản tham chiếu, ban đầu bị ẩn
+            ref_text_input = gr.Textbox(
+                label="Reference Text (cho tác vụ Locate)",
+                placeholder="Ví dụ: the teacher, 11-2=, a red car...",
+                visible=False, # Ban đầu ẩn đi
+            )
+            submit_btn = gr.Button("Xử lý", variant="primary")
+        with gr.Column(scale=2):
+            output_text = gr.Textbox(label="Kết quả văn bản", lines=15, show_copy_button=True)
+            output_image = gr.Image(label="Kết quả hình ảnh (nếu có)", type="pil")
+    # --- Logic tương tác cho giao diện ---
+    def toggle_ref_text_visibility(task):
+        # Nếu người dùng chọn 'Locate', hiển thị ô nhập văn bản
+        if task == "Locate Object by Reference":
+            return gr.Textbox(visible=True)
+        else:
+            return gr.Textbox(visible=False)
+    # Khi dropdown 'task_type' thay đổi, gọi hàm để cập nhật trạng thái hiển thị của ô ref_text_input
+    task_type.change(
+        fn=toggle_ref_text_visibility,
+        inputs=task_type,
+        outputs=ref_text_input,
+    )
+    # Khi nhấn nút submit
+    submit_btn.click(
+        fn=process_ocr_task,
+        inputs=[image_input, model_size, task_type, ref_text_input],
+        outputs=[output_text, output_image],
+    )
+    # --- Các ví dụ minh họa ---
     gr.Examples(
         examples=[
+            ["./examples/doc_markdown.png", "Gundam (Recommended)", "Convert to Markdown", ""],
+            ["./examples/chart.png", "Gundam (Recommended)", "Parse Figure", ""],
+            ["./examples/teacher.png", "Base", "Locate Object by Reference", "the teacher"],
+            ["./examples/math_locate.png", "Small", "Locate Object by Reference", "11-2="],
+            ["./examples/receipt.jpg", "Base", "Free OCR", ""],
         ],
+        inputs=[image_input, model_size, task_type, ref_text_input],
+        outputs=[output_text, output_image],
+        fn=process_ocr_task,
+        cache_examples=False, # Tắt cache để đảm bảo chạy lại mỗi lần click
     )
+# --- Khởi chạy ứng dụng ---
 if __name__ == "__main__":
+    # Tạo thư mục examples và tải ảnh ví dụ (nếu chưa có)
+    if not os.path.exists("examples"):
+        os.makedirs("examples")
+    # Bạn cần tự tải các file ảnh ví dụ vào thư mục "examples"
+    # Ví dụ: doc_markdown.png, chart.png, teacher.png, math_locate.png, receipt.jpg
     demo.queue(max_size=20)
+    demo.launch(share=True) # share=True để tạo link public