DeepSeek-OCR-DEMO

Running on Zero

App Files Files Community

khang119966 commited on 15 days ago

Commit

3b28ff1

verified ·

1 Parent(s): c2f8f51

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -29

app.py CHANGED Viewed

@@ -4,7 +4,8 @@ from transformers import AutoModel, AutoTokenizer
 import spaces
 import os
 import tempfile
-from PIL import Image
 # --- 1. Load Model and Tokenizer (Done only once at startup) ---
 print("Loading model and tokenizer...")
@@ -20,29 +21,32 @@ model = AutoModel.from_pretrained(
 model = model.eval()
 print("✅ Model loaded successfully.")
-# --- 2. Main Processing Function ---
 @spaces.GPU
 def process_ocr_task(image, model_size, task_type, ref_text):
     """
     Processes an image with DeepSeek-OCR for all supported tasks.
-    Args:
-        image (PIL.Image): The input image.
-        model_size (str): The model size configuration.
-        task_type (str): The type of OCR task to perform.
-        ref_text (str): The reference text for the 'Locate' task.
     """
     if image is None:
         return "Please upload an image first.", None
-    # Move the model to GPU and use bfloat16 for better performance
     print("🚀 Moving model to GPU...")
     model_gpu = model.cuda().to(torch.bfloat16)
     print("✅ Model is on GPU.")
-    # Create a temporary directory to store files
     with tempfile.TemporaryDirectory() as output_path:
-        # --- Build the prompt based on the selected task type ---
         if task_type == "📝 Free OCR":
             prompt = "<image>\nFree OCR."
         elif task_type == "📄 Convert to Markdown":
@@ -52,16 +56,14 @@ def process_ocr_task(image, model_size, task_type, ref_text):
         elif task_type == "🔍 Locate Object by Reference":
             if not ref_text or ref_text.strip() == "":
                 raise gr.Error("For the 'Locate' task, you must provide the reference text to find!")
-            # Use an f-string to embed the user's reference text into the prompt
             prompt = f"<image>\nLocate <|ref|>{ref_text.strip()}<|/ref|> in the image."
         else:
-            prompt = "<image>\nFree OCR." # Default fallback
-        # Save the uploaded image to the temporary path
         temp_image_path = os.path.join(output_path, "temp_image.png")
         image.save(temp_image_path)
-        # Configure model size parameters
         size_configs = {
             "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
             "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
@@ -72,7 +74,6 @@ def process_ocr_task(image, model_size, task_type, ref_text):
         config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
         print(f"🏃 Running inference with prompt: {prompt}")
-        # --- Run the model's inference method ---
         text_result = model_gpu.infer(
             tokenizer,
             prompt=prompt,
@@ -81,26 +82,51 @@ def process_ocr_task(image, model_size, task_type, ref_text):
             base_size=config["base_size"],
             image_size=config["image_size"],
             crop_mode=config["crop_mode"],
-            save_results=True,  # Important: Must be True to get the output image
             test_compress=True,
             eval_mode=True,
         )
         print(f"====\n📄 Text Result: {text_result}\n====")
-        # --- Handle the output (both text and image) ---
-        image_result_path = None
-        # Tasks that generate a visual output usually create a 'grounding' or 'result' image
-        if task_type in ["🔍 Locate Object by Reference", "📄 Convert to Markdown", "📈 Parse Figure"]:
-            # Find the result image in the output directory
-            for filename in os.listdir(output_path):
-                if "grounding" in filename or "result" in filename:
-                    image_result_path = os.path.join(output_path, filename)
-                    break
-        # If an image was found, open it with PIL; otherwise, return None
-        result_image_pil = Image.open(image_result_path) if image_result_path else None
         return text_result, result_image_pil

 import spaces
 import os
 import tempfile
+from PIL import Image, ImageDraw
+import re # Import thư viện regular expression
 # --- 1. Load Model and Tokenizer (Done only once at startup) ---
 print("Loading model and tokenizer...")
 model = model.eval()
 print("✅ Model loaded successfully.")
+# --- Helper function to find pre-generated result images ---
+def find_result_image(path):
+    for filename in os.listdir(path):
+        if "grounding" in filename or "result" in filename:
+            try:
+                image_path = os.path.join(path, filename)
+                return Image.open(image_path)
+            except Exception as e:
+                print(f"Error opening result image {filename}: {e}")
+    return None
+# --- 2. Main Processing Function (UPDATED) ---
 @spaces.GPU
 def process_ocr_task(image, model_size, task_type, ref_text):
     """
     Processes an image with DeepSeek-OCR for all supported tasks.
     """
     if image is None:
         return "Please upload an image first.", None
     print("🚀 Moving model to GPU...")
     model_gpu = model.cuda().to(torch.bfloat16)
     print("✅ Model is on GPU.")
     with tempfile.TemporaryDirectory() as output_path:
+        # Build the prompt... (same as before)
         if task_type == "📝 Free OCR":
             prompt = "<image>\nFree OCR."
         elif task_type == "📄 Convert to Markdown":
         elif task_type == "🔍 Locate Object by Reference":
             if not ref_text or ref_text.strip() == "":
                 raise gr.Error("For the 'Locate' task, you must provide the reference text to find!")
             prompt = f"<image>\nLocate <|ref|>{ref_text.strip()}<|/ref|> in the image."
         else:
+            prompt = "<image>\nFree OCR."
         temp_image_path = os.path.join(output_path, "temp_image.png")
         image.save(temp_image_path)
+        # Configure model size... (same as before)
         size_configs = {
             "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
             "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
         config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
         print(f"🏃 Running inference with prompt: {prompt}")
         text_result = model_gpu.infer(
             tokenizer,
             prompt=prompt,
             base_size=config["base_size"],
             image_size=config["image_size"],
             crop_mode=config["crop_mode"],
+            save_results=True,
             test_compress=True,
             eval_mode=True,
         )
         print(f"====\n📄 Text Result: {text_result}\n====")
+        # --- NEW: Handle the output with custom bounding box drawing ---
+        result_image_pil = None
+        if task_type == "🔍 Locate Object by Reference":
+            # Define the pattern to find coordinates like [[280, 15, 696, 997]]
+            pattern = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
+            match = pattern.search(text_result)
+            if match:
+                print("✅ Found bounding box coordinates. Drawing on the original image.")
+                # Extract coordinates as integers
+                coords_norm = [int(c) for c in match.groups()]
+                x1_norm, y1_norm, x2_norm, y2_norm = coords_norm
+                # Get the original image's dimensions
+                w, h = image.size
+                # Scale the normalized coordinates (from 1000x1000 space) to the image's actual size
+                x1 = int(x1_norm / 1000 * w)
+                y1 = int(y1_norm / 1000 * h)
+                x2 = int(x2_norm / 1000 * w)
+                y2 = int(y2_norm / 1000 * h)
+                # Create a copy of the original image to draw on
+                image_with_bbox = image.copy()
+                draw = ImageDraw.Draw(image_with_bbox)
+                # Draw the rectangle with a red outline, 3 pixels wide
+                draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
+                result_image_pil = image_with_bbox
+            else:
+                print("⚠️ Could not parse bbox from text. Falling back to searching for a result image.")
+                result_image_pil = find_result_image(output_path)
+        else:
+            # For other tasks, use the old method of finding the generated image
+            result_image_pil = find_result_image(output_path)
         return text_result, result_image_pil