Spaces:

Mungert
/

UI-TARS-1.5-7B

Running

App Files Files Community

Mungert commited on Aug 12

Commit

7d85a18

verified ·

1 Parent(s): 27fd625

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -18

app.py CHANGED Viewed

@@ -34,7 +34,7 @@ def pick_dtype(device: str) -> torch.dtype:
         return torch.bfloat16 if major >= 8 else torch.float16  # Ampere+ -> bf16
     if device == "mps":
         return torch.float16
-    return torch.float16  # CPU
 def move_to_device(batch, device: str):
     if isinstance(batch, dict):
@@ -82,6 +82,52 @@ def trim_generated(generated_ids, inputs):
         return [out_ids for out_ids in generated_ids]
     return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
 # --- Load model/processor ON CPU at import time (required for ZeroGPU) ---
 print(f"Loading model and processor for {MODEL_ID} on CPU startup (ZeroGPU safe)...")
 model = None
@@ -168,8 +214,7 @@ def run_inference_localization(
     return decoded_output[0] if decoded_output else ""
 # --- Gradio processing function (ZeroGPU-visible) ---
-# Decorate the function Gradio calls so Spaces detects a GPU entry point.
-@spaces.GPU(duration=120)  # keep GPU attached briefly between calls (seconds)
 def predict_click_location(input_pil_image: Image.Image, instruction: str):
     if not model_loaded or not processor or not model:
         return f"Model not loaded. Error: {load_error_message}", None, "device: n/a | dtype: n/a"
@@ -229,21 +274,17 @@ def predict_click_location(input_pil_image: Image.Image, instruction: str):
     # 4) Parse coordinates and draw marker
     output_image_with_click = resized_image.copy().convert("RGB")
-    match = re.search(r"Click\((\d+),\s*(\d+)\)", coordinates_str)
-    if match:
-        try:
-            x = int(match.group(1))
-            y = int(match.group(2))
-            draw = ImageDraw.Draw(output_image_with_click)
-            radius = max(5, min(resized_width // 100, resized_height // 100, 15))
-            bbox = (x - radius, y - radius, x + radius, y + radius)
-            draw.ellipse(bbox, outline="red", width=max(2, radius // 4))
-            print(f"Predicted and drawn click at: ({x}, {y}) on resized image ({resized_width}x{resized_height})")
-        except Exception as e:
-            print(f"Error drawing on image: {e}")
-            traceback.print_exc()
     else:
-        print(f"Could not parse 'Click(x, y)' from model output: {coordinates_str}")
     return coordinates_str, output_image_with_click, f"device: {device} | dtype: {str(dtype).replace('torch.', '')}"
@@ -293,7 +334,7 @@ else:
             with gr.Column(scale=1):
                 output_coords_component = gr.Textbox(
-                    label="Predicted Coordinates (Format: Click(x, y))",
                     interactive=False
                 )
                 output_image_component = gr.Image(

         return torch.bfloat16 if major >= 8 else torch.float16  # Ampere+ -> bf16
     if device == "mps":
         return torch.float16
+    return torch.float32  # CPU: FP32 is usually fastest & most stable
 def move_to_device(batch, device: str):
     if isinstance(batch, dict):
         return [out_ids for out_ids in generated_ids]
     return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
+# --- Parsing helper: normalize various UI-TARS click formats to (x, y) ---
+def parse_click_coordinates(text: str, img_w: int, img_h: int):
+    """
+    Returns (x, y) in image coordinates, clamped to bounds, or None.
+    Handles:
+      - Click(start_box='(x,y)') / Click(end_box='(x,y)')
+      - Click(box='(x1,y1,x2,y2)') -> center
+      - Click(x, y)
+      - Click({'x':..., 'y':...}) / Click({"x":...,"y":...})
+    Preference: start_box > end_box when both exist.
+    """
+    s = str(text)
+    # 1) start_box / end_box
+    pairs = re.findall(r"(start_box|end_box)\s*=\s*['\"]\(\s*(\d+)\s*,\s*(\d+)\s*\)['\"]", s)
+    if pairs:
+        start = next(((int(x), int(y)) for k, x, y in pairs if k == "start_box"), None)
+        if start:
+            x, y = start
+            return max(0, min(x, img_w - 1)), max(0, min(y, img_h - 1))
+        end = next(((int(x), int(y)) for k, x, y in pairs if k == "end_box"), None)
+        if end:
+            x, y = end
+            return max(0, min(x, img_w - 1)), max(0, min(y, img_h - 1))
+    # 2) box='(x1,y1,x2,y2)' -> center
+    m = re.search(r"box\s*=\s*['\"]\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)['\"]", s)
+    if m:
+        x1, y1, x2, y2 = map(int, m.groups())
+        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
+        return max(0, min(cx, img_w - 1)), max(0, min(cy, img_h - 1))
+    # 3) Direct Click(x, y)
+    m = re.search(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", s)
+    if m:
+        x, y = int(m.group(1)), int(m.group(2))
+        return max(0, min(x, img_w - 1)), max(0, min(y, img_h - 1))
+    # 4) JSON-ish dicts
+    m = re.search(r"Click\s*\(\s*[{[][^)}]*['\"]?x['\"]?\s*:\s*(\d+)\s*,\s*['\"]?y['\"]?\s*:\s*(\d+)[^)}]*\)\s*", s)
+    if m:
+        x, y = int(m.group(1)), int(m.group(2))
+        return max(0, min(x, img_w - 1)), max(0, min(y, img_h - 1))
+    return None
 # --- Load model/processor ON CPU at import time (required for ZeroGPU) ---
 print(f"Loading model and processor for {MODEL_ID} on CPU startup (ZeroGPU safe)...")
 model = None
     return decoded_output[0] if decoded_output else ""
 # --- Gradio processing function (ZeroGPU-visible) ---
+@spaces.GGPU(duration=120)  # keep GPU attached briefly between calls (seconds)
 def predict_click_location(input_pil_image: Image.Image, instruction: str):
     if not model_loaded or not processor or not model:
         return f"Model not loaded. Error: {load_error_message}", None, "device: n/a | dtype: n/a"
     # 4) Parse coordinates and draw marker
     output_image_with_click = resized_image.copy().convert("RGB")
+    coords = parse_click_coordinates(coordinates_str, resized_width, resized_height)
+    if coords is not None:
+        x, y = coords
+        draw = ImageDraw.Draw(output_image_with_click)
+        radius = max(5, min(resized_width // 100, resized_height // 100, 15))
+        bbox = (x - radius, y - radius, x + radius, y + radius)
+        draw.ellipse(bbox, outline="red", width=max(2, radius // 4))
+        print(f"Predicted and drawn click at: ({x}, {y}) on resized image ({resized_width}x{resized_height})")
     else:
+        print(f"Could not parse a click from model output: {coordinates_str}")
     return coordinates_str, output_image_with_click, f"device: {device} | dtype: {str(dtype).replace('torch.', '')}"
             with gr.Column(scale=1):
                 output_coords_component = gr.Textbox(
+                    label="Predicted Coordinates (Normalized)",
                     interactive=False
                 )
                 output_image_component = gr.Image(