Spaces:

Mountchicken
/

Rex-Omni

Running on Zero

App Files Files Community

Mountchicken commited on 23 days ago

Commit

60f587b

verified ·

1 Parent(s): a8932c9

Update rex_omni/wrapper.py

Browse files

Files changed (1) hide show

rex_omni/wrapper.py +4 -175

rex_omni/wrapper.py CHANGED Viewed

@@ -4,194 +4,24 @@
 """
 Main wrapper class for Rex Omni
 """
-import spaces
-import base64
 import json
-import math
 import time
-from io import BytesIO
 from typing import Any, Dict, List, Optional, Tuple, Union
-import requests
 import torch
 from PIL import Image
 from .parser import convert_boxes_to_normalized_bins, parse_prediction
 from .tasks import TASK_CONFIGS, TaskType, get_keypoint_config, get_task_config
-IMAGE_FACTOR = 28
-MIN_PIXELS = 4 * 28 * 28
-MAX_PIXELS = 16384 * 28 * 28
-MAX_RATIO = 200
-VIDEO_MIN_PIXELS = 128 * 28 * 28
-VIDEO_MAX_PIXELS = 768 * 28 * 28
-FRAME_FACTOR = 2
-FPS = 2.0
-FPS_MIN_FRAMES = 4
-FPS_MAX_FRAMES = 768
-def round_by_factor(number: int, factor: int) -> int:
-    """Returns the closest integer to 'number' that is divisible by 'factor'."""
-    return round(number / factor) * factor
-def ceil_by_factor(number: int, factor: int) -> int:
-    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
-    return math.ceil(number / factor) * factor
-def floor_by_factor(number: int, factor: int) -> int:
-    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
-    return math.floor(number / factor) * factor
-def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[dict]:
-    vision_infos = []
-    if isinstance(conversations[0], dict):
-        conversations = [conversations]
-    for conversation in conversations:
-        for message in conversation:
-            if isinstance(message["content"], list):
-                for ele in message["content"]:
-                    if (
-                        "image" in ele
-                        or "image_url" in ele
-                        or "video" in ele
-                        or ele["type"] in ("image", "image_url", "video")
-                    ):
-                        vision_infos.append(ele)
-    return vision_infos
-def to_rgb(pil_image: Image.Image) -> Image.Image:
-    if pil_image.mode == "RGBA":
-        white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
-        white_background.paste(
-            pil_image, mask=pil_image.split()[3]
-        )  # Use alpha channel as mask
-        return white_background
-    else:
-        return pil_image.convert("RGB")
-def fetch_image(
-    ele: dict[str, str | Image.Image], size_factor: int = IMAGE_FACTOR
-) -> Image.Image:
-    if "image" in ele:
-        image = ele["image"]
-    else:
-        image = ele["image_url"]
-    image_obj = None
-    if isinstance(image, Image.Image):
-        image_obj = image
-    elif image.startswith("http://") or image.startswith("https://"):
-        response = requests.get(image, stream=True)
-        image_obj = Image.open(BytesIO(response.content))
-    elif image.startswith("file://"):
-        image_obj = Image.open(image[7:])
-    elif image.startswith("data:image"):
-        if "base64," in image:
-            _, base64_data = image.split("base64,", 1)
-            data = base64.b64decode(base64_data)
-            image_obj = Image.open(BytesIO(data))
-    else:
-        image_obj = Image.open(image)
-    if image_obj is None:
-        raise ValueError(
-            f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}"
-        )
-    image = to_rgb(image_obj)
-    ## resize
-    if "resized_height" in ele and "resized_width" in ele:
-        resized_height, resized_width = smart_resize(
-            ele["resized_height"],
-            ele["resized_width"],
-            factor=size_factor,
-        )
-    else:
-        width, height = image.size
-        min_pixels = ele.get("min_pixels", MIN_PIXELS)
-        max_pixels = ele.get("max_pixels", MAX_PIXELS)
-        resized_height, resized_width = smart_resize(
-            height,
-            width,
-            factor=size_factor,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-        )
-    image = image.resize((resized_width, resized_height))
-    return image
-def process_vision_info(
-    conversations: list[dict] | list[list[dict]],
-    return_video_kwargs: bool = False,
-) -> tuple[
-    list[Image.Image] | None,
-    list[torch.Tensor | list[Image.Image]] | None,
-    Optional[dict],
-]:
-    vision_infos = extract_vision_info(conversations)
-    ## Read images or videos
-    image_inputs = []
-    video_inputs = []
-    video_sample_fps_list = []
-    for vision_info in vision_infos:
-        if "image" in vision_info or "image_url" in vision_info:
-            image_inputs.append(fetch_image(vision_info))
-        else:
-            raise ValueError("image, image_url or video should in content.")
-    if len(image_inputs) == 0:
-        image_inputs = None
-    if len(video_inputs) == 0:
-        video_inputs = None
-    if return_video_kwargs:
-        return image_inputs, video_inputs, {"fps": video_sample_fps_list}
-    return image_inputs, video_inputs
-def smart_resize(
-    height: int,
-    width: int,
-    factor: int = IMAGE_FACTOR,
-    min_pixels: int = MIN_PIXELS,
-    max_pixels: int = MAX_PIXELS,
-) -> tuple[int, int]:
-    """
-    Rescales the image so that the following conditions are met:
-    1. Both dimensions (height and width) are divisible by 'factor'.
-    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-    3. The aspect ratio of the image is maintained as closely as possible.
-    """
-    if max(height, width) / min(height, width) > MAX_RATIO:
-        raise ValueError(
-            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
-        )
-    h_bar = max(factor, round_by_factor(height, factor))
-    w_bar = max(factor, round_by_factor(width, factor))
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = floor_by_factor(height / beta, factor)
-        w_bar = floor_by_factor(width / beta, factor)
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = ceil_by_factor(height * beta, factor)
-        w_bar = ceil_by_factor(width * beta, factor)
-    return h_bar, w_bar
 class RexOmniWrapper:
     """
     High-level wrapper for Rex-Omni
     """
-    @spaces.GPU
     def __init__(
         self,
         model_path: str,
@@ -304,8 +134,7 @@ class RexOmniWrapper:
         elif self.backend == "transformers":
             import torch
-            from transformers import (AutoProcessor,
-                                      Qwen2_5_VLForConditionalGeneration)
             # Initialize transformers model
             self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(

 """
 Main wrapper class for Rex Omni
 """
 import json
 import time
 from typing import Any, Dict, List, Optional, Tuple, Union
 import torch
 from PIL import Image
+from qwen_vl_utils import process_vision_info, smart_resize
 from .parser import convert_boxes_to_normalized_bins, parse_prediction
 from .tasks import TASK_CONFIGS, TaskType, get_keypoint_config, get_task_config
 class RexOmniWrapper:
     """
     High-level wrapper for Rex-Omni
     """
     def __init__(
         self,
         model_path: str,
         elif self.backend == "transformers":
             import torch
+            from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
             # Initialize transformers model
             self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(