Spaces:

reachomk
/

gen2seg

Running on Zero

App Files Files Community

reachomk commited on May 20

Commit

353e8fc

verified ·

1 Parent(s): 87fc13a

Upload 4 files

Browse files

Files changed (4) hide show

app.py +280 -0
gen2seg_mae_pipeline.py +132 -0
gen2seg_sd_pipeline.py +454 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import gradio as gr
+import torch
+from PIL import Image
+import numpy as np
+import time
+import os
+# --- Import Custom Pipelines ---
+# Ensure these files are in the same directory or accessible in PYTHONPATH
+try:
+    from gen2seg_sd_pipeline import gen2segSDPipeline
+    from gen2seg_mae_pipeline import gen2segMAEInstancePipeline
+except ImportError as e:
+    print(f"Error importing pipeline modules: {e}")
+    print("Please ensure gen2seg_sd_pipeline.py and gen2seg_mae_pipeline.py are in the same directory.")
+    # Optionally, raise an error or exit if pipelines are critical at startup
+    # raise ImportError("Could not import custom pipeline modules. Check file paths.") from e
+from transformers import ViTMAEForPreTraining, AutoImageProcessor
+# --- Configuration ---
+MODEL_IDS = {
+    "SD": "reachomk/gen2seg-sd",
+    "MAE-H": "reachomk/gen2seg-mae-h"
+}
+# Check if a GPU is available and set the device accordingly
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {DEVICE}")
+# --- Global Variables for Caching Pipelines ---
+sd_pipe_global = None
+mae_pipe_global = None
+# --- Model Loading Functions ---
+def get_sd_pipeline():
+    """Loads and caches the gen2seg Stable Diffusion pipeline."""
+    global sd_pipe_global
+    if sd_pipe_global is None:
+        model_id_sd = MODEL_IDS["SD"]
+        print(f"Attempting to load SD pipeline from Hugging Face Hub: {model_id_sd}")
+        try:
+            sd_pipe_global = gen2segSDPipeline.from_pretrained(
+                model_id_sd,
+                use_safetensors=True,
+                # torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32, # Optional: use float16 on GPU
+            ).to(DEVICE)
+            print(f"SD Pipeline loaded successfully from {model_id_sd} on {DEVICE}.")
+        except Exception as e:
+            print(f"Error loading SD pipeline from Hugging Face Hub ({model_id_sd}): {e}")
+            sd_pipe_global = None # Ensure it remains None on failure
+            # Do not raise gr.Error here; let the main function handle it.
+    return sd_pipe_global
+def get_mae_pipeline():
+    """Loads and caches the gen2seg MAE-H pipeline."""
+    global mae_pipe_global
+    if mae_pipe_global is None:
+        model_id_mae = MODEL_IDS["MAE-H"]
+        print(f"Loading MAE-H pipeline with model {model_id_mae} on {DEVICE}...")
+        try:
+            model = ViTMAEForPreTraining.from_pretrained(model_id_mae)
+            model.to(DEVICE)
+            model.eval() # Set to evaluation mode
+            # Load the official MAE-H image processor
+            # Using "facebook/vit-mae-huge" as per the original app_mae.py
+            image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-huge")
+            mae_pipe_global = gen2segMAEInstancePipeline(model=model, image_processor=image_processor)
+            # The custom MAE pipeline's model is already on the DEVICE.
+            print(f"MAE-H Pipeline with model {model_id_mae} loaded successfully on {DEVICE}.")
+        except Exception as e:
+            print(f"Error loading MAE-H model or pipeline from Hugging Face Hub ({model_id_mae}): {e}")
+            mae_pipe_global = None # Ensure it remains None on failure
+            # Do not raise gr.Error here; let the main function handle it.
+    return mae_pipe_global
+# --- Unified Prediction Function ---
+def segment_image(input_image: Image.Image, model_choice: str) -> Image.Image:
+    """
+    Takes a PIL Image and model choice, performs segmentation, and returns the segmented image.
+    """
+    if input_image is None:
+        raise gr.Error("No image provided. Please upload an image.")
+    print(f"Model selected: {model_choice}")
+    # Ensure image is in RGB format
+    image_rgb = input_image.convert("RGB")
+    original_resolution = image_rgb.size # (width, height)
+    seg_array = None
+    try:
+        if model_choice == "SD":
+            pipe_sd = get_sd_pipeline()
+            if pipe_sd is None:
+                raise gr.Error("The SD segmentation pipeline could not be loaded. "
+                               "Please check the Space logs for more details, or try again later.")
+            print(f"Running SD inference with image size: {image_rgb.size}")
+            start_time = time.time()
+            with torch.no_grad():
+                # The gen2segSDPipeline expects a single image or a list
+                # The pipeline's __call__ method handles preprocessing internally
+                seg_output = pipe_sd(image_rgb, match_input_resolution=False).prediction # Output is before resize
+                # seg_output is expected to be a numpy array (N,H,W,1) or (N,1,H,W) or tensor
+                # Based on gen2seg_sd_pipeline.py, if output_type="np" (default), it's [N,H,W,1]
+                # If output_type="pt", it's [N,1,H,W]
+                # The original app_sd.py converted tensor to numpy and squeezed.
+                if isinstance(seg_output, torch.Tensor):
+                    seg_output = seg_output.cpu().numpy()
+                if seg_output.ndim == 4 and seg_output.shape[0] == 1: # Batch size 1
+                    if seg_output.shape[1] == 1: # Grayscale, (1, 1, H, W)
+                        seg_array = seg_output.squeeze(0).squeeze(0).astype(np.uint8)
+                    elif seg_output.shape[-1] == 1: # Grayscale, (1, H, W, 1)
+                         seg_array = seg_output.squeeze(0).squeeze(-1).astype(np.uint8)
+                    elif seg_output.shape[1] == 3: # RGB, (1, 3, H, W) -> (H, W, 3)
+                        seg_array = np.transpose(seg_output.squeeze(0), (1, 2, 0)).astype(np.uint8)
+                    elif seg_output.shape[-1] == 3: # RGB, (1, H, W, 3)
+                         seg_array = seg_output.squeeze(0).astype(np.uint8)
+                    else: # Fallback for unexpected shapes
+                        seg_array = seg_output.squeeze().astype(np.uint8)
+                elif seg_output.ndim == 3: # (H, W, C) or (C, H, W)
+                    seg_array = seg_output.astype(np.uint8)
+                elif seg_output.ndim == 2: # (H,W)
+                    seg_array = seg_output.astype(np.uint8)
+                else:
+                    raise TypeError(f"Unexpected SD segmentation output type/shape: {type(seg_output)}, {seg_output.shape}")
+            end_time = time.time()
+            print(f"SD Inference completed in {end_time - start_time:.2f} seconds.")
+        elif model_choice == "MAE-H":
+            pipe_mae = get_mae_pipeline()
+            if pipe_mae is None:
+                raise gr.Error("The MAE-H segmentation pipeline could not be loaded. "
+                               "Please check the Space logs for more details, or try again later.")
+            print(f"Running MAE-H inference with image size: {image_rgb.size}")
+            start_time = time.time()
+            with torch.no_grad():
+                # The gen2segMAEInstancePipeline expects a list of images
+                # output_type="np" returns a NumPy array
+                pipe_output = pipe_mae([image_rgb], output_type="np")
+                # Prediction is (batch_size, height, width, 3) for MAE
+                prediction_np = pipe_output.prediction[0] # Get the first (and only) image prediction
+            end_time = time.time()
+            print(f"MAE-H Inference completed in {end_time - start_time:.2f} seconds.")
+            if not isinstance(prediction_np, np.ndarray):
+                # This case should ideally not be reached if output_type="np"
+                prediction_np = prediction_np.cpu().numpy()
+            # Ensure it's in the expected (H, W, C) format and uint8
+            if prediction_np.ndim == 3 and prediction_np.shape[-1] == 3: # Expected (H, W, 3)
+                seg_array = prediction_np.astype(np.uint8)
+            else:
+                # Attempt to handle other shapes if necessary, or raise error
+                raise gr.Error(f"Unexpected MAE-H prediction shape: {prediction_np.shape}. Expected (H, W, 3).")
+            # The MAE pipeline already does gamma correction and scaling to 0-255.
+            # It also ensures 3 channels.
+        else:
+            raise gr.Error(f"Invalid model choice: {model_choice}. Please select a valid model.")
+        if seg_array is None:
+             raise gr.Error("Segmentation array was not generated. An unknown error occurred.")
+        print(f"Segmentation array generated with shape: {seg_array.shape}, dtype: {seg_array.dtype}")
+        # Convert numpy array to PIL Image
+        # Handle grayscale or RGB based on seg_array channels
+        if seg_array.ndim == 2: # Grayscale
+            segmented_image_pil = Image.fromarray(seg_array, mode='L')
+        elif seg_array.ndim == 3 and seg_array.shape[-1] == 3: # RGB
+            segmented_image_pil = Image.fromarray(seg_array, mode='RGB')
+        elif seg_array.ndim == 3 and seg_array.shape[-1] == 1: # Grayscale with channel dim
+            segmented_image_pil = Image.fromarray(seg_array.squeeze(-1), mode='L')
+        else:
+            raise gr.Error(f"Cannot convert seg_array with shape {seg_array.shape} to PIL Image.")
+        # Resize back to original image resolution using LANCZOS for high quality
+        segmented_image_pil = segmented_image_pil.resize(original_resolution, Image.Resampling.LANCZOS)
+        print(f"Segmented image processed. Output size: {segmented_image_pil.size}, mode: {segmented_image_pil.mode}")
+        return segmented_image_pil
+    except Exception as e:
+        print(f"Error during segmentation with {model_choice}: {e}")
+        # Re-raise as gr.Error for Gradio to display, if not already one
+        if not isinstance(e, gr.Error):
+            # It's often helpful to include the type of the original exception
+            error_type = type(e).__name__
+            raise gr.Error(f"An error occurred during segmentation: {error_type} - {str(e)}")
+        else:
+            raise e # Re-raise if it's already a gr.Error
+# --- Gradio Interface ---
+title = "gen2seg: Generative Models Enable Generalizable Instance Segmentation Demo (SD & MAE-H)"
+description = f"""
+<div style="text-align: center; font-family: 'Arial', sans-serif;">
+    <p>Upload an image and choose a model architecture to see the instance segmentation result generated by the respective model. </p>
+    <p>
+        Currently, inference is running on CPU.
+        Performance will be significantly better on GPU.
+    </p>
+    <ul>
+        <li><strong>SD</strong>: Based on Stable Diffusion 2.
+            <a href="https://huggingface.co/{MODEL_IDS['SD']}" target="_blank">Model Link</a>.
+            <em>Approx. CPU inference time: ~1-2 minutes per image.</em>
+        </li>
+        <li><strong>MAE-H</strong>: Based on Masked Autoencoder (Huge).
+            <a href="https://huggingface.co/{MODEL_IDS['MAE-H']}" target="_blank">Model Link</a>.
+            <em>Approx. CPU inference time: ~15-45 seconds per image.</em>
+            If you experience tokenizer artifacts or very dark images, you can use gamma correction to handle this.
+        </li>
+    </ul>
+    <p>
+        For faster inference, please check out our GitHub to run the models locally on a GPU:
+        <a href="https://github.com/UCDvision/gen2seg" target="_blank">https://github.com/UCDvision/gen2seg</a>
+    </p>
+    <p>If the demo experiences issues, please open an issue on our GitHub.</p>
+    <p> If you have not already, please see our webpage at <a href="https://reachomk.github.io/gen2seg" target="_blank">https://reachomk.github.io/gen2seg</a>
+</div>
+"""
+article = """
+"""
+# Define Gradio inputs
+input_image_component = gr.Image(type="pil", label="Input Image")
+model_choice_component = gr.Dropdown(
+    choices=list(MODEL_IDS.keys()),
+    value="SD",  # Default model
+    label="Choose Segmentation Model Architecture"
+)
+# Define Gradio output
+output_image_component = gr.Image(type="pil", label="Segmented Image")
+# Example images (ensure these paths are correct if you upload examples to your Space)
+# For example, if you create an "examples" folder in your Space repo:
+# example_paths = [
+#     os.path.join("examples", "example1.jpg"),
+#     os.path.join("examples", "example2.png")
+# ]
+# Filter out non-existent example files to prevent errors
+# example_paths = [ex for ex in example_paths if os.path.exists(ex)]
+example_paths = [] # Add paths to example images here if you have them
+iface = gr.Interface(
+    fn=segment_image,
+    inputs=[input_image_component, model_choice_component],
+    outputs=output_image_component,
+    title=title,
+    description=description,
+    article=article,
+    examples=example_paths if example_paths else None, # Pass None if no examples
+    allow_flagging="never",
+    theme=gr.themes.Soft() # Using a soft theme for a slightly modern look
+)
+if __name__ == "__main__":
+    # Optional: Pre-load a default model on startup if desired.
+    # This can make the first inference faster but increases startup time.
+    # print("Attempting to pre-load default SD model on startup...")
+    # try:
+    #    get_sd_pipeline() # Pre-load the default SD model
+    #    print("Default SD model pre-loaded successfully or was already cached.")
+    # except Exception as e:
+    #    print(f"Could not pre-load default SD model: {e}")
+    print("Launching Gradio interface...")
+    iface.launch()

gen2seg_mae_pipeline.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# gen2seg official inference pipeline code for Stable Diffusion model
+#
+# Please see our project website at https://reachomk.github.io/gen2seg
+#
+# Additionally, if you use our code please cite our paper, along with the two works above.
+from dataclasses import dataclass
+from typing import Union, List, Optional
+import torch
+import numpy as np
+from PIL import Image
+from einops import rearrange
+from diffusers import DiffusionPipeline
+from diffusers.utils import BaseOutput, logging
+from transformers import AutoImageProcessor
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class gen2segMAEInstanceOutput(BaseOutput):
+    """
+    Output class for the ViTMAE Instance Segmentation Pipeline.
+    Args:
+        prediction (`np.ndarray` or `torch.Tensor`):
+            Predicted instance segmentation maps. The output has shape
+            `(batch_size, 3, height, width)` with pixel values scaled to [0, 255].
+    """
+    prediction: Union[np.ndarray, torch.Tensor]
+class gen2segMAEInstancePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for Instance Segmentation using a fine-tuned ViTMAEForPreTraining model.
+    This pipeline takes one or more input images and returns an instance segmentation
+    prediction for each image. The model is assumed to have been fine-tuned using an instance
+    segmentation loss, and the reconstruction is performed by rearranging the model’s
+    patch logits into an image.
+    Args:
+        model (`ViTMAEForPreTraining`):
+            The fine-tuned ViTMAE model.
+        image_processor (`AutoImageProcessor`):
+            The image processor responsible for preprocessing input images.
+    """
+    def __init__(self, model, image_processor):
+        super().__init__()
+        self.register_modules(model=model, image_processor=image_processor)
+        self.model = model
+        self.image_processor = image_processor
+    def check_inputs(
+        self,
+        image: Union[Image.Image, np.ndarray, torch.Tensor, List[Union[Image.Image, np.ndarray, torch.Tensor]]]
+    ) -> List:
+        if not isinstance(image, list):
+            image = [image]
+        # Additional input validations can be added here if desired.
+        return image
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[Image.Image, np.ndarray, torch.Tensor, List[Union[Image.Image, np.ndarray, torch.Tensor]]],
+        output_type: str = "np",
+        **kwargs
+    ) -> gen2segMAEInstanceOutput:
+        r"""
+        The call method of the pipeline.
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, or a list of these):
+                The input image(s) for instance segmentation. For arrays/tensors, expected values are in [0, 1].
+            output_type (`str`, optional, defaults to `"np"`):
+                The format of the output prediction. Choose `"np"` for a NumPy array or `"pt"` for a PyTorch tensor.
+            **kwargs:
+                Additional keyword arguments passed to the image processor.
+        Returns:
+            [`gen2segMAEInstanceOutput`]:
+                An output object containing the predicted instance segmentation maps.
+        """
+        # 1. Check and prepare input images.
+        images = self.check_inputs(image)
+        inputs = self.image_processor(images=images, return_tensors="pt", **kwargs)
+        pixel_values = inputs["pixel_values"].to(self.device)
+        # 2. Forward pass through the model.
+        outputs = self.model(pixel_values=pixel_values)
+        logits = outputs.logits  # Expected shape: (B, num_patches, patch_dim)
+        # 3. Retrieve patch size and image size from the model configuration.
+        patch_size = self.model.config.patch_size  # e.g., 16
+        image_size = self.model.config.image_size    # e.g., 224
+        grid_size = image_size // patch_size
+        # 4. Rearrange logits into the reconstructed image.
+        #    The logits are reshaped from (B, num_patches, patch_dim) to (B, 3, H, W).
+        reconstructed = rearrange(
+            logits,
+            "b (h w) (p1 p2 c) -> b c (h p1) (w p2)",
+            h=grid_size,
+            p1=patch_size,
+            p2=patch_size,
+            c=3,
+        )
+        # 5. Post-process the reconstructed output.
+        #    For each sample, shift and scale the prediction to [0, 255].
+        predictions = []
+        for i in range(reconstructed.shape[0]):
+            sample = reconstructed[i]
+            min_val = torch.abs(sample.min())
+            max_val = torch.abs(sample.max())
+            sample = (sample + min_val) / (max_val + min_val + 1e-5)
+            # sometimes the image is very dark so we perform gamma correction to "brighten" it
+            # in practice we can set this value to whatever we want or disable it entirely.
+            sample = sample**0.6
+            sample = sample * 255.0
+            predictions.append(sample)
+        prediction_tensor = torch.stack(predictions, dim=0).permute(0, 2, 3, 1)
+        # 6. Format the output.
+        if output_type == "np":
+            prediction = prediction_tensor.cpu().numpy()
+        else:
+            prediction = prediction_tensor
+        return gen2segMAEInstanceOutput(prediction=prediction)

gen2seg_sd_pipeline.py ADDED Viewed

	@@ -0,0 +1,454 @@

+# gen2seg official inference pipeline code for Stable Diffusion model
+#
+# This code was adapted from Marigold and Diffusion E2E Finetuning.
+#
+# Please see our project website at https://reachomk.github.io/gen2seg
+#
+# Additionally, if you use our code please cite our paper, along with the two works above.
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+from PIL import Image
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import (
+    AutoencoderKL,
+    UNet2DConditionModel,
+)
+from diffusers.schedulers import (
+    DDIMScheduler,
+)
+from diffusers.utils import (
+    BaseOutput,
+    logging,
+)
+from diffusers import DiffusionPipeline
+from diffusers.pipelines.marigold.marigold_image_processing import MarigoldImageProcessor
+# add
+def zeros_tensor(
+    shape: Union[Tuple, List],
+    device: Optional["torch.device"] = None,
+    dtype: Optional["torch.dtype"] = None,
+    layout: Optional["torch.layout"] = None,
+):
+    """
+    A helper function to create tensors of zeros on the desired `device`.
+    Mirrors randn_tensor from diffusers.utils.torch_utils.
+    """
+    layout = layout or torch.strided
+    device = device or torch.device("cpu")
+    latents = torch.zeros(list(shape), dtype=dtype, layout=layout).to(device)
+    return latents
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class Gen2SegSDSegOutput(BaseOutput):
+    """
+    Output class for gen2seg Instance Segmentation prediction pipeline.
+    Args:
+        prediction (`np.ndarray`, `torch.Tensor`):
+            Predicted instance segmentation with values in the range [0, 255]. The shape is always $numimages \times 1 \times height
+            \times width$, regardless of whether the images were passed as a 4D array or a list.
+        latent (`None`, `torch.Tensor`):
+            Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
+            The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
+    """
+    prediction: Union[np.ndarray, torch.Tensor]
+    latent: Union[None, torch.Tensor]
+class Gen2SegSDPipeline(DiffusionPipeline):
+    """
+    # add
+    Pipeline for Instance Segmentation prediction using our Stable Diffusion model.
+    Implementation is built upon Marigold: https://marigoldmonodepth.github.io and E2E FThttps://gonzalomartingarcia.github.io/diffusion-e2e-ft/
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        unet (`UNet2DConditionModel`):
+            Conditional U-Net to denoise the segmentation latent, synthesized from image latent.
+        vae (`AutoencoderKL`):
+            Variational Auto-Encoder (VAE) Model to encode and decode images and predictions to and from latent
+            representations.
+        scheduler (`DDIMScheduler`):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latent.
+        text_encoder (`CLIPTextModel`):
+            Text-encoder, for empty text embedding.
+        tokenizer (`CLIPTokenizer`):
+            CLIP tokenizer.
+        default_processing_resolution (`int`, *optional*):
+            The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
+            the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
+            default value is used. This is required to ensure reasonable results with various model flavors trained
+            with varying optimal processing resolution values.
+    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: Union[DDIMScheduler],
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        default_processing_resolution: Optional[int] = 768, # add
+    ):
+        super().__init__()
+        self.register_modules(
+            unet=unet,
+            vae=vae,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+        self.register_to_config(
+            default_processing_resolution=default_processing_resolution,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.default_processing_resolution = default_processing_resolution
+        self.empty_text_embedding = None
+        self.image_processor = MarigoldImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def check_inputs(
+        self,
+        image: PipelineImageInput,
+        processing_resolution: int,
+        resample_method_input: str,
+        resample_method_output: str,
+        batch_size: int,
+        output_type: str,
+    ) -> int:
+        if processing_resolution is None:
+            raise ValueError(
+                "`processing_resolution` is not specified and could not be resolved from the model config."
+            )
+        if processing_resolution < 0:
+            raise ValueError(
+                "`processing_resolution` must be non-negative: 0 for native resolution, or any positive value for "
+                "downsampled processing."
+            )
+        if processing_resolution % self.vae_scale_factor != 0:
+            raise ValueError(f"`processing_resolution` must be a multiple of {self.vae_scale_factor}.")
+        if resample_method_input not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_input` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if resample_method_output not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_output` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if batch_size < 1:
+            raise ValueError("`batch_size` must be positive.")
+        if output_type not in ["pt", "np"]:
+            raise ValueError("`output_type` must be one of `pt` or `np`.")
+        # image checks
+        num_images = 0
+        W, H = None, None
+        if not isinstance(image, list):
+            image = [image]
+        for i, img in enumerate(image):
+            if isinstance(img, np.ndarray) or torch.is_tensor(img):
+                if img.ndim not in (2, 3, 4):
+                    raise ValueError(f"`image[{i}]` has unsupported dimensions or shape: {img.shape}.")
+                H_i, W_i = img.shape[-2:]
+                N_i = 1
+                if img.ndim == 4:
+                    N_i = img.shape[0]
+            elif isinstance(img, Image.Image):
+                W_i, H_i = img.size
+                N_i = 1
+            else:
+                raise ValueError(f"Unsupported `image[{i}]` type: {type(img)}.")
+            if W is None:
+                W, H = W_i, H_i
+            elif (W, H) != (W_i, H_i):
+                raise ValueError(
+                    f"Input `image[{i}]` has incompatible dimensions {(W_i, H_i)} with the previous images {(W, H)}"
+                )
+            num_images += N_i
+        return num_images
+    def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        elif not isinstance(self._progress_bar_config, dict):
+            raise ValueError(
+                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
+            )
+        progress_bar_config = dict(**self._progress_bar_config)
+        progress_bar_config["desc"] = progress_bar_config.get("desc", desc)
+        progress_bar_config["leave"] = progress_bar_config.get("leave", leave)
+        if iterable is not None:
+            return tqdm(iterable, **progress_bar_config)
+        elif total is not None:
+            return tqdm(total=total, **progress_bar_config)
+        else:
+            raise ValueError("Either `total` or `iterable` has to be defined.")
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        processing_resolution: Optional[int] = None,
+        match_input_resolution: bool = False,
+        resample_method_input: str = "bilinear",
+        resample_method_output: str = "bilinear",
+        batch_size: int = 1,
+        output_type: str = "np",
+        output_latent: bool = False,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline.
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`),
+                `List[torch.Tensor]`: An input image or images used as an input for the instance segmentation task. For
+                arrays and tensors, the expected value range is between `[0, 1]`. Passing a batch of images is possible
+                by providing a four-dimensional array or a tensor. Additionally, a list of images of two- or
+                three-dimensional arrays or tensors can be passed. In the latter case, all list elements must have the
+                same width and height.
+            processing_resolution (`int`, *optional*, defaults to `None`):
+                Effective processing resolution. When set to `0`, matches the larger input image dimension. This
+                produces crisper predictions, but may also lead to the overall loss of global context. The default
+                value `None` resolves to the optimal value from the model config.
+            match_input_resolution (`bool`, *optional*, defaults to `True`):
+                When enabled, the output prediction is resized to match the input dimensions. When disabled, the longer
+                side of the output will equal to `processing_resolution`.
+            resample_method_input (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize input images to `processing_resolution`. The accepted values are:
+                `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            resample_method_output (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize output predictions to match the input resolution. The accepted values
+                are `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            batch_size (`int`, *optional*, defaults to `1`):
+                Batch size; only matters passing a tensor of images.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                Preferred format of the output's `prediction`. The accepted ßvalues are: `"np"` (numpy array) or `"pt"` (torch tensor).
+            output_latent (`bool`, *optional*, defaults to `False`):
+                When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
+                within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
+                `latents` argument.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`Gen2SegSDSegOutput`] instead of a plain tuple.
+        # add
+        E2E FT models are deterministic single step models involving no ensembling, i.e. E=1.
+        """
+        # 0. Resolving variables.
+        device = self._execution_device
+        dtype = self.dtype
+        # Model-specific optimal default values leading to fast and reasonable results.
+        if processing_resolution is None:
+            processing_resolution = self.default_processing_resolution
+        #print(image[0].size)
+        #processing_resolution = 8 * round(max(image[0].size) / 8)
+        # 1. Check inputs.
+        num_images = self.check_inputs(
+            image,
+            processing_resolution,
+            resample_method_input,
+            resample_method_output,
+            batch_size,
+            output_type,
+        )
+        # 2. Prepare empty text conditioning.
+        # Model invocation: self.tokenizer, self.text_encoder.
+        prompt = ""
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="do_not_pad",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids.to(device)
+        self.empty_text_embedding = self.text_encoder(text_input_ids)[0]  # [1,2,1024]
+        # 3. Preprocess input images. This function loads input image or images of compatible dimensions `(H, W)`,
+        # optionally downsamples them to the `processing_resolution` `(PH, PW)`, where
+        # `max(PH, PW) == processing_resolution`, and pads the dimensions to `(PPH, PPW)` such that these values are
+        # divisible by the latent space downscaling factor (typically 8 in Stable Diffusion). The default value `None`
+        # of `processing_resolution` resolves to the optimal value from the model config. It is a recommended mode of
+        # operation and leads to the most reasonable results. Using the native image resolution or any other processing
+        # resolution can lead to loss of either fine details or global context in the output predictions.
+        image, padding, original_resolution = self.image_processor.preprocess(
+            image, processing_resolution, resample_method_input, device, dtype
+        )  # [N,3,PPH,PPW]
+    #     image =(image+torch.abs(image.min()))
+    #     image = image/(torch.abs(image.max())+torch.abs(image.min()))
+    #    # prediction = prediction**0.5
+    #     #prediction = torch.clip(prediction, min=-1, max=1)+1
+    #     image = (image) * 2
+    #     image = image - 1
+        # 4. Encode input image into latent space. At this step, each of the `N` input images is represented with `E`
+        # ensemble members. Each ensemble member is an independent diffused prediction, just initialized independently.
+        # Latents of each such predictions across all input images and all ensemble members are represented in the
+        # `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
+        # into latent space and replicated `E` times. Encoding into latent space happens in batches of size `batch_size`.
+        # Model invocation: self.vae.encoder.
+        image_latent, pred_latent = self.prepare_latents(
+            image, batch_size
+        )  # [N*E,4,h,w], [N*E,4,h,w]
+        del image
+        batch_empty_text_embedding = self.empty_text_embedding.to(device=device, dtype=dtype).repeat(
+            batch_size, 1, 1
+        )  # [B,1024,2]
+        # 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
+        # The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
+        # outputs noise for the predicted modality's latent space.
+        # Model invocation: self.unet.
+        pred_latents = []
+        for i in range(0, num_images, batch_size):
+            batch_image_latent = image_latent[i : i + batch_size]  # [B,4,h,w]
+            batch_pred_latent = batch_image_latent[i : i + batch_size]    # [B,4,h,w]
+            effective_batch_size = batch_image_latent.shape[0]
+            text = batch_empty_text_embedding[:effective_batch_size]  # [B,2,1024]
+            # add
+            # Single step inference for E2E FT models
+            self.scheduler.set_timesteps(1, device=device)
+            for t in self.scheduler.timesteps:
+                batch_latent = batch_image_latent # torch.cat([batch_image_latent, batch_pred_latent], dim=1)  # [B,8,h,w]
+                noise = self.unet(batch_latent, t, encoder_hidden_states=text, return_dict=False)[0]  # [B,4,h,w]
+                batch_pred_latent = self.scheduler.step(
+                    noise, t, batch_image_latent
+                ).pred_original_sample  # [B,4,h,w], # add
+                                                     # directly take pred_original_sample rather than prev_sample
+            pred_latents.append(batch_pred_latent)
+        pred_latent = torch.cat(pred_latents, dim=0)  # [N*E,4,h,w]
+        del (
+            pred_latents,
+            image_latent,
+            batch_empty_text_embedding,
+            batch_image_latent,
+          #  batch_pred_latent,
+            text,
+            batch_latent,
+            noise,
+        )
+        # 6. Decode predictions from latent into pixel space. The resulting `N * E` predictions have shape `(PPH, PPW)`,
+        # which requires slight postprocessing. Decoding into pixel space happens in batches of size `batch_size`.
+        # Model invocation: self.vae.decoder.
+        prediction = torch.cat(
+            [
+                self.decode_prediction(pred_latent[i : i + batch_size])
+                for i in range(0, pred_latent.shape[0], batch_size)
+            ],
+            dim=0,
+        )  # [N*E,1,PPH,PPW]
+        if not output_latent:
+            pred_latent = None
+        # 7. Remove padding. The output shape is (PH, PW).
+        prediction = self.image_processor.unpad_image(prediction, padding)  # [N*E,1,PH,PW]
+        # 9. If `match_input_resolution` is set, the output prediction are upsampled to match the
+        # input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
+        # Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
+        # setting the `resample_method_output` parameter (e.g., to `"nearest"`).
+        if match_input_resolution:
+            prediction = self.image_processor.resize_antialias(
+                prediction, original_resolution, resample_method_output, is_aa=False
+            )  # [N,1,H,W]
+        # 10. Prepare the final outputs.
+        if output_type == "np":
+            prediction = self.image_processor.pt_to_numpy(prediction)  # [N,H,W,1]
+        # 11. Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (prediction, pred_latent)
+        return Gen2SegSDSegOutput(
+            prediction=prediction,
+            latent=pred_latent,
+        )
+    def prepare_latents(
+        self,
+        image: torch.Tensor,
+        batch_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        def retrieve_latents(encoder_output):
+            if hasattr(encoder_output, "latent_dist"):
+                return encoder_output.latent_dist.mode()
+            elif hasattr(encoder_output, "latents"):
+                return encoder_output.latents
+            else:
+                raise AttributeError("Could not access latents of provided encoder_output")
+        image_latent = torch.cat(
+            [
+                retrieve_latents(self.vae.encode(image[i : i + batch_size]))
+                for i in range(0, image.shape[0], batch_size)
+            ],
+            dim=0,
+        )  # [N,4,h,w]
+        image_latent = image_latent * self.vae.config.scaling_factor # [N*E,4,h,w]
+        # add
+        # provide zeros as noised latent
+        pred_latent = zeros_tensor(
+            image_latent.shape,
+            device=image_latent.device,
+            dtype=image_latent.dtype,
+        )  # [N*E,4,h,w]
+        return image_latent, pred_latent
+    def decode_prediction(self, pred_latent: torch.Tensor) -> torch.Tensor:
+        if pred_latent.dim() != 4 or pred_latent.shape[1] != self.vae.config.latent_channels:
+            raise ValueError(
+                f"Expecting 4D tensor of shape [B,{self.vae.config.latent_channels},H,W]; got {pred_latent.shape}."
+            )
+        prediction = self.vae.decode(pred_latent / self.vae.config.scaling_factor, return_dict=False)[0]  # [B,3,H,W]
+        #print(prediction.max())
+        #print(prediction.min())
+        prediction =(prediction+torch.abs(prediction.min()))
+        prediction = prediction/(torch.abs(prediction.max())+torch.abs(prediction.min()))
+        #prediction = prediction**0.5
+        #prediction = torch.clip(prediction, min=-1, max=1)+1
+        prediction = (prediction) * 255.0
+        #print(prediction.max())
+        #print(prediction.min())
+        return prediction  # [B,1,H,W]

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio
+torch
+torchvision
+Pillow
+numpy
+diffusers
+transformers
+einops
+tqdm
+safetensors