Spaces:

sitatech
/

vibe-shopping

Running

App Files Files Community

sitatech commited on Jun 9

Commit

060ca74

1 Parent(s): 0651cc5

[vtry] Fix and finalize virtual_try

Browse files

Files changed (8) hide show

.gitignore +2 -1
llm/app.py +2 -0
mcp_host/agent.py +1 -1
mcp_server.py +3 -3
virtual_try/.gitignore +1 -0
virtual_try/app.py +50 -5
virtual_try/auto_masker.py +15 -7
virtual_try/configs.py +7 -4

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 __pycache__
-.DS_Store

 __pycache__
+.DS_Store
+.env

llm/app.py CHANGED Viewed

@@ -59,6 +59,8 @@ def serve_llm():
         str(VLLM_PORT),
         "--api-key",
         os.environ["API_KEY"],
     ]
     subprocess.Popen(cmd)

         str(VLLM_PORT),
         "--api-key",
         os.environ["API_KEY"],
+        "--tensor-parallel-size",
+        str(N_GPU),
     ]
     subprocess.Popen(cmd)

mcp_host/agent.py CHANGED Viewed

@@ -44,7 +44,7 @@ If a tool requires an input that you don't have based on your knowledge and the
         self,
         model_name: str = "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic",
         openai_api_key: str = os.getenv("OPENAI_API_KEY", ""),
-        openai_api_base_url: str = "TODO",
         image_uploader: ImageUploader = ImageUploader(),
     ):
         self.agora_client = AgoraMCPClient(unique_name="Agora")

         self,
         model_name: str = "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic",
         openai_api_key: str = os.getenv("OPENAI_API_KEY", ""),
+        openai_api_base_url: str = os.getenv("OPENAI_API_BASE_URL", ""),
         image_uploader: ImageUploader = ImageUploader(),
     ):
         self.agora_client = AgoraMCPClient(unique_name="Agora")

mcp_server.py CHANGED Viewed

@@ -47,7 +47,7 @@ def try_item_with_masking(
     [IMAGE2] The same skirt is worn by a woman standing in a realistic lifestyle setting, the skirt fits naturally.
     Args:
-        prompt: A prompt for the diffusion model to use for inpainting.
         item_image_url: URL of the item image to try.
         target_image_url: URL of the target image where the item will be tried.
         mask_image_url: Optional URL of a mask image to use.
@@ -85,14 +85,14 @@ def try_item_with_auto_masking(
     [IMAGE2] The same sofa is shown in a living room in a realistic lifestyle setting, the sofa fits in naturally with the room decor.
     For cases where a similar item is present but masking it won't cover enough area for the item to be applied, if you can, you should use a composite mask prompt.
-    For example if the item is a long-sleeved shirt and the target image is a person wearing a short-sleeved t-shirt, the masking prompt could be "t-shirt, arms".
     If the the item is a dress and the target image is a person wearing a t-shirt and jeans, the masking prompt could be "t-shirt, jeans, arms, legs".
     Make sure the mask prompt include all the parts where the item will be applied to.
     This tool requires a similar item to be present in the target image, so it can generate a mask of the item using the masking_prompt.
     Args:
-        prompt: A prompt for the diffusion model to use for inpainting.
         item_image_url: URL of the item image to try.
         target_image_url: URL of the target image where the item will be tried.
         masking_prompt: Prompt for generating a mask of the corresponding item in the target image. It need to be short and descriptive, e.g. "red dress", "blue sofa", "tire", "skirt, legs" etc.

     [IMAGE2] The same skirt is worn by a woman standing in a realistic lifestyle setting, the skirt fits naturally.
     Args:
+        prompt: A prompt for the diffusion model to use for inpainting. Be specific, e.g: for a short dress, say short dress, not just dress.
         item_image_url: URL of the item image to try.
         target_image_url: URL of the target image where the item will be tried.
         mask_image_url: Optional URL of a mask image to use.
     [IMAGE2] The same sofa is shown in a living room in a realistic lifestyle setting, the sofa fits in naturally with the room decor.
     For cases where a similar item is present but masking it won't cover enough area for the item to be applied, if you can, you should use a composite mask prompt.
+    For example if the item is a long-sleeved shirt and the target image is a person wearing a short-sleeved t-shirt, the masking prompt could be "t-shirt, arms, neck".
     If the the item is a dress and the target image is a person wearing a t-shirt and jeans, the masking prompt could be "t-shirt, jeans, arms, legs".
     Make sure the mask prompt include all the parts where the item will be applied to.
     This tool requires a similar item to be present in the target image, so it can generate a mask of the item using the masking_prompt.
     Args:
+        prompt: A prompt for the diffusion model to use for inpainting. Be specific, e.g: for a long-sleeved shirt, say long-sleeved shirt, not just shirt.
         item_image_url: URL of the item image to try.
         target_image_url: URL of the target image where the item will be tried.
         masking_prompt: Prompt for generating a mask of the corresponding item in the target image. It need to be short and descriptive, e.g. "red dress", "blue sofa", "tire", "skirt, legs" etc.

virtual_try/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ test_data

virtual_try/app.py CHANGED Viewed

@@ -14,9 +14,9 @@ with image.imports():
     from nunchaku.utils import get_precision
     from nunchaku.lora.flux.compose import compose_lora
-    from virtual_try.auto_masker import AutoInpaintMaskGenerator
-    TransformType = Callable[[Image.Image | np.ndarray], torch.Tensor]
 app = modal.App("vibe-shopping")
@@ -120,9 +120,7 @@ class VirtualTryModel:
         mask_tensor = mask_preprocessor(mask)
         # Create concatenated images along the width axis
-        inpaint_image = torch.cat(
-            [item_to_try_tensor, image_tensor], dim=2
-        )
         extended_mask = torch.cat([torch.zeros_like(mask_tensor), mask_tensor], dim=2)
         prompt = prompt or (
@@ -148,3 +146,50 @@ class VirtualTryModel:
         byte_stream = BytesIO()
         output_image.save(byte_stream, format="WEBP", quality=90)
         return byte_stream.getvalue()

     from nunchaku.utils import get_precision
     from nunchaku.lora.flux.compose import compose_lora
+    from auto_masker import AutoInpaintMaskGenerator
+TransformType = Callable[[Image.Image | np.ndarray], torch.Tensor]
 app = modal.App("vibe-shopping")
         mask_tensor = mask_preprocessor(mask)
         # Create concatenated images along the width axis
+        inpaint_image = torch.cat([item_to_try_tensor, image_tensor], dim=2)
         extended_mask = torch.cat([torch.zeros_like(mask_tensor), mask_tensor], dim=2)
         prompt = prompt or (
         byte_stream = BytesIO()
         output_image.save(byte_stream, format="WEBP", quality=90)
         return byte_stream.getvalue()
+###### ------ FOR TESTING PURPOSES ONLY ------ ######
+@app.local_entrypoint()
+def main(twice: bool = True):
+    import time
+    from pathlib import Path
+    test_data_dir = Path(__file__).parent / "test_data"
+    with open(test_data_dir / "target_image.jpg", "rb") as f:
+        target_image_bytes = f.read()
+    with open(test_data_dir / "item_to_try.jpg", "rb") as f:
+        item_to_try_bytes = f.read()
+    with open(test_data_dir / "item_to_try2.png", "rb") as f:
+        item_to_try_2_bytes = f.read()
+    prompt = (
+        "The pair of images highlights a clothing and its styling on a model, high resolution, 4K, 8K; "
+        "[IMAGE1] Detailed product shot of a clothing"
+        "[IMAGE2] The same cloth is worn by a model in a lifestyle setting."
+    )
+    t0 = time.time()
+    image_bytes = VirtualTryModel().try_it.remote(
+        prompt=prompt,
+        image_bytes=target_image_bytes,
+        item_to_try_bytes=item_to_try_bytes,
+        masking_prompt="t-shirt, arms, neck",
+    )
+    output_path = test_data_dir / "output1.jpg"
+    output_path.parent.mkdir(exist_ok=True, parents=True)
+    output_path.write_bytes(image_bytes)
+    print(f"🎨 first inference latency: {time.time() - t0:.2f} seconds")
+    if twice:
+        t0 = time.time()
+        image_bytes = VirtualTryModel().try_it.remote(
+            prompt=prompt,
+            image_bytes=target_image_bytes,
+            item_to_try_bytes=item_to_try_2_bytes,
+            masking_prompt="t-shirt, arms",
+        )
+        print(f"🎨 second inference latency: {time.time() - t0:.2f} seconds")
+        output_path = test_data_dir / "output2.jpg"
+        output_path.parent.mkdir(exist_ok=True, parents=True)
+        output_path.write_bytes(image_bytes)

virtual_try/auto_masker.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import numpy as np
 from PIL import Image
 from huggingface_hub import hf_hub_download
@@ -46,19 +47,26 @@ class AutoInpaintMaskGenerator:
         )[0]
         masks = result["masks"]  # (N, H, W)
-        scores = result["mask_scores"]  # (N,)
         if len(masks) == 0:
             raise ValueError("No masks found.")
         # Filter masks by score threshold
-        valid_indices = np.where(scores >= threshold)[0]
         if len(valid_indices) == 0:
             raise ValueError("No masks scored the required threshold.")
-        best_idx = valid_indices[np.argmax(scores[valid_indices])]
-        mask = masks[best_idx]
         # Convert to uint8 binary mask for inpainting
-        binary_mask = (mask.astype(np.uint8)) * 255  # 0 or 255
-        return binary_mask

+import cv2
 import numpy as np
 from PIL import Image
 from huggingface_hub import hf_hub_download
         )[0]
         masks = result["masks"]  # (N, H, W)
+        scores = np.atleast_1d(result["mask_scores"]) # Ensure it's always at least 1D
+        # If only one mask returned, expand dims
+        if masks.ndim == 2:
+            masks = masks[np.newaxis, :, :]  # Make it (1, H, W)
         if len(masks) == 0:
             raise ValueError("No masks found.")
         # Filter masks by score threshold
+        valid_indices = scores >= threshold
         if len(valid_indices) == 0:
             raise ValueError("No masks scored the required threshold.")
+        combined_mask = np.any(masks[valid_indices], axis=0)
         # Convert to uint8 binary mask for inpainting
+        binary_mask = (combined_mask.astype(np.uint8)) * 255  # 0 or 255
+        # Apply dilation
+        kernel = np.ones((10, 10), np.uint8)
+        dilated_mask = cv2.dilate(binary_mask, kernel, iterations=1)
+        return dilated_mask

virtual_try/configs.py CHANGED Viewed

@@ -1,18 +1,23 @@
 import modal
 image = (
     modal.Image.debian_slim(python_version="3.12")
     .pip_install(
         "torch==2.7.0",
         "torchvision",
         "diffusers==0.33.1",
         "transformers==4.52.4",
         "accelerate==1.7.0",
         "huggingface_hub[hf_transfer]==0.32.4",
-        "git+https://github.com/luca-medeiros/lang-segment-anything.git@e9af744d999d85eb4d0bd59a83342ecdc2bd2461",
-        "https://github.com/mit-han-lab/nunchaku/releases/download/v0.3.0/nunchaku-0.3.0+torch2.7-cp312-cp312-linux_x86_64.whl#sha256=ed28665515075050c8ef1bacd16845b85aa4335f6c760d6fa716d3b090909d8d7",
     )
     .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
 )
 hf_cache_vol = modal.Volume.from_name(
@@ -28,8 +33,6 @@ MINUTE = 60
 modal_class_config = {
     "image": image,
     "gpu": "A100-40GB",
-    "cpu": 4,  # 8vCPUs
-    "memory": 16,  # 16 GB RAM
     "volumes": {
         "/root/.cache/huggingface": hf_cache_vol,
     },

 import modal
+from pathlib import Path
 image = (
     modal.Image.debian_slim(python_version="3.12")
+    .apt_install("git")
     .pip_install(
         "torch==2.7.0",
         "torchvision",
         "diffusers==0.33.1",
         "transformers==4.52.4",
         "accelerate==1.7.0",
+        "opencv-python-headless",
         "huggingface_hub[hf_transfer]==0.32.4",
+        "git+https://github.com/sitatec/lang-segment-anything.git",
+        "https://github.com/mit-han-lab/nunchaku/releases/download/v0.3.1dev20250609/nunchaku-0.3.1.dev20250609+torch2.7-cp312-cp312-linux_x86_64.whl#sha256=1518f6c02358545fd0336a6a74547e2c875603b381d5ce75b1664f981105b141",
     )
     .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+    .add_local_file(str(Path(__file__).resolve()), "/root/configs.py")
+    .add_local_file(str(Path(__file__).parent / "auto_masker.py"), "/root/auto_masker.py")
 )
 hf_cache_vol = modal.Volume.from_name(
 modal_class_config = {
     "image": image,
     "gpu": "A100-40GB",
     "volumes": {
         "/root/.cache/huggingface": hf_cache_vol,
     },