flux-lightning

Runtime error

App Files Files Community

Jordan Legg commited on Aug 8, 2024

Commit

a7d057d

1 Parent(s): d2b0012

check latent chapes before multiplication

Browse files

Files changed (1) hide show

app.py +36 -39

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import random
 import torch
 from PIL import Image
 from torchvision import transforms
-from diffusers import DiffusionPipeline, AutoencoderKL
 # Constants
 dtype = torch.bfloat16
@@ -13,27 +13,43 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 2048
-# Load models
 pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=dtype).to(device)
 pipe.enable_model_cpu_offload()
 pipe.vae.enable_slicing()
 pipe.vae.enable_tiling()
-vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae").to(device)
 def preprocess_image(image, image_size):
     preprocess = transforms.Compose([
         transforms.Resize((image_size, image_size), interpolation=transforms.InterpolationMode.LANCZOS),
         transforms.ToTensor(),
         transforms.Normalize([0.5], [0.5])
     ])
-    image = preprocess(image).unsqueeze(0).to(device, dtype=torch.float32)
     return image
-def encode_image(image):
-    with torch.no_grad():
-        latents = vae.encode(image).latent_dist.sample() * 0.18215
-    return latents
 @spaces.GPU()
 def infer(prompt, init_image=None, seed=42, randomize_seed=False, width=1024, height=1024, num_inference_steps=4, progress=gr.Progress(track_tqdm=True)):
@@ -56,15 +72,21 @@ def infer(prompt, init_image=None, seed=42, randomize_seed=False, width=1024, he
             # img2img case
             init_image = init_image.convert("RGB")
             init_image = preprocess_image(init_image, 1024)  # Using 1024 as FLUX VAE sample size
-            latents = encode_image(init_image)
             latents = torch.nn.functional.interpolate(latents, size=(height // 8, width // 8), mode='bilinear')
-            if latents.shape[1] != pipe.vae.config.latent_channels:
-                conv = torch.nn.Conv2d(latents.shape[1], pipe.vae.config.latent_channels, kernel_size=1).to(device, dtype=dtype)
-                latents = conv(latents.to(dtype))
-            latents = latents.permute(0, 2, 3, 1).contiguous().view(-1, pipe.vae.config.latent_channels)
             image = pipe(
                 prompt=prompt,
@@ -81,30 +103,5 @@ def infer(prompt, init_image=None, seed=42, randomize_seed=False, width=1024, he
         print(f"Error during inference: {e}")
         return Image.new("RGB", (width, height), (255, 0, 0)), seed  # Red fallback image
-# Gradio interface setup
-with gr.Blocks() as demo:
-    with gr.Row():
-        prompt = gr.Textbox(label="Prompt")
-        init_image = gr.Image(label="Initial Image (optional)", type="pil")
-    with gr.Row():
-        generate = gr.Button("Generate")
-    with gr.Row():
-        result = gr.Image(label="Result")
-        seed_output = gr.Number(label="Seed")
-    with gr.Accordion("Advanced Settings", open=False):
-        seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42)
-        randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-        width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024)
-        height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024)
-        num_inference_steps = gr.Slider(label="Number of inference steps", minimum=1, maximum=50, step=1, value=4)
-    generate.click(
-        infer,
-        inputs=[prompt, init_image, seed, randomize_seed, width, height, num_inference_steps],
-        outputs=[result, seed_output]
-    )
 demo.launch()

 import torch
 from PIL import Image
 from torchvision import transforms
+from diffusers import DiffusionPipeline
 # Constants
 dtype = torch.bfloat16
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 2048
+# Load FLUX model
 pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=dtype).to(device)
 pipe.enable_model_cpu_offload()
 pipe.vae.enable_slicing()
 pipe.vae.enable_tiling()
 def preprocess_image(image, image_size):
     preprocess = transforms.Compose([
         transforms.Resize((image_size, image_size), interpolation=transforms.InterpolationMode.LANCZOS),
         transforms.ToTensor(),
         transforms.Normalize([0.5], [0.5])
     ])
+    image = preprocess(image).unsqueeze(0).to(device, dtype=dtype)
     return image
+def check_shapes(latents):
+    # Get the shape of the latents
+    latent_shape = latents.shape
+    print(f"Latent shape: {latent_shape}")
+    # Get the expected shape for the transformer input
+    expected_shape = (1, latent_shape[1] * latent_shape[2], latent_shape[3])
+    print(f"Expected transformer input shape: {expected_shape}")
+    # Get the shape of the transformer's weight matrix
+    if hasattr(pipe.transformer, 'text_model'):
+        weight_shape = pipe.transformer.text_model.encoder.layers[0].self_attn.q_proj.weight.shape
+    else:
+        weight_shape = pipe.transformer.encoder.layers[0].self_attn.q_proj.weight.shape
+    print(f"Transformer weight shape: {weight_shape}")
+    # Check if the shapes are compatible for matrix multiplication
+    if expected_shape[1] == weight_shape[1]:
+        print("Shapes are compatible for matrix multiplication.")
+    else:
+        print("Warning: Shapes are not compatible for matrix multiplication.")
+        print(f"Expected: {expected_shape[1]}, Got: {weight_shape[1]}")
 @spaces.GPU()
 def infer(prompt, init_image=None, seed=42, randomize_seed=False, width=1024, height=1024, num_inference_steps=4, progress=gr.Progress(track_tqdm=True)):
             # img2img case
             init_image = init_image.convert("RGB")
             init_image = preprocess_image(init_image, 1024)  # Using 1024 as FLUX VAE sample size
+            # Encode the image using FLUX VAE
+            latents = pipe.vae.encode(init_image).latent_dist.sample() * 0.18215
+            # Ensure latents are the correct shape
             latents = torch.nn.functional.interpolate(latents, size=(height // 8, width // 8), mode='bilinear')
+            # Check shapes before reshaping
+            check_shapes(latents)
+            # Reshape latents to match the expected input shape of the transformer
+            latents = latents.permute(0, 2, 3, 1).contiguous().view(1, -1, pipe.vae.config.latent_channels)
+            # Check shapes after reshaping
+            check_shapes(latents)
             image = pipe(
                 prompt=prompt,
         print(f"Error during inference: {e}")
         return Image.new("RGB", (width, height), (255, 0, 0)), seed  # Red fallback image
 demo.launch()