Spaces:

1inkusFace
/

StableDiffusion-3.5-Large-IP

Paused

App Files Files Community

1inkusFace commited on Jan 16

Commit

cf4d4ba

verified ·

1 Parent(s): d7dfa1e

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -47

app.py CHANGED Viewed

@@ -38,8 +38,6 @@ torch.backends.cudnn.benchmark = False
 hftoken = os.getenv("HF_TOKEN")
-#image_encoder_path = "google/siglip-so400m-patch14-384"
-#image_encoder_path_b = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
 ipadapter_path = hf_hub_download(repo_id="InstantX/SD3.5-Large-IP-Adapter", filename="ip-adapter.bin")
 model_path = 'ford442/stable-diffusion-3.5-large-bf16'
@@ -82,8 +80,6 @@ pipe = StableDiffusion3Pipeline.from_pretrained(
 pipe.to(device)
-#pipe.to(device=device, dtype=torch.bfloat16)
 upscaler_2 = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(torch.device("cuda:0"))
 MAX_SEED = np.iinfo(np.int32).max
@@ -99,7 +95,7 @@ def infer(
     height,
     guidance_scale,
     num_inference_steps,
-    latent_file,  # Add latents file input
     ip_scale,
     image_encoder_path,
     progress=gr.Progress(track_tqdm=True),
@@ -110,30 +106,20 @@ def infer(
     generator = torch.Generator(device='cuda').manual_seed(seed)
     enhanced_prompt = prompt
     enhanced_prompt_2 = prompt
-    if latent_file:  # Check if a latent file is provided
-      #  initial_latents = pipe.prepare_latents(
-      #      batch_size=1,
-      #      num_channels_latents=pipe.transformer.in_channels,
-      #      height=pipe.transformer.config.sample_size[0],
-       #     width=pipe.transformer.config.sample_size[1],
-      #      dtype=pipe.transformer.dtype,
-      #      device=pipe.device,
-      #      generator=generator,
-      #  )
         sd_image_a = Image.open(latent_file.name).convert('RGB')
         print("-- using image file and loading ip-adapter --")
         pipe.init_ipadapter(
         ip_adapter_path=ipadapter_path,
         image_encoder_path=image_encoder_path,
         nb_token=64,
         )
         print('-- generating image --')
-        #with torch.no_grad():
         sd_image = pipe(
             width=width,
             height=height,
-            prompt=enhanced_prompt,  # This conversion is fine
             negative_prompt=negative_prompt_1,
             num_inference_steps=num_inference_steps,
             guidance_scale=guidance_scale,
@@ -147,9 +133,8 @@ def infer(
         upload_to_ftp(rv_path)
     else:
         print('-- generating image --')
-        #with torch.no_grad():
         sd_image = pipe(
-            prompt=prompt,  # This conversion is fine
             prompt_2=enhanced_prompt_2,
             prompt_3=enhanced_prompt,
             negative_prompt=negative_prompt_1,
@@ -159,33 +144,14 @@ def infer(
             num_inference_steps=num_inference_steps,
             width=width,
             height=height,
-         #   latents=None,
-          #  output_type='latent',
             generator=generator,
             max_sequence_length=512
         ).images[0]
         print('-- got image --')
         timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-        #sd35_image = pipe.vae.decode(sd_image / 0.18215).sample
-       # sd35_image = sd35_image.cpu().permute(0, 2, 3, 1).float().detach().numpy()
-       # sd35_image = (sd35_image * 255).round().astype("uint8")
-       # image_pil = Image.fromarray(sd35_image[0])
-      #  sd35_path = f"sd35_{seed}.png"
-       # image_pil.save(sd35_path,optimize=False,compress_level=0)
-       # upload_to_ftp(sd35_path)
         sd35_path = f"sd35l_{timestamp}.png"
         sd_image.save(sd35_path,optimize=False,compress_level=0)
         upload_to_ftp(sd35_path)
-        # Convert the generated image to a tensor
-    #generated_image_tensor = torch.tensor([np.array(sd_image).transpose(2, 0, 1)]).to('cuda') / 255.0
-    # Encode the generated image into latents
-    #with torch.no_grad():
-    #    generated_latents = pipe.vae.encode(generated_image_tensor.to(torch.bfloat16)).latent_dist.sample().mul_(0.18215)
-    #latent_path = f"sd35m_{seed}.pt"
-    # Save the latents to a .pt file
-    #torch.save(generated_latents, latent_path)
-    #upload_to_ftp(latent_path)
-    #  pipe.unet.to('cpu')
     upscaler_2.to(torch.device('cuda'))
     with torch.no_grad():
         upscale2 = upscaler_2(sd_image, tiling=True, tile_width=256, tile_height=256)
@@ -214,8 +180,8 @@ body{
 with gr.Blocks(theme=gr.themes.Origin(),css=css) as demo:
     with gr.Column(elem_id="col-container"):
-        gr.Markdown(" # Text-to-Text-to-Image StableDiffusion 3.5 Large")
-        expanded_prompt_output = gr.Textbox(label="Prompt", lines=5)  # Add this line
         with gr.Row():
             prompt = gr.Text(
                 label="Prompt",
@@ -227,7 +193,7 @@ with gr.Blocks(theme=gr.themes.Origin(),css=css) as demo:
             run_button = gr.Button("Run", scale=0, variant="primary")
         result = gr.Image(label="Result", show_label=False)
         with gr.Accordion("Advanced Settings", open=True):
-            latent_file = gr.File(label="Image File (optional)")  # Add latents file input
             image_encoder_path = gr.Dropdown(
                 ["google/siglip-so400m-patch14-384", "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"],
                 label="CLIP Model",
@@ -266,28 +232,28 @@ with gr.Blocks(theme=gr.themes.Origin(),css=css) as demo:
                     minimum=256,
                     maximum=MAX_IMAGE_SIZE,
                     step=32,
-                    value=768,  # Replace with defaults that work for your model
                 )
                 height = gr.Slider(
                     label="Height",
                     minimum=256,
                     maximum=MAX_IMAGE_SIZE,
                     step=32,
-                    value=768,  # Replace with defaults that work for your model
                 )
                 guidance_scale = gr.Slider(
                     label="Guidance scale",
                     minimum=0.0,
                     maximum=30.0,
                     step=0.1,
-                    value=4.2,  # Replace with defaults that work for your model
                 )
                 num_inference_steps = gr.Slider(
                     label="Number of inference steps",
                     minimum=1,
                     maximum=500,
                     step=1,
-                    value=220,  # Replace with defaults that work for your model
                 )
             gr.Examples(examples=examples, inputs=[prompt])
         gr.on(
@@ -302,7 +268,7 @@ with gr.Blocks(theme=gr.themes.Origin(),css=css) as demo:
             height,
             guidance_scale,
             num_inference_steps,
-            latent_file,  # Add latent_file to the inputs
             ip_scale,
             image_encoder_path,
         ],

 hftoken = os.getenv("HF_TOKEN")
 ipadapter_path = hf_hub_download(repo_id="InstantX/SD3.5-Large-IP-Adapter", filename="ip-adapter.bin")
 model_path = 'ford442/stable-diffusion-3.5-large-bf16'
 pipe.to(device)
 upscaler_2 = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(torch.device("cuda:0"))
 MAX_SEED = np.iinfo(np.int32).max
     height,
     guidance_scale,
     num_inference_steps,
+    latent_file,
     ip_scale,
     image_encoder_path,
     progress=gr.Progress(track_tqdm=True),
     generator = torch.Generator(device='cuda').manual_seed(seed)
     enhanced_prompt = prompt
     enhanced_prompt_2 = prompt
+    if latent_file:
         sd_image_a = Image.open(latent_file.name).convert('RGB')
         print("-- using image file and loading ip-adapter --")
+        sd_image_a.resize((height,width), Image.LANCZOS)
         pipe.init_ipadapter(
         ip_adapter_path=ipadapter_path,
         image_encoder_path=image_encoder_path,
         nb_token=64,
         )
         print('-- generating image --')
         sd_image = pipe(
             width=width,
             height=height,
+            prompt=enhanced_prompt,
             negative_prompt=negative_prompt_1,
             num_inference_steps=num_inference_steps,
             guidance_scale=guidance_scale,
         upload_to_ftp(rv_path)
     else:
         print('-- generating image --')
         sd_image = pipe(
+            prompt=prompt,
             prompt_2=enhanced_prompt_2,
             prompt_3=enhanced_prompt,
             negative_prompt=negative_prompt_1,
             num_inference_steps=num_inference_steps,
             width=width,
             height=height,
             generator=generator,
             max_sequence_length=512
         ).images[0]
         print('-- got image --')
         timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
         sd35_path = f"sd35l_{timestamp}.png"
         sd_image.save(sd35_path,optimize=False,compress_level=0)
         upload_to_ftp(sd35_path)
     upscaler_2.to(torch.device('cuda'))
     with torch.no_grad():
         upscale2 = upscaler_2(sd_image, tiling=True, tile_width=256, tile_height=256)
 with gr.Blocks(theme=gr.themes.Origin(),css=css) as demo:
     with gr.Column(elem_id="col-container"):
+        gr.Markdown(" # StableDiffusion 3.5 Large with IP Adapter")
+        expanded_prompt_output = gr.Textbox(label="Prompt", lines=5)
         with gr.Row():
             prompt = gr.Text(
                 label="Prompt",
             run_button = gr.Button("Run", scale=0, variant="primary")
         result = gr.Image(label="Result", show_label=False)
         with gr.Accordion("Advanced Settings", open=True):
+            latent_file = gr.File(label="Image File (optional)")
             image_encoder_path = gr.Dropdown(
                 ["google/siglip-so400m-patch14-384", "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"],
                 label="CLIP Model",
                     minimum=256,
                     maximum=MAX_IMAGE_SIZE,
                     step=32,
+                    value=768,
                 )
                 height = gr.Slider(
                     label="Height",
                     minimum=256,
                     maximum=MAX_IMAGE_SIZE,
                     step=32,
+                    value=768,
                 )
                 guidance_scale = gr.Slider(
                     label="Guidance scale",
                     minimum=0.0,
                     maximum=30.0,
                     step=0.1,
+                    value=4.2,
                 )
                 num_inference_steps = gr.Slider(
                     label="Number of inference steps",
                     minimum=1,
                     maximum=500,
                     step=1,
+                    value=50,
                 )
             gr.Examples(examples=examples, inputs=[prompt])
         gr.on(
             height,
             guidance_scale,
             num_inference_steps,
+            latent_file,
             ip_scale,
             image_encoder_path,
         ],