pixagram-dev

Runtime error

App Files Files Community

primerz commited on Oct 27

Commit

ee4fca1

verified ·

1 Parent(s): 962b8c2

Update app.py

Browse files

Files changed (1) hide show

app.py +246 -160

app.py CHANGED Viewed

@@ -2,21 +2,25 @@ import spaces  # MUST be first, before any CUDA-related imports
 import gradio as gr
 import torch
 from diffusers import (
-    StableDiffusionXLPipeline,
-    StableDiffusionXLControlNetPipeline,
     ControlNetModel,
     AutoencoderKL,
-    LCMScheduler  # CORRECT SCHEDULER FOR LCM
 )
 from diffusers.models.attention_processor import AttnProcessor2_0
 from insightface.app import FaceAnalysis
 from PIL import Image
 import numpy as np
 import cv2
-from transformers import pipeline as transformers_pipeline
 from huggingface_hub import hf_hub_download
 import os
 # Configuration
 MODEL_REPO = "primerz/pixagram"
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -30,9 +34,10 @@ print(f"Loading models from: {MODEL_REPO}")
 print(f"LORA Trigger Word: {TRIGGER_WORD}")
 class RetroArtConverter:
-    def __init__(self):
         self.device = device
         self.dtype = dtype
         self.models_loaded = {
             'custom_checkpoint': False,
             'lora': False,
@@ -40,7 +45,7 @@ class RetroArtConverter:
         }
         # Initialize face analysis for InstantID
-        print("Loading face analysis model...")
         try:
             self.face_app = FaceAnalysis(
                 name='antelopev2',
@@ -55,14 +60,7 @@ class RetroArtConverter:
             self.face_app = None
             self.face_detection_enabled = False
-        # Load ControlNet for depth
-        print("Loading ControlNet depth model...")
-        self.controlnet_depth = ControlNetModel.from_pretrained(
-            "diffusers/controlnet-zoe-depth-sdxl-1.0",
-            torch_dtype=self.dtype
-        ).to(self.device)
-        # Load InstantID ControlNet (optional)
         print("Loading InstantID ControlNet...")
         try:
             self.controlnet_instantid = ControlNetModel.from_pretrained(
@@ -78,50 +76,82 @@ class RetroArtConverter:
             self.controlnet_instantid = None
             self.instantid_enabled = False
-        # Load depth estimator
-        print("Loading depth estimator...")
-        self.depth_estimator = transformers_pipeline(
-            'depth-estimation',
-            model="Intel/dpt-hybrid-midas",
-            device=self.device if self.device == "cuda" else -1
-        )
         # Determine which controlnets to use
         if self.instantid_enabled and self.controlnet_instantid is not None:
-            controlnets = [self.controlnet_depth, self.controlnet_instantid]
-            print(f"Initializing with multiple ControlNets: Depth + InstantID")
         else:
             controlnets = self.controlnet_depth
-            print(f"Initializing with single ControlNet: Depth only")
         # Load SDXL checkpoint from HuggingFace Hub
-        # NOTE: VAE is bundled in the checkpoint, don't load separately!
-        print("Loading SDXL checkpoint (horizon) with bundled VAE from HuggingFace Hub...")
         try:
             model_path = hf_hub_download(
                 repo_id=MODEL_REPO,
                 filename="horizon.safetensors",
                 repo_type="model"
             )
-            self.pipe = StableDiffusionXLControlNetPipeline.from_single_file(
                 model_path,
                 controlnet=controlnets,
                 torch_dtype=self.dtype,
                 use_safetensors=True
             ).to(self.device)
-            print("✓ Custom checkpoint loaded successfully (VAE bundled)")
             self.models_loaded['custom_checkpoint'] = True
         except Exception as e:
             print(f"⚠️ Could not load custom checkpoint: {e}")
             print("Using default SDXL base model")
-            self.pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
                 "stabilityai/stable-diffusion-xl-base-1.0",
                 controlnet=controlnets,
                 torch_dtype=self.dtype,
                 use_safetensors=True
             ).to(self.device)
             self.models_loaded['custom_checkpoint'] = False
         # Load LORA from HuggingFace Hub
         print("Loading LORA (retroart) from HuggingFace Hub...")
         try:
@@ -138,11 +168,18 @@ class RetroArtConverter:
             print(f"⚠️ Could not load LORA: {e}")
             self.models_loaded['lora'] = False
-        # CRITICAL: Use LCM Scheduler for this model!
-        print("Setting up LCM scheduler...")
-        self.pipe.scheduler = LCMScheduler.from_config(
-            self.pipe.scheduler.config
-        )
         # Enable attention optimizations
         self.pipe.unet.set_attn_processor(AttnProcessor2_0())
@@ -155,11 +192,6 @@ class RetroArtConverter:
             except Exception as e:
                 print(f"⚠️ xformers not available: {e}")
-        # Set CLIP skip to 2
-        if hasattr(self.pipe, 'text_encoder'):
-            self.clip_skip = 2
-            print(f"✓ CLIP skip set to {self.clip_skip}")
         # Track controlnet configuration
         self.using_multiple_controlnets = isinstance(controlnets, list)
         print(f"Pipeline initialized with {'multiple' if self.using_multiple_controlnets else 'single'} ControlNet(s)")
@@ -171,40 +203,38 @@ class RetroArtConverter:
         print("===================\n")
         print("✓ Model initialization complete!")
-        print("\n=== LCM CONFIGURATION ===")
-        print("Scheduler: LCM")
-        print("Recommended Steps: 12")
-        print("Recommended CFG: 1.0-1.5")
-        print("Recommended Resolution: 896x1152 or 832x1216")
-        print("CLIP Skip: 2")
         print(f"LORA Trigger: '{TRIGGER_WORD}'")
         print("=========================\n")
     def get_depth_map(self, image):
-        """Generate depth map from input image"""
-        depth = self.depth_estimator(image)
-        depth_image = depth['depth']
-        depth_array = np.array(depth_image)
-        # Normalize with percentile clipping
-        depth_min, depth_max = np.percentile(depth_array, [2, 98])
-        depth_normalized = np.clip((depth_array - depth_min) / (depth_max - depth_min + 1e-8), 0, 1) * 255
-        depth_normalized = depth_normalized.astype(np.uint8)
-        # Slight blur to reduce noise
-        depth_normalized = cv2.GaussianBlur(depth_normalized, (3, 3), 0)
-        # Convert to RGB
-        depth_colored = cv2.cvtColor(depth_normalized, cv2.COLOR_GRAY2RGB)
-        return Image.fromarray(depth_colored)
     def calculate_optimal_size(self, original_width, original_height):
         """Calculate optimal size from recommended resolutions"""
         aspect_ratio = original_width / original_height
-        # Recommended resolutions for this model
         recommended_sizes = [
             (896, 1152),  # Portrait
             (1152, 896),  # Landscape
@@ -242,14 +272,15 @@ class RetroArtConverter:
         input_image,
         prompt="retro game character, vibrant colors, detailed",
         negative_prompt="blurry, low quality, ugly, distorted",
-        num_inference_steps=12,  # LCM recommended: 12 steps
-        guidance_scale=1.0,       # LCM recommended: 1.0-1.5
         controlnet_conditioning_scale=0.8,
         lora_scale=1.0,
-        identity_preservation=0.8,
-        image_scale=0.2
     ):
-        """Generate retro art with correct LCM settings"""
         # Add trigger word to prompt
         prompt = self.add_trigger_word(prompt)
@@ -264,26 +295,41 @@ class RetroArtConverter:
         # Resize with high quality
         resized_image = input_image.resize((target_width, target_height), Image.LANCZOS)
-        # Generate depth map
-        print("Generating depth map...")
         depth_image = self.get_depth_map(resized_image)
-        depth_image = depth_image.resize((target_width, target_height), Image.LANCZOS)
         # Handle face detection for InstantID
         using_multiple_controlnets = self.using_multiple_controlnets
         face_embeddings = None
         has_detected_faces = False
-        if using_multiple_controlnets:
-            print("Checking for faces...")
             img_array = np.array(resized_image)
-            faces = self.face_app.get(img_array) if self.face_app is not None else []
             if len(faces) > 0:
                 has_detected_faces = True
                 print(f"Detected {len(faces)} face(s)")
-                face = sorted(faces, key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]))[-1]
-                face_embeddings = torch.from_numpy(face.normed_embedding).unsqueeze(0).to(self.device, dtype=self.dtype)
         # Set LORA scale
         if hasattr(self.pipe, 'set_adapters') and self.models_loaded['lora']:
@@ -297,51 +343,52 @@ class RetroArtConverter:
         pipe_kwargs = {
             "prompt": prompt,
             "negative_prompt": negative_prompt,
             "num_inference_steps": num_inference_steps,
             "guidance_scale": guidance_scale,
-            "width": target_width,
-            "height": target_height,
             "generator": torch.Generator(device=self.device).manual_seed(42)
         }
-        # Add CLIP skip
-        if hasattr(self.pipe, 'text_encoder'):
-            pipe_kwargs["clip_skip"] = 2
         # Configure ControlNet inputs
-        if using_multiple_controlnets and has_detected_faces:
-            print("Using Depth + InstantID ControlNets")
-            control_images = [depth_image, resized_image]
-            conditioning_scales = [controlnet_conditioning_scale, image_scale]
-            pipe_kwargs["image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
-            if face_embeddings is not None:
-                pipe_kwargs["cross_attention_kwargs"] = {"ip_adapter_image_embeds": [face_embeddings]}
-        elif using_multiple_controlnets and not has_detected_faces:
-            print("Multiple ControlNets available but no faces detected")
             control_images = [depth_image, depth_image]
-            conditioning_scales = [controlnet_conditioning_scale, 0.0]
-            pipe_kwargs["image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
         else:
-            print("Using Depth ControlNet only")
-            pipe_kwargs["image"] = depth_image
-            pipe_kwargs["controlnet_conditioning_scale"] = controlnet_conditioning_scale
         # Generate
-        print(f"Generating with LCM: Steps={num_inference_steps}, CFG={guidance_scale}")
         result = self.pipe(**pipe_kwargs)
         return result.images[0]
 # Initialize converter
 print("Initializing RetroArt Converter...")
-converter = RetroArtConverter()
 @spaces.GPU
 def process_image(
@@ -350,25 +397,31 @@ def process_image(
     negative_prompt,
     steps,
     guidance_scale,
     controlnet_scale,
     lora_scale,
-    identity_preservation,
-    image_scale
 ):
     if image is None:
         return None
     try:
         result = converter.generate_retro_art(
             input_image=image,
             prompt=prompt,
             negative_prompt=negative_prompt,
             num_inference_steps=int(steps),
             guidance_scale=guidance_scale,
             controlnet_conditioning_scale=controlnet_scale,
             lora_scale=lora_scale,
-            identity_preservation=identity_preservation,
-            image_scale=image_scale
         )
         return result
     except Exception as e:
@@ -378,36 +431,26 @@ def process_image(
         raise gr.Error(f"Generation failed: {str(e)}")
 # Gradio UI
-with gr.Blocks(title="RetroArt Converter - LCM", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 🎮 RetroArt Converter (LCM Optimized)
-    Convert images into retro pixel art style using LCM (Latent Consistency Model) for fast, high-quality generation!
-    **✨ Features:**
-    - ⚡ Ultra-fast generation (12 steps!)
-    - 🎨 Custom pixel art LORA with trigger word: `p1x3l4rt, pixel art`
-    - 📐 Optimized resolutions: 896x1152 / 832x1216
-    - 🖼️ Bundled VAE for authentic retro look
-    - 🎯 CLIP Skip 2 for better style
     """)
     # Model status
-    if converter.models_loaded:
-        status_text = "**📦 Loaded Models:**\n"
-        status_text += f"- Custom Checkpoint (Horizon): {'✓ Loaded' if converter.models_loaded['custom_checkpoint'] else '✗ Using SDXL base'}\n"
-        status_text += f"- LORA (RetroArt): {'✓ Loaded' if converter.models_loaded['lora'] else '✗ Disabled'}\n"
-        status_text += f"- InstantID: {'✓ Loaded' if converter.models_loaded['instantid'] else '✗ Disabled'}\n"
-        gr.Markdown(status_text)
-    gr.Markdown(f"""
-    **⚙️ LCM Configuration:**
-    - Scheduler: LCM (Latent Consistency Model)
-    - Recommended Steps: **12** (fast!)
-    - Recommended CFG: **1.0-1.5** (lower than normal)
-    - CLIP Skip: **2**
-    - LORA Trigger: `{TRIGGER_WORD}` (auto-added)
-    """)
     with gr.Row():
         with gr.Column():
@@ -426,31 +469,38 @@ with gr.Blocks(title="RetroArt Converter - LCM", theme=gr.themes.Soft()) as demo
                 lines=2
             )
-            with gr.Accordion("⚡ LCM Settings (Optimized)", open=True):
                 steps = gr.Slider(
                     minimum=4,
-                    maximum=20,
                     value=12,
                     step=1,
-                    label="Inference Steps (LCM works great with just 12!)"
                 )
                 guidance_scale = gr.Slider(
                     minimum=0.5,
-                    maximum=3.0,
                     value=1.0,
                     step=0.1,
-                    label="Guidance Scale (CFG) - LCM uses 1.0-1.5"
                 )
-                controlnet_scale = gr.Slider(
                     minimum=0.3,
-                    maximum=1.2,
-                    value=0.8,
                     step=0.05,
-                    label="ControlNet Depth Scale"
                 )
                 lora_scale = gr.Slider(
                     minimum=0.5,
                     maximum=1.5,
@@ -458,22 +508,32 @@ with gr.Blocks(title="RetroArt Converter - LCM", theme=gr.themes.Soft()) as demo
                     step=0.05,
                     label="RetroArt LORA Scale"
                 )
-            with gr.Accordion("🎭 Identity Settings (for portraits)", open=False):
-                identity_preservation = gr.Slider(
                     minimum=0,
-                    maximum=1.5,
-                    value=0.8,
-                    step=0.1,
-                    label="Identity Preservation"
                 )
-                image_scale = gr.Slider(
                     minimum=0,
                     maximum=1.0,
-                    value=0.2,
                     step=0.05,
-                    label="InstantID Image Scale"
                 )
             generate_btn = gr.Button("🎨 Generate Retro Art", variant="primary", size="lg")
@@ -484,27 +544,53 @@ with gr.Blocks(title="RetroArt Converter - LCM", theme=gr.themes.Soft()) as demo
             gr.Markdown("""
             ### 💡 Tips for Best Results:
-            **For LCM Models:**
-            - ✅ Use **12 steps** (already optimized!)
-            - ✅ Keep CFG at **1.0-1.5** (not 7.5!)
-            - ✅ LORA trigger word is **auto-added**
-            - ✅ Resolution auto-optimized to 896x1152 or 832x1216
-            **For Quality:**
-            - Use high-resolution input images
-            - Be specific in prompts: "16-bit game character" vs "character"
-            - Adjust ControlNet scale: lower = more creative, higher = more faithful
-            **For Style:**
-            - Increase LORA scale (1.0-1.5) for stronger pixel art effect
-            - Try prompts like: "SNES style", "16-bit RPG", "Game Boy advance style"
             """)
     generate_btn.click(
         fn=process_image,
         inputs=[
-            input_image, prompt, negative_prompt, steps, guidance_scale,
-            controlnet_scale, lora_scale, identity_preservation, image_scale
         ],
         outputs=[output_image]
     )

 import gradio as gr
 import torch
 from diffusers import (
     ControlNetModel,
     AutoencoderKL,
+    DPMSolverMultistepScheduler,
+    LCMScheduler
 )
 from diffusers.models.attention_processor import AttnProcessor2_0
 from insightface.app import FaceAnalysis
 from PIL import Image
 import numpy as np
 import cv2
 from huggingface_hub import hf_hub_download
 import os
+# Import the custom img2img pipeline with InstantID
+from pipeline_stable_diffusion_xl_instantid_img2img import StableDiffusionXLInstantIDImg2ImgPipeline, draw_kps
+# Import ZoeDetector for better depth maps
+from controlnet_aux import ZoeDetector
 # Configuration
 MODEL_REPO = "primerz/pixagram"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"LORA Trigger Word: {TRIGGER_WORD}")
 class RetroArtConverter:
+    def __init__(self, use_lcm=False):
         self.device = device
         self.dtype = dtype
+        self.use_lcm = use_lcm
         self.models_loaded = {
             'custom_checkpoint': False,
             'lora': False,
         }
         # Initialize face analysis for InstantID
+        print("Loading face analysis model (antelopev2)...")
         try:
             self.face_app = FaceAnalysis(
                 name='antelopev2',
             self.face_app = None
             self.face_detection_enabled = False
+        # Load ControlNet for InstantID
         print("Loading InstantID ControlNet...")
         try:
             self.controlnet_instantid = ControlNetModel.from_pretrained(
             self.controlnet_instantid = None
             self.instantid_enabled = False
+        # Load ControlNet for Zoe depth
+        print("Loading Zoe Depth ControlNet...")
+        self.controlnet_depth = ControlNetModel.from_pretrained(
+            "diffusers/controlnet-zoe-depth-sdxl-1.0",
+            torch_dtype=self.dtype
+        ).to(self.device)
+        # Load Zoe depth detector (better than DPT)
+        print("Loading Zoe depth detector...")
+        try:
+            self.zoe_detector = ZoeDetector.from_pretrained("lllyasviel/Annotators")
+            self.zoe_detector.to(self.device)
+            print("✓ Zoe detector loaded successfully")
+        except Exception as e:
+            print(f"⚠️ Could not load Zoe detector: {e}")
+            self.zoe_detector = None
         # Determine which controlnets to use
         if self.instantid_enabled and self.controlnet_instantid is not None:
+            controlnets = [self.controlnet_instantid, self.controlnet_depth]
+            print(f"Initializing with multiple ControlNets: InstantID + Zoe Depth")
         else:
             controlnets = self.controlnet_depth
+            print(f"Initializing with single ControlNet: Zoe Depth only")
+        # Load VAE
+        print("Loading VAE...")
+        self.vae = AutoencoderKL.from_pretrained(
+            "madebyollin/sdxl-vae-fp16-fix",
+            torch_dtype=self.dtype
+        ).to(self.device)
         # Load SDXL checkpoint from HuggingFace Hub
+        print("Loading SDXL checkpoint (horizon) from HuggingFace Hub...")
         try:
             model_path = hf_hub_download(
                 repo_id=MODEL_REPO,
                 filename="horizon.safetensors",
                 repo_type="model"
             )
+            # Use the custom img2img pipeline for better results
+            self.pipe = StableDiffusionXLInstantIDImg2ImgPipeline.from_single_file(
                 model_path,
                 controlnet=controlnets,
+                vae=self.vae,
                 torch_dtype=self.dtype,
                 use_safetensors=True
             ).to(self.device)
+            print("✓ Custom checkpoint loaded successfully")
             self.models_loaded['custom_checkpoint'] = True
         except Exception as e:
             print(f"⚠️ Could not load custom checkpoint: {e}")
             print("Using default SDXL base model")
+            self.pipe = StableDiffusionXLInstantIDImg2ImgPipeline.from_pretrained(
                 "stabilityai/stable-diffusion-xl-base-1.0",
                 controlnet=controlnets,
+                vae=self.vae,
                 torch_dtype=self.dtype,
                 use_safetensors=True
             ).to(self.device)
             self.models_loaded['custom_checkpoint'] = False
+        # Load InstantID IP-Adapter
+        if self.instantid_enabled:
+            print("Loading InstantID IP-Adapter...")
+            try:
+                ip_adapter_path = hf_hub_download(
+                    repo_id="InstantX/InstantID",
+                    filename="ip-adapter.bin"
+                )
+                self.pipe.load_ip_adapter_instantid(ip_adapter_path)
+                self.pipe.set_ip_adapter_scale(0.8)
+                print("✓ InstantID IP-Adapter loaded successfully")
+            except Exception as e:
+                print(f"⚠️ Could not load IP-Adapter: {e}")
         # Load LORA from HuggingFace Hub
         print("Loading LORA (retroart) from HuggingFace Hub...")
         try:
             print(f"⚠️ Could not load LORA: {e}")
             self.models_loaded['lora'] = False
+        # Choose scheduler based on mode
+        if use_lcm:
+            print("Setting up LCM scheduler for fast generation...")
+            self.pipe.scheduler = LCMScheduler.from_config(
+                self.pipe.scheduler.config
+            )
+        else:
+            print("Setting up DPMSolverMultistep scheduler with Karras sigmas for quality...")
+            self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(
+                self.pipe.scheduler.config,
+                use_karras_sigmas=True
+            )
         # Enable attention optimizations
         self.pipe.unet.set_attn_processor(AttnProcessor2_0())
             except Exception as e:
                 print(f"⚠️ xformers not available: {e}")
         # Track controlnet configuration
         self.using_multiple_controlnets = isinstance(controlnets, list)
         print(f"Pipeline initialized with {'multiple' if self.using_multiple_controlnets else 'single'} ControlNet(s)")
         print("===================\n")
         print("✓ Model initialization complete!")
+        if use_lcm:
+            print("\n=== LCM CONFIGURATION ===")
+            print("Scheduler: LCM")
+            print("Recommended Steps: 8-12")
+            print("Recommended CFG: 1.0-1.5")
+            print("Recommended Strength: 0.6-0.8")
+        else:
+            print("\n=== QUALITY CONFIGURATION ===")
+            print("Scheduler: DPMSolverMultistep + Karras")
+            print("Recommended Steps: 25-40")
+            print("Recommended CFG: 5.0-7.5")
+            print("Recommended Strength: 0.4-0.7")
         print(f"LORA Trigger: '{TRIGGER_WORD}'")
         print("=========================\n")
     def get_depth_map(self, image):
+        """Generate depth map from input image using Zoe"""
+        if self.zoe_detector is not None:
+            # Use Zoe detector for better depth maps
+            depth_image = self.zoe_detector(image)
+            return depth_image
+        else:
+            # Fallback to basic conversion
+            img_array = np.array(image.convert('L'))
+            depth_colored = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
+            return Image.fromarray(depth_colored)
     def calculate_optimal_size(self, original_width, original_height):
         """Calculate optimal size from recommended resolutions"""
         aspect_ratio = original_width / original_height
+        # Recommended resolutions for SDXL
         recommended_sizes = [
             (896, 1152),  # Portrait
             (1152, 896),  # Landscape
         input_image,
         prompt="retro game character, vibrant colors, detailed",
         negative_prompt="blurry, low quality, ugly, distorted",
+        num_inference_steps=25,
+        guidance_scale=5.0,
+        strength=0.6,  # img2img strength
         controlnet_conditioning_scale=0.8,
         lora_scale=1.0,
+        face_strength=0.85,  # InstantID face strength
+        depth_control_scale=0.8  # Zoe depth strength
     ):
+        """Generate retro art using img2img pipeline with face keypoints"""
         # Add trigger word to prompt
         prompt = self.add_trigger_word(prompt)
         # Resize with high quality
         resized_image = input_image.resize((target_width, target_height), Image.LANCZOS)
+        # Generate depth map using Zoe
+        print("Generating Zoe depth map...")
         depth_image = self.get_depth_map(resized_image)
+        if depth_image.size != (target_width, target_height):
+            depth_image = depth_image.resize((target_width, target_height), Image.LANCZOS)
         # Handle face detection for InstantID
         using_multiple_controlnets = self.using_multiple_controlnets
+        face_kps = None
         face_embeddings = None
         has_detected_faces = False
+        if using_multiple_controlnets and self.face_app is not None:
+            print("Detecting faces and extracting keypoints...")
             img_array = np.array(resized_image)
+            faces = self.face_app.get(img_array)
             if len(faces) > 0:
                 has_detected_faces = True
                 print(f"Detected {len(faces)} face(s)")
+                # Get the largest face
+                face = sorted(faces,
+                            key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]))[-1]
+                # Extract face embeddings
+                face_embeddings = torch.from_numpy(face.normed_embedding).unsqueeze(0).to(
+                    self.device, dtype=self.dtype
+                )
+                # Draw keypoints (this shows age, gender, expression)
+                face_kps = draw_kps(resized_image, face.kps)
+                print(f"Face keypoints drawn (age/gender/expression preserved)")
+            else:
+                print("No faces detected in image")
         # Set LORA scale
         if hasattr(self.pipe, 'set_adapters') and self.models_loaded['lora']:
         pipe_kwargs = {
             "prompt": prompt,
             "negative_prompt": negative_prompt,
+            "image": resized_image,  # Original image for img2img
             "num_inference_steps": num_inference_steps,
             "guidance_scale": guidance_scale,
+            "strength": strength,  # img2img denoising strength
             "generator": torch.Generator(device=self.device).manual_seed(42)
         }
         # Configure ControlNet inputs
+        if using_multiple_controlnets and has_detected_faces and face_kps is not None:
+            print("Using InstantID + Zoe Depth ControlNets with face keypoints")
+            control_images = [face_kps, depth_image]
+            conditioning_scales = [face_strength, depth_control_scale]
+            pipe_kwargs["control_image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
+            # Add face embeddings through IP-Adapter
+            if face_embeddings is not None and hasattr(self.pipe, 'set_ip_adapter_scale'):
+                pipe_kwargs["ip_adapter_image_embeds"] = [face_embeddings]
+        elif using_multiple_controlnets:
+            print("Multiple ControlNets available but no faces detected - using depth only")
+            # Use depth for both to maintain structure
             control_images = [depth_image, depth_image]
+            conditioning_scales = [0.0, depth_control_scale]  # Disable InstantID
+            pipe_kwargs["control_image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
         else:
+            print("Using Zoe Depth ControlNet only")
+            pipe_kwargs["control_image"] = depth_image
+            pipe_kwargs["controlnet_conditioning_scale"] = depth_control_scale
         # Generate
+        mode = "LCM" if self.use_lcm else "Quality"
+        print(f"Generating with {mode} mode: Steps={num_inference_steps}, CFG={guidance_scale}, Strength={strength}")
         result = self.pipe(**pipe_kwargs)
         return result.images[0]
 # Initialize converter
 print("Initializing RetroArt Converter...")
+print("Choose mode: LCM (fast) or Quality (better)")
+converter_lcm = RetroArtConverter(use_lcm=True)
+converter_quality = RetroArtConverter(use_lcm=False)
 @spaces.GPU
 def process_image(
     negative_prompt,
     steps,
     guidance_scale,
+    strength,
     controlnet_scale,
     lora_scale,
+    face_strength,
+    depth_control_scale,
+    use_lcm_mode
 ):
     if image is None:
         return None
     try:
+        # Choose the right converter based on mode
+        converter = converter_lcm if use_lcm_mode else converter_quality
         result = converter.generate_retro_art(
             input_image=image,
             prompt=prompt,
             negative_prompt=negative_prompt,
             num_inference_steps=int(steps),
             guidance_scale=guidance_scale,
+            strength=strength,
             controlnet_conditioning_scale=controlnet_scale,
             lora_scale=lora_scale,
+            face_strength=face_strength,
+            depth_control_scale=depth_control_scale
         )
         return result
     except Exception as e:
         raise gr.Error(f"Generation failed: {str(e)}")
 # Gradio UI
+with gr.Blocks(title="RetroArt Converter - Improved", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # 🎮 RetroArt Converter (Improved with True Img2Img)
+    Convert images into retro pixel art style with **proper face detection** and **gender/age preservation**!
+    **✨ Key Improvements:**
+    - 🎯 **True img2img pipeline** for better structure preservation
+    - 👤 **draw_kps**: Detects and preserves age, gender, expression
+    - 🗺️ **Zoe Depth**: Superior depth estimation
+    - ⚡ **Dual Mode**: Fast LCM or Quality DPM++
+    - 🎨 Custom pixel art LORA with trigger: `p1x3l4rt, pixel art`
     """)
     # Model status
+    status_text = "**📦 Loaded Models (LCM Mode):**\n"
+    status_text += f"- Custom Checkpoint: {'✓ Loaded' if converter_lcm.models_loaded['custom_checkpoint'] else '✗ Using SDXL base'}\n"
+    status_text += f"- LORA (RetroArt): {'✓ Loaded' if converter_lcm.models_loaded['lora'] else '✗ Disabled'}\n"
+    status_text += f"- InstantID: {'✓ Loaded' if converter_lcm.models_loaded['instantid'] else '✗ Disabled'}\n"
+    gr.Markdown(status_text)
     with gr.Row():
         with gr.Column():
                 lines=2
             )
+            use_lcm_mode = gr.Checkbox(
+                label="Use LCM Mode (Fast)",
+                value=True,
+                info="Uncheck for Quality mode (slower but better)"
+            )
+            with gr.Accordion("⚙️ Generation Settings", open=True):
                 steps = gr.Slider(
                     minimum=4,
+                    maximum=50,
                     value=12,
                     step=1,
+                    label="Inference Steps (12 for LCM, 25-40 for Quality)"
                 )
                 guidance_scale = gr.Slider(
                     minimum=0.5,
+                    maximum=15.0,
                     value=1.0,
                     step=0.1,
+                    label="Guidance Scale (1.0-1.5 for LCM, 5-7.5 for Quality)"
                 )
+                strength = gr.Slider(
                     minimum=0.3,
+                    maximum=1.0,
+                    value=0.7,
                     step=0.05,
+                    label="Img2Img Strength (how much to change)"
                 )
+            with gr.Accordion("🎨 Style Settings", open=True):
                 lora_scale = gr.Slider(
                     minimum=0.5,
                     maximum=1.5,
                     step=0.05,
                     label="RetroArt LORA Scale"
                 )
+                controlnet_scale = gr.Slider(
+                    minimum=0.3,
+                    maximum=1.2,
+                    value=0.8,
+                    step=0.05,
+                    label="Overall ControlNet Scale"
+                )
+            with gr.Accordion("👤 Face & Depth Settings", open=False):
+                face_strength = gr.Slider(
                     minimum=0,
+                    maximum=2.0,
+                    value=0.85,
+                    step=0.05,
+                    label="Face Preservation (InstantID)",
+                    info="Higher = better face likeness"
                 )
+                depth_control_scale = gr.Slider(
                     minimum=0,
                     maximum=1.0,
+                    value=0.8,
                     step=0.05,
+                    label="Zoe Depth Control Scale",
+                    info="Higher = more structure preservation"
                 )
             generate_btn = gr.Button("🎨 Generate Retro Art", variant="primary", size="lg")
             gr.Markdown("""
             ### 💡 Tips for Best Results:
+            **Mode Selection:**
+            - ✅ **LCM Mode**: 12 steps, CFG 1.0-1.5, Strength 0.6-0.8 (⚡ fast!)
+            - ✅ **Quality Mode**: 25-40 steps, CFG 5-7.5, Strength 0.4-0.7 (🎨 better!)
+            **Face Preservation:**
+            - System automatically detects faces and draws keypoints
+            - Preserves age, gender, and expression characteristics
+            - Adjust "Face Preservation" slider for control
+            **For Best Quality:**
+            - Use high-resolution input images (min 512px)
+            - For portraits: enable Quality mode + high face strength
+            - For scenes: lower img2img strength for more creativity
+            - Adjust depth control for structure vs creativity balance
+            **Style Control:**
+            - LORA trigger word auto-added for pixel art style
+            - Increase LORA scale (1.2-1.5) for stronger retro effect
+            - Try: "SNES style", "16-bit RPG", "Game Boy advance style"
             """)
+    # Update defaults when switching modes
+    def update_mode_defaults(use_lcm):
+        if use_lcm:
+            return (
+                gr.update(value=12),  # steps
+                gr.update(value=1.0),  # guidance_scale
+                gr.update(value=0.7)  # strength
+            )
+        else:
+            return (
+                gr.update(value=30),  # steps
+                gr.update(value=6.0),  # guidance_scale
+                gr.update(value=0.6)  # strength
+            )
+    use_lcm_mode.change(
+        fn=update_mode_defaults,
+        inputs=[use_lcm_mode],
+        outputs=[steps, guidance_scale, strength]
+    )
     generate_btn.click(
         fn=process_image,
         inputs=[
+            input_image, prompt, negative_prompt, steps, guidance_scale, strength,
+            controlnet_scale, lora_scale, face_strength, depth_control_scale, use_lcm_mode
         ],
         outputs=[output_image]
     )