pixagram-dev

Runtime error

App Files Files Community

primerz commited on Oct 27

Commit

c358674

verified ·

1 Parent(s): 3e6f23b

Update app.py

Browse files

Files changed (1) hide show

app.py +219 -226

app.py CHANGED Viewed

@@ -2,25 +2,22 @@ import spaces  # MUST be first, before any CUDA-related imports
 import gradio as gr
 import torch
 from diffusers import (
     ControlNetModel,
     AutoencoderKL,
-    DPMSolverMultistepScheduler,
-    LCMScheduler
 )
 from diffusers.models.attention_processor import AttnProcessor2_0
 from insightface.app import FaceAnalysis
 from PIL import Image
 import numpy as np
 import cv2
 from huggingface_hub import hf_hub_download
 import os
-# Import the custom img2img pipeline with InstantID
-from pipeline_stable_diffusion_xl_instantid_img2img import StableDiffusionXLInstantIDImg2ImgPipeline, draw_kps
-# Import ZoeDetector for better depth maps
-from controlnet_aux import ZoeDetector
 # Configuration
 MODEL_REPO = "primerz/pixagram"
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -29,23 +26,61 @@ dtype = torch.float16 if device == "cuda" else torch.float32
 # LORA trigger word
 TRIGGER_WORD = "p1x3l4rt, pixel art"
 print(f"Using device: {device}")
 print(f"Loading models from: {MODEL_REPO}")
 print(f"LORA Trigger Word: {TRIGGER_WORD}")
 class RetroArtConverter:
-    def __init__(self, use_lcm=False):
         self.device = device
         self.dtype = dtype
-        self.use_lcm = use_lcm
         self.models_loaded = {
             'custom_checkpoint': False,
             'lora': False,
-            'instantid': False
         }
         # Initialize face analysis for InstantID
-        print("Loading face analysis model (antelopev2)...")
         try:
             self.face_app = FaceAnalysis(
                 name='antelopev2',
@@ -60,7 +95,25 @@ class RetroArtConverter:
             self.face_app = None
             self.face_detection_enabled = False
-        # Load ControlNet for InstantID
         print("Loading InstantID ControlNet...")
         try:
             self.controlnet_instantid = ControlNetModel.from_pretrained(
@@ -76,82 +129,42 @@ class RetroArtConverter:
             self.controlnet_instantid = None
             self.instantid_enabled = False
-        # Load ControlNet for Zoe depth
-        print("Loading Zoe Depth ControlNet...")
-        self.controlnet_depth = ControlNetModel.from_pretrained(
-            "diffusers/controlnet-zoe-depth-sdxl-1.0",
-            torch_dtype=self.dtype
-        ).to(self.device)
-        # Load Zoe depth detector (better than DPT)
-        print("Loading Zoe depth detector...")
-        try:
-            self.zoe_detector = ZoeDetector.from_pretrained("lllyasviel/Annotators")
-            self.zoe_detector.to(self.device)
-            print("✓ Zoe detector loaded successfully")
-        except Exception as e:
-            print(f"⚠️ Could not load Zoe detector: {e}")
-            self.zoe_detector = None
         # Determine which controlnets to use
         if self.instantid_enabled and self.controlnet_instantid is not None:
             controlnets = [self.controlnet_instantid, self.controlnet_depth]
-            print(f"Initializing with multiple ControlNets: InstantID + Zoe Depth")
         else:
             controlnets = self.controlnet_depth
-            print(f"Initializing with single ControlNet: Zoe Depth only")
-        # Load VAE
-        print("Loading VAE...")
-        self.vae = AutoencoderKL.from_pretrained(
-            "madebyollin/sdxl-vae-fp16-fix",
-            torch_dtype=self.dtype
-        ).to(self.device)
         # Load SDXL checkpoint from HuggingFace Hub
-        print("Loading SDXL checkpoint (horizon) from HuggingFace Hub...")
         try:
             model_path = hf_hub_download(
                 repo_id=MODEL_REPO,
                 filename="horizon.safetensors",
                 repo_type="model"
             )
-            # Use the custom img2img pipeline for better results
-            self.pipe = StableDiffusionXLInstantIDImg2ImgPipeline.from_single_file(
                 model_path,
                 controlnet=controlnets,
-                vae=self.vae,
                 torch_dtype=self.dtype,
                 use_safetensors=True
             ).to(self.device)
-            print("✓ Custom checkpoint loaded successfully")
             self.models_loaded['custom_checkpoint'] = True
         except Exception as e:
             print(f"⚠️ Could not load custom checkpoint: {e}")
             print("Using default SDXL base model")
-            self.pipe = StableDiffusionXLInstantIDImg2ImgPipeline.from_pretrained(
                 "stabilityai/stable-diffusion-xl-base-1.0",
                 controlnet=controlnets,
-                vae=self.vae,
                 torch_dtype=self.dtype,
                 use_safetensors=True
             ).to(self.device)
             self.models_loaded['custom_checkpoint'] = False
-        # Load InstantID IP-Adapter
-        if self.instantid_enabled:
-            print("Loading InstantID IP-Adapter...")
-            try:
-                ip_adapter_path = hf_hub_download(
-                    repo_id="InstantX/InstantID",
-                    filename="ip-adapter.bin"
-                )
-                self.pipe.load_ip_adapter_instantid(ip_adapter_path)
-                self.pipe.set_ip_adapter_scale(0.8)
-                print("✓ InstantID IP-Adapter loaded successfully")
-            except Exception as e:
-                print(f"⚠️ Could not load IP-Adapter: {e}")
         # Load LORA from HuggingFace Hub
         print("Loading LORA (retroart) from HuggingFace Hub...")
         try:
@@ -168,14 +181,14 @@ class RetroArtConverter:
             print(f"⚠️ Could not load LORA: {e}")
             self.models_loaded['lora'] = False
-        # Choose scheduler based on mode
-        if use_lcm:
-            print("Setting up LCM scheduler for fast generation...")
             self.pipe.scheduler = LCMScheduler.from_config(
                 self.pipe.scheduler.config
             )
         else:
-            print("Setting up DPMSolverMultistep scheduler with Karras sigmas for quality...")
             self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(
                 self.pipe.scheduler.config,
                 use_karras_sigmas=True
@@ -192,6 +205,11 @@ class RetroArtConverter:
             except Exception as e:
                 print(f"⚠️ xformers not available: {e}")
         # Track controlnet configuration
         self.using_multiple_controlnets = isinstance(controlnets, list)
         print(f"Pipeline initialized with {'multiple' if self.using_multiple_controlnets else 'single'} ControlNet(s)")
@@ -203,38 +221,36 @@ class RetroArtConverter:
         print("===================\n")
         print("✓ Model initialization complete!")
-        if use_lcm:
-            print("\n=== LCM CONFIGURATION ===")
-            print("Scheduler: LCM")
-            print("Recommended Steps: 8-12")
             print("Recommended CFG: 1.0-1.5")
-            print("Recommended Strength: 0.6-0.8")
         else:
-            print("\n=== QUALITY CONFIGURATION ===")
-            print("Scheduler: DPMSolverMultistep + Karras")
-            print("Recommended Steps: 25-40")
-            print("Recommended CFG: 5.0-7.5")
-            print("Recommended Strength: 0.4-0.7")
         print(f"LORA Trigger: '{TRIGGER_WORD}'")
-        print("=========================\n")
     def get_depth_map(self, image):
-        """Generate depth map from input image using Zoe"""
-        if self.zoe_detector is not None:
-            # Use Zoe detector for better depth maps
-            depth_image = self.zoe_detector(image)
             return depth_image
         else:
-            # Fallback to basic conversion
-            img_array = np.array(image.convert('L'))
-            depth_colored = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
             return Image.fromarray(depth_colored)
     def calculate_optimal_size(self, original_width, original_height):
         """Calculate optimal size from recommended resolutions"""
         aspect_ratio = original_width / original_height
-        # Recommended resolutions for SDXL
         recommended_sizes = [
             (896, 1152),  # Portrait
             (1152, 896),  # Landscape
@@ -272,15 +288,14 @@ class RetroArtConverter:
         input_image,
         prompt="retro game character, vibrant colors, detailed",
         negative_prompt="blurry, low quality, ugly, distorted",
-        num_inference_steps=25,
-        guidance_scale=5.0,
-        strength=0.6,  # img2img strength
         controlnet_conditioning_scale=0.8,
         lora_scale=1.0,
-        face_strength=0.85,  # InstantID face strength
-        depth_control_scale=0.8  # Zoe depth strength
     ):
-        """Generate retro art using img2img pipeline with face keypoints"""
         # Add trigger word to prompt
         prompt = self.add_trigger_word(prompt)
@@ -291,6 +306,7 @@ class RetroArtConverter:
         print(f"Resizing from {original_width}x{original_height} to {target_width}x{target_height}")
         print(f"Prompt: {prompt}")
         # Resize with high quality
         resized_image = input_image.resize((target_width, target_height), Image.LANCZOS)
@@ -303,33 +319,30 @@ class RetroArtConverter:
         # Handle face detection for InstantID
         using_multiple_controlnets = self.using_multiple_controlnets
-        face_kps = None
         face_embeddings = None
         has_detected_faces = False
         if using_multiple_controlnets and self.face_app is not None:
             print("Detecting faces and extracting keypoints...")
-            img_array = np.array(resized_image)
             faces = self.face_app.get(img_array)
             if len(faces) > 0:
                 has_detected_faces = True
                 print(f"Detected {len(faces)} face(s)")
-                # Get the largest face
-                face = sorted(faces,
-                            key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]))[-1]
                 # Extract face embeddings
-                face_embeddings = torch.from_numpy(face.normed_embedding).unsqueeze(0).to(
-                    self.device, dtype=self.dtype
-                )
-                # Draw keypoints (this shows age, gender, expression)
-                face_kps = draw_kps(resized_image, face.kps)
-                print(f"Face keypoints drawn (age/gender/expression preserved)")
-            else:
-                print("No faces detected in image")
         # Set LORA scale
         if hasattr(self.pipe, 'set_adapters') and self.models_loaded['lora']:
@@ -343,52 +356,53 @@ class RetroArtConverter:
         pipe_kwargs = {
             "prompt": prompt,
             "negative_prompt": negative_prompt,
-            "image": resized_image,  # Original image for img2img
             "num_inference_steps": num_inference_steps,
             "guidance_scale": guidance_scale,
-            "strength": strength,  # img2img denoising strength
             "generator": torch.Generator(device=self.device).manual_seed(42)
         }
         # Configure ControlNet inputs
-        if using_multiple_controlnets and has_detected_faces and face_kps is not None:
-            print("Using InstantID + Zoe Depth ControlNets with face keypoints")
-            control_images = [face_kps, depth_image]
-            conditioning_scales = [face_strength, depth_control_scale]
             pipe_kwargs["control_image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
-            # Add face embeddings through IP-Adapter
-            if face_embeddings is not None and hasattr(self.pipe, 'set_ip_adapter_scale'):
-                pipe_kwargs["ip_adapter_image_embeds"] = [face_embeddings]
-        elif using_multiple_controlnets:
-            print("Multiple ControlNets available but no faces detected - using depth only")
-            # Use depth for both to maintain structure
             control_images = [depth_image, depth_image]
-            conditioning_scales = [0.0, depth_control_scale]  # Disable InstantID
             pipe_kwargs["control_image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
         else:
-            print("Using Zoe Depth ControlNet only")
             pipe_kwargs["control_image"] = depth_image
-            pipe_kwargs["controlnet_conditioning_scale"] = depth_control_scale
         # Generate
-        mode = "LCM" if self.use_lcm else "Quality"
-        print(f"Generating with {mode} mode: Steps={num_inference_steps}, CFG={guidance_scale}, Strength={strength}")
         result = self.pipe(**pipe_kwargs)
         return result.images[0]
 # Initialize converter
 print("Initializing RetroArt Converter...")
-print("Choose mode: LCM (fast) or Quality (better)")
-converter_lcm = RetroArtConverter(use_lcm=True)
-converter_quality = RetroArtConverter(use_lcm=False)
 @spaces.GPU
 def process_image(
@@ -397,31 +411,25 @@ def process_image(
     negative_prompt,
     steps,
     guidance_scale,
-    strength,
     controlnet_scale,
     lora_scale,
-    face_strength,
-    depth_control_scale,
-    use_lcm_mode
 ):
     if image is None:
         return None
     try:
-        # Choose the right converter based on mode
-        converter = converter_lcm if use_lcm_mode else converter_quality
         result = converter.generate_retro_art(
             input_image=image,
             prompt=prompt,
             negative_prompt=negative_prompt,
             num_inference_steps=int(steps),
             guidance_scale=guidance_scale,
-            strength=strength,
             controlnet_conditioning_scale=controlnet_scale,
             lora_scale=lora_scale,
-            face_strength=face_strength,
-            depth_control_scale=depth_control_scale
         )
         return result
     except Exception as e:
@@ -430,27 +438,44 @@ def process_image(
         traceback.print_exc()
         raise gr.Error(f"Generation failed: {str(e)}")
 # Gradio UI
-with gr.Blocks(title="RetroArt Converter - Improved", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 🎮 RetroArt Converter (Improved with True Img2Img)
-    Convert images into retro pixel art style with **proper face detection** and **gender/age preservation**!
-    **✨ Key Improvements:**
-    - 🎯 **True img2img pipeline** for better structure preservation
-    - 👤 **draw_kps**: Detects and preserves age, gender, expression
-    - 🗺️ **Zoe Depth**: Superior depth estimation
-    - ⚡ **Dual Mode**: Fast LCM or Quality DPM++
-    - 🎨 Custom pixel art LORA with trigger: `p1x3l4rt, pixel art`
     """)
     # Model status
-    status_text = "**📦 Loaded Models (LCM Mode):**\n"
-    status_text += f"- Custom Checkpoint: {'✓ Loaded' if converter_lcm.models_loaded['custom_checkpoint'] else '✗ Using SDXL base'}\n"
-    status_text += f"- LORA (RetroArt): {'✓ Loaded' if converter_lcm.models_loaded['lora'] else '✗ Disabled'}\n"
-    status_text += f"- InstantID: {'✓ Loaded' if converter_lcm.models_loaded['instantid'] else '✗ Disabled'}\n"
-    gr.Markdown(status_text)
     with gr.Row():
         with gr.Column():
@@ -469,44 +494,29 @@ with gr.Blocks(title="RetroArt Converter - Improved", theme=gr.themes.Soft()) as
                 lines=2
             )
-            use_lcm_mode = gr.Checkbox(
-                label="Use LCM Mode (Fast)",
-                value=True,
-                info="Uncheck for Quality mode (slower but better)"
-            )
-            with gr.Accordion("⚙️ Generation Settings", open=True):
                 steps = gr.Slider(
                     minimum=4,
                     maximum=50,
-                    value=12,
                     step=1,
-                    label="Inference Steps (12 for LCM, 25-40 for Quality)"
                 )
                 guidance_scale = gr.Slider(
                     minimum=0.5,
                     maximum=15.0,
-                    value=1.0,
                     step=0.1,
-                    label="Guidance Scale (1.0-1.5 for LCM, 5-7.5 for Quality)"
                 )
                 strength = gr.Slider(
                     minimum=0.3,
-                    maximum=1.0,
-                    value=0.7,
                     step=0.05,
-                    label="Img2Img Strength (how much to change)"
-                )
-            with gr.Accordion("🎨 Style Settings", open=True):
-                lora_scale = gr.Slider(
-                    minimum=0.5,
-                    maximum=1.5,
-                    value=1.0,
-                    step=0.05,
-                    label="RetroArt LORA Scale"
                 )
                 controlnet_scale = gr.Slider(
@@ -514,26 +524,24 @@ with gr.Blocks(title="RetroArt Converter - Improved", theme=gr.themes.Soft()) as
                     maximum=1.2,
                     value=0.8,
                     step=0.05,
-                    label="Overall ControlNet Scale"
                 )
-            with gr.Accordion("👤 Face & Depth Settings", open=False):
-                face_strength = gr.Slider(
-                    minimum=0,
-                    maximum=2.0,
-                    value=0.85,
                     step=0.05,
-                    label="Face Preservation (InstantID)",
-                    info="Higher = better face likeness"
                 )
-                depth_control_scale = gr.Slider(
                     minimum=0,
-                    maximum=1.0,
                     value=0.8,
-                    step=0.05,
-                    label="Zoe Depth Control Scale",
-                    info="Higher = more structure preservation"
                 )
             generate_btn = gr.Button("🎨 Generate Retro Art", variant="primary", size="lg")
@@ -541,60 +549,45 @@ with gr.Blocks(title="RetroArt Converter - Improved", theme=gr.themes.Soft()) as
         with gr.Column():
             output_image = gr.Image(label="Retro Art Output")
-            gr.Markdown("""
             ### 💡 Tips for Best Results:
-            **Mode Selection:**
-            - ✅ **LCM Mode**: 12 steps, CFG 1.0-1.5, Strength 0.6-0.8 (⚡ fast!)
-            - ✅ **Quality Mode**: 25-40 steps, CFG 5-7.5, Strength 0.4-0.7 (🎨 better!)
-            **Face Preservation:**
-            - System automatically detects faces and draws keypoints
-            - Preserves age, gender, and expression characteristics
-            - Adjust "Face Preservation" slider for control
-            **For Best Quality:**
-            - Use high-resolution input images (min 512px)
-            - For portraits: enable Quality mode + high face strength
-            - For scenes: lower img2img strength for more creativity
-            - Adjust depth control for structure vs creativity balance
-            **Style Control:**
-            - LORA trigger word auto-added for pixel art style
-            - Increase LORA scale (1.2-1.5) for stronger retro effect
-            - Try: "SNES style", "16-bit RPG", "Game Boy advance style"
             """)
-    # Update defaults when switching modes
-    def update_mode_defaults(use_lcm):
-        if use_lcm:
-            return (
-                gr.update(value=12),  # steps
-                gr.update(value=1.0),  # guidance_scale
-                gr.update(value=0.7)  # strength
-            )
-        else:
-            return (
-                gr.update(value=30),  # steps
-                gr.update(value=6.0),  # guidance_scale
-                gr.update(value=0.6)  # strength
-            )
-    use_lcm_mode.change(
-        fn=update_mode_defaults,
-        inputs=[use_lcm_mode],
-        outputs=[steps, guidance_scale, strength]
-    )
     generate_btn.click(
         fn=process_image,
         inputs=[
-            input_image, prompt, negative_prompt, steps, guidance_scale, strength,
-            controlnet_scale, lora_scale, face_strength, depth_control_scale, use_lcm_mode
         ],
         outputs=[output_image]
     )
 if __name__ == "__main__":
     demo.queue(max_size=20)
     demo.launch(

 import gradio as gr
 import torch
 from diffusers import (
+    StableDiffusionXLControlNetImg2ImgPipeline,  # Changed to img2img
     ControlNetModel,
     AutoencoderKL,
+    LCMScheduler,
+    DPMSolverMultistepScheduler
 )
 from diffusers.models.attention_processor import AttnProcessor2_0
 from insightface.app import FaceAnalysis
 from PIL import Image
 import numpy as np
 import cv2
+import math
+from controlnet_aux import ZoeDetector  # Better depth detection
 from huggingface_hub import hf_hub_download
 import os
 # Configuration
 MODEL_REPO = "primerz/pixagram"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # LORA trigger word
 TRIGGER_WORD = "p1x3l4rt, pixel art"
+# Use LCM or DPM++ scheduler
+USE_LCM = True  # Set to False to use DPM++ 2M Karras
 print(f"Using device: {device}")
 print(f"Loading models from: {MODEL_REPO}")
 print(f"LORA Trigger Word: {TRIGGER_WORD}")
+print(f"Scheduler: {'LCM' if USE_LCM else 'DPM++ 2M Karras'}")
+def draw_kps(image_pil, kps, color_list=[(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255)]):
+    """Draw facial keypoints on image for InstantID ControlNet"""
+    stickwidth = 4
+    limbSeq = np.array([[0, 2], [1, 2], [3, 2], [4, 2]])
+    kps = np.array(kps)
+    w, h = image_pil.size
+    out_img = np.zeros([h, w, 3])
+    for i in range(len(limbSeq)):
+        index = limbSeq[i]
+        color = color_list[index[0]]
+        x = kps[index][:, 0]
+        y = kps[index][:, 1]
+        length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5
+        angle = math.degrees(math.atan2(y[0] - y[1], x[0] - x[1]))
+        polygon = cv2.ellipse2Poly(
+            (int(np.mean(x)), int(np.mean(y))), (int(length / 2), stickwidth), int(angle), 0, 360, 1
+        )
+        out_img = cv2.fillConvexPoly(out_img.copy(), polygon, color)
+    out_img = (out_img * 0.6).astype(np.uint8)
+    for idx_kp, kp in enumerate(kps):
+        color = color_list[idx_kp]
+        x, y = kp
+        out_img = cv2.circle(out_img.copy(), (int(x), int(y)), 10, color, -1)
+    out_img_pil = Image.fromarray(out_img.astype(np.uint8))
+    return out_img_pil
 class RetroArtConverter:
+    def __init__(self):
         self.device = device
         self.dtype = dtype
+        self.use_lcm = USE_LCM
         self.models_loaded = {
             'custom_checkpoint': False,
             'lora': False,
+            'instantid': False,
+            'zoe_depth': False
         }
         # Initialize face analysis for InstantID
+        print("Loading face analysis model...")
         try:
             self.face_app = FaceAnalysis(
                 name='antelopev2',
             self.face_app = None
             self.face_detection_enabled = False
+        # Load Zoe Depth detector (better than DPT)
+        print("Loading Zoe Depth detector...")
+        try:
+            self.zoe_depth = ZoeDetector.from_pretrained("lllyasviel/Annotators")
+            self.zoe_depth.to(self.device)
+            print("✓ Zoe Depth loaded successfully")
+            self.models_loaded['zoe_depth'] = True
+        except Exception as e:
+            print(f"⚠️ Zoe Depth not available: {e}")
+            self.zoe_depth = None
+        # Load ControlNet for depth
+        print("Loading ControlNet Zoe Depth model...")
+        self.controlnet_depth = ControlNetModel.from_pretrained(
+            "diffusers/controlnet-zoe-depth-sdxl-1.0",
+            torch_dtype=self.dtype
+        ).to(self.device)
+        # Load InstantID ControlNet
         print("Loading InstantID ControlNet...")
         try:
             self.controlnet_instantid = ControlNetModel.from_pretrained(
             self.controlnet_instantid = None
             self.instantid_enabled = False
         # Determine which controlnets to use
         if self.instantid_enabled and self.controlnet_instantid is not None:
             controlnets = [self.controlnet_instantid, self.controlnet_depth]
+            print(f"Initializing with multiple ControlNets: InstantID + Depth")
         else:
             controlnets = self.controlnet_depth
+            print(f"Initializing with single ControlNet: Depth only")
         # Load SDXL checkpoint from HuggingFace Hub
+        print("Loading SDXL checkpoint (horizon) with bundled VAE from HuggingFace Hub...")
         try:
             model_path = hf_hub_download(
                 repo_id=MODEL_REPO,
                 filename="horizon.safetensors",
                 repo_type="model"
             )
+            # Use Img2Img pipeline
+            self.pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_single_file(
                 model_path,
                 controlnet=controlnets,
                 torch_dtype=self.dtype,
                 use_safetensors=True
             ).to(self.device)
+            print("✓ Custom checkpoint loaded successfully (VAE bundled)")
             self.models_loaded['custom_checkpoint'] = True
         except Exception as e:
             print(f"⚠️ Could not load custom checkpoint: {e}")
             print("Using default SDXL base model")
+            self.pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
                 "stabilityai/stable-diffusion-xl-base-1.0",
                 controlnet=controlnets,
                 torch_dtype=self.dtype,
                 use_safetensors=True
             ).to(self.device)
             self.models_loaded['custom_checkpoint'] = False
         # Load LORA from HuggingFace Hub
         print("Loading LORA (retroart) from HuggingFace Hub...")
         try:
             print(f"⚠️ Could not load LORA: {e}")
             self.models_loaded['lora'] = False
+        # Setup scheduler based on USE_LCM flag
+        if self.use_lcm:
+            print("Setting up LCM scheduler...")
             self.pipe.scheduler = LCMScheduler.from_config(
                 self.pipe.scheduler.config
             )
         else:
+            print("Setting up DPM++ 2M Karras scheduler...")
             self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(
                 self.pipe.scheduler.config,
                 use_karras_sigmas=True
             except Exception as e:
                 print(f"⚠️ xformers not available: {e}")
+        # Set CLIP skip to 2
+        if hasattr(self.pipe, 'text_encoder'):
+            self.clip_skip = 2
+            print(f"✓ CLIP skip set to {self.clip_skip}")
         # Track controlnet configuration
         self.using_multiple_controlnets = isinstance(controlnets, list)
         print(f"Pipeline initialized with {'multiple' if self.using_multiple_controlnets else 'single'} ControlNet(s)")
         print("===================\n")
         print("✓ Model initialization complete!")
+        print("\n=== CONFIGURATION ===")
+        print(f"Scheduler: {'LCM' if self.use_lcm else 'DPM++ 2M Karras'}")
+        if self.use_lcm:
+            print("Recommended Steps: 12")
             print("Recommended CFG: 1.0-1.5")
         else:
+            print("Recommended Steps: 30-50")
+            print("Recommended CFG: 7.0-8.0")
+        print("Recommended Resolution: 896x1152 or 832x1216")
+        print("CLIP Skip: 2")
         print(f"LORA Trigger: '{TRIGGER_WORD}'")
+        print("=====================\n")
     def get_depth_map(self, image):
+        """Generate depth map using Zoe Depth"""
+        if self.zoe_depth is not None:
+            # Use Zoe detector
+            depth_image = self.zoe_depth(image, detect_resolution=512, image_resolution=1024)
             return depth_image
         else:
+            # Fallback to simple grayscale
+            gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
+            depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
             return Image.fromarray(depth_colored)
     def calculate_optimal_size(self, original_width, original_height):
         """Calculate optimal size from recommended resolutions"""
         aspect_ratio = original_width / original_height
+        # Recommended resolutions for this model
         recommended_sizes = [
             (896, 1152),  # Portrait
             (1152, 896),  # Landscape
         input_image,
         prompt="retro game character, vibrant colors, detailed",
         negative_prompt="blurry, low quality, ugly, distorted",
+        num_inference_steps=12,
+        guidance_scale=1.0,
         controlnet_conditioning_scale=0.8,
         lora_scale=1.0,
+        identity_preservation=0.8,
+        strength=0.75  # img2img strength
     ):
+        """Generate retro art with img2img pipeline"""
         # Add trigger word to prompt
         prompt = self.add_trigger_word(prompt)
         print(f"Resizing from {original_width}x{original_height} to {target_width}x{target_height}")
         print(f"Prompt: {prompt}")
+        print(f"Img2Img Strength: {strength}")
         # Resize with high quality
         resized_image = input_image.resize((target_width, target_height), Image.LANCZOS)
         # Handle face detection for InstantID
         using_multiple_controlnets = self.using_multiple_controlnets
+        face_kps_image = None
         face_embeddings = None
         has_detected_faces = False
         if using_multiple_controlnets and self.face_app is not None:
             print("Detecting faces and extracting keypoints...")
+            img_array = cv2.cvtColor(np.array(resized_image), cv2.COLOR_RGB2BGR)
             faces = self.face_app.get(img_array)
             if len(faces) > 0:
                 has_detected_faces = True
                 print(f"Detected {len(faces)} face(s)")
+                # Get largest face
+                face = sorted(faces, key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]))[-1]
                 # Extract face embeddings
+                face_embeddings = face.normed_embedding
+                # Draw keypoints
+                face_kps = face.kps
+                face_kps_image = draw_kps(resized_image, face_kps)
+                print(f"Face info: bbox={face.bbox}, age={face.age if hasattr(face, 'age') else 'N/A'}, gender={'M' if face.gender == 1 else 'F' if hasattr(face, 'gender') else 'N/A'}")
         # Set LORA scale
         if hasattr(self.pipe, 'set_adapters') and self.models_loaded['lora']:
         pipe_kwargs = {
             "prompt": prompt,
             "negative_prompt": negative_prompt,
+            "image": resized_image,  # img2img source
+            "strength": strength,  # how much to transform
             "num_inference_steps": num_inference_steps,
             "guidance_scale": guidance_scale,
             "generator": torch.Generator(device=self.device).manual_seed(42)
         }
+        # Add CLIP skip
+        if hasattr(self.pipe, 'text_encoder'):
+            pipe_kwargs["clip_skip"] = 2
         # Configure ControlNet inputs
+        if using_multiple_controlnets and has_detected_faces and face_kps_image is not None:
+            print("Using InstantID (keypoints) + Depth ControlNets")
+            # Order: [InstantID, Depth]
+            control_images = [face_kps_image, depth_image]
+            conditioning_scales = [identity_preservation, controlnet_conditioning_scale]
             pipe_kwargs["control_image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
+        elif using_multiple_controlnets and not has_detected_faces:
+            print("Multiple ControlNets available but no faces detected, using depth only")
+            # Use depth for both to avoid errors
             control_images = [depth_image, depth_image]
+            conditioning_scales = [0.0, controlnet_conditioning_scale]
             pipe_kwargs["control_image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
         else:
+            print("Using Depth ControlNet only")
             pipe_kwargs["control_image"] = depth_image
+            pipe_kwargs["controlnet_conditioning_scale"] = controlnet_conditioning_scale
         # Generate
+        scheduler_name = "LCM" if self.use_lcm else "DPM++"
+        print(f"Generating with {scheduler_name}: Steps={num_inference_steps}, CFG={guidance_scale}, Strength={strength}")
         result = self.pipe(**pipe_kwargs)
         return result.images[0]
 # Initialize converter
 print("Initializing RetroArt Converter...")
+converter = RetroArtConverter()
 @spaces.GPU
 def process_image(
     negative_prompt,
     steps,
     guidance_scale,
     controlnet_scale,
     lora_scale,
+    identity_preservation,
+    strength
 ):
     if image is None:
         return None
     try:
         result = converter.generate_retro_art(
             input_image=image,
             prompt=prompt,
             negative_prompt=negative_prompt,
             num_inference_steps=int(steps),
             guidance_scale=guidance_scale,
             controlnet_conditioning_scale=controlnet_scale,
             lora_scale=lora_scale,
+            identity_preservation=identity_preservation,
+            strength=strength
         )
         return result
     except Exception as e:
         traceback.print_exc()
         raise gr.Error(f"Generation failed: {str(e)}")
 # Gradio UI
+with gr.Blocks(title="RetroArt Converter - Img2Img", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(f"""
+    # 🎮 RetroArt Converter (Img2Img + InstantID)
+    Convert images into retro pixel art style using img2img with face preservation!
+    **✨ Features:**
+    - 🖼️ **True Img2Img**: Transforms your image while preserving structure
+    - 👤 **InstantID**: Facial keypoint detection with age/gender detection
+    - 🎨 Custom pixel art LORA with trigger word: `{TRIGGER_WORD}`
+    - 🏔️ **Zoe Depth**: Better depth map quality
+    - ⚡ **{'LCM' if USE_LCM else 'DPM++ 2M Karras'}** scheduler
+    - 📐 Optimized resolutions: 896x1152 / 832x1216
+    - 🎯 CLIP Skip 2 for better style
     """)
     # Model status
+    if converter.models_loaded:
+        status_text = "**📦 Loaded Models:**\n"
+        status_text += f"- Custom Checkpoint (Horizon): {'✓ Loaded' if converter.models_loaded['custom_checkpoint'] else '✗ Using SDXL base'}\n"
+        status_text += f"- LORA (RetroArt): {'✓ Loaded' if converter.models_loaded['lora'] else '✗ Disabled'}\n"
+        status_text += f"- InstantID: {'✓ Loaded' if converter.models_loaded['instantid'] else '✗ Disabled'}\n"
+        status_text += f"- Zoe Depth: {'✓ Loaded' if converter.models_loaded['zoe_depth'] else '✗ Fallback'}\n"
+        gr.Markdown(status_text)
+    scheduler_info = f"""
+    **⚙️ Configuration:**
+    - Pipeline: **Img2Img** (better structure preservation)
+    - Scheduler: **{'LCM' if USE_LCM else 'DPM++ 2M Karras'}**
+    - Recommended Steps: **{12 if USE_LCM else '30-50'}**
+    - Recommended CFG: **{1.0 if USE_LCM else '7.0-8.0'}**
+    - CLIP Skip: **2**
+    - LORA Trigger: `{TRIGGER_WORD}` (auto-added)
+    - Face Detection: **Age & Gender detection enabled**
+    """
+    gr.Markdown(scheduler_info)
     with gr.Row():
         with gr.Column():
                 lines=2
             )
+            with gr.Accordion(f"⚡ {'LCM' if USE_LCM else 'DPM++'} Settings", open=True):
                 steps = gr.Slider(
                     minimum=4,
                     maximum=50,
+                    value=12 if USE_LCM else 30,
                     step=1,
+                    label=f"Inference Steps ({'LCM works with 12' if USE_LCM else 'DPM++ uses 30-50'})"
                 )
                 guidance_scale = gr.Slider(
                     minimum=0.5,
                     maximum=15.0,
+                    value=1.0 if USE_LCM else 7.5,
                     step=0.1,
+                    label=f"Guidance Scale (CFG) - {'LCM uses 1.0-1.5' if USE_LCM else 'DPM++ uses 7-8'}"
                 )
                 strength = gr.Slider(
                     minimum=0.3,
+                    maximum=0.95,
+                    value=0.75,
                     step=0.05,
+                    label="Img2Img Strength (how much to transform)"
                 )
                 controlnet_scale = gr.Slider(
                     maximum=1.2,
                     value=0.8,
                     step=0.05,
+                    label="Zoe Depth ControlNet Scale"
                 )
+                lora_scale = gr.Slider(
+                    minimum=0.5,
+                    maximum=1.5,
+                    value=1.0,
                     step=0.05,
+                    label="RetroArt LORA Scale"
                 )
+            with gr.Accordion("👤 InstantID Settings (for portraits)", open=False):
+                identity_preservation = gr.Slider(
                     minimum=0,
+                    maximum=1.5,
                     value=0.8,
+                    step=0.1,
+                    label="Identity/Keypoint Preservation"
                 )
             generate_btn = gr.Button("🎨 Generate Retro Art", variant="primary", size="lg")
         with gr.Column():
             output_image = gr.Image(label="Retro Art Output")
+            gr.Markdown(f"""
             ### 💡 Tips for Best Results:
+            **For Img2Img:**
+            - ✅ **Strength 0.7-0.8**: Good balance of transformation and structure
+            - ✅ **Strength 0.5-0.6**: More faithful to original
+            - ✅ **Strength 0.8-0.9**: More creative/stylized
+            **For {'LCM' if USE_LCM else 'DPM++'}:**
+            - {'✅ Use **12 steps** (optimized for speed)' if USE_LCM else '✅ Use **30-50 steps** (better quality)'}
+            - {'✅ Keep CFG at **1.0-1.5**' if USE_LCM else '✅ Keep CFG at **7.0-8.0**'}
+            - ✅ LORA trigger word is **auto-added**
+            - ✅ Resolution auto-optimized to 896x1152 or 832x1216
+            **For Portraits:**
+            - The system detects **age and gender** automatically
+            - Facial **keypoints** are used for better face preservation
+            - Adjust Identity Preservation: lower = more stylized, higher = more realistic face
+            **For Quality:**
+            - Use high-resolution input images
+            - Be specific in prompts: "16-bit game character" vs "character"
+            - Adjust Depth scale: lower = more creative, higher = more faithful depth
+            **For Style:**
+            - Increase LORA scale (1.0-1.5) for stronger pixel art effect
+            - Try prompts like: "SNES style", "16-bit RPG", "Game Boy advance style"
             """)
     generate_btn.click(
         fn=process_image,
         inputs=[
+            input_image, prompt, negative_prompt, steps, guidance_scale,
+            controlnet_scale, lora_scale, identity_preservation, strength
         ],
         outputs=[output_image]
     )
 if __name__ == "__main__":
     demo.queue(max_size=20)
     demo.launch(