pixagram-dev

Runtime error

App Files Files Community

primerz commited on Oct 27

Commit

c6815c0

verified ·

1 Parent(s): 912e6dd

Update app.py

Browse files

Files changed (1) hide show

app.py +175 -222

app.py CHANGED Viewed

@@ -6,12 +6,11 @@ from diffusers import (
     StableDiffusionXLControlNetPipeline,
     ControlNetModel,
     AutoencoderKL,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler
 )
 from diffusers.models.attention_processor import AttnProcessor2_0
 from insightface.app import FaceAnalysis
-from PIL import Image, ImageEnhance, ImageFilter
 import numpy as np
 import cv2
 from transformers import pipeline as transformers_pipeline
@@ -23,8 +22,12 @@ MODEL_REPO = "primerz/pixagram"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.float16 if device == "cuda" else torch.float32
 print(f"Using device: {device}")
 print(f"Loading models from: {MODEL_REPO}")
 class RetroArtConverter:
     def __init__(self):
@@ -32,7 +35,6 @@ class RetroArtConverter:
         self.dtype = dtype
         self.models_loaded = {
             'custom_checkpoint': False,
-            'custom_vae': False,
             'lora': False,
             'instantid': False
         }
@@ -50,7 +52,6 @@ class RetroArtConverter:
             self.face_detection_enabled = True
         except Exception as e:
             print(f"⚠️ Face detection not available: {e}")
-            print("Continuing without face detection")
             self.face_app = None
             self.face_detection_enabled = False
@@ -61,7 +62,7 @@ class RetroArtConverter:
             torch_dtype=self.dtype
         ).to(self.device)
-        # Load InstantID ControlNet for identity preservation
         print("Loading InstantID ControlNet...")
         try:
             self.controlnet_instantid = ControlNetModel.from_pretrained(
@@ -74,34 +75,10 @@ class RetroArtConverter:
             self.models_loaded['instantid'] = True
         except Exception as e:
             print(f"⚠️ InstantID ControlNet not available: {e}")
-            print("Running without InstantID")
             self.controlnet_instantid = None
             self.instantid_enabled = False
-        # Load custom VAE from HuggingFace Hub
-        print("Loading custom VAE (pixelate) from HuggingFace Hub...")
-        try:
-            vae_path = hf_hub_download(
-                repo_id=MODEL_REPO,
-                filename="pixelate.safetensors",
-                repo_type="model"
-            )
-            self.vae = AutoencoderKL.from_single_file(
-                vae_path,
-                torch_dtype=self.dtype
-            ).to(self.device)
-            print("✓ Custom VAE loaded successfully")
-            self.models_loaded['custom_vae'] = True
-        except Exception as e:
-            print(f"⚠️ Could not load custom VAE: {e}")
-            print("Using high-quality SDXL VAE instead")
-            self.vae = AutoencoderKL.from_pretrained(
-                "madebyollin/sdxl-vae-fp16-fix",
-                torch_dtype=self.dtype
-            ).to(self.device)
-            self.models_loaded['custom_vae'] = False
-        # Load depth estimator for preprocessing
         print("Loading depth estimator...")
         self.depth_estimator = transformers_pipeline(
             'depth-estimation',
@@ -118,7 +95,8 @@ class RetroArtConverter:
             print(f"Initializing with single ControlNet: Depth only")
         # Load SDXL checkpoint from HuggingFace Hub
-        print("Loading SDXL checkpoint (horizon) from HuggingFace Hub...")
         try:
             model_path = hf_hub_download(
                 repo_id=MODEL_REPO,
@@ -128,11 +106,10 @@ class RetroArtConverter:
             self.pipe = StableDiffusionXLControlNetPipeline.from_single_file(
                 model_path,
                 controlnet=controlnets,
-                vae=self.vae,
                 torch_dtype=self.dtype,
                 use_safetensors=True
             ).to(self.device)
-            print("✓ Custom checkpoint loaded successfully")
             self.models_loaded['custom_checkpoint'] = True
         except Exception as e:
             print(f"⚠️ Could not load custom checkpoint: {e}")
@@ -140,7 +117,6 @@ class RetroArtConverter:
             self.pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
                 "stabilityai/stable-diffusion-xl-base-1.0",
                 controlnet=controlnets,
-                vae=self.vae,
                 torch_dtype=self.dtype,
                 use_safetensors=True
             ).to(self.device)
@@ -155,25 +131,23 @@ class RetroArtConverter:
                 repo_type="model"
             )
             self.pipe.load_lora_weights(lora_path)
-            print("✓ LORA loaded successfully")
             self.models_loaded['lora'] = True
         except Exception as e:
             print(f"⚠️ Could not load LORA: {e}")
-            print("Running without LORA")
             self.models_loaded['lora'] = False
-        # Use EulerAncestral scheduler for better quality
-        self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(
             self.pipe.scheduler.config
         )
-        # Disable VAE slicing for better quality (use only if you have VRAM issues)
-        # self.pipe.enable_vae_slicing()
-        # Enable attention slicing for memory efficiency
         self.pipe.unet.set_attn_processor(AttnProcessor2_0())
-        # Try to enable xformers if available
         if self.device == "cuda":
             try:
                 self.pipe.enable_xformers_memory_efficient_attention()
@@ -181,7 +155,12 @@ class RetroArtConverter:
             except Exception as e:
                 print(f"⚠️ xformers not available: {e}")
-        # Track whether we're using multiple ControlNets
         self.using_multiple_controlnets = isinstance(controlnets, list)
         print(f"Pipeline initialized with {'multiple' if self.using_multiple_controlnets else 'single'} ControlNet(s)")
@@ -191,150 +170,133 @@ class RetroArtConverter:
             print(f"{model}: {status}")
         print("===================\n")
-        print("Model initialization complete!")
-    def enhance_image_quality(self, image):
-        """Enhance input image quality before processing"""
-        # Sharpen slightly
-        enhancer = ImageEnhance.Sharpness(image)
-        image = enhancer.enhance(1.2)
-        # Enhance contrast slightly
-        enhancer = ImageEnhance.Contrast(image)
-        image = enhancer.enhance(1.1)
-        return image
-    def get_depth_map(self, image, enhance=True):
-        """Generate depth map from input image with quality improvements"""
-        # Enhance image before depth estimation if needed
-        if enhance:
-            image = self.enhance_image_quality(image)
         depth = self.depth_estimator(image)
         depth_image = depth['depth']
         depth_array = np.array(depth_image)
-        # Better normalization with histogram stretching
         depth_min, depth_max = np.percentile(depth_array, [2, 98])
         depth_normalized = np.clip((depth_array - depth_min) / (depth_max - depth_min + 1e-8), 0, 1) * 255
         depth_normalized = depth_normalized.astype(np.uint8)
-        # Apply slight gaussian blur to reduce noise
         depth_normalized = cv2.GaussianBlur(depth_normalized, (3, 3), 0)
-        # Convert to 3-channel image
         depth_colored = cv2.cvtColor(depth_normalized, cv2.COLOR_GRAY2RGB)
         return Image.fromarray(depth_colored)
-    def extract_face_embeddings(self, image):
-        """Extract face embeddings using InsightFace"""
-        if not self.face_detection_enabled or self.face_app is None:
-            return None
-        try:
-            img_array = np.array(image)
-            faces = self.face_app.get(img_array)
-            if len(faces) == 0:
-                return None
-            # Use the largest face
-            face = sorted(faces, key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]))[-1]
-            return torch.from_numpy(face.normed_embedding).unsqueeze(0)
-        except Exception as e:
-            print(f"Face embedding extraction error: {e}")
-            return None
-    def calculate_target_size(self, original_width, original_height, max_dimension=1024):
-        """Calculate target size maintaining aspect ratio"""
         aspect_ratio = original_width / original_height
-        if original_width > original_height:
-            new_width = min(original_width, max_dimension)
-            new_height = int(new_width / aspect_ratio)
-        else:
-            new_height = min(original_height, max_dimension)
-            new_width = int(new_height * aspect_ratio)
-        # Round to nearest multiple of 8
-        new_width = (new_width // 8) * 8
-        new_height = (new_height // 8) * 8
-        return new_width, new_height
     def generate_retro_art(
         self,
         input_image,
-        prompt="retro pixel art game, 16-bit style, vibrant colors",
-        negative_prompt="blurry, low quality, modern, photorealistic, 3d render",
-        num_inference_steps=40,  # Increased for better quality
-        guidance_scale=7.5,
-        controlnet_conditioning_scale=0.6,  # Reduced for less depth influence
-        lora_scale=0.85,
         identity_preservation=0.8,
-        image_scale=0.2,
-        enhance_quality=True  # New parameter
     ):
-        """Main generation function with quality improvements"""
-        # Resize image maintaining aspect ratio
         original_width, original_height = input_image.size
-        target_width, target_height = self.calculate_target_size(original_width, original_height)
         print(f"Resizing from {original_width}x{original_height} to {target_width}x{target_height}")
-        # Use LANCZOS for high-quality resizing
         resized_image = input_image.resize((target_width, target_height), Image.LANCZOS)
-        # Optionally enhance image quality
-        if enhance_quality:
-            resized_image = self.enhance_image_quality(resized_image)
-        # Generate depth map with quality enhancements
         print("Generating depth map...")
-        depth_image = self.get_depth_map(resized_image, enhance=enhance_quality)
         depth_image = depth_image.resize((target_width, target_height), Image.LANCZOS)
-        # Determine if we're using multiple ControlNets
         using_multiple_controlnets = self.using_multiple_controlnets
-        # Extract face embeddings if InstantID is enabled
         face_embeddings = None
         has_detected_faces = False
         if using_multiple_controlnets:
-            print("Extracting face embeddings...")
             img_array = np.array(resized_image)
             faces = self.face_app.get(img_array) if self.face_app is not None else []
             if len(faces) > 0:
                 has_detected_faces = True
-                print(f"Detected {len(faces)} face(s), using for identity preservation")
-                # Get the largest face
                 face = sorted(faces, key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]))[-1]
                 face_embeddings = torch.from_numpy(face.normed_embedding).unsqueeze(0).to(self.device, dtype=self.dtype)
-                # Enhance prompt for face preservation
-                prompt = f"portrait, detailed face, facial features, {prompt}"
         # Set LORA scale
         if hasattr(self.pipe, 'set_adapters') and self.models_loaded['lora']:
             try:
                 self.pipe.set_adapters(["retroart"], adapter_weights=[lora_scale])
-                print(f"LORA scale set to: {lora_scale}")
             except Exception as e:
-                print(f"Could not set LORA adapters: {e}")
-        # Enhanced negative prompt for better quality
-        enhanced_negative_prompt = f"{negative_prompt}, worst quality, low quality, normal quality, lowres, watermark, signature, text, jpeg artifacts, noise, grainy"
-        # Prepare pipeline kwargs
         pipe_kwargs = {
             "prompt": prompt,
-            "negative_prompt": enhanced_negative_prompt,
             "num_inference_steps": num_inference_steps,
             "guidance_scale": guidance_scale,
             "width": target_width,
@@ -342,21 +304,24 @@ class RetroArtConverter:
             "generator": torch.Generator(device=self.device).manual_seed(42)
         }
-        # Add control images and scales based on ControlNet configuration
         if using_multiple_controlnets and has_detected_faces:
-            print("Using multiple ControlNets (Depth + InstantID)")
             control_images = [depth_image, resized_image]
             conditioning_scales = [controlnet_conditioning_scale, image_scale]
             pipe_kwargs["image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
-            # Add face embeddings for InstantID IP-Adapter
             if face_embeddings is not None:
                 pipe_kwargs["cross_attention_kwargs"] = {"ip_adapter_image_embeds": [face_embeddings]}
         elif using_multiple_controlnets and not has_detected_faces:
-            print("Multiple ControlNets available but no faces detected, using depth only")
             control_images = [depth_image, depth_image]
             conditioning_scales = [controlnet_conditioning_scale, 0.0]
@@ -364,22 +329,20 @@ class RetroArtConverter:
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
         else:
-            print("Using single ControlNet (Depth only)")
             pipe_kwargs["image"] = depth_image
             pipe_kwargs["controlnet_conditioning_scale"] = controlnet_conditioning_scale
-        # Generate image
-        print("Generating retro art...")
-        print(f"Steps: {num_inference_steps}, Guidance: {guidance_scale}")
         result = self.pipe(**pipe_kwargs)
         return result.images[0]
-# Initialize the converter
 print("Initializing RetroArt Converter...")
 converter = RetroArtConverter()
-# Gradio interface with ZeroGPU support
 @spaces.GPU
 def process_image(
     image,
@@ -390,8 +353,7 @@ def process_image(
     controlnet_scale,
     lora_scale,
     identity_preservation,
-    image_scale,
-    enhance_quality
 ):
     if image is None:
         return None
@@ -406,8 +368,7 @@ def process_image(
             controlnet_conditioning_scale=controlnet_scale,
             lora_scale=lora_scale,
             identity_preservation=identity_preservation,
-            image_scale=image_scale,
-            enhance_quality=enhance_quality
         )
         return result
     except Exception as e:
@@ -416,87 +377,89 @@ def process_image(
         traceback.print_exc()
         raise gr.Error(f"Generation failed: {str(e)}")
-# Create Gradio interface
-with gr.Blocks(title="RetroArt Converter", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 🎮 RetroArt Converter - Quality Enhanced
-    Convert any image into retro game art style with improved quality!
-    **Features:**
-    - High-quality depth estimation and preprocessing
-    - Enhanced prompts for better results
-    - Custom SDXL checkpoint (Horizon)
-    - Pixelate VAE for authentic retro look
-    - RetroArt LORA for style enhancement
-    - Face preservation with InstantID
     """)
-    # Model status display
     if converter.models_loaded:
-        status_text = "**Loaded Models:**\n"
-        status_text += f"- Custom Checkpoint: {'✓' if converter.models_loaded['custom_checkpoint'] else '✗ (using SDXL base)'}\n"
-        status_text += f"- Custom VAE: {'✓' if converter.models_loaded['custom_vae'] else '✗ (using default VAE)'}\n"
-        status_text += f"- LORA: {'✓' if converter.models_loaded['lora'] else '✗ (disabled)'}\n"
-        status_text += f"- InstantID: {'✓' if converter.models_loaded['instantid'] else '✗ (disabled)'}\n"
         gr.Markdown(status_text)
     with gr.Row():
         with gr.Column():
             input_image = gr.Image(label="Input Image", type="pil")
             prompt = gr.Textbox(
-                label="Prompt",
-                value="masterpiece, best quality, retro pixel art game, 16-bit style, vibrant colors, highly detailed",
-                lines=3
             )
             negative_prompt = gr.Textbox(
                 label="Negative Prompt",
-                value="blurry, low quality, modern, photorealistic, 3d render, ugly, distorted, deformed",
                 lines=2
             )
-            enhance_quality = gr.Checkbox(
-                label="Enable Quality Enhancement",
-                value=True,
-                info="Sharpen and enhance input image before processing"
-            )
-            with gr.Accordion("Quality Settings", open=True):
                 steps = gr.Slider(
-                    minimum=20,
-                    maximum=70,
-                    value=40,
-                    step=5,
-                    label="Inference Steps (more = better quality but slower)"
                 )
                 guidance_scale = gr.Slider(
-                    minimum=3,
-                    maximum=15,
-                    value=7.5,
-                    step=0.5,
-                    label="Guidance Scale (how closely to follow prompt)"
                 )
                 controlnet_scale = gr.Slider(
-                    minimum=0,
-                    maximum=1.5,
-                    value=0.6,
                     step=0.05,
-                    label="ControlNet Depth Scale (lower = more creative)"
                 )
                 lora_scale = gr.Slider(
-                    minimum=0,
-                    maximum=2,
-                    value=0.85,
                     step=0.05,
                     label="RetroArt LORA Scale"
                 )
-            with gr.Accordion("Identity Settings (for portraits)", open=False):
                 identity_preservation = gr.Slider(
                     minimum=0,
                     maximum=1.5,
@@ -519,43 +482,33 @@ with gr.Blocks(title="RetroArt Converter", theme=gr.themes.Soft()) as demo:
             output_image = gr.Image(label="Retro Art Output")
             gr.Markdown("""
-            ### Tips for Best Quality:
-            1. **Use high-resolution input images** (at least 512x512)
-            2. **Increase inference steps** to 50-60 for maximum quality
-            3. **Lower ControlNet scale** (0.5-0.6) for more stylization
-            4. **Adjust guidance scale:** 7-9 for balanced results
-            5. **Enable quality enhancement** for sharper inputs
-            6. Try different prompts with quality keywords: "masterpiece, best quality, highly detailed"
             """)
-    gr.Examples(
-        examples=[
-            [
-                "example_portrait.jpg",
-                "masterpiece, best quality, retro pixel art portrait, 16-bit game character, vibrant colors",
-                "blurry, modern, low quality",
-                40, 7.5, 0.6, 0.85, 0.8, 0.2, True
-            ],
-        ],
-        inputs=[
-            input_image, prompt, negative_prompt, steps, guidance_scale,
-            controlnet_scale, lora_scale, identity_preservation, image_scale, enhance_quality
-        ],
-        outputs=[output_image],
-        fn=process_image,
-        cache_examples=False
-    )
     generate_btn.click(
         fn=process_image,
         inputs=[
-            input_image, prompt, negative_prompt, steps, guidance_scale,
-            controlnet_scale, lora_scale, identity_preservation, image_scale, enhance_quality
         ],
         outputs=[output_image]
     )
-# Launch with API enabled
 if __name__ == "__main__":
     demo.queue(max_size=20)
     demo.launch(

     StableDiffusionXLControlNetPipeline,
     ControlNetModel,
     AutoencoderKL,
+    LCMScheduler  # CORRECT SCHEDULER FOR LCM
 )
 from diffusers.models.attention_processor import AttnProcessor2_0
 from insightface.app import FaceAnalysis
+from PIL import Image
 import numpy as np
 import cv2
 from transformers import pipeline as transformers_pipeline
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.float16 if device == "cuda" else torch.float32
+# LORA trigger word
+TRIGGER_WORD = "p1x3l4rt, pixel art"
 print(f"Using device: {device}")
 print(f"Loading models from: {MODEL_REPO}")
+print(f"LORA Trigger Word: {TRIGGER_WORD}")
 class RetroArtConverter:
     def __init__(self):
         self.dtype = dtype
         self.models_loaded = {
             'custom_checkpoint': False,
             'lora': False,
             'instantid': False
         }
             self.face_detection_enabled = True
         except Exception as e:
             print(f"⚠️ Face detection not available: {e}")
             self.face_app = None
             self.face_detection_enabled = False
             torch_dtype=self.dtype
         ).to(self.device)
+        # Load InstantID ControlNet (optional)
         print("Loading InstantID ControlNet...")
         try:
             self.controlnet_instantid = ControlNetModel.from_pretrained(
             self.models_loaded['instantid'] = True
         except Exception as e:
             print(f"⚠️ InstantID ControlNet not available: {e}")
             self.controlnet_instantid = None
             self.instantid_enabled = False
+        # Load depth estimator
         print("Loading depth estimator...")
         self.depth_estimator = transformers_pipeline(
             'depth-estimation',
             print(f"Initializing with single ControlNet: Depth only")
         # Load SDXL checkpoint from HuggingFace Hub
+        # NOTE: VAE is bundled in the checkpoint, don't load separately!
+        print("Loading SDXL checkpoint (horizon) with bundled VAE from HuggingFace Hub...")
         try:
             model_path = hf_hub_download(
                 repo_id=MODEL_REPO,
             self.pipe = StableDiffusionXLControlNetPipeline.from_single_file(
                 model_path,
                 controlnet=controlnets,
                 torch_dtype=self.dtype,
                 use_safetensors=True
             ).to(self.device)
+            print("✓ Custom checkpoint loaded successfully (VAE bundled)")
             self.models_loaded['custom_checkpoint'] = True
         except Exception as e:
             print(f"⚠️ Could not load custom checkpoint: {e}")
             self.pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
                 "stabilityai/stable-diffusion-xl-base-1.0",
                 controlnet=controlnets,
                 torch_dtype=self.dtype,
                 use_safetensors=True
             ).to(self.device)
                 repo_type="model"
             )
             self.pipe.load_lora_weights(lora_path)
+            print(f"✓ LORA loaded successfully")
+            print(f"  Trigger word: '{TRIGGER_WORD}'")
             self.models_loaded['lora'] = True
         except Exception as e:
             print(f"⚠️ Could not load LORA: {e}")
             self.models_loaded['lora'] = False
+        # CRITICAL: Use LCM Scheduler for this model!
+        print("Setting up LCM scheduler...")
+        self.pipe.scheduler = LCMScheduler.from_config(
             self.pipe.scheduler.config
         )
+        # Enable attention optimizations
         self.pipe.unet.set_attn_processor(AttnProcessor2_0())
+        # Try to enable xformers
         if self.device == "cuda":
             try:
                 self.pipe.enable_xformers_memory_efficient_attention()
             except Exception as e:
                 print(f"⚠️ xformers not available: {e}")
+        # Set CLIP skip to 2
+        if hasattr(self.pipe, 'text_encoder'):
+            self.clip_skip = 2
+            print(f"✓ CLIP skip set to {self.clip_skip}")
+        # Track controlnet configuration
         self.using_multiple_controlnets = isinstance(controlnets, list)
         print(f"Pipeline initialized with {'multiple' if self.using_multiple_controlnets else 'single'} ControlNet(s)")
             print(f"{model}: {status}")
         print("===================\n")
+        print("✓ Model initialization complete!")
+        print("\n=== LCM CONFIGURATION ===")
+        print("Scheduler: LCM")
+        print("Recommended Steps: 12")
+        print("Recommended CFG: 1.0-1.5")
+        print("Recommended Resolution: 896x1152 or 832x1216")
+        print("CLIP Skip: 2")
+        print(f"LORA Trigger: '{TRIGGER_WORD}'")
+        print("=========================\n")
+    def get_depth_map(self, image):
+        """Generate depth map from input image"""
         depth = self.depth_estimator(image)
         depth_image = depth['depth']
         depth_array = np.array(depth_image)
+        # Normalize with percentile clipping
         depth_min, depth_max = np.percentile(depth_array, [2, 98])
         depth_normalized = np.clip((depth_array - depth_min) / (depth_max - depth_min + 1e-8), 0, 1) * 255
         depth_normalized = depth_normalized.astype(np.uint8)
+        # Slight blur to reduce noise
         depth_normalized = cv2.GaussianBlur(depth_normalized, (3, 3), 0)
+        # Convert to RGB
         depth_colored = cv2.cvtColor(depth_normalized, cv2.COLOR_GRAY2RGB)
         return Image.fromarray(depth_colored)
+    def calculate_optimal_size(self, original_width, original_height):
+        """Calculate optimal size from recommended resolutions"""
         aspect_ratio = original_width / original_height
+        # Recommended resolutions for this model
+        recommended_sizes = [
+            (896, 1152),  # Portrait
+            (1152, 896),  # Landscape
+            (832, 1216),  # Tall portrait
+            (1216, 832),  # Wide landscape
+            (1024, 1024)  # Square
+        ]
+        # Find closest matching aspect ratio
+        best_match = None
+        best_diff = float('inf')
+        for width, height in recommended_sizes:
+            rec_aspect = width / height
+            diff = abs(rec_aspect - aspect_ratio)
+            if diff < best_diff:
+                best_diff = diff
+                best_match = (width, height)
+        # Ensure dimensions are multiples of 8
+        width, height = best_match
+        width = (width // 8) * 8
+        height = (height // 8) * 8
+        return width, height
+    def add_trigger_word(self, prompt):
+        """Add trigger word to prompt if not present"""
+        if TRIGGER_WORD.lower() not in prompt.lower():
+            return f"{TRIGGER_WORD}, {prompt}"
+        return prompt
     def generate_retro_art(
         self,
         input_image,
+        prompt="retro game character, vibrant colors, detailed",
+        negative_prompt="blurry, low quality, ugly, distorted",
+        num_inference_steps=12,  # LCM recommended: 12 steps
+        guidance_scale=1.0,       # LCM recommended: 1.0-1.5
+        controlnet_conditioning_scale=0.8,
+        lora_scale=1.0,
         identity_preservation=0.8,
+        image_scale=0.2
     ):
+        """Generate retro art with correct LCM settings"""
+        # Add trigger word to prompt
+        prompt = self.add_trigger_word(prompt)
+        # Calculate optimal size
         original_width, original_height = input_image.size
+        target_width, target_height = self.calculate_optimal_size(original_width, original_height)
         print(f"Resizing from {original_width}x{original_height} to {target_width}x{target_height}")
+        print(f"Prompt: {prompt}")
+        # Resize with high quality
         resized_image = input_image.resize((target_width, target_height), Image.LANCZOS)
+        # Generate depth map
         print("Generating depth map...")
+        depth_image = self.get_depth_map(resized_image)
         depth_image = depth_image.resize((target_width, target_height), Image.LANCZOS)
+        # Handle face detection for InstantID
         using_multiple_controlnets = self.using_multiple_controlnets
         face_embeddings = None
         has_detected_faces = False
         if using_multiple_controlnets:
+            print("Checking for faces...")
             img_array = np.array(resized_image)
             faces = self.face_app.get(img_array) if self.face_app is not None else []
             if len(faces) > 0:
                 has_detected_faces = True
+                print(f"Detected {len(faces)} face(s)")
                 face = sorted(faces, key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]))[-1]
                 face_embeddings = torch.from_numpy(face.normed_embedding).unsqueeze(0).to(self.device, dtype=self.dtype)
         # Set LORA scale
         if hasattr(self.pipe, 'set_adapters') and self.models_loaded['lora']:
             try:
                 self.pipe.set_adapters(["retroart"], adapter_weights=[lora_scale])
+                print(f"LORA scale: {lora_scale}")
             except Exception as e:
+                print(f"Could not set LORA scale: {e}")
+        # Prepare generation kwargs
         pipe_kwargs = {
             "prompt": prompt,
+            "negative_prompt": negative_prompt,
             "num_inference_steps": num_inference_steps,
             "guidance_scale": guidance_scale,
             "width": target_width,
             "generator": torch.Generator(device=self.device).manual_seed(42)
         }
+        # Add CLIP skip
+        if hasattr(self.pipe, 'text_encoder'):
+            pipe_kwargs["clip_skip"] = 2
+        # Configure ControlNet inputs
         if using_multiple_controlnets and has_detected_faces:
+            print("Using Depth + InstantID ControlNets")
             control_images = [depth_image, resized_image]
             conditioning_scales = [controlnet_conditioning_scale, image_scale]
             pipe_kwargs["image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
             if face_embeddings is not None:
                 pipe_kwargs["cross_attention_kwargs"] = {"ip_adapter_image_embeds": [face_embeddings]}
         elif using_multiple_controlnets and not has_detected_faces:
+            print("Multiple ControlNets available but no faces detected")
             control_images = [depth_image, depth_image]
             conditioning_scales = [controlnet_conditioning_scale, 0.0]
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
         else:
+            print("Using Depth ControlNet only")
             pipe_kwargs["image"] = depth_image
             pipe_kwargs["controlnet_conditioning_scale"] = controlnet_conditioning_scale
+        # Generate
+        print(f"Generating with LCM: Steps={num_inference_steps}, CFG={guidance_scale}")
         result = self.pipe(**pipe_kwargs)
         return result.images[0]
+# Initialize converter
 print("Initializing RetroArt Converter...")
 converter = RetroArtConverter()
 @spaces.GPU
 def process_image(
     image,
     controlnet_scale,
     lora_scale,
     identity_preservation,
+    image_scale
 ):
     if image is None:
         return None
             controlnet_conditioning_scale=controlnet_scale,
             lora_scale=lora_scale,
             identity_preservation=identity_preservation,
+            image_scale=image_scale
         )
         return result
     except Exception as e:
         traceback.print_exc()
         raise gr.Error(f"Generation failed: {str(e)}")
+# Gradio UI
+with gr.Blocks(title="RetroArt Converter - LCM", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # 🎮 RetroArt Converter (LCM Optimized)
+    Convert images into retro pixel art style using LCM (Latent Consistency Model) for fast, high-quality generation!
+    **✨ Features:**
+    - ⚡ Ultra-fast generation (12 steps!)
+    - 🎨 Custom pixel art LORA with trigger word: `p1x3l4rt, pixel art`
+    - 📐 Optimized resolutions: 896x1152 / 832x1216
+    - 🖼️ Bundled VAE for authentic retro look
+    - 🎯 CLIP Skip 2 for better style
     """)
+    # Model status
     if converter.models_loaded:
+        status_text = "**📦 Loaded Models:**\n"
+        status_text += f"- Custom Checkpoint (Horizon): {'✓ Loaded' if converter.models_loaded['custom_checkpoint'] else '✗ Using SDXL base'}\n"
+        status_text += f"- LORA (RetroArt): {'✓ Loaded' if converter.models_loaded['lora'] else '✗ Disabled'}\n"
+        status_text += f"- InstantID: {'✓ Loaded' if converter.models_loaded['instantid'] else '✗ Disabled'}\n"
         gr.Markdown(status_text)
+    gr.Markdown(f"""
+    **⚙️ LCM Configuration:**
+    - Scheduler: LCM (Latent Consistency Model)
+    - Recommended Steps: **12** (fast!)
+    - Recommended CFG: **1.0-1.5** (lower than normal)
+    - CLIP Skip: **2**
+    - LORA Trigger: `{TRIGGER_WORD}` (auto-added)
+    """)
     with gr.Row():
         with gr.Column():
             input_image = gr.Image(label="Input Image", type="pil")
             prompt = gr.Textbox(
+                label="Prompt (trigger word auto-added)",
+                value="retro game character, vibrant colors, highly detailed",
+                lines=3,
+                info=f"'{TRIGGER_WORD}' will be automatically added"
             )
             negative_prompt = gr.Textbox(
                 label="Negative Prompt",
+                value="blurry, low quality, ugly, distorted, deformed, bad anatomy",
                 lines=2
             )
+            with gr.Accordion("⚡ LCM Settings (Optimized)", open=True):
                 steps = gr.Slider(
+                    minimum=4,
+                    maximum=20,
+                    value=12,
+                    step=1,
+                    label="Inference Steps (LCM works great with just 12!)"
                 )
                 guidance_scale = gr.Slider(
+                    minimum=0.5,
+                    maximum=3.0,
+                    value=1.0,
+                    step=0.1,
+                    label="Guidance Scale (CFG) - LCM uses 1.0-1.5"
                 )
                 controlnet_scale = gr.Slider(
+                    minimum=0.3,
+                    maximum=1.2,
+                    value=0.8,
                     step=0.05,
+                    label="ControlNet Depth Scale"
                 )
                 lora_scale = gr.Slider(
+                    minimum=0.5,
+                    maximum=1.5,
+                    value=1.0,
                     step=0.05,
                     label="RetroArt LORA Scale"
                 )
+            with gr.Accordion("🎭 Identity Settings (for portraits)", open=False):
                 identity_preservation = gr.Slider(
                     minimum=0,
                     maximum=1.5,
             output_image = gr.Image(label="Retro Art Output")
             gr.Markdown("""
+            ### 💡 Tips for Best Results:
+            **For LCM Models:**
+            - ✅ Use **12 steps** (already optimized!)
+            - ✅ Keep CFG at **1.0-1.5** (not 7.5!)
+            - ✅ LORA trigger word is **auto-added**
+            - ✅ Resolution auto-optimized to 896x1152 or 832x1216
+            **For Quality:**
+            - Use high-resolution input images
+            - Be specific in prompts: "16-bit game character" vs "character"
+            - Adjust ControlNet scale: lower = more creative, higher = more faithful
+            **For Style:**
+            - Increase LORA scale (1.0-1.5) for stronger pixel art effect
+            - Try prompts like: "SNES style", "16-bit RPG", "Game Boy advance style"
             """)
     generate_btn.click(
         fn=process_image,
         inputs=[
+            input_image, prompt, negative_prompt, steps, guidance_scale,
+            controlnet_scale, lora_scale, identity_preservation, image_scale
         ],
         outputs=[output_image]
     )
 if __name__ == "__main__":
     demo.queue(max_size=20)
     demo.launch(