pixagram-dev

Runtime error

File size: 13,239 Bytes

import spaces  # MUST be first, before any CUDA-related imports
import gradio as gr
import torch
from diffusers import (
    StableDiffusionXLPipeline,
    StableDiffusionXLControlNetPipeline,
    ControlNetModel,
    AutoencoderKL,
    DPMSolverMultistepScheduler
)
from diffusers.models.attention_processor import AttnProcessor2_0
from insightface.app import FaceAnalysis
from PIL import Image
import numpy as np
import cv2
from transformers import pipeline as transformers_pipeline
from huggingface_hub import hf_hub_download
import os

# Configuration
MODEL_REPO = "primerz/pixagram"
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32

print(f"Using device: {device}")
print(f"Loading models from: {MODEL_REPO}")

class RetroArtConverter:
    def __init__(self):
        self.device = device
        self.dtype = dtype
        
        # Initialize face analysis for InstantID (optional)
        print("Loading face analysis model...")
        try:
            self.face_app = FaceAnalysis(
                name='antelopev2',
                root='./models/insightface',
                providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
            )
            self.face_app.prepare(ctx_id=0, det_size=(640, 640))
            print("✓ Face analysis model loaded successfully")
            self.face_detection_enabled = True
        except Exception as e:
            print(f"⚠️ Face detection not available: {e}")
            print("Continuing without face detection (will still work fine)")
            self.face_app = None
            self.face_detection_enabled = False
        
        # Load ControlNet for depth
        print("Loading ControlNet depth model...")
        self.controlnet_depth = ControlNetModel.from_pretrained(
            "diffusers/controlnet-zoe-depth-sdxl-1.0",
            torch_dtype=self.dtype
        ).to(self.device)
        
        # Load custom VAE from HuggingFace Hub
        print("Loading custom VAE (pixelate) from HuggingFace Hub...")
        try:
            vae_path = hf_hub_download(
                repo_id=MODEL_REPO,
                filename="pixelate.safetensors",
                repo_type="model"
            )
            self.vae = AutoencoderKL.from_single_file(
                vae_path,
                torch_dtype=self.dtype
            ).to(self.device)
            print("✓ Custom VAE loaded successfully")
        except Exception as e:
            print(f"Warning: Could not load custom VAE: {e}")
            print("Using default SDXL VAE")
            self.vae = AutoencoderKL.from_pretrained(
                "madebyollin/sdxl-vae-fp16-fix",
                torch_dtype=self.dtype
            ).to(self.device)
        
        # Load depth estimator for preprocessing
        print("Loading depth estimator...")
        self.depth_estimator = transformers_pipeline(
            'depth-estimation',
            model="Intel/dpt-hybrid-midas",
            device=self.device if self.device == "cuda" else -1
        )
        
        # Load SDXL checkpoint from HuggingFace Hub
        print("Loading SDXL checkpoint (horizon) from HuggingFace Hub...")
        try:
            model_path = hf_hub_download(
                repo_id=MODEL_REPO,
                filename="horizon.safetensors",
                repo_type="model"
            )
            self.pipe = StableDiffusionXLControlNetPipeline.from_single_file(
                model_path,
                controlnet=self.controlnet_depth,
                vae=self.vae,
                torch_dtype=self.dtype,
                use_safetensors=True
            ).to(self.device)
            print("✓ Custom checkpoint loaded successfully")
        except Exception as e:
            print(f"Warning: Could not load custom checkpoint: {e}")
            print("Using default SDXL")
            self.pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
                "stabilityai/stable-diffusion-xl-base-1.0",
                controlnet=self.controlnet_depth,
                vae=self.vae,
                torch_dtype=self.dtype,
                use_safetensors=True
            ).to(self.device)
        
        # Load LORA from HuggingFace Hub (requires PEFT)
        print("Loading LORA (retroart) from HuggingFace Hub...")
        try:
            lora_path = hf_hub_download(
                repo_id=MODEL_REPO,
                filename="retroart.safetensors",
                repo_type="model"
            )
            self.pipe.load_lora_weights(lora_path)
            print("✓ LORA loaded successfully")
        except Exception as e:
            print(f"Warning: Could not load LORA: {e}")
            print("Running without LORA")
        
        # Optimize pipeline
        self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(
            self.pipe.scheduler.config
        )
        
        # For ZeroGPU, don't use model_cpu_offload
        # self.pipe.enable_model_cpu_offload()
        
        self.pipe.enable_vae_slicing()
        
        # Enable attention slicing for memory efficiency
        self.pipe.unet.set_attn_processor(AttnProcessor2_0())
        
        # Try to enable xformers if available (only works on GPU)
        if self.device == "cuda":
            try:
                self.pipe.enable_xformers_memory_efficient_attention()
                print("✓ xformers enabled")
            except Exception as e:
                print(f"⚠️ xformers not available: {e}")
        
        print("Model initialization complete!")
    
    def get_depth_map(self, image):
        """Generate depth map from input image"""
        depth = self.depth_estimator(image)
        depth_image = depth['depth']
        
        # Convert to numpy array
        depth_array = np.array(depth_image)
        
        # Normalize to 0-255
        depth_normalized = (depth_array - depth_array.min()) / (depth_array.max() - depth_array.min()) * 255
        depth_normalized = depth_normalized.astype(np.uint8)
        
        # Convert to 3-channel image
        depth_colored = cv2.cvtColor(depth_normalized, cv2.COLOR_GRAY2RGB)
        
        return Image.fromarray(depth_colored)
    
    def detect_faces(self, image):
        """Detect faces in the image using antelopev2"""
        if not self.face_detection_enabled or self.face_app is None:
            return []
        
        try:
            img_array = np.array(image)
            faces = self.face_app.get(img_array)
            return faces
        except Exception as e:
            print(f"Face detection error: {e}")
            return []
    
    def calculate_target_size(self, original_width, original_height, max_dimension=1024):
        """Calculate target size maintaining aspect ratio"""
        aspect_ratio = original_width / original_height
        
        if original_width > original_height:
            new_width = min(original_width, max_dimension)
            new_height = int(new_width / aspect_ratio)
        else:
            new_height = min(original_height, max_dimension)
            new_width = int(new_height * aspect_ratio)
        
        # Round to nearest multiple of 8 (required for diffusion models)
        new_width = (new_width // 8) * 8
        new_height = (new_height // 8) * 8
        
        return new_width, new_height
    
    def generate_retro_art(
        self,
        input_image,
        prompt="retro pixel art game, 16-bit style, vibrant colors",
        negative_prompt="blurry, low quality, modern, photorealistic, 3d render",
        num_inference_steps=30,
        guidance_scale=7.5,
        controlnet_conditioning_scale=0.8,
        lora_scale=0.85
    ):
        """Main generation function"""
        
        # Resize image maintaining aspect ratio
        original_width, original_height = input_image.size
        target_width, target_height = self.calculate_target_size(original_width, original_height)
        
        print(f"Resizing from {original_width}x{original_height} to {target_width}x{target_height}")
        
        resized_image = input_image.resize((target_width, target_height), Image.LANCZOS)
        
        # Detect faces
        faces = self.detect_faces(resized_image)
        has_faces = len(faces) > 0
        
        if has_faces:
            print(f"Detected {len(faces)} face(s)")
            # Enhance prompt for face preservation
            prompt = f"portrait, detailed face, {prompt}"
        
        # Generate depth map
        print("Generating depth map...")
        depth_image = self.get_depth_map(resized_image)
        depth_image = depth_image.resize((target_width, target_height), Image.LANCZOS)
        
        # Set LORA scale
        self.pipe.set_adapters(["retroart"], adapter_weights=[lora_scale])
        
        # Generate image
        print("Generating retro art...")
        result = self.pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            image=depth_image,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            controlnet_conditioning_scale=controlnet_conditioning_scale,
            width=target_width,
            height=target_height,
            generator=torch.Generator(device=self.device).manual_seed(42)
        )
        
        return result.images[0]

# Initialize the converter
print("Initializing RetroArt Converter...")
converter = RetroArtConverter()

# Gradio interface with ZeroGPU support
@spaces.GPU
def process_image(
    image,
    prompt,
    negative_prompt,
    steps,
    guidance_scale,
    controlnet_scale,
    lora_scale
):
    if image is None:
        return None
    
    try:
        result = converter.generate_retro_art(
            input_image=image,
            prompt=prompt,
            negative_prompt=negative_prompt,
            num_inference_steps=int(steps),
            guidance_scale=guidance_scale,
            controlnet_conditioning_scale=controlnet_scale,
            lora_scale=lora_scale
        )
        return result
    except Exception as e:
        print(f"Error: {e}")
        raise gr.Error(f"Generation failed: {str(e)}")

# Create Gradio interface
with gr.Blocks(title="RetroArt Converter") as demo:
    gr.Markdown("""
    # 🎮 RetroArt Converter
    
    Convert any image into retro game art style!
    
    **Features:**
    - Custom SDXL checkpoint (Horizon)
    - Pixelate VAE for authentic retro look
    - RetroArt LORA for style enhancement
    - Face preservation with InstantID
    - Depth-aware generation with ControlNet
    """)
    
    with gr.Row():
        with gr.Column():
            input_image = gr.Image(label="Input Image", type="pil")
            
            prompt = gr.Textbox(
                label="Prompt",
                value="retro pixel art game, 16-bit style, vibrant colors, detailed",
                lines=3
            )
            
            negative_prompt = gr.Textbox(
                label="Negative Prompt",
                value="blurry, low quality, modern, photorealistic, 3d render, ugly, distorted",
                lines=2
            )
            
            with gr.Accordion("Advanced Settings", open=False):
                steps = gr.Slider(
                    minimum=20,
                    maximum=50,
                    value=30,
                    step=1,
                    label="Inference Steps"
                )
                
                guidance_scale = gr.Slider(
                    minimum=1,
                    maximum=15,
                    value=7.5,
                    step=0.5,
                    label="Guidance Scale"
                )
                
                controlnet_scale = gr.Slider(
                    minimum=0,
                    maximum=2,
                    value=0.8,
                    step=0.1,
                    label="ControlNet Depth Scale"
                )
                
                lora_scale = gr.Slider(
                    minimum=0,
                    maximum=2,
                    value=0.85,
                    step=0.05,
                    label="RetroArt LORA Scale"
                )
            
            generate_btn = gr.Button("🎨 Generate Retro Art", variant="primary")
        
        with gr.Column():
            output_image = gr.Image(label="Retro Art Output")
    
    gr.Examples(
        examples=[
            ["example_portrait.jpg", "retro pixel art portrait, 16-bit game character", "blurry, modern", 30, 7.5, 0.8, 0.85],
        ],
        inputs=[input_image, prompt, negative_prompt, steps, guidance_scale, controlnet_scale, lora_scale],
        outputs=[output_image],
        fn=process_image,
        cache_examples=False
    )
    
    generate_btn.click(
        fn=process_image,
        inputs=[input_image, prompt, negative_prompt, steps, guidance_scale, controlnet_scale, lora_scale],
        outputs=[output_image]
    )

# Launch with API enabled
if __name__ == "__main__":
    demo.queue(max_size=20)
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_api=True  # Enable API
    )