pixagram-dev

Runtime error

App Files Files Community

primerz commited on 8 days ago

Commit

8bbe1e4

verified ·

1 Parent(s): a70cb97

Update models.py

Browse files

Files changed (1) hide show

models.py +105 -177

models.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """
 Model loading and initialization for Pixagram AI Pixel Art Generator
 """
 import torch
 import time
@@ -11,14 +12,15 @@ from diffusers import (
 )
 from diffusers.models.attention_processor import AttnProcessor2_0
 from transformers import CLIPVisionModelWithProjection
-from transformers import BlipProcessor, BlipForConditionalGeneration
 from insightface.app import FaceAnalysis
 from controlnet_aux import ZoeDetector
 from huggingface_hub import hf_hub_download
 from compel import Compel, ReturnedEmbeddingsType
-from ip_attention_processor_compatible import IPAttnProcessorCompatible as IPAttnProcessor2_0
-from resampler_compatible import create_compatible_resampler as create_enhanced_resampler
 from config import (
     device, dtype, MODEL_REPO, MODEL_FILES, HUGGINGFACE_TOKEN,
     FACE_DETECTION_CONFIG, CLIP_SKIP, DOWNLOAD_CONFIG
@@ -26,17 +28,7 @@ from config import (
 def download_model_with_retry(repo_id, filename, max_retries=None):
-    """
-    Download model with retry logic and proper token handling.
-    Args:
-        repo_id: HuggingFace repository ID
-        filename: File to download
-        max_retries: Maximum number of retries (uses config default if None)
-    Returns:
-        Path to downloaded file
-    """
     if max_retries is None:
         max_retries = DOWNLOAD_CONFIG['max_retries']
@@ -44,7 +36,6 @@ def download_model_with_retry(repo_id, filename, max_retries=None):
         try:
             print(f"  Attempting to download {filename} (attempt {attempt + 1}/{max_retries})...")
-            # Use token if available
             kwargs = {"repo_type": "model"}
             if HUGGINGFACE_TOKEN:
                 kwargs["token"] = HUGGINGFACE_TOKEN
@@ -71,12 +62,7 @@ def download_model_with_retry(repo_id, filename, max_retries=None):
 def load_face_analysis():
-    """
-    Load face analysis model with proper error handling.
-    Returns:
-        Tuple of (face_app, success_bool)
-    """
     print("Loading face analysis model...")
     try:
         face_app = FaceAnalysis(
@@ -96,12 +82,7 @@ def load_face_analysis():
 def load_depth_detector():
-    """
-    Load Zoe Depth detector.
-    Returns:
-        Tuple of (zoe_depth, success_bool)
-    """
     print("Loading Zoe Depth detector...")
     try:
         zoe_depth = ZoeDetector.from_pretrained("lllyasviel/Annotators")
@@ -114,13 +95,7 @@ def load_depth_detector():
 def load_controlnets():
-    """
-    Load ControlNet models.
-    Returns:
-        Tuple of (controlnet_depth, controlnet_instantid, instantid_success)
-    """
-    # Load ControlNet for depth
     print("Loading ControlNet Zoe Depth model...")
     controlnet_depth = ControlNetModel.from_pretrained(
         "diffusers/controlnet-zoe-depth-sdxl-1.0",
@@ -128,7 +103,6 @@ def load_controlnets():
     ).to(device)
     print("  [OK] ControlNet Depth loaded")
-    # Load InstantID ControlNet
     print("Loading InstantID ControlNet...")
     try:
         controlnet_instantid = ControlNetModel.from_pretrained(
@@ -144,12 +118,7 @@ def load_controlnets():
 def load_image_encoder():
-    """
-    Load CLIP Image Encoder for IP-Adapter.
-    Returns:
-        Image encoder or None
-    """
     print("Loading CLIP Image Encoder for IP-Adapter...")
     try:
         image_encoder = CLIPVisionModelWithProjection.from_pretrained(
@@ -165,15 +134,7 @@ def load_image_encoder():
 def load_sdxl_pipeline(controlnets):
-    """
-    Load SDXL checkpoint from HuggingFace Hub.
-    Args:
-        controlnets: ControlNet model(s) to use
-    Returns:
-        Tuple of (pipeline, checkpoint_loaded_bool)
-    """
     print("Loading SDXL checkpoint (horizon) with bundled VAE from HuggingFace Hub...")
     try:
         model_path = download_model_with_retry(MODEL_REPO, MODEL_FILES['checkpoint'])
@@ -199,15 +160,7 @@ def load_sdxl_pipeline(controlnets):
 def load_lora(pipe):
-    """
-    Load LORA from HuggingFace Hub.
-    Args:
-        pipe: Pipeline to load LORA into
-    Returns:
-        Boolean indicating success
-    """
     print("Loading LORA (retroart) from HuggingFace Hub...")
     try:
         lora_path = download_model_with_retry(MODEL_REPO, MODEL_FILES['lora'])
@@ -221,66 +174,68 @@ def load_lora(pipe):
 def setup_ip_adapter(pipe, image_encoder):
     """
-    Setup IP-Adapter for InstantID face embeddings.
-    Args:
-        pipe: Pipeline to setup IP-Adapter on
-        image_encoder: CLIP image encoder
-    Returns:
-        Tuple of (image_proj_model, success_bool)
     """
     if image_encoder is None:
         return None, False
-    print("Setting up IP-Adapter for InstantID face embeddings...")
     try:
-        # Download InstantID IP-Adapter weights
         ip_adapter_path = download_model_with_retry(
             "InstantX/InstantID",
             "ip-adapter.bin"
         )
-        # Load IP-Adapter state dict
-        ip_adapter_state_dict = torch.load(ip_adapter_path, map_location="cpu")
-        # Separate image projection and IP-adapter weights
         image_proj_state_dict = {}
-        ip_state_dict = {}
-        for key, value in ip_adapter_state_dict.items():
             if key.startswith("image_proj."):
                 image_proj_state_dict[key.replace("image_proj.", "")] = value
             elif key.startswith("ip_adapter."):
-                ip_state_dict[key.replace("ip_adapter.", "")] = value
-        print("Setting up Enhanced Perceiver Resampler for face embedding refinement...")
-        # Create enhanced resampler
-        image_proj_model = create_enhanced_resampler(
-            quality_mode='quality',
-            num_queries=4,
-            output_dim=pipe.unet.config.cross_attention_dim,
-            device=device,
-            dtype=dtype
         )
-        # Try to load pretrained Resampler weights if available
-        try:
-            if 'latents' in image_proj_state_dict:
                 image_proj_model.load_state_dict(image_proj_state_dict, strict=True)
                 print("  [OK] Resampler loaded with pretrained weights")
-            else:
-                print("  [INFO] No pretrained Resampler weights found")
                 print("  Using randomly initialized Resampler")
-                print("  Expected +8-10% face similarity improvement")
-        except Exception as e:
-            print(f"  [INFO] Resampler initialization: {e}")
-            print("  Using randomly initialized Resampler")
-        # Set up IP-Adapter attention processors
         attn_procs = {}
         for name in pipe.unet.attn_processors.keys():
             cross_attention_dim = None if name.endswith("attn1.processor") else pipe.unet.config.cross_attention_dim
             if name.startswith("mid_block"):
                 hidden_size = pipe.unet.config.block_out_channels[-1]
             elif name.startswith("up_blocks"):
@@ -289,6 +244,8 @@ def setup_ip_adapter(pipe, image_encoder):
             elif name.startswith("down_blocks"):
                 block_id = int(name[len("down_blocks.")])
                 hidden_size = pipe.unet.config.block_out_channels[block_id]
             if cross_attention_dim is None:
                 attn_procs[name] = AttnProcessor2_0()
@@ -297,39 +254,41 @@ def setup_ip_adapter(pipe, image_encoder):
                     hidden_size=hidden_size,
                     cross_attention_dim=cross_attention_dim,
                     scale=1.0,
-                    num_tokens=4
                 ).to(device, dtype=dtype)
         pipe.unet.set_attn_processor(attn_procs)
-        # Load IP-adapter weights into attention processors
-        ip_layers = torch.nn.ModuleList(pipe.unet.attn_processors.values())
-        ip_layers.load_state_dict(ip_state_dict, strict=False)
-        print("  [OK] IP-Adapter attention processors loaded")
-        # Store the image encoder
         pipe.image_encoder = image_encoder
-        print("  [OK] IP-Adapter fully loaded with InstantID weights")
         return image_proj_model, True
     except Exception as e:
-        print(f"  [ERROR] Could not load IP-Adapter: {e}")
-        print("  InstantID will work with keypoints only (no face embeddings)")
         import traceback
         traceback.print_exc()
         return None, False
 def setup_compel(pipe):
-    """
-    Setup Compel for better SDXL prompt handling.
-    Args:
-        pipe: Pipeline to setup Compel on
-    Returns:
-        Tuple of (compel, success_bool)
-    """
     print("Setting up Compel for enhanced prompt processing...")
     try:
         compel = Compel(
@@ -346,27 +305,14 @@ def setup_compel(pipe):
 def setup_scheduler(pipe):
-    """
-    Setup LCM scheduler.
-    Args:
-        pipe: Pipeline to setup scheduler on
-    """
     print("Setting up LCM scheduler...")
     pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
     print("  [OK] LCM scheduler configured")
 def optimize_pipeline(pipe):
-    """
-    Apply optimizations to pipeline.
-    Args:
-        pipe: Pipeline to optimize
-    """
-    # Enable attention optimizations
-    pipe.unet.set_attn_processor(AttnProcessor2_0())
     # Try to enable xformers
     if device == "cuda":
         try:
@@ -378,66 +324,48 @@ def optimize_pipeline(pipe):
 def load_caption_model():
     """
-    Load BLIP-2 model for longer, more detailed caption generation.
-    BLIP-2 produces richer descriptions compared to BLIP base.
-    Returns:
-        Tuple of (processor, model, success_bool)
     """
-    print("Loading BLIP-2 model for detailed caption generation...")
     try:
-        # Try BLIP-2 first (produces longer, more detailed captions)
         try:
-            from transformers import Blip2Processor, Blip2ForConditionalGeneration
-            caption_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
-            caption_model = Blip2ForConditionalGeneration.from_pretrained(
-                "Salesforce/blip2-opt-2.7b",
                 torch_dtype=dtype
             ).to(device)
-            print("  [OK] BLIP-2 model loaded successfully (produces detailed captions)")
-            return caption_processor, caption_model, True
-        except Exception as e:
-            print(f"  [INFO] BLIP-2 not available ({e}), trying GIT-Large...")
-            # Fallback to GIT-Large (also produces good long captions)
-            try:
-                from transformers import AutoProcessor, AutoModelForCausalLM
-                caption_processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
-                caption_model = AutoModelForCausalLM.from_pretrained(
-                    "microsoft/git-large-coco",
-                    torch_dtype=dtype
-                ).to(device)
-                print("  [OK] GIT-Large model loaded successfully (produces detailed captions)")
-                return caption_processor, caption_model, True
-            except Exception as e2:
-                print(f"  [INFO] GIT-Large not available ({e2}), falling back to BLIP base...")
-                # Final fallback to BLIP base
-                caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-                caption_model = BlipForConditionalGeneration.from_pretrained(
-                    "Salesforce/blip-image-captioning-base",
-                    torch_dtype=dtype
-                ).to(device)
-                print("  [OK] BLIP base model loaded (shorter captions)")
-                return caption_processor, caption_model, True
-    except Exception as e:
-        print(f"  [WARNING] Caption model not available: {e}")
-        print("  Caption generation will be disabled")
-        return None, None, False
 def set_clip_skip(pipe):
-    """
-    Set CLIP skip value.
-    Args:
-        pipe: Pipeline to set CLIP skip on
-    """
     if hasattr(pipe, 'text_encoder'):
         print(f"  [OK] CLIP skip set to {CLIP_SKIP}")
-print("[OK] Model loading functions ready")

 """
 Model loading and initialization for Pixagram AI Pixel Art Generator
+FIXED VERSION with proper IP-Adapter and BLIP-2 support
 """
 import torch
 import time
 )
 from diffusers.models.attention_processor import AttnProcessor2_0
 from transformers import CLIPVisionModelWithProjection
 from insightface.app import FaceAnalysis
 from controlnet_aux import ZoeDetector
 from huggingface_hub import hf_hub_download
 from compel import Compel, ReturnedEmbeddingsType
+# Use reference implementation's attention processor
+from attention_processor import IPAttnProcessor2_0, AttnProcessor
+from resampler import Resampler
 from config import (
     device, dtype, MODEL_REPO, MODEL_FILES, HUGGINGFACE_TOKEN,
     FACE_DETECTION_CONFIG, CLIP_SKIP, DOWNLOAD_CONFIG
 def download_model_with_retry(repo_id, filename, max_retries=None):
+    """Download model with retry logic and proper token handling."""
     if max_retries is None:
         max_retries = DOWNLOAD_CONFIG['max_retries']
         try:
             print(f"  Attempting to download {filename} (attempt {attempt + 1}/{max_retries})...")
             kwargs = {"repo_type": "model"}
             if HUGGINGFACE_TOKEN:
                 kwargs["token"] = HUGGINGFACE_TOKEN
 def load_face_analysis():
+    """Load face analysis model with proper error handling."""
     print("Loading face analysis model...")
     try:
         face_app = FaceAnalysis(
 def load_depth_detector():
+    """Load Zoe Depth detector."""
     print("Loading Zoe Depth detector...")
     try:
         zoe_depth = ZoeDetector.from_pretrained("lllyasviel/Annotators")
 def load_controlnets():
+    """Load ControlNet models."""
     print("Loading ControlNet Zoe Depth model...")
     controlnet_depth = ControlNetModel.from_pretrained(
         "diffusers/controlnet-zoe-depth-sdxl-1.0",
     ).to(device)
     print("  [OK] ControlNet Depth loaded")
     print("Loading InstantID ControlNet...")
     try:
         controlnet_instantid = ControlNetModel.from_pretrained(
 def load_image_encoder():
+    """Load CLIP Image Encoder for IP-Adapter."""
     print("Loading CLIP Image Encoder for IP-Adapter...")
     try:
         image_encoder = CLIPVisionModelWithProjection.from_pretrained(
 def load_sdxl_pipeline(controlnets):
+    """Load SDXL checkpoint from HuggingFace Hub."""
     print("Loading SDXL checkpoint (horizon) with bundled VAE from HuggingFace Hub...")
     try:
         model_path = download_model_with_retry(MODEL_REPO, MODEL_FILES['checkpoint'])
 def load_lora(pipe):
+    """Load LORA from HuggingFace Hub."""
     print("Loading LORA (retroart) from HuggingFace Hub...")
     try:
         lora_path = download_model_with_retry(MODEL_REPO, MODEL_FILES['lora'])
 def setup_ip_adapter(pipe, image_encoder):
     """
+    Setup IP-Adapter for InstantID face embeddings - PROPER IMPLEMENTATION.
+    Based on the reference InstantID pipeline.
     """
     if image_encoder is None:
         return None, False
+    print("Setting up IP-Adapter for InstantID face embeddings (proper implementation)...")
     try:
+        # Download InstantID weights
         ip_adapter_path = download_model_with_retry(
             "InstantX/InstantID",
             "ip-adapter.bin"
         )
+        # Load full state dict
+        state_dict = torch.load(ip_adapter_path, map_location="cpu")
+        # Extract image_proj and ip_adapter weights
         image_proj_state_dict = {}
+        ip_adapter_state_dict = {}
+        for key, value in state_dict.items():
             if key.startswith("image_proj."):
                 image_proj_state_dict[key.replace("image_proj.", "")] = value
             elif key.startswith("ip_adapter."):
+                ip_adapter_state_dict[key.replace("ip_adapter.", "")] = value
+        # Create Resampler (image projection model) with CORRECT parameters from reference
+        print("Creating Resampler (Perceiver architecture)...")
+        image_proj_model = Resampler(
+            dim=1280,                                       # Hidden dimension
+            depth=4,                                        # IMPORTANT: 4 layers (not 8!)
+            dim_head=64,                                    # Dimension per head
+            heads=20,                                       # Number of heads
+            num_queries=16,                                 # Number of output tokens
+            embedding_dim=512,                              # InsightFace embedding dim
+            output_dim=pipe.unet.config.cross_attention_dim,  # SDXL cross-attention dim (2048)
+            ff_mult=4                                       # Feedforward multiplier
         )
+        image_proj_model.eval()
+        image_proj_model = image_proj_model.to(device, dtype=dtype)
+        # Load image_proj weights
+        if image_proj_state_dict:
+            try:
                 image_proj_model.load_state_dict(image_proj_state_dict, strict=True)
                 print("  [OK] Resampler loaded with pretrained weights")
+            except Exception as e:
+                print(f"  [WARNING] Could not load Resampler weights: {e}")
                 print("  Using randomly initialized Resampler")
+        else:
+            print("  [WARNING] No image_proj weights found, using random initialization")
+        # Setup IP-Adapter attention processors
+        print("Setting up IP-Adapter attention processors...")
         attn_procs = {}
+        num_tokens = 16  # Match Resampler num_queries
         for name in pipe.unet.attn_processors.keys():
             cross_attention_dim = None if name.endswith("attn1.processor") else pipe.unet.config.cross_attention_dim
             if name.startswith("mid_block"):
                 hidden_size = pipe.unet.config.block_out_channels[-1]
             elif name.startswith("up_blocks"):
             elif name.startswith("down_blocks"):
                 block_id = int(name[len("down_blocks.")])
                 hidden_size = pipe.unet.config.block_out_channels[block_id]
+            else:
+                hidden_size = pipe.unet.config.block_out_channels[-1]
             if cross_attention_dim is None:
                 attn_procs[name] = AttnProcessor2_0()
                     hidden_size=hidden_size,
                     cross_attention_dim=cross_attention_dim,
                     scale=1.0,
+                    num_tokens=num_tokens
                 ).to(device, dtype=dtype)
+        # Set attention processors
         pipe.unet.set_attn_processor(attn_procs)
+        # Load IP-Adapter weights into attention processors
+        if ip_adapter_state_dict:
+            try:
+                ip_layers = torch.nn.ModuleList(pipe.unet.attn_processors.values())
+                ip_layers.load_state_dict(ip_adapter_state_dict, strict=False)
+                print("  [OK] IP-Adapter attention weights loaded")
+            except Exception as e:
+                print(f"  [WARNING] Could not load IP-Adapter weights: {e}")
+        else:
+            print("  [WARNING] No ip_adapter weights found")
+        # Store image encoder and projection model
         pipe.image_encoder = image_encoder
+        print("  [OK] IP-Adapter fully loaded with InstantID architecture")
+        print(f"  - Resampler: 4 layers, 20 heads, 16 output tokens")
+        print(f"  - Face embeddings: 512D → 16x2048D")
         return image_proj_model, True
     except Exception as e:
+        print(f"  [ERROR] Could not setup IP-Adapter: {e}")
         import traceback
         traceback.print_exc()
         return None, False
 def setup_compel(pipe):
+    """Setup Compel for better SDXL prompt handling."""
     print("Setting up Compel for enhanced prompt processing...")
     try:
         compel = Compel(
 def setup_scheduler(pipe):
+    """Setup LCM scheduler."""
     print("Setting up LCM scheduler...")
     pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
     print("  [OK] LCM scheduler configured")
 def optimize_pipeline(pipe):
+    """Apply optimizations to pipeline."""
     # Try to enable xformers
     if device == "cuda":
         try:
 def load_caption_model():
     """
+    Load caption model with proper error handling.
+    Tries multiple models in order of quality.
     """
+    print("Loading caption model...")
+    # Try GIT-Large first (good balance of quality and compatibility)
     try:
+        from transformers import AutoProcessor, AutoModelForCausalLM
+        print("  Attempting GIT-Large (recommended)...")
+        caption_processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
+        caption_model = AutoModelForCausalLM.from_pretrained(
+            "microsoft/git-large-coco",
+            torch_dtype=dtype
+        ).to(device)
+        print("  [OK] GIT-Large model loaded (produces detailed captions)")
+        return caption_processor, caption_model, True, 'git'
+    except Exception as e1:
+        print(f"  [INFO] GIT-Large not available: {e1}")
+        # Try BLIP base as fallback
         try:
+            from transformers import BlipProcessor, BlipForConditionalGeneration
+            print("  Attempting BLIP base (fallback)...")
+            caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+            caption_model = BlipForConditionalGeneration.from_pretrained(
+                "Salesforce/blip-image-captioning-base",
                 torch_dtype=dtype
             ).to(device)
+            print("  [OK] BLIP base model loaded (standard captions)")
+            return caption_processor, caption_model, True, 'blip'
+        except Exception as e2:
+            print(f"  [WARNING] Caption models not available: {e2}")
+            print("  Caption generation will be disabled")
+            return None, None, False, 'none'
 def set_clip_skip(pipe):
+    """Set CLIP skip value."""
     if hasattr(pipe, 'text_encoder'):
         print(f"  [OK] CLIP skip set to {CLIP_SKIP}")
+print("[OK] Model loading functions ready")