Spaces:

Gertie01
/

smart-swift-tool

Build error

File size: 5,100 Bytes

48ae3bc

# models.py
import torch
import numpy as np
from diffusers import DiffusionPipeline
from typing import Tuple, Union
import spaces
from PIL import Image
import imageio
import os
from scipy.io import wavfile

from config import MODEL_ID_T2V, MAX_DURATION_SECONDS

# --- Model Loading (ZeroGPU Setup) ---
pipe_t2v = None
MODEL_LOADED = False

try:
    # Use bfloat16 if available (recommended for modern GPUs)
    dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8 else torch.float16
    
    pipe_t2v = DiffusionPipeline.from_pretrained(
        MODEL_ID_T2V, 
        torch_dtype=dtype, 
        variant="fp16"
    )
    # Move to CUDA and enable CPU offload for large models
    pipe_t2v.to("cuda")
    pipe_t2v.enable_model_cpu_offload() 
    MODEL_LOADED = True
    print(f"✅ Loaded model {MODEL_ID_T2V} to CUDA.")

except Exception as e:
    print(f"❌ Failed to load ZeroScope model for GPU: {e}")
    MODEL_LOADED = False

# Fallback generator function
def fallback_video_generator(prompt: str, duration: int) -> str:
    print(f"⚠️ Using CPU Fallback Generator for '{prompt}'.")
    
    # Simulate generation time
    # This ensures the user waits, mirroring the real process time
    import time; time.sleep(duration * 1.5) 

    num_frames = duration * 10  # 10 FPS
    frames = []
    
    # Simple gradient animation 
    width, height = 576, 320
    
    for i in range(num_frames):
        # Create a simple color based on frame index
        r = (128 + 100 * np.sin(i * 0.1)).astype(np.uint8)
        g = (128 + 100 * np.sin(i * 0.15)).astype(np.uint8)
        b = (128 + 100 * np.sin(i * 0.2)).astype(np.uint8)

        frame = np.zeros((height, width, 3), dtype=np.uint8)
        frame[:, :] = [r, g, b]
        frames.append(frame)
    
    output_path = "output_fallback.mp4"
    imageio.mimsave(output_path, frames, fps=10)
    return output_path

def synthesize_audio(prompt: str) -> Tuple[int, np.ndarray]:
    """Synthesizes placeholder audio based on the prompt complexity."""
    try:
        base_freq = 200 + len(prompt.split()) * 15 # Frequency scales with word count
        duration = 4.0 # seconds (fixed length for simplicity)
        sample_rate = 22050
        
        t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
        
        # Complex waveform: multiple sine waves + envelope
        waveform = 0.6 * np.sin(2 * np.pi * base_freq * t)
        waveform += 0.3 * np.sin(2 * np.pi * (base_freq * 1.5) * t)
        
        # Apply gentle attack/decay envelope
        envelope = np.ones_like(t)
        attack_len = int(sample_rate * 0.5)
        decay_len = int(sample_rate * (duration - 0.5))
        envelope[:attack_len] = np.linspace(0, 1, attack_len)
        envelope[decay_len:] = np.linspace(1, 0, len(t) - decay_len)
        
        waveform *= envelope
        
        # Scale to 16-bit PCM
        audio_data = (waveform * 32767).astype(np.int16)
        
        return sample_rate, audio_data
    except Exception as e:
        print(f"Audio synthesis error: {e}")
        return 22050, np.zeros(22050 * 4, dtype=np.int16)

@spaces.GPU(duration=300) # Generous duration for video generation
def generate_video(
    prompt: str, 
    input_image: Union[Image.Image, None], 
    duration: int, 
    is_image_to_video: bool
) -> Tuple[str, Tuple[int, np.ndarray]]:
    """
    Generates a video (and synthesized audio) based on the input parameters.
    """
    
    # 1. Video generation logic
    if not MODEL_LOADED or pipe_t2v is None:
        video_path = fallback_video_generator(prompt, duration)
    else:
        actual_duration = min(duration, MAX_DURATION_SECONDS)
        # Using a fixed frame rate common for ZeroScope
        fps = 10 
        num_frames = actual_duration * fps 
        
        print(f"Using ZeroScope T2V. Duration: {actual_duration}s, Frames: {num_frames}")

        if is_image_to_video and input_image:
            # For I2V using T2V, we must guide the model using the prompt
            # and rely on future model iterations (or Lora/ControlNet) for true image conditioning.
            prompt = f"video starting from a visual of the following: {prompt}"
            # In a real I2V setup, input_image would condition the VAE/UNet.
            
        try:
            # Generate frames
            video_frames = pipe_t2v(
                prompt, 
                num_frames=num_frames, 
                height=320, 
                width=576
            ).frames
            
            output_path = "output_video.mp4"
            # Use 'H.264' codec for better compatibility in web browsers
            imageio.mimsave(output_path, [np.array(f) for f in video_frames], fps=fps, quality=8, codec='libx264', pixelformat='yuv420p')
            
        except Exception as e:
            print(f"Critical Error during ZeroScope generation: {e}")
            video_path = fallback_video_generator(prompt, duration)

    # 2. Synthesize audio
    audio_output = synthesize_audio(prompt)

    return video_path, audio_output