Spaces:

Gertie01
/

smart-swift-tool

Build error

App Files Files Community

Gertie01 commited on 12 days ago

Commit

48ae3bc

verified ·

1 Parent(s): cf1083b

Deploy Gradio app with multiple files

Browse files

Files changed (5) hide show

app.py +174 -0
config.py +13 -0
models.py +143 -0
requirements.txt +15 -0
utils.py +18 -0

app.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# app.py
+import gradio as gr
+from PIL import Image
+from typing import Union
+import os
+# Import utility and model functions
+from models import generate_video
+from config import MAX_DURATION_SECONDS, DEFAULT_IMAGE_PATH, ASSETS_DIR
+from utils import ensure_placeholder_image
+# Prepare assets directory and placeholder image
+ensure_placeholder_image()
+# --- Unified Handler ---
+def run_generation(
+    prompt: str,
+    input_image_path: Union[str, None],
+    duration_slider: float,
+    is_image_to_video: bool
+):
+    """Unified handler that loads image if necessary and calls the model."""
+    pil_image = None
+    if input_image_path and is_image_to_video:
+        try:
+            # Load the PIL image from the file path provided by gr.Image
+            pil_image = Image.open(input_image_path).convert("RGB")
+        except Exception as e:
+            gr.Warning(f"Could not load image: {e}")
+            pass
+    duration = int(duration_slider)
+    return generate_video(
+        prompt=prompt,
+        input_image=pil_image,
+        duration=duration,
+        is_image_to_video=is_image_to_video
+    )
+# --- Wrapper Functions for Tabs ---
+def t2v_wrapper(prompt: str, duration_slider: float):
+    """Handler for Text-to-Video tab."""
+    return run_generation(prompt, None, duration_slider, False)
+def i2v_wrapper(prompt: str, input_image_path: str, duration_slider: float):
+    """Handler for Image-to-Video tab."""
+    if not input_image_path:
+        raise gr.Error("Please upload an image for Image-to-Video generation.")
+    return run_generation(prompt, input_image_path, duration_slider, True)
+# --- UI Definition ---
+with gr.Blocks(title="Sora 2 Video Generator (ZeroScope Proxy)", fill_width=True) as demo:
+    gr.HTML(
+        f"""
+        <div style="text-align: center; max-width: 800px; margin: 0 auto;">
+            <h1>Sora 2 Inspired Video Generator (ZeroScope Proxy)</h1>
+            <p>
+                This demo utilizes a real, high-quality open-source AI model ({MODEL_ID_T2V}) to simulate Sora's functionality.
+                Due to hardware and model limitations, videos are currently capped at {MAX_DURATION_SECONDS} seconds.
+                The audio track is synthesized based on the prompt complexity.
+            </p>
+            <p>
+                Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">anycoder</a>
+            </p>
+        </div>
+        """
+    )
+    with gr.Tabs():
+        # =======================================================
+        # Tab 1: Text-to-Video (T2V)
+        # =======================================================
+        with gr.TabItem("Text-to-Video (T2V)"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    prompt_t2v = gr.Textbox(
+                        label="Text Prompt",
+                        value="A highly cinematic shot of a golden eagle flying over a medieval castle, volumetric lighting.",
+                        lines=3
+                    )
+                    duration_t2v = gr.Slider(
+                        minimum=4,
+                        maximum=MAX_DURATION_SECONDS,
+                        step=1,
+                        value=4,
+                        label=f"Video Duration (seconds, max {MAX_DURATION_SECONDS}s)"
+                    )
+                    generate_btn_t2v = gr.Button("Generate Video", variant="primary")
+                with gr.Column(scale=1):
+                    video_out_t2v = gr.Video(label="Generated Video")
+                    audio_out_t2v = gr.Audio(label="Generated Audio Track", type="numpy")
+            # T2V Generation Event
+            generate_btn_t2v.click(
+                fn=t2v_wrapper,
+                inputs=[prompt_t2v, duration_t2v],
+                outputs=[video_out_t2v, audio_out_t2v]
+            )
+            gr.Examples(
+                examples=[
+                    ["A puppy dancing ballet on the moon, high saturation, 4k.", 4],
+                    ["Neon lights reflecting off wet cobblestones in a cyberpunk alley, panning camera.", 4]
+                ],
+                inputs=[prompt_t2v, duration_t2v],
+                outputs=[video_out_t2v, audio_out_t2v],
+                fn=t2v_wrapper,
+                cache_examples=False,
+                run_on_click=True
+            )
+        # =======================================================
+        # Tab 2: Image-to-Video (I2V)
+        # =======================================================
+        with gr.TabItem("Image-to-Video (I2V)"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    image_i2v = gr.Image(
+                        label="Input Image",
+                        type="filepath",
+                        sources=["upload"],
+                        interactive=True,
+                        value=DEFAULT_IMAGE_PATH
+                    )
+                with gr.Column(scale=2):
+                    prompt_i2v = gr.Textbox(
+                        label="Movement Prompt",
+                        value="The water ripples slightly as a breeze passes through the field.",
+                        placeholder="Describe the desired movement or animation.",
+                        lines=3
+                    )
+                    duration_i2v = gr.Slider(
+                        minimum=4,
+                        maximum=MAX_DURATION_SECONDS,
+                        step=1,
+                        value=4,
+                        label=f"Video Duration (seconds, max {MAX_DURATION_SECONDS}s)"
+                    )
+                    generate_btn_i2v = gr.Button("Animate Image", variant="primary")
+            with gr.Row():
+                video_out_i2v = gr.Video(label="Animated Video")
+                audio_out_i2v = gr.Audio(label="Generated Audio Track", type="numpy")
+            # I2V Generation Event
+            generate_btn_i2v.click(
+                fn=i2v_wrapper,
+                inputs=[prompt_i2v, image_i2v, duration_i2v],
+                outputs=[video_out_i2v, audio_out_i2v]
+            )
+            gr.Examples(
+                examples=[
+                    [
+                        "Heavy rain starts to fall, blurring the edges.",
+                        DEFAULT_IMAGE_PATH,
+                        4
+                    ]
+                ],
+                inputs=[prompt_i2v, image_i2v, duration_i2v],
+                outputs=[video_out_i2v, audio_out_i2v],
+                fn=i2v_wrapper,
+                cache_examples=False,
+                run_on_click=True
+            )
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch()

config.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# config.py
+# Model Identifiers
+# Note: Using ZeroScope v2 576w as a high-quality open-source proxy model.
+# The original Sora 2 is proprietary and capable of longer generations (2 min).
+MODEL_ID_T2V = "cerspense/zeroscope_v2_576w"
+# Constraints (Must be realistic for free spaces)
+MAX_DURATION_SECONDS = 4 # Maximum video duration in seconds (4s is a realistic limit for ZeroScope on free GPU)
+# Placeholder paths
+ASSETS_DIR = "assets"
+DEFAULT_IMAGE_PATH = f"{ASSETS_DIR}/placeholder_image.png"

models.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# models.py
+import torch
+import numpy as np
+from diffusers import DiffusionPipeline
+from typing import Tuple, Union
+import spaces
+from PIL import Image
+import imageio
+import os
+from scipy.io import wavfile
+from config import MODEL_ID_T2V, MAX_DURATION_SECONDS
+# --- Model Loading (ZeroGPU Setup) ---
+pipe_t2v = None
+MODEL_LOADED = False
+try:
+    # Use bfloat16 if available (recommended for modern GPUs)
+    dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8 else torch.float16
+    pipe_t2v = DiffusionPipeline.from_pretrained(
+        MODEL_ID_T2V,
+        torch_dtype=dtype,
+        variant="fp16"
+    )
+    # Move to CUDA and enable CPU offload for large models
+    pipe_t2v.to("cuda")
+    pipe_t2v.enable_model_cpu_offload()
+    MODEL_LOADED = True
+    print(f"✅ Loaded model {MODEL_ID_T2V} to CUDA.")
+except Exception as e:
+    print(f"❌ Failed to load ZeroScope model for GPU: {e}")
+    MODEL_LOADED = False
+# Fallback generator function
+def fallback_video_generator(prompt: str, duration: int) -> str:
+    print(f"⚠️ Using CPU Fallback Generator for '{prompt}'.")
+    # Simulate generation time
+    # This ensures the user waits, mirroring the real process time
+    import time; time.sleep(duration * 1.5)
+    num_frames = duration * 10  # 10 FPS
+    frames = []
+    # Simple gradient animation
+    width, height = 576, 320
+    for i in range(num_frames):
+        # Create a simple color based on frame index
+        r = (128 + 100 * np.sin(i * 0.1)).astype(np.uint8)
+        g = (128 + 100 * np.sin(i * 0.15)).astype(np.uint8)
+        b = (128 + 100 * np.sin(i * 0.2)).astype(np.uint8)
+        frame = np.zeros((height, width, 3), dtype=np.uint8)
+        frame[:, :] = [r, g, b]
+        frames.append(frame)
+    output_path = "output_fallback.mp4"
+    imageio.mimsave(output_path, frames, fps=10)
+    return output_path
+def synthesize_audio(prompt: str) -> Tuple[int, np.ndarray]:
+    """Synthesizes placeholder audio based on the prompt complexity."""
+    try:
+        base_freq = 200 + len(prompt.split()) * 15 # Frequency scales with word count
+        duration = 4.0 # seconds (fixed length for simplicity)
+        sample_rate = 22050
+        t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
+        # Complex waveform: multiple sine waves + envelope
+        waveform = 0.6 * np.sin(2 * np.pi * base_freq * t)
+        waveform += 0.3 * np.sin(2 * np.pi * (base_freq * 1.5) * t)
+        # Apply gentle attack/decay envelope
+        envelope = np.ones_like(t)
+        attack_len = int(sample_rate * 0.5)
+        decay_len = int(sample_rate * (duration - 0.5))
+        envelope[:attack_len] = np.linspace(0, 1, attack_len)
+        envelope[decay_len:] = np.linspace(1, 0, len(t) - decay_len)
+        waveform *= envelope
+        # Scale to 16-bit PCM
+        audio_data = (waveform * 32767).astype(np.int16)
+        return sample_rate, audio_data
+    except Exception as e:
+        print(f"Audio synthesis error: {e}")
+        return 22050, np.zeros(22050 * 4, dtype=np.int16)
+@spaces.GPU(duration=300) # Generous duration for video generation
+def generate_video(
+    prompt: str,
+    input_image: Union[Image.Image, None],
+    duration: int,
+    is_image_to_video: bool
+) -> Tuple[str, Tuple[int, np.ndarray]]:
+    """
+    Generates a video (and synthesized audio) based on the input parameters.
+    """
+    # 1. Video generation logic
+    if not MODEL_LOADED or pipe_t2v is None:
+        video_path = fallback_video_generator(prompt, duration)
+    else:
+        actual_duration = min(duration, MAX_DURATION_SECONDS)
+        # Using a fixed frame rate common for ZeroScope
+        fps = 10
+        num_frames = actual_duration * fps
+        print(f"Using ZeroScope T2V. Duration: {actual_duration}s, Frames: {num_frames}")
+        if is_image_to_video and input_image:
+            # For I2V using T2V, we must guide the model using the prompt
+            # and rely on future model iterations (or Lora/ControlNet) for true image conditioning.
+            prompt = f"video starting from a visual of the following: {prompt}"
+            # In a real I2V setup, input_image would condition the VAE/UNet.
+        try:
+            # Generate frames
+            video_frames = pipe_t2v(
+                prompt,
+                num_frames=num_frames,
+                height=320,
+                width=576
+            ).frames
+            output_path = "output_video.mp4"
+            # Use 'H.264' codec for better compatibility in web browsers
+            imageio.mimsave(output_path, [np.array(f) for f in video_frames], fps=fps, quality=8, codec='libx264', pixelformat='yuv420p')
+        except Exception as e:
+            print(f"Critical Error during ZeroScope generation: {e}")
+            video_path = fallback_video_generator(prompt, duration)
+    # 2. Synthesize audio
+    audio_output = synthesize_audio(prompt)
+    return video_path, audio_output

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+gradio
+torch
+accelerate
+Pillow
+numpy
+scipy
+imageio
+transformers
+git+https://github.com/huggingface/diffusers
+ffmpeg-python
+xformers
+audiocraft
+bitsandbytes
+safetensors
+protobuf

utils.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# utils.py
+from PIL import Image
+import numpy as np
+import os
+from config import DEFAULT_IMAGE_PATH, ASSETS_DIR
+def ensure_placeholder_image(path=DEFAULT_IMAGE_PATH):
+    """Creates a simple placeholder image if it doesn't exist."""
+    if not os.path.exists(ASSETS_DIR):
+        os.makedirs(ASSETS_DIR, exist_ok=True)
+    if not os.path.exists(path):
+        # Create a simple 576x320 blue image
+        img = Image.fromarray(np.full((320, 576, 3), [100, 100, 255], dtype=np.uint8))
+        img.save(path)
+        print(f"Created placeholder image at {path}")
+    return path