Sora 2 Inspired Video Generator (ZeroScope Proxy)

# app.py
import gradio as gr
from PIL import Image
from typing import Union
import os

# Import utility and model functions
from models import generate_video
from config import MAX_DURATION_SECONDS, DEFAULT_IMAGE_PATH, ASSETS_DIR
from utils import ensure_placeholder_image

# Prepare assets directory and placeholder image
ensure_placeholder_image()

# --- Unified Handler ---
def run_generation(
    prompt: str, 
    input_image_path: Union[str, None], 
    duration_slider: float, 
    is_image_to_video: bool
):
    """Unified handler that loads image if necessary and calls the model."""
    
    pil_image = None
    if input_image_path and is_image_to_video:
        try:
            # Load the PIL image from the file path provided by gr.Image
            pil_image = Image.open(input_image_path).convert("RGB")
        except Exception as e:
            gr.Warning(f"Could not load image: {e}")
            pass
    
    duration = int(duration_slider)
    
    return generate_video(
        prompt=prompt,
        input_image=pil_image, 
        duration=duration,
        is_image_to_video=is_image_to_video
    )

# --- Wrapper Functions for Tabs ---
def t2v_wrapper(prompt: str, duration_slider: float):
    """Handler for Text-to-Video tab."""
    return run_generation(prompt, None, duration_slider, False)

def i2v_wrapper(prompt: str, input_image_path: str, duration_slider: float):
    """Handler for Image-to-Video tab."""
    if not input_image_path:
        raise gr.Error("Please upload an image for Image-to-Video generation.")
    return run_generation(prompt, input_image_path, duration_slider, True)


# --- UI Definition ---
with gr.Blocks(title="Sora 2 Video Generator (ZeroScope Proxy)", fill_width=True) as demo:
    gr.HTML(
        f"""
        <div style="text-align: center; max-width: 800px; margin: 0 auto;">
            <h1>Sora 2 Inspired Video Generator (ZeroScope Proxy)</h1>
            <p>
                This demo utilizes a real, high-quality open-source AI model ({MODEL_ID_T2V}) to simulate Sora's functionality. 
                Due to hardware and model limitations, videos are currently capped at {MAX_DURATION_SECONDS} seconds.
                The audio track is synthesized based on the prompt complexity.
            </p>
            <p>
                Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">anycoder</a>
            </p>
        </div>
        """
    )

    with gr.Tabs():
        
        # =======================================================
        # Tab 1: Text-to-Video (T2V)
        # =======================================================
        with gr.TabItem("Text-to-Video (T2V)"):
            with gr.Row():
                with gr.Column(scale=2):
                    prompt_t2v = gr.Textbox(
                        label="Text Prompt",
                        value="A highly cinematic shot of a golden eagle flying over a medieval castle, volumetric lighting.",
                        lines=3
                    )
                    duration_t2v = gr.Slider(
                        minimum=4,
                        maximum=MAX_DURATION_SECONDS,
                        step=1,
                        value=4,
                        label=f"Video Duration (seconds, max {MAX_DURATION_SECONDS}s)"
                    )
                    generate_btn_t2v = gr.Button("Generate Video", variant="primary")
                    
                with gr.Column(scale=1):
                    video_out_t2v = gr.Video(label="Generated Video")
                    audio_out_t2v = gr.Audio(label="Generated Audio Track", type="numpy")

            # T2V Generation Event
            generate_btn_t2v.click(
                fn=t2v_wrapper,
                inputs=[prompt_t2v, duration_t2v],
                outputs=[video_out_t2v, audio_out_t2v]
            )
            
            gr.Examples(
                examples=[
                    ["A puppy dancing ballet on the moon, high saturation, 4k.", 4],
                    ["Neon lights reflecting off wet cobblestones in a cyberpunk alley, panning camera.", 4]
                ],
                inputs=[prompt_t2v, duration_t2v],
                outputs=[video_out_t2v, audio_out_t2v],
                fn=t2v_wrapper,
                cache_examples=False,
                run_on_click=True
            )

        # =======================================================
        # Tab 2: Image-to-Video (I2V)
        # =======================================================
        with gr.TabItem("Image-to-Video (I2V)"):
            with gr.Row():
                with gr.Column(scale=1):
                    image_i2v = gr.Image(
                        label="Input Image",
                        type="filepath",
                        sources=["upload"],
                        interactive=True,
                        value=DEFAULT_IMAGE_PATH
                    )
                with gr.Column(scale=2):
                    prompt_i2v = gr.Textbox(
                        label="Movement Prompt",
                        value="The water ripples slightly as a breeze passes through the field.",
                        placeholder="Describe the desired movement or animation.",
                        lines=3
                    )
                    duration_i2v = gr.Slider(
                        minimum=4,
                        maximum=MAX_DURATION_SECONDS,
                        step=1,
                        value=4,
                        label=f"Video Duration (seconds, max {MAX_DURATION_SECONDS}s)"
                    )
                    generate_btn_i2v = gr.Button("Animate Image", variant="primary")
            
            with gr.Row():
                video_out_i2v = gr.Video(label="Animated Video")
                audio_out_i2v = gr.Audio(label="Generated Audio Track", type="numpy")
            
            # I2V Generation Event
            generate_btn_i2v.click(
                fn=i2v_wrapper,
                inputs=[prompt_i2v, image_i2v, duration_i2v],
                outputs=[video_out_i2v, audio_out_i2v]
            )

            gr.Examples(
                examples=[
                    [
                        "Heavy rain starts to fall, blurring the edges.",
                        DEFAULT_IMAGE_PATH,
                        4
                    ]
                ],
                inputs=[prompt_i2v, image_i2v, duration_i2v],
                outputs=[video_out_i2v, audio_out_i2v],
                fn=i2v_wrapper,
                cache_examples=False,
                run_on_click=True
            )


if __name__ == "__main__":
    demo.queue(max_size=20).launch()