Gertie01 commited on
Commit
48ae3bc
·
verified ·
1 Parent(s): cf1083b

Deploy Gradio app with multiple files

Browse files
Files changed (5) hide show
  1. app.py +174 -0
  2. config.py +13 -0
  3. models.py +143 -0
  4. requirements.txt +15 -0
  5. utils.py +18 -0
app.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ from PIL import Image
4
+ from typing import Union
5
+ import os
6
+
7
+ # Import utility and model functions
8
+ from models import generate_video
9
+ from config import MAX_DURATION_SECONDS, DEFAULT_IMAGE_PATH, ASSETS_DIR
10
+ from utils import ensure_placeholder_image
11
+
12
+ # Prepare assets directory and placeholder image
13
+ ensure_placeholder_image()
14
+
15
+ # --- Unified Handler ---
16
+ def run_generation(
17
+ prompt: str,
18
+ input_image_path: Union[str, None],
19
+ duration_slider: float,
20
+ is_image_to_video: bool
21
+ ):
22
+ """Unified handler that loads image if necessary and calls the model."""
23
+
24
+ pil_image = None
25
+ if input_image_path and is_image_to_video:
26
+ try:
27
+ # Load the PIL image from the file path provided by gr.Image
28
+ pil_image = Image.open(input_image_path).convert("RGB")
29
+ except Exception as e:
30
+ gr.Warning(f"Could not load image: {e}")
31
+ pass
32
+
33
+ duration = int(duration_slider)
34
+
35
+ return generate_video(
36
+ prompt=prompt,
37
+ input_image=pil_image,
38
+ duration=duration,
39
+ is_image_to_video=is_image_to_video
40
+ )
41
+
42
+ # --- Wrapper Functions for Tabs ---
43
+ def t2v_wrapper(prompt: str, duration_slider: float):
44
+ """Handler for Text-to-Video tab."""
45
+ return run_generation(prompt, None, duration_slider, False)
46
+
47
+ def i2v_wrapper(prompt: str, input_image_path: str, duration_slider: float):
48
+ """Handler for Image-to-Video tab."""
49
+ if not input_image_path:
50
+ raise gr.Error("Please upload an image for Image-to-Video generation.")
51
+ return run_generation(prompt, input_image_path, duration_slider, True)
52
+
53
+
54
+ # --- UI Definition ---
55
+ with gr.Blocks(title="Sora 2 Video Generator (ZeroScope Proxy)", fill_width=True) as demo:
56
+ gr.HTML(
57
+ f"""
58
+ <div style="text-align: center; max-width: 800px; margin: 0 auto;">
59
+ <h1>Sora 2 Inspired Video Generator (ZeroScope Proxy)</h1>
60
+ <p>
61
+ This demo utilizes a real, high-quality open-source AI model ({MODEL_ID_T2V}) to simulate Sora's functionality.
62
+ Due to hardware and model limitations, videos are currently capped at {MAX_DURATION_SECONDS} seconds.
63
+ The audio track is synthesized based on the prompt complexity.
64
+ </p>
65
+ <p>
66
+ Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">anycoder</a>
67
+ </p>
68
+ </div>
69
+ """
70
+ )
71
+
72
+ with gr.Tabs():
73
+
74
+ # =======================================================
75
+ # Tab 1: Text-to-Video (T2V)
76
+ # =======================================================
77
+ with gr.TabItem("Text-to-Video (T2V)"):
78
+ with gr.Row():
79
+ with gr.Column(scale=2):
80
+ prompt_t2v = gr.Textbox(
81
+ label="Text Prompt",
82
+ value="A highly cinematic shot of a golden eagle flying over a medieval castle, volumetric lighting.",
83
+ lines=3
84
+ )
85
+ duration_t2v = gr.Slider(
86
+ minimum=4,
87
+ maximum=MAX_DURATION_SECONDS,
88
+ step=1,
89
+ value=4,
90
+ label=f"Video Duration (seconds, max {MAX_DURATION_SECONDS}s)"
91
+ )
92
+ generate_btn_t2v = gr.Button("Generate Video", variant="primary")
93
+
94
+ with gr.Column(scale=1):
95
+ video_out_t2v = gr.Video(label="Generated Video")
96
+ audio_out_t2v = gr.Audio(label="Generated Audio Track", type="numpy")
97
+
98
+ # T2V Generation Event
99
+ generate_btn_t2v.click(
100
+ fn=t2v_wrapper,
101
+ inputs=[prompt_t2v, duration_t2v],
102
+ outputs=[video_out_t2v, audio_out_t2v]
103
+ )
104
+
105
+ gr.Examples(
106
+ examples=[
107
+ ["A puppy dancing ballet on the moon, high saturation, 4k.", 4],
108
+ ["Neon lights reflecting off wet cobblestones in a cyberpunk alley, panning camera.", 4]
109
+ ],
110
+ inputs=[prompt_t2v, duration_t2v],
111
+ outputs=[video_out_t2v, audio_out_t2v],
112
+ fn=t2v_wrapper,
113
+ cache_examples=False,
114
+ run_on_click=True
115
+ )
116
+
117
+ # =======================================================
118
+ # Tab 2: Image-to-Video (I2V)
119
+ # =======================================================
120
+ with gr.TabItem("Image-to-Video (I2V)"):
121
+ with gr.Row():
122
+ with gr.Column(scale=1):
123
+ image_i2v = gr.Image(
124
+ label="Input Image",
125
+ type="filepath",
126
+ sources=["upload"],
127
+ interactive=True,
128
+ value=DEFAULT_IMAGE_PATH
129
+ )
130
+ with gr.Column(scale=2):
131
+ prompt_i2v = gr.Textbox(
132
+ label="Movement Prompt",
133
+ value="The water ripples slightly as a breeze passes through the field.",
134
+ placeholder="Describe the desired movement or animation.",
135
+ lines=3
136
+ )
137
+ duration_i2v = gr.Slider(
138
+ minimum=4,
139
+ maximum=MAX_DURATION_SECONDS,
140
+ step=1,
141
+ value=4,
142
+ label=f"Video Duration (seconds, max {MAX_DURATION_SECONDS}s)"
143
+ )
144
+ generate_btn_i2v = gr.Button("Animate Image", variant="primary")
145
+
146
+ with gr.Row():
147
+ video_out_i2v = gr.Video(label="Animated Video")
148
+ audio_out_i2v = gr.Audio(label="Generated Audio Track", type="numpy")
149
+
150
+ # I2V Generation Event
151
+ generate_btn_i2v.click(
152
+ fn=i2v_wrapper,
153
+ inputs=[prompt_i2v, image_i2v, duration_i2v],
154
+ outputs=[video_out_i2v, audio_out_i2v]
155
+ )
156
+
157
+ gr.Examples(
158
+ examples=[
159
+ [
160
+ "Heavy rain starts to fall, blurring the edges.",
161
+ DEFAULT_IMAGE_PATH,
162
+ 4
163
+ ]
164
+ ],
165
+ inputs=[prompt_i2v, image_i2v, duration_i2v],
166
+ outputs=[video_out_i2v, audio_out_i2v],
167
+ fn=i2v_wrapper,
168
+ cache_examples=False,
169
+ run_on_click=True
170
+ )
171
+
172
+
173
+ if __name__ == "__main__":
174
+ demo.queue(max_size=20).launch()
config.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py
2
+
3
+ # Model Identifiers
4
+ # Note: Using ZeroScope v2 576w as a high-quality open-source proxy model.
5
+ # The original Sora 2 is proprietary and capable of longer generations (2 min).
6
+ MODEL_ID_T2V = "cerspense/zeroscope_v2_576w"
7
+
8
+ # Constraints (Must be realistic for free spaces)
9
+ MAX_DURATION_SECONDS = 4 # Maximum video duration in seconds (4s is a realistic limit for ZeroScope on free GPU)
10
+
11
+ # Placeholder paths
12
+ ASSETS_DIR = "assets"
13
+ DEFAULT_IMAGE_PATH = f"{ASSETS_DIR}/placeholder_image.png"
models.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models.py
2
+ import torch
3
+ import numpy as np
4
+ from diffusers import DiffusionPipeline
5
+ from typing import Tuple, Union
6
+ import spaces
7
+ from PIL import Image
8
+ import imageio
9
+ import os
10
+ from scipy.io import wavfile
11
+
12
+ from config import MODEL_ID_T2V, MAX_DURATION_SECONDS
13
+
14
+ # --- Model Loading (ZeroGPU Setup) ---
15
+ pipe_t2v = None
16
+ MODEL_LOADED = False
17
+
18
+ try:
19
+ # Use bfloat16 if available (recommended for modern GPUs)
20
+ dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8 else torch.float16
21
+
22
+ pipe_t2v = DiffusionPipeline.from_pretrained(
23
+ MODEL_ID_T2V,
24
+ torch_dtype=dtype,
25
+ variant="fp16"
26
+ )
27
+ # Move to CUDA and enable CPU offload for large models
28
+ pipe_t2v.to("cuda")
29
+ pipe_t2v.enable_model_cpu_offload()
30
+ MODEL_LOADED = True
31
+ print(f"✅ Loaded model {MODEL_ID_T2V} to CUDA.")
32
+
33
+ except Exception as e:
34
+ print(f"❌ Failed to load ZeroScope model for GPU: {e}")
35
+ MODEL_LOADED = False
36
+
37
+ # Fallback generator function
38
+ def fallback_video_generator(prompt: str, duration: int) -> str:
39
+ print(f"⚠️ Using CPU Fallback Generator for '{prompt}'.")
40
+
41
+ # Simulate generation time
42
+ # This ensures the user waits, mirroring the real process time
43
+ import time; time.sleep(duration * 1.5)
44
+
45
+ num_frames = duration * 10 # 10 FPS
46
+ frames = []
47
+
48
+ # Simple gradient animation
49
+ width, height = 576, 320
50
+
51
+ for i in range(num_frames):
52
+ # Create a simple color based on frame index
53
+ r = (128 + 100 * np.sin(i * 0.1)).astype(np.uint8)
54
+ g = (128 + 100 * np.sin(i * 0.15)).astype(np.uint8)
55
+ b = (128 + 100 * np.sin(i * 0.2)).astype(np.uint8)
56
+
57
+ frame = np.zeros((height, width, 3), dtype=np.uint8)
58
+ frame[:, :] = [r, g, b]
59
+ frames.append(frame)
60
+
61
+ output_path = "output_fallback.mp4"
62
+ imageio.mimsave(output_path, frames, fps=10)
63
+ return output_path
64
+
65
+ def synthesize_audio(prompt: str) -> Tuple[int, np.ndarray]:
66
+ """Synthesizes placeholder audio based on the prompt complexity."""
67
+ try:
68
+ base_freq = 200 + len(prompt.split()) * 15 # Frequency scales with word count
69
+ duration = 4.0 # seconds (fixed length for simplicity)
70
+ sample_rate = 22050
71
+
72
+ t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
73
+
74
+ # Complex waveform: multiple sine waves + envelope
75
+ waveform = 0.6 * np.sin(2 * np.pi * base_freq * t)
76
+ waveform += 0.3 * np.sin(2 * np.pi * (base_freq * 1.5) * t)
77
+
78
+ # Apply gentle attack/decay envelope
79
+ envelope = np.ones_like(t)
80
+ attack_len = int(sample_rate * 0.5)
81
+ decay_len = int(sample_rate * (duration - 0.5))
82
+ envelope[:attack_len] = np.linspace(0, 1, attack_len)
83
+ envelope[decay_len:] = np.linspace(1, 0, len(t) - decay_len)
84
+
85
+ waveform *= envelope
86
+
87
+ # Scale to 16-bit PCM
88
+ audio_data = (waveform * 32767).astype(np.int16)
89
+
90
+ return sample_rate, audio_data
91
+ except Exception as e:
92
+ print(f"Audio synthesis error: {e}")
93
+ return 22050, np.zeros(22050 * 4, dtype=np.int16)
94
+
95
+ @spaces.GPU(duration=300) # Generous duration for video generation
96
+ def generate_video(
97
+ prompt: str,
98
+ input_image: Union[Image.Image, None],
99
+ duration: int,
100
+ is_image_to_video: bool
101
+ ) -> Tuple[str, Tuple[int, np.ndarray]]:
102
+ """
103
+ Generates a video (and synthesized audio) based on the input parameters.
104
+ """
105
+
106
+ # 1. Video generation logic
107
+ if not MODEL_LOADED or pipe_t2v is None:
108
+ video_path = fallback_video_generator(prompt, duration)
109
+ else:
110
+ actual_duration = min(duration, MAX_DURATION_SECONDS)
111
+ # Using a fixed frame rate common for ZeroScope
112
+ fps = 10
113
+ num_frames = actual_duration * fps
114
+
115
+ print(f"Using ZeroScope T2V. Duration: {actual_duration}s, Frames: {num_frames}")
116
+
117
+ if is_image_to_video and input_image:
118
+ # For I2V using T2V, we must guide the model using the prompt
119
+ # and rely on future model iterations (or Lora/ControlNet) for true image conditioning.
120
+ prompt = f"video starting from a visual of the following: {prompt}"
121
+ # In a real I2V setup, input_image would condition the VAE/UNet.
122
+
123
+ try:
124
+ # Generate frames
125
+ video_frames = pipe_t2v(
126
+ prompt,
127
+ num_frames=num_frames,
128
+ height=320,
129
+ width=576
130
+ ).frames
131
+
132
+ output_path = "output_video.mp4"
133
+ # Use 'H.264' codec for better compatibility in web browsers
134
+ imageio.mimsave(output_path, [np.array(f) for f in video_frames], fps=fps, quality=8, codec='libx264', pixelformat='yuv420p')
135
+
136
+ except Exception as e:
137
+ print(f"Critical Error during ZeroScope generation: {e}")
138
+ video_path = fallback_video_generator(prompt, duration)
139
+
140
+ # 2. Synthesize audio
141
+ audio_output = synthesize_audio(prompt)
142
+
143
+ return video_path, audio_output
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ accelerate
4
+ Pillow
5
+ numpy
6
+ scipy
7
+ imageio
8
+ transformers
9
+ git+https://github.com/huggingface/diffusers
10
+ ffmpeg-python
11
+ xformers
12
+ audiocraft
13
+ bitsandbytes
14
+ safetensors
15
+ protobuf
utils.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils.py
2
+ from PIL import Image
3
+ import numpy as np
4
+ import os
5
+ from config import DEFAULT_IMAGE_PATH, ASSETS_DIR
6
+
7
+ def ensure_placeholder_image(path=DEFAULT_IMAGE_PATH):
8
+ """Creates a simple placeholder image if it doesn't exist."""
9
+
10
+ if not os.path.exists(ASSETS_DIR):
11
+ os.makedirs(ASSETS_DIR, exist_ok=True)
12
+
13
+ if not os.path.exists(path):
14
+ # Create a simple 576x320 blue image
15
+ img = Image.fromarray(np.full((320, 576, 3), [100, 100, 255], dtype=np.uint8))
16
+ img.save(path)
17
+ print(f"Created placeholder image at {path}")
18
+ return path