import spaces import torch from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline from diffusers.models.transformers.transformer_wan import WanTransformer3DModel from diffusers.utils.export_utils import export_to_video import gradio as gr import tempfile import numpy as np from PIL import Image import random import gc import requests import time from torchao.quantization import quantize_ from torchao.quantization import Float8DynamicActivationFloat8WeightConfig from torchao.quantization import Int8WeightOnlyConfig import aoti MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers" MAX_DIM = 832 MIN_DIM = 480 SQUARE_DIM = 640 MULTIPLE_OF = 16 MAX_SEED = np.iinfo(np.int32).max FIXED_FPS = 16 MIN_FRAMES_MODEL = 8 MAX_FRAMES_MODEL = 80 MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS, 1) MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS, 1) # --- Translation Functions --- @spaces.GPU def translate_albanian_to_english(text): """Translate from Albanian to English using the sepioo-facebook-translation API.""" if not text.strip(): raise gr.Error("Please enter a description.") for attempt in range(2): try: response = requests.post( "https://hal1993-mdftranslation1234567890abcdef1234567890-fc073a6.hf.space/v1/translate", json={"from_language": "sq", "to_language": "en", "input_text": text}, headers={"accept": "application/json", "Content-Type": "application/json"}, timeout=5 ) response.raise_for_status() translated = response.json().get("translate", "") print(f"Translation response (sq->en): {translated}") return translated except Exception as e: print(f"Translation error (attempt {attempt + 1}): {e}") if attempt == 1: raise gr.Error("Translation failed. Please try again.") raise gr.Error("Translation failed. Please try again.") pipe = WanImageToVideoPipeline.from_pretrained( MODEL_ID, transformer=WanTransformer3DModel.from_pretrained( 'cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers', subfolder='transformer', torch_dtype=torch.bfloat16, device_map='cuda', ), transformer_2=WanTransformer3DModel.from_pretrained( 'cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers', subfolder='transformer_2', torch_dtype=torch.bfloat16, device_map='cuda', ), torch_dtype=torch.bfloat16, ).to('cuda') pipe.load_lora_weights( "Kijai/WanVideo_comfy", weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors", adapter_name="lightx2v" ) kwargs_lora = {} kwargs_lora["load_into_transformer_2"] = True pipe.load_lora_weights( "Kijai/WanVideo_comfy", weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors", adapter_name="lightx2v_2", **kwargs_lora ) pipe.set_adapters(["lightx2v", "lightx2v_2"], adapter_weights=[1., 1.]) pipe.fuse_lora(adapter_names=["lightx2v"], lora_scale=3., components=["transformer"]) pipe.fuse_lora(adapter_names=["lightx2v_2"], lora_scale=1., components=["transformer_2"]) pipe.unload_lora_weights() quantize_(pipe.text_encoder, Int8WeightOnlyConfig()) quantize_(pipe.transformer, Float8DynamicActivationFloat8WeightConfig()) quantize_(pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig()) aoti.aoti_blocks_load(pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da') aoti.aoti_blocks_load(pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da') default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation" default_negative_prompt = "色调艳丽, 过曝, 静态, 细节模糊不清, 字幕, 风格, 作品, 画作, 画面, 静止, 整体发灰, 最差质量, 低质量, JPEG压缩残留, 丑陋的, 残缺的, 多余的手指, 画得不好的手部, 画得不好的脸部, 畸形的, 毁容的, 形态畸形的肢体, 手指融合, 静止不动的画面, 杂乱的背景, 三条腿, 背景人很多, 倒着走" def resize_image(image: Image.Image) -> Image.Image: width, height = image.size if width == height: return image.resize((SQUARE_DIM, SQUARE_DIM), Image.LANCZOS) aspect_ratio = width / height MAX_ASPECT_RATIO = MAX_DIM / MIN_DIM MIN_ASPECT_RATIO = MIN_DIM / MAX_DIM image_to_resize = image if aspect_ratio > MAX_ASPECT_RATIO: target_w, target_h = MAX_DIM, MIN_DIM crop_width = int(round(height * MAX_ASPECT_RATIO)) left = (width - crop_width) // 2 image_to_resize = image.crop((left, 0, left + crop_width, height)) elif aspect_ratio < MIN_ASPECT_RATIO: target_w, target_h = MIN_DIM, MAX_DIM crop_height = int(round(width / MIN_ASPECT_RATIO)) top = (height - crop_height) // 2 image_to_resize = image.crop((0, top, width, top + crop_height)) else: if width > height: target_w = MAX_DIM target_h = int(round(target_w / aspect_ratio)) else: target_h = MAX_DIM target_w = int(round(target_h * aspect_ratio)) final_w = round(target_w / MULTIPLE_OF) * MULTIPLE_OF final_h = round(target_h / MULTIPLE_OF) * MULTIPLE_OF final_w = max(MIN_DIM, min(MAX_DIM, final_w)) final_h = max(MIN_DIM, min(MAX_DIM, final_h)) return image_to_resize.resize((final_w, final_h), Image.LANCZOS) def get_num_frames(duration_seconds: float): return 1 + int(np.clip( int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL, )) def get_duration( input_image, prompt, steps, negative_prompt, duration_seconds, guidance_scale, guidance_scale_2, seed, randomize_seed, progress, ): BASE_FRAMES_HEIGHT_WIDTH = 81 * 832 * 624 BASE_STEP_DURATION = 15 width, height = resize_image(input_image).size frames = get_num_frames(duration_seconds) factor = frames * width * height / BASE_FRAMES_HEIGHT_WIDTH step_duration = BASE_STEP_DURATION * factor ** 1.5 est = 10 + int(steps) * step_duration MAX_ALLOWED = 30 return min(est, MAX_ALLOWED) @spaces.GPU def generate_video( input_image, prompt, steps=6, negative_prompt=default_negative_prompt, duration_seconds=3.2, guidance_scale=1.5, guidance_scale_2=1.5, seed=42, randomize_seed=False, progress=gr.Progress(track_tqdm=True), ): if input_image is None: raise gr.Error("Please upload an input image.") num_frames = get_num_frames(duration_seconds) current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed) resized_image = resize_image(input_image) output_frames_list = pipe( image=resized_image, prompt=prompt, negative_prompt=negative_prompt, height=resized_image.height, width=resized_image.width, num_frames=num_frames, guidance_scale=float(guidance_scale), guidance_scale_2=float(guidance_scale_2), num_inference_steps=int(steps), generator=torch.Generator(device="cuda").manual_seed(current_seed), ).frames[0] with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile: video_path = tmpfile.name export_to_video(output_frames_list, video_path, fps=FIXED_FPS) return video_path, current_seed def create_demo(): with gr.Blocks(css="", title="Fast Image to Video") as demo: gr.HTML(""" """) with gr.Row(elem_id="general_items"): gr.Markdown("# ") gr.Markdown("Convert an image into an animated video with prompt description.", elem_id="subtitle") with gr.Column(elem_id="input_column"): input_image = gr.Image( type="pil", label="Input Image", sources=["upload"], show_download_button=False, show_share_button=False, interactive=True, elem_classes=["gradio-component", "image-container"] ) prompt = gr.Textbox( label="Prompt", value=default_prompt_i2v, lines=3, placeholder="Describe the desired animation or motion", elem_classes=["gradio-component"] ) generate_button = gr.Button( "Generate Video", variant="primary", elem_classes=["gradio-component", "gr-button-primary"] ) output_video = gr.Video( label="Generated Video", autoplay=True, interactive=False, show_download_button=True, show_share_button=False, elem_classes=["gradio-component", "image-container"] ) generate_button.click( fn=generate_video, inputs=[ input_image, prompt, gr.State(value=6), gr.State(value=default_negative_prompt), gr.State(value=3.2), gr.State(value=1.5), gr.State(value=1.5), gr.State(value=42), gr.State(value=True), ], outputs=[output_video, gr.State(value=42)], ) return demo if __name__ == "__main__": demo = create_demo() demo.queue().launch(share=True)