Test

Paused

App Files Files Community

eeuuia commited on Oct 15

Commit

fcf054d

verified ·

1 Parent(s): e6712fd

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -207

app.py CHANGED Viewed

@@ -3,124 +3,141 @@ import torch
 import numpy as np
 import tempfile
 import os
 from diffusers import LTXLatentUpsamplePipeline
 from pipeline_ltx_condition_control import LTXConditionPipeline, LTXVideoCondition
-from diffusers.utils import export_to_video, load_video
-from torchvision import transforms
-import random
-import imageio
 from PIL import Image, ImageOps
-import cv2
-import shutil
-import glob
-from pathlib import Path
 import warnings
 import logging
-warnings.filterwarnings("ignore", category=UserWarning)
-warnings.filterwarnings("ignore", category=FutureWarning)
 warnings.filterwarnings("ignore", message=".*")
-from huggingface_hub import logging as ll
-ll.set_verbosity_error()
-ll.set_verbosity_warning()
-ll.set_verbosity_info()
-ll.set_verbosity_debug()
-logger = logging.getLogger("AducDebug")
-logging.basicConfig(level=logging.DEBUG)
-logger.setLevel(logging.DEBUG)
-FPS = 24
-dtype = torch.bfloat16
-device = "cuda" if torch.cuda.is_available() else "cpu"
-single_file_url = "https://huggingface.co/Lightricks/LTX-Video/resolve/main/ltxv-13b-0.9.8-distilled-fp8.safetensors"
-pipeline = LTXConditionPipeline.from_single_file(
-    single_file_url,
-    offload_state_dict=False,
-    dtype=torch.bfloat16, # Use o dtype apropriado. Para FP8, pode ser necessário torch.float8_e4m3fn.
-    cache_dir=os.getenv("HF_HOME_CACHE"),
-    token=os.getenv("HF_TOKEN"),
-)
-# Carregamento das pipelines
-#pipeline = LTXConditionPipeline.from_pretrained(
-#    "Lightricks/LTX-Video-0.9.8-13B-distilled",
-#    offload_state_dict=False,
-#    torch_dtype=torch.bfloat16,
-#    cache_dir=os.getenv("HF_HOME_CACHE"),
-#    token=os.getenv("HF_TOKEN"),
-#)
-pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained(
-    "Lightricks/ltxv-spatial-upscaler-0.9.7",
-    cache_dir=os.getenv("HF_HOME_CACHE"),
-    vae=pipeline.vae, torch_dtype=dtype
-)
-pipeline.to(device)
-pipe_upsample.to(device)
-pipeline.vae.enable_tiling()
-current_dir = Path(__file__).parent
-def cleanup_session_files(request: gr.Request):
-    """Limpa arquivos temporários da sessão quando o usuário se desconecta."""
-    try:
-        session_id = request.session_hash
-        session_dir = os.path.join("/tmp/gradio", session_id)
-        if os.path.exists(session_dir):
-            shutil.rmtree(session_dir)
-            print(f"Limpou o diretório da sessão: {session_dir}")
-    except Exception as e:
-        print(f"Erro durante a limpeza da sessão: {e}")
-def read_video(video) -> torch.Tensor:
-    """Lê um arquivo de vídeo e converte para um tensor torch."""
-    to_tensor_transform = transforms.ToTensor()
-    if isinstance(video, str):
-        video_tensor = torch.stack([to_tensor_transform(img) for img in imageio.get_reader(video)])
-    else:
-        video_tensor = torch.stack([to_tensor_transform(img) for img in video])
-    return video_tensor
 def round_to_nearest_resolution_acceptable_by_vae(height, width, vae_temporal_compression_ratio):
-    """Arredonda a resolução para valores aceitáveis pelo VAE."""
     height = height - (height % vae_temporal_compression_ratio)
     width = width - (width % vae_temporal_compression_ratio)
     return height, width
-# A assinatura da função volta a aceitar argumentos individuais para compatibilidade com o Gradio
-def generate_video(
-    condition_image_1,
-    condition_strength_1,
-    condition_frame_index_1,
-    condition_image_2,
-    condition_strength_2,
-    condition_frame_index_2,
-    prompt,
-    duration=3.0,
-    negative_prompt="worst quality, inconsistent motion, blurry, jittery, distorted",
-    height=768,
-    width=1152,
-    num_inference_steps=7,
-    guidance_scale=1.0,
-    seed=0,
-    randomize_seed=False,
     progress=gr.Progress(track_tqdm=True)
 ):
     try:
-        # Lógica para agrupar as condições *dentro* da função
-        # Cálculo de frames e resolução
         num_frames = int(duration * FPS) + 1
         temporal_compression = pipeline.vae_temporal_compression_ratio
         num_frames = ((num_frames - 1) // temporal_compression) * temporal_compression + 1
         downscale_factor = 2 / 3
         downscaled_height = int(height * downscale_factor)
         downscaled_width = int(width * downscale_factor)
@@ -128,85 +145,45 @@ def generate_video(
             downscaled_height, downscaled_width, pipeline.vae_temporal_compression_ratio
         )
-        conditions = []
-        if condition_image_1 is not None:
-            condition_image_1 = ImageOps.fit(condition_image_1, (downscaled_width, downscaled_height), Image.LANCZOS)
-            conditions.append(LTXVideoCondition(
-                image=condition_image_1,
-                strength=condition_strength_1,
-                frame_index=int(condition_frame_index_1)
-            ))
-        if condition_image_2 is not None:
-            condition_image_2 = ImageOps.fit(condition_image_2, (downscaled_width, downscaled_height), Image.LANCZOS)
-            conditions.append(LTXVideoCondition(
-                image=condition_image_2,
-                strength=condition_strength_2,
-                frame_index=int(condition_frame_index_2)
-            ))
-        pipeline_args = {}
-        if conditions:
-            pipeline_args["conditions"] = conditions
-        # Manipulação da seed
-        if randomize_seed:
-            seed = random.randint(0, 2**32 - 1)
-        # ETAPA 1: Geração do vídeo em baixa resolução
         latents = pipeline(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            width=downscaled_width,
-            height=downscaled_height,
-            num_frames=num_frames,
-            timesteps=[1000, 993, 987, 981, 975, 909, 725, 0.03],
-            decode_timestep=0.05,
-            decode_noise_scale=0.025,
-            image_cond_noise_scale=0.0,
-            guidance_scale=guidance_scale,
-            guidance_rescale=0.7,
-            generator=torch.Generator().manual_seed(seed),
-            output_type="latent",
-            **pipeline_args
         ).frames
-        # ETAPA 2: Upscale dos latentes
-        #upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
-        #upscaled_latents = pipe_upsample(
-        #    latents=latents,
-        #    output_type="latent"
-        #).frames
-        print(f"ETAPA 1 latents {latents.shape}")
-        # ETAPA 3: Denoise final em alta resolução
         final_video_frames_np = pipeline(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            width=downscaled_width,
-            height=downscaled_height,
-            num_frames=num_frames,
-            denoise_strength=0.999,
-            timesteps=[1000, 909, 725, 421, 0],
-            latents=latents,
-            decode_timestep=0.05,
-            decode_noise_scale=0.025,
-            image_cond_noise_scale=0.0,
-            guidance_scale=guidance_scale,
-            guidance_rescale=0.7,
             generator=torch.Generator(device="cuda").manual_seed(seed),
-            output_type="np",
-            **pipeline_args
         ).frames[0]
-        print(f"ETAPA 3 final_video_frames_np {final_video_frames_np.shape}")
-        # Exportação para arquivo MP4
         video_uint8_frames = [(frame * 255).astype(np.uint8) for frame in final_video_frames_np]
         output_filename = "output.mp4"
         with imageio.get_writer(output_filename, fps=FPS, quality=8, macro_block_size=1) as writer:
@@ -218,53 +195,39 @@ def generate_video(
     except Exception as e:
         print(f"Ocorreu um erro: {e}")
         return None, seed
-# Interface Gráfica com Gradio
 with gr.Blocks(theme=gr.themes.Ocean(font=[gr.themes.GoogleFont("Lexend Deca"), "sans-serif"]), delete_cache=(60, 900)) as demo:
-    gr.Markdown(
-        """
-        # Geração de Vídeo com LTX
-        **Crie vídeos a partir de texto e imagens de condição usando o modelo LTX-Video.**
-        """
-    )
     with gr.Row():
         with gr.Column(scale=1):
-            prompt = gr.Textbox(
-                label="Prompt",
-                placeholder="Descreva o vídeo que você quer gerar...",
-                lines=3,
-                value="O Coringa em seu icônico terno roxo e cabelo verde, dançando sozinho em um quarto escuro e decadente. Seus movimentos são erráticos e imprevisíveis, alternando entre graciosos e caóticos enquanto ele se perde no momento. A câmera captura seus gestos teatrais, sua dança refletindo sua personalidade desequilibrada. Iluminação temperamental com sombras dançando pelas paredes, criando uma atmosfera de bela loucura."
-            )
             with gr.Accordion("Imagem de Condição 1", open=True):
-                condition_image_1 = gr.Image(label="Imagem de Condição 1", type="pil")
                 with gr.Row():
-                    condition_strength_1 = gr.Slider(label="Peso (Strength)", minimum=0.0, maximum=1.0, step=0.05, value=1.0)
                     condition_frame_index_1 = gr.Number(label="Frame", value=0, precision=0)
             with gr.Accordion("Imagem de Condição 2", open=False):
-                condition_image_2 = gr.Image(label="Imagem de Condição 2", type="pil")
                 with gr.Row():
-                    condition_strength_2 = gr.Slider(label="Peso (Strength)", minimum=0.0, maximum=1.0, step=0.05, value=1.0)
                     condition_frame_index_2 = gr.Number(label="Frame", value=0, precision=0)
-            duration = gr.Slider(label="Duração (segundos)", minimum=1.0, maximum=10.0, step=0.5, value=2)
             with gr.Accordion("Configurações Avançadas", open=False):
-                negative_prompt = gr.Textbox(label="Prompt Negativo", placeholder="O que você não quer no vídeo...", lines=2, value="pior qualidade, movimento inconsistente, embaçado, tremido, distorcido")
                 with gr.Row():
                     height = gr.Slider(label="Altura", minimum=256, maximum=1536, step=32, value=768)
                     width = gr.Slider(label="Largura", minimum=256, maximum=1536, step=32, value=1152)
-                num_inference_steps = gr.Slider(label="Passos de Inferência", minimum=5, maximum=10, step=1, value=7, visible=False)
-                with gr.Row():
-                    guidance_scale = gr.Slider(label="Escala de Orientação (Guidance)", minimum=1.0, maximum=5.0, step=0.1, value=1.0)
                 with gr.Row():
                     randomize_seed = gr.Checkbox(label="Seed Aleatória", value=True)
                     seed = gr.Number(label="Seed", value=0, precision=0)
@@ -272,33 +235,18 @@ with gr.Blocks(theme=gr.themes.Ocean(font=[gr.themes.GoogleFont("Lexend Deca"),
         with gr.Column(scale=1):
             output_video = gr.Video(label="Vídeo Gerado", height=400)
-    # CORREÇÃO: A lista de inputs agora é "plana", contendo apenas componentes do Gradio
     generate_btn.click(
-        fn=generate_video,
         inputs=[
-            condition_image_1,
-            condition_strength_1,
-            condition_frame_index_1,
-            condition_image_2,
-            condition_strength_2,
-            condition_frame_index_2,
-            prompt,
-            duration,
-            negative_prompt,
-            height,
-            width,
-            num_inference_steps,
-            guidance_scale,
-            seed,
-            randomize_seed,
         ],
-        outputs=[output_video, seed],
-        show_progress=True
     )
-    demo.unload(cleanup_session_files)
 if __name__ == "__main__":
     demo.queue().launch(server_name="0.0.0.0", server_port=7860, debug=True, show_error=True)

 import numpy as np
 import tempfile
 import os
+import yaml
+import json
+import threading
+from pathlib import Path
+# Importações de Hugging Face
+from huggingface_hub import snapshot_download, HfFolder
+from transformers import T5EncoderModel, T5TokenizerFast
 from diffusers import LTXLatentUpsamplePipeline
+from diffusers.models import AutoencoderKLLTXVideo, LTXVideoTransformer3DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+# Nossa pipeline customizada e utilitários
 from pipeline_ltx_condition_control import LTXConditionPipeline, LTXVideoCondition
+from diffusers.utils import export_to_video
 from PIL import Image, ImageOps
+import imageio
+# --- Configuração de Logging e Avisos ---
 import warnings
 import logging
+warnings.filterwarnings("ignore", category="UserWarning")
+warnings.filterwarnings("ignore", category="FutureWarning")
 warnings.filterwarnings("ignore", message=".*")
+from huggingface_hub import logging as hf_logging
+hf_logging.set_verbosity_error()
+# --- Classe de Serviço para Carregamento e Gerenciamento dos Modelos ---
+class VideoGenerationService:
+    """
+    Encapsula o carregamento e a configuração das pipelines de IA.
+    Carrega os componentes de forma explícita e modular a partir de um arquivo de configuração.
+    """
+    def __init__(self, config_path: Path):
+        print("=== [Serviço de Geração de Vídeo] Inicializando... ===")
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA é necessário para rodar este serviço.")
+        self.device = "cuda"
+        self.torch_dtype = torch.bfloat16
+        print(f"[Init] Dispositivo: {self.device}, DType: {self.torch_dtype}")
+        with open(config_path, "r") as f:
+            self.cfg = yaml.safe_load(f)
+        print(f"[Init] Configuração carregada de: {config_path}")
+        print(json.dumps(self.cfg, indent=2))
+        # Parâmetros do YAML
+        self.base_repo = self.cfg.get("base_repo")
+        self.checkpoint_path = self.cfg.get("checkpoint_path")
+        self.upscaler_repo = self.cfg.get("spatial_upscaler_model_path")
+        self._initialize()
+        print("=== [Serviço de Geração de Vídeo] Inicialização concluída. ===")
+    def _initialize(self):
+        print(f"=== [Init] Baixando snapshot do repositório base: {self.base_repo} ===")
+        local_repo_path = snapshot_download(
+            repo_id=self.base_repo,
+            token=os.getenv("HF_TOKEN") or HfFolder.get_token(),
+            resume_download=True
+        )
+        print("[Init] Carregando componentes da pipeline a partir de arquivos locais...")
+        self.vae = AutoencoderKLLTXVideo.from_pretrained(local_repo_path, subfolder="vae", torch_dtype=self.torch_dtype)
+        self.text_encoder = T5EncoderModel.from_pretrained(local_repo_path, subfolder="text_encoder", torch_dtype=self.torch_dtype)
+        self.tokenizer = T5TokenizerFast.from_pretrained(local_repo_path, subfolder="tokenizer")
+        self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(local_repo_path, subfolder="scheduler")
+        # Causa do erro anterior: desativar explicitamente o dynamic shifting para compatibilidade
+        if hasattr(self.scheduler.config, 'use_dynamic_shifting') and self.scheduler.config.use_dynamic_shifting:
+            print("[Init] Desativando 'use_dynamic_shifting' no scheduler.")
+            self.scheduler.config.use_dynamic_shifting = False
+        print(f"[Init] Carregando pesos do Transformer de: {self.checkpoint_path}")
+        self.transformer = LTXVideoTransformer3DModel.from_pretrained(
+            local_repo_path, subfolder="transformer", weight_name=self.checkpoint_path, torch_dtype=self.torch_dtype
+        )
+        print("[Init] Montando a LTXConditionPipeline...")
+        self.pipeline = LTXConditionPipeline(
+            vae=self.vae, text_encoder=self.text_encoder, tokenizer=self.tokenizer,
+            scheduler=self.scheduler, transformer=self.transformer
+        )
+        self.pipeline.to(self.device)
+        self.pipeline.vae.enable_tiling()
+        print(f"[Init] Carregando o upsampler espacial de: {self.upscaler_repo}")
+        self.upsampler = LTXLatentUpsamplePipeline.from_pretrained(
+            self.upscaler_repo, vae=self.vae, torch_dtype=self.torch_dtype
+        )
+        self.upsampler.to(self.device)
+# --- Inicialização da Aplicação ---
+CONFIG_PATH = Path("ltx_config.yaml")
+if not CONFIG_PATH.exists():
+    raise FileNotFoundError(f"Arquivo de configuração '{CONFIG_PATH}' não encontrado. Crie-o antes de executar a aplicação.")
+# Instancia o serviço que carrega e mantém os modelos
+service = VideoGenerationService(config_path=CONFIG_PATH)
+pipeline = service.pipeline
+pipe_upsample = service.upsampler
+FPS = 24
+# --- Lógica Principal da Geração de Vídeo ---
 def round_to_nearest_resolution_acceptable_by_vae(height, width, vae_temporal_compression_ratio):
     height = height - (height % vae_temporal_compression_ratio)
     width = width - (width % vae_temporal_compression_ratio)
     return height, width
+def prepare_and_generate_video(
+    condition_image_1, condition_strength_1, condition_frame_index_1,
+    condition_image_2, condition_strength_2, condition_frame_index_2,
+    prompt, duration, negative_prompt,
+    height, width, guidance_scale, seed, randomize_seed,
     progress=gr.Progress(track_tqdm=True)
 ):
     try:
+        conditions_data = [
+            (condition_image_1, condition_strength_1, condition_frame_index_1),
+            (condition_image_2, condition_strength_2, condition_frame_index_2)
+        ]
+        if randomize_seed:
+            seed = random.randint(0, 2**32 - 1)
         num_frames = int(duration * FPS) + 1
         temporal_compression = pipeline.vae_temporal_compression_ratio
         num_frames = ((num_frames - 1) // temporal_compression) * temporal_compression + 1
+        # Etapa 1: Preparar condições para baixa resolução
         downscale_factor = 2 / 3
         downscaled_height = int(height * downscale_factor)
         downscaled_width = int(width * downscale_factor)
             downscaled_height, downscaled_width, pipeline.vae_temporal_compression_ratio
         )
+        conditions_low_res = []
+        for image, strength, frame_index in conditions_data:
+            if image is not None:
+                processed_image = ImageOps.fit(image, (downscaled_width, downscaled_height), Image.LANCZOS)
+                conditions_low_res.append(LTXVideoCondition(
+                    image=processed_image, strength=strength, frame_index=int(frame_index)
+                ))
+        pipeline_args_low_res = {"conditions": conditions_low_res} if conditions_low_res else {}
         latents = pipeline(
+            prompt=prompt, negative_prompt=negative_prompt, width=downscaled_width, height=downscaled_height,
+            num_frames=num_frames, generator=torch.Generator().manual_seed(seed),
+            output_type="latent", **pipeline_args_low_res
         ).frames
+        # Etapa 2: Upscale
+        upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
+        upscaled_latents = pipe_upsample(latents=latents, output_type="latent").frames
+        # Etapa 3: Preparar condições para alta resolução (para manter frames imutáveis)
+        conditions_high_res = []
+        for image, strength, frame_index in conditions_data:
+            if image is not None:
+                processed_image_high_res = ImageOps.fit(image, (upscaled_width, upscaled_height), Image.LANCZOS)
+                conditions_high_res.append(LTXVideoCondition(
+                    image=processed_image_high_res, strength=strength, frame_index=int(frame_index)
+                ))
+        pipeline_args_high_res = {"conditions": conditions_high_res} if conditions_high_res else {}
         final_video_frames_np = pipeline(
+            prompt=prompt, negative_prompt=negative_prompt, width=upscaled_width, height=upscaled_height,
+            num_frames=num_frames, denoise_strength=0.999, latents=upscaled_latents,
             generator=torch.Generator(device="cuda").manual_seed(seed),
+            output_type="np", **pipeline_args_high_res
         ).frames[0]
+        # Etapa 4: Exportação
         video_uint8_frames = [(frame * 255).astype(np.uint8) for frame in final_video_frames_np]
         output_filename = "output.mp4"
         with imageio.get_writer(output_filename, fps=FPS, quality=8, macro_block_size=1) as writer:
     except Exception as e:
         print(f"Ocorreu um erro: {e}")
+        import traceback
+        traceback.print_exc()
         return None, seed
+# --- Interface Gráfica com Gradio ---
 with gr.Blocks(theme=gr.themes.Ocean(font=[gr.themes.GoogleFont("Lexend Deca"), "sans-serif"]), delete_cache=(60, 900)) as demo:
+    gr.Markdown("# Geração de Vídeo com LTX\n**Crie vídeos a partir de texto e imagens de condição.**")
     with gr.Row():
         with gr.Column(scale=1):
+            prompt = gr.Textbox(label="Prompt", placeholder="Descreva o vídeo que você quer gerar...", lines=3, value="O Coringa dançando em um quarto escuro, iluminação dramática.")
             with gr.Accordion("Imagem de Condição 1", open=True):
+                condition_image_1 = gr.Image(label="Imagem 1", type="pil")
                 with gr.Row():
+                    condition_strength_1 = gr.Slider(label="Peso", minimum=0.0, maximum=1.0, step=0.05, value=1.0)
                     condition_frame_index_1 = gr.Number(label="Frame", value=0, precision=0)
             with gr.Accordion("Imagem de Condição 2", open=False):
+                condition_image_2 = gr.Image(label="Imagem 2", type="pil")
                 with gr.Row():
+                    condition_strength_2 = gr.Slider(label="Peso", minimum=0.0, maximum=1.0, step=0.05, value=1.0)
                     condition_frame_index_2 = gr.Number(label="Frame", value=0, precision=0)
+            duration = gr.Slider(label="Duração (s)", minimum=1.0, maximum=10.0, step=0.5, value=2)
             with gr.Accordion("Configurações Avançadas", open=False):
+                negative_prompt = gr.Textbox(label="Prompt Negativo", lines=2, value="pior qualidade, embaçado, tremido, distorcido")
                 with gr.Row():
                     height = gr.Slider(label="Altura", minimum=256, maximum=1536, step=32, value=768)
                     width = gr.Slider(label="Largura", minimum=256, maximum=1536, step=32, value=1152)
                 with gr.Row():
+                    guidance_scale = gr.Slider(label="Guidance", minimum=1.0, maximum=5.0, step=0.1, value=1.0)
                     randomize_seed = gr.Checkbox(label="Seed Aleatória", value=True)
                     seed = gr.Number(label="Seed", value=0, precision=0)
         with gr.Column(scale=1):
             output_video = gr.Video(label="Vídeo Gerado", height=400)
+            generated_seed = gr.Number(label="Seed Utilizada", interactive=False)
     generate_btn.click(
+        fn=prepare_and_generate_video,
         inputs=[
+            condition_image_1, condition_strength_1, condition_frame_index_1,
+            condition_image_2, condition_strength_2, condition_frame_index_2,
+            prompt, duration, negative_prompt,
+            height, width, guidance_scale, seed, randomize_seed,
         ],
+        outputs=[output_video, generated_seed]
     )
 if __name__ == "__main__":
     demo.queue().launch(server_name="0.0.0.0", server_port=7860, debug=True, show_error=True)