Test

Paused

App Files Files Community

eeuuia commited on about 1 month ago

Commit

703a6cb

verified ·

1 Parent(s): 19c4121

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -76

app.py CHANGED Viewed

@@ -31,13 +31,13 @@ warnings.filterwarnings("ignore", message=".*")
 print("=== [Inicialização da Aplicação] ===")
 # 1. Carregar Configuração do Arquivo YAML
-CONFIG_PATH = Path("ltxv-13b-0.9.8-distilled.yaml")
 if not CONFIG_PATH.exists():
     raise FileNotFoundError(f"Arquivo de configuração '{CONFIG_PATH}' não encontrado.")
 with open(CONFIG_PATH, "r") as f:
-    config = yaml.safe_load(f)
 print(f"Configuração carregada de: {CONFIG_PATH}")
-print(json.dumps(config, indent=2))
 # Parâmetros Globais
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -49,62 +49,75 @@ upscaler_repo="Lightricks/ltxv-spatial-upscaler-0.9.7"
 FPS = 24
-# 2. Baixar os arquivos do modelo base
-print(f"=== Baixando snapshot do repositório base: {base_repo} ===")
-local_repo_path = snapshot_download(
-    repo_id=base_repo,
-    token=os.getenv("HF_TOKEN") or HfFolder.get_token(),
-    resume_download=True
-)
-# 3. Carregar cada componente da pipeline explicitamente
-print("=== Carregando componentes da pipeline... ===")
-vae = AutoModel.from_pretrained(
-    "Lightricks/LTX-Video",
-    subfolder="vae",
-    torch_dtype=torch_dtype
-)
-text_encoder = AutoModel.from_pretrained(
-    "Lightricks/LTX-Video",
-    subfolder="text_encoder",
-    torch_dtype=torch_dtype
-)
-scheduler = AutoModel.from_pretrained(
-    "Lightricks/LTX-Video",
-    subfolder="scheduler",
-    torch_dtype=torch_dtype
-)
-tokenizer = AutoModel.from_pretrained(
-    "Lightricks/LTX-Video",
-    subfolder="tokenizer",
-    torch_dtype=torch_dtype
-)
-if hasattr(scheduler.config, 'use_dynamic_shifting') and scheduler.config.use_dynamic_shifting:
-    print("[Config] Desativando 'use_dynamic_shifting' no scheduler.")
-    scheduler.config.use_dynamic_shifting = False
-transformer = AutoModel.from_pretrained(
-    "Lightricks/LTX-Video",
-    subfolder="transformer",
-    torch_dtype=torch.bfloat16
-)
-transformer.enable_layerwise_casting(
-    storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16
-)
-pipeline = LTXConditionPipeline.from_pretrained(
-    "Lightricks/LTX-Video-0.9.8-13B-distilled",
-    offload_state_dict=False,
-    vae=vae, text_encoder=text_encoder, tokenizer=tokenizer,
-    scheduler=scheduler, transformer=transformer,
-    torch_dtype=torch.bfloat16,
-    cache_dir=os.getenv("HF_HOME_CACHE"),
-    token=os.getenv("HF_TOKEN"),
-)
 # 4. Montar a pipeline principal
@@ -151,7 +164,6 @@ def prepare_and_generate_video(
         )
         conditions = []
         if condition_image_1 is not None:
             condition_image_1 = ImageOps.fit(condition_image_1, (downscaled_width, downscaled_height), Image.LANCZOS)
@@ -170,31 +182,48 @@ def prepare_and_generate_video(
         pipeline_args = {}
         if conditions:
-            pipeline_args["conditions"] = conditions
         # Manipulação da seed
         if randomize_seed:
             seed = random.randint(0, 2**32 - 1)
         # ETAPA 1: Geração do vídeo em baixa resolução
-        latents = pipeline(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            width=downscaled_width,
-            height=downscaled_height,
-            num_frames=num_frames,
-            timesteps=[1000, 993, 987, 981, 975, 909, 725, 0.03],
-            decode_timestep=0.05,
-            decode_noise_scale=0.025,
-            image_cond_noise_scale=0.0,
-            guidance_scale=guidance_scale,
-            guidance_rescale=0.7,
-            generator=torch.Generator().manual_seed(seed),
-            #output_type="latent",
-            output_type="np",
-            **pipeline_args
-        ).frames[0]
         # ETAPA 2: Upscale dos latentes
         #upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2

 print("=== [Inicialização da Aplicação] ===")
 # 1. Carregar Configuração do Arquivo YAML
+CONFIG_PATH = Path("ltxv-13b-0.9.8-dev-fp8.yaml")
 if not CONFIG_PATH.exists():
     raise FileNotFoundError(f"Arquivo de configuração '{CONFIG_PATH}' não encontrado.")
 with open(CONFIG_PATH, "r") as f:
+    CONFIG = yaml.safe_load(f)
 print(f"Configuração carregada de: {CONFIG_PATH}")
+print(json.dumps(CONFIG, indent=2))
 # Parâmetros Globais
 device = "cuda" if torch.cuda.is_available() else "cpu"
 FPS = 24
+CACHE_DIR = os.environ.get("HF_HOME")
+DEPS_DIR = Path("/data")
+LTX_VIDEO_REPO_DIR = DEPS_DIR / "LTX-Video"
+BASE_CONFIG_PATH = LTX_VIDEO_REPO_DIR / "configs"
+DEFAULT_CONFIG_FILE = BASE_CONFIG_PATH / "ltxv-13b-0.9.8-dev-fp8.yaml"
+LTX_REPO_ID = "Lightricks/LTX-Video"
+RESULTS_DIR = Path("/app/output")
+DEFAULT_FPS = 24.0
+FRAMES_ALIGNMENT = 8
+# 2. Baixar os arquivos do modelo base
+print(f"=== Baixando snapshot do repositório base: {base_repo} ===")
+if True:
+    if True:
+        ckpt_path_str = hf_hub_download(repo_id=LTX_REPO_ID, filename=checkpoint_path, cache_dir=CACHE_DIR)
+        ckpt_path = Path(ckpt_path_str)
+        if not ckpt_path.is_file():
+            raise FileNotFoundError(f"Main checkpoint file not found: {ckpt_path}")
+        # 1. Carrega Metadados do Checkpoint
+        with safe_open(ckpt_path, framework="pt") as f:
+            metadata = f.metadata() or {}
+            config_str = metadata.get("config", "{}")
+            configs = json.loads(config_str)
+            allowed_inference_steps = configs.get("allowed_inference_steps")
+        # 2. Carrega os Componentes Individuais (todos na CPU)
+        #    O `.from_pretrained(ckpt_path)` é inteligente e carrega os pesos corretos do arquivo .safetensors.
+        logging.info("Carregando VAE...")
+        vae = CausalVideoAutoencoder.from_pretrained(ckpt_path).to("cpu")
+        logging.info("Carregando Transformer...")
+        transformer = Transformer3DModel.from_pretrained(ckpt_path).to("cpu")
+        logging.info("Carregando Scheduler...")
+        scheduler = RectifiedFlowScheduler.from_pretrained(ckpt_path)
+        logging.info("Carregando Text Encoder e Tokenizer...")
+        text_encoder_path = self.config["text_encoder_model_name_or_path"]
+        text_encoder = T5EncoderModel.from_pretrained(text_encoder_path, subfolder="text_encoder").to("cpu")
+        tokenizer = T5Tokenizer.from_pretrained(text_encoder_path, subfolder="tokenizer")
+        patchifier = SymmetricPatchifier(patch_size=1)
+        # 3. Define a precisão dos modelos (ainda na CPU, será aplicado na GPU depois)
+        precision = self.config.get("precision", "bfloat16")
+        if precision == "bfloat16":
+            vae.to(torch.bfloat16)
+            transformer.to(torch.bfloat16)
+            text_encoder.to(torch.bfloat16)
+        # 4. Monta o objeto do Pipeline com os componentes carregados
+        logging.info("Montando o objeto LTXVideoPipeline...")
+        submodel_dict = {
+            "transformer": transformer,
+            "patchifier": patchifier,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "scheduler": scheduler,
+            "vae": vae,
+            "allowed_inference_steps": allowed_inference_steps,
+            # Os prompt enhancers são opcionais e não são carregados por padrão para economizar memória
+            "prompt_enhancer_image_caption_model": None,
+            "prompt_enhancer_image_caption_processor": None,
+            "prompt_enhancer_llm_model": None,
+            "prompt_enhancer_llm_tokenizer": None,
+        }
+        pipeline = LTXConditionPipeline(**submodel_dict)
 # 4. Montar a pipeline principal
         )
         conditions = []
         if condition_image_1 is not None:
             condition_image_1 = ImageOps.fit(condition_image_1, (downscaled_width, downscaled_height), Image.LANCZOS)
         pipeline_args = {}
         if conditions:
+            call_kwargs["conditions"] = conditions
         # Manipulação da seed
         if randomize_seed:
             seed = random.randint(0, 2**32 - 1)
+        if True:
+            call_kwargs = {
+                "prompt":prompt,
+                "height": downscaled_height,
+                "width": downscaled_width,
+                "skip_initial_inference_steps": 3,
+                "skip_final_inference_steps": 0,
+                "num_inference_steps": 30,
+                "negative_prompt": negative_prompt,
+                "guidance_scale": CONFIG.get("guidance_scale", [1, 1, 6, 8, 6, 1, 1]),
+                "stg_scale": CONFIG.get("stg_scale", [0, 0, 4, 4, 4, 2, 1]),
+                "rescaling_scale": CONFIG.get("rescaling_scale", [1, 1, 0.5, 0.5, 1, 1, 1]),
+                "skip_block_list": CONFIG.get("skip_block_list", [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]),
+                "frame_rate": int(DEFAULT_FPS),
+                "generator": torch.Generator().manual_seed(seed),
+                "output_type": "np",
+                "media_items": None,
+                "decode_timestep": CONFIG.get("decode_timestep", 0.05),
+                "decode_noise_scale": CONFIG.get("decode_noise_scale", 0.025),
+                "is_video": True,
+                "vae_per_channel_normalize": True,
+                "offload_to_cpu": False,
+                "enhance_prompt": False,
+                "num_frames": num_frames,
+                "downscale_factor": CONFIG.get("downscale_factor", 0.6666666),
+                "rescaling_scale": CONFIG.get("rescaling_scale",  [1, 1, 0.5, 0.5, 1, 1, 1]),
+                "guidance_timesteps": CONFIG.get("guidance_timesteps", [1.0, 0.996,  0.9933, 0.9850, 0.9767, 0.9008, 0.6180]),
+                "skip_block_list": CONFIG.get("skip_block_list",  [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]),
+                "sampler": CONFIG.get("sampler", "from_checkpoint"),
+                "precision": CONFIG.get("precision", "float8_e4m3fn"),
+                "stochastic_sampling": CONFIG.get("stochastic_sampling", False),
+                "cfg_star_rescale": CONFIG.get("cfg_star_rescale", True),
+            }
         # ETAPA 1: Geração do vídeo em baixa resolução
+        latents = pipeline(**call_kwargs).frames[0]
         # ETAPA 2: Upscale dos latentes
         #upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2