eeuuia commited on
Commit
703a6cb
·
verified ·
1 Parent(s): 19c4121

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -76
app.py CHANGED
@@ -31,13 +31,13 @@ warnings.filterwarnings("ignore", message=".*")
31
  print("=== [Inicialização da Aplicação] ===")
32
 
33
  # 1. Carregar Configuração do Arquivo YAML
34
- CONFIG_PATH = Path("ltxv-13b-0.9.8-distilled.yaml")
35
  if not CONFIG_PATH.exists():
36
  raise FileNotFoundError(f"Arquivo de configuração '{CONFIG_PATH}' não encontrado.")
37
  with open(CONFIG_PATH, "r") as f:
38
- config = yaml.safe_load(f)
39
  print(f"Configuração carregada de: {CONFIG_PATH}")
40
- print(json.dumps(config, indent=2))
41
 
42
  # Parâmetros Globais
43
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -49,62 +49,75 @@ upscaler_repo="Lightricks/ltxv-spatial-upscaler-0.9.7"
49
 
50
  FPS = 24
51
 
52
- # 2. Baixar os arquivos do modelo base
53
- print(f"=== Baixando snapshot do repositório base: {base_repo} ===")
54
- local_repo_path = snapshot_download(
55
- repo_id=base_repo,
56
- token=os.getenv("HF_TOKEN") or HfFolder.get_token(),
57
- resume_download=True
58
- )
59
-
60
- # 3. Carregar cada componente da pipeline explicitamente
61
- print("=== Carregando componentes da pipeline... ===")
62
-
63
- vae = AutoModel.from_pretrained(
64
- "Lightricks/LTX-Video",
65
- subfolder="vae",
66
- torch_dtype=torch_dtype
67
- )
68
- text_encoder = AutoModel.from_pretrained(
69
- "Lightricks/LTX-Video",
70
- subfolder="text_encoder",
71
- torch_dtype=torch_dtype
72
- )
73
- scheduler = AutoModel.from_pretrained(
74
- "Lightricks/LTX-Video",
75
- subfolder="scheduler",
76
- torch_dtype=torch_dtype
77
- )
78
- tokenizer = AutoModel.from_pretrained(
79
- "Lightricks/LTX-Video",
80
- subfolder="tokenizer",
81
- torch_dtype=torch_dtype
82
- )
83
 
84
- if hasattr(scheduler.config, 'use_dynamic_shifting') and scheduler.config.use_dynamic_shifting:
85
- print("[Config] Desativando 'use_dynamic_shifting' no scheduler.")
86
- scheduler.config.use_dynamic_shifting = False
87
-
88
 
89
- transformer = AutoModel.from_pretrained(
90
- "Lightricks/LTX-Video",
91
- subfolder="transformer",
92
- torch_dtype=torch.bfloat16
93
- )
94
- transformer.enable_layerwise_casting(
95
- storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16
96
- )
97
-
98
-
99
- pipeline = LTXConditionPipeline.from_pretrained(
100
- "Lightricks/LTX-Video-0.9.8-13B-distilled",
101
- offload_state_dict=False,
102
- vae=vae, text_encoder=text_encoder, tokenizer=tokenizer,
103
- scheduler=scheduler, transformer=transformer,
104
- torch_dtype=torch.bfloat16,
105
- cache_dir=os.getenv("HF_HOME_CACHE"),
106
- token=os.getenv("HF_TOKEN"),
107
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
 
110
  # 4. Montar a pipeline principal
@@ -151,7 +164,6 @@ def prepare_and_generate_video(
151
  )
152
 
153
 
154
-
155
  conditions = []
156
  if condition_image_1 is not None:
157
  condition_image_1 = ImageOps.fit(condition_image_1, (downscaled_width, downscaled_height), Image.LANCZOS)
@@ -170,31 +182,48 @@ def prepare_and_generate_video(
170
 
171
  pipeline_args = {}
172
  if conditions:
173
- pipeline_args["conditions"] = conditions
174
 
175
  # Manipulação da seed
176
  if randomize_seed:
177
  seed = random.randint(0, 2**32 - 1)
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
  # ETAPA 1: Geração do vídeo em baixa resolução
181
- latents = pipeline(
182
- prompt=prompt,
183
- negative_prompt=negative_prompt,
184
- width=downscaled_width,
185
- height=downscaled_height,
186
- num_frames=num_frames,
187
- timesteps=[1000, 993, 987, 981, 975, 909, 725, 0.03],
188
- decode_timestep=0.05,
189
- decode_noise_scale=0.025,
190
- image_cond_noise_scale=0.0,
191
- guidance_scale=guidance_scale,
192
- guidance_rescale=0.7,
193
- generator=torch.Generator().manual_seed(seed),
194
- #output_type="latent",
195
- output_type="np",
196
- **pipeline_args
197
- ).frames[0]
198
 
199
  # ETAPA 2: Upscale dos latentes
200
  #upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
 
31
  print("=== [Inicialização da Aplicação] ===")
32
 
33
  # 1. Carregar Configuração do Arquivo YAML
34
+ CONFIG_PATH = Path("ltxv-13b-0.9.8-dev-fp8.yaml")
35
  if not CONFIG_PATH.exists():
36
  raise FileNotFoundError(f"Arquivo de configuração '{CONFIG_PATH}' não encontrado.")
37
  with open(CONFIG_PATH, "r") as f:
38
+ CONFIG = yaml.safe_load(f)
39
  print(f"Configuração carregada de: {CONFIG_PATH}")
40
+ print(json.dumps(CONFIG, indent=2))
41
 
42
  # Parâmetros Globais
43
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
49
 
50
  FPS = 24
51
 
52
+ CACHE_DIR = os.environ.get("HF_HOME")
53
+ DEPS_DIR = Path("/data")
54
+ LTX_VIDEO_REPO_DIR = DEPS_DIR / "LTX-Video"
55
+ BASE_CONFIG_PATH = LTX_VIDEO_REPO_DIR / "configs"
56
+ DEFAULT_CONFIG_FILE = BASE_CONFIG_PATH / "ltxv-13b-0.9.8-dev-fp8.yaml"
57
+ LTX_REPO_ID = "Lightricks/LTX-Video"
58
+ RESULTS_DIR = Path("/app/output")
59
+ DEFAULT_FPS = 24.0
60
+ FRAMES_ALIGNMENT = 8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
 
 
 
 
62
 
63
+ # 2. Baixar os arquivos do modelo base
64
+ print(f"=== Baixando snapshot do repositório base: {base_repo} ===")
65
+ if True:
66
+ if True:
67
+ ckpt_path_str = hf_hub_download(repo_id=LTX_REPO_ID, filename=checkpoint_path, cache_dir=CACHE_DIR)
68
+ ckpt_path = Path(ckpt_path_str)
69
+ if not ckpt_path.is_file():
70
+ raise FileNotFoundError(f"Main checkpoint file not found: {ckpt_path}")
71
+
72
+ # 1. Carrega Metadados do Checkpoint
73
+ with safe_open(ckpt_path, framework="pt") as f:
74
+ metadata = f.metadata() or {}
75
+ config_str = metadata.get("config", "{}")
76
+ configs = json.loads(config_str)
77
+ allowed_inference_steps = configs.get("allowed_inference_steps")
78
+
79
+ # 2. Carrega os Componentes Individuais (todos na CPU)
80
+ # O `.from_pretrained(ckpt_path)` é inteligente e carrega os pesos corretos do arquivo .safetensors.
81
+ logging.info("Carregando VAE...")
82
+ vae = CausalVideoAutoencoder.from_pretrained(ckpt_path).to("cpu")
83
+
84
+ logging.info("Carregando Transformer...")
85
+ transformer = Transformer3DModel.from_pretrained(ckpt_path).to("cpu")
86
+
87
+ logging.info("Carregando Scheduler...")
88
+ scheduler = RectifiedFlowScheduler.from_pretrained(ckpt_path)
89
+
90
+ logging.info("Carregando Text Encoder e Tokenizer...")
91
+ text_encoder_path = self.config["text_encoder_model_name_or_path"]
92
+ text_encoder = T5EncoderModel.from_pretrained(text_encoder_path, subfolder="text_encoder").to("cpu")
93
+ tokenizer = T5Tokenizer.from_pretrained(text_encoder_path, subfolder="tokenizer")
94
+
95
+ patchifier = SymmetricPatchifier(patch_size=1)
96
+
97
+ # 3. Define a precisão dos modelos (ainda na CPU, será aplicado na GPU depois)
98
+ precision = self.config.get("precision", "bfloat16")
99
+ if precision == "bfloat16":
100
+ vae.to(torch.bfloat16)
101
+ transformer.to(torch.bfloat16)
102
+ text_encoder.to(torch.bfloat16)
103
+
104
+ # 4. Monta o objeto do Pipeline com os componentes carregados
105
+ logging.info("Montando o objeto LTXVideoPipeline...")
106
+ submodel_dict = {
107
+ "transformer": transformer,
108
+ "patchifier": patchifier,
109
+ "text_encoder": text_encoder,
110
+ "tokenizer": tokenizer,
111
+ "scheduler": scheduler,
112
+ "vae": vae,
113
+ "allowed_inference_steps": allowed_inference_steps,
114
+ # Os prompt enhancers são opcionais e não são carregados por padrão para economizar memória
115
+ "prompt_enhancer_image_caption_model": None,
116
+ "prompt_enhancer_image_caption_processor": None,
117
+ "prompt_enhancer_llm_model": None,
118
+ "prompt_enhancer_llm_tokenizer": None,
119
+ }
120
+ pipeline = LTXConditionPipeline(**submodel_dict)
121
 
122
 
123
  # 4. Montar a pipeline principal
 
164
  )
165
 
166
 
 
167
  conditions = []
168
  if condition_image_1 is not None:
169
  condition_image_1 = ImageOps.fit(condition_image_1, (downscaled_width, downscaled_height), Image.LANCZOS)
 
182
 
183
  pipeline_args = {}
184
  if conditions:
185
+ call_kwargs["conditions"] = conditions
186
 
187
  # Manipulação da seed
188
  if randomize_seed:
189
  seed = random.randint(0, 2**32 - 1)
190
 
191
+ if True:
192
+ call_kwargs = {
193
+ "prompt":prompt,
194
+ "height": downscaled_height,
195
+ "width": downscaled_width,
196
+ "skip_initial_inference_steps": 3,
197
+ "skip_final_inference_steps": 0,
198
+ "num_inference_steps": 30,
199
+ "negative_prompt": negative_prompt,
200
+ "guidance_scale": CONFIG.get("guidance_scale", [1, 1, 6, 8, 6, 1, 1]),
201
+ "stg_scale": CONFIG.get("stg_scale", [0, 0, 4, 4, 4, 2, 1]),
202
+ "rescaling_scale": CONFIG.get("rescaling_scale", [1, 1, 0.5, 0.5, 1, 1, 1]),
203
+ "skip_block_list": CONFIG.get("skip_block_list", [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]),
204
+ "frame_rate": int(DEFAULT_FPS),
205
+ "generator": torch.Generator().manual_seed(seed),
206
+ "output_type": "np",
207
+ "media_items": None,
208
+ "decode_timestep": CONFIG.get("decode_timestep", 0.05),
209
+ "decode_noise_scale": CONFIG.get("decode_noise_scale", 0.025),
210
+ "is_video": True,
211
+ "vae_per_channel_normalize": True,
212
+ "offload_to_cpu": False,
213
+ "enhance_prompt": False,
214
+ "num_frames": num_frames,
215
+ "downscale_factor": CONFIG.get("downscale_factor", 0.6666666),
216
+ "rescaling_scale": CONFIG.get("rescaling_scale", [1, 1, 0.5, 0.5, 1, 1, 1]),
217
+ "guidance_timesteps": CONFIG.get("guidance_timesteps", [1.0, 0.996, 0.9933, 0.9850, 0.9767, 0.9008, 0.6180]),
218
+ "skip_block_list": CONFIG.get("skip_block_list", [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]),
219
+ "sampler": CONFIG.get("sampler", "from_checkpoint"),
220
+ "precision": CONFIG.get("precision", "float8_e4m3fn"),
221
+ "stochastic_sampling": CONFIG.get("stochastic_sampling", False),
222
+ "cfg_star_rescale": CONFIG.get("cfg_star_rescale", True),
223
+ }
224
 
225
  # ETAPA 1: Geração do vídeo em baixa resolução
226
+ latents = pipeline(**call_kwargs).frames[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
  # ETAPA 2: Upscale dos latentes
229
  #upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2