tencent
/

HunyuanVideo-Foley

hunyuanvideo-foley

text-video-to-audio

Model card Files Files and versions

James Zhou commited on Aug 26

Commit

1062761

·

1 Parent(s): 764de60

[add] model setting

Files changed (1) hide show

config.yaml +49 -0

config.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+model_config:
+  model_name: HunyuanVideo-Foley-XXL
+  model_type: 1d
+  model_precision: bf16
+  model_kwargs:
+    depth_triple_blocks: 18
+    depth_single_blocks: 36
+    hidden_size: 1536
+    num_heads: 12
+    mlp_ratio: 4
+    mlp_act_type: "gelu_tanh"
+    qkv_bias: True
+    qk_norm: True
+    qk_norm_type: "rms"
+    attn_mode: "torch"
+    embedder_type: "default"
+    interleaved_audio_visual_rope: True
+    enable_learnable_empty_visual_feat: True
+    sync_modulation: False
+    add_sync_feat_to_audio: True
+    cross_attention: True
+    use_attention_mask: False
+    condition_projection: "linear"
+    sync_feat_dim: 768 # syncformer 768 dim
+    condition_dim: 768  # clap 768 text condition dim (clip-text)
+    clip_dim: 768  # siglip2 visual dim
+    audio_vae_latent_dim: 128
+    audio_frame_rate: 50
+    patch_size: 1
+    rope_dim_list: null
+    rope_theta: 10000
+    text_length: 77
+    clip_length: 64
+    sync_length: 192
+    use_mmaudio_singleblock: True
+    depth_triple_ssl_encoder: null
+    depth_single_ssl_encoder: 8
+    use_repa_with_audiossl: True
+diffusion_config:
+  denoise_type: "flow"
+  flow_path_type: "linear"
+  flow_predict_type: "velocity"
+  flow_reverse: True
+  flow_solver: "euler"
+  sample_flow_shift: 1.0
+  sample_use_flux_shift: False
+  flux_base_shift: 0.5
+  flux_max_shift: 1.15