James Zhou commited on
Commit
1062761
·
1 Parent(s): 764de60

[add] model setting

Browse files
Files changed (1) hide show
  1. config.yaml +49 -0
config.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_config:
2
+ model_name: HunyuanVideo-Foley-XXL
3
+ model_type: 1d
4
+ model_precision: bf16
5
+ model_kwargs:
6
+ depth_triple_blocks: 18
7
+ depth_single_blocks: 36
8
+ hidden_size: 1536
9
+ num_heads: 12
10
+ mlp_ratio: 4
11
+ mlp_act_type: "gelu_tanh"
12
+ qkv_bias: True
13
+ qk_norm: True
14
+ qk_norm_type: "rms"
15
+ attn_mode: "torch"
16
+ embedder_type: "default"
17
+ interleaved_audio_visual_rope: True
18
+ enable_learnable_empty_visual_feat: True
19
+ sync_modulation: False
20
+ add_sync_feat_to_audio: True
21
+ cross_attention: True
22
+ use_attention_mask: False
23
+ condition_projection: "linear"
24
+ sync_feat_dim: 768 # syncformer 768 dim
25
+ condition_dim: 768 # clap 768 text condition dim (clip-text)
26
+ clip_dim: 768 # siglip2 visual dim
27
+ audio_vae_latent_dim: 128
28
+ audio_frame_rate: 50
29
+ patch_size: 1
30
+ rope_dim_list: null
31
+ rope_theta: 10000
32
+ text_length: 77
33
+ clip_length: 64
34
+ sync_length: 192
35
+ use_mmaudio_singleblock: True
36
+ depth_triple_ssl_encoder: null
37
+ depth_single_ssl_encoder: 8
38
+ use_repa_with_audiossl: True
39
+
40
+ diffusion_config:
41
+ denoise_type: "flow"
42
+ flow_path_type: "linear"
43
+ flow_predict_type: "velocity"
44
+ flow_reverse: True
45
+ flow_solver: "euler"
46
+ sample_flow_shift: 1.0
47
+ sample_use_flux_shift: False
48
+ flux_base_shift: 0.5
49
+ flux_max_shift: 1.15