hqfang commited on
Commit
dc0553f
·
1 Parent(s): f9da642

update to fp32 weights

Browse files
config.json CHANGED
@@ -186,7 +186,7 @@
186
  }
187
  },
188
  "tie_word_embeddings": false,
189
- "torch_dtype": "bfloat16",
190
  "transformers_version": "4.52.3",
191
  "use_cache": true,
192
  "vit_config": {
 
186
  }
187
  },
188
  "tie_word_embeddings": false,
189
+ "torch_dtype": "float32",
190
  "transformers_version": "4.52.3",
191
  "use_cache": true,
192
  "vit_config": {
model-00001-of-00004.safetensors → model-00001-of-00007.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba6a7678f86a232a70a801fba317c3dfe8b754003e71861f54173f0bc53831e0
3
- size 4878581216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44d1112e516614a010331bba82eeeac9d6039e2f15b1c3c58f2a01b9e536c3b5
3
+ size 4978520816
model-00002-of-00004.safetensors → model-00002-of-00007.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4a4a1c7e20477448b5ac6a2f3b6e502dd127fb43019d6095256f071f3f27a06
3
- size 4932745864
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:500c53be3ecef1e9e6a02211d2322096affca2dce6ec4e0413f9c1059a18430d
3
+ size 4778633920
model-00003-of-00004.safetensors → model-00003-of-00007.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f5f9708d386c65a2e3b3d788ffdc360e5e305fdb9ac10f797b2f54e2c7f8f10
3
- size 4994552920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:314ccb5eee07eb898b0f7fc7b0e79e20a8c917acf0caa34a63379ab2e73a4401
3
+ size 4661160168
model-00004-of-00004.safetensors → model-00004-of-00007.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b0357c2b2be3790f125c6b2de468689cf857ee61f3d7183842e603eb64529aa
3
- size 1433042592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc6678709f4720f62d175f755fd92fc3a71528712eb6d7bdf7822c82ea0f7e96
3
+ size 4661160192
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e5268ef2052de977b771942380ddafd7dd746ba3a1f3353483a1c6a20fa5fe5
3
+ size 4661160192
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7273b2173b8c25346ec9071d2b3b8c489ce977bce9115ec19cf6b1027493a5b1
3
+ size 4997750712
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32beeed0e17cfaf1fb2f83b39c28ba8044a6ef41a7a865b4d7e3f0751fbfa2c4
3
+ size 3739371680
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
model.yaml ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name: molmo
2
+ llm:
3
+ d_model: 3584
4
+ n_heads: 28
5
+ n_kv_heads: 4
6
+ head_dim: null
7
+ qkv_bias: true
8
+ clip_qkv: null
9
+ n_layers: 28
10
+ mlp_ratio: 4
11
+ mlp_hidden_size: 37888
12
+ activation_type: swiglu
13
+ block_type: sequential
14
+ rope: true
15
+ rope_full_precision: true
16
+ rope_theta: 1000000.0
17
+ rope_type: default
18
+ rope_factor: null
19
+ rope_high_freq_factor: null
20
+ rope_low_freq_factor: null
21
+ rope_original_max_position_embeddings: null
22
+ attention_type: sdpa
23
+ float32_attention: true
24
+ attention_dropout: 0.0
25
+ attention_layer_norm: false
26
+ attention_layer_norm_type: olmo
27
+ residual_dropout: 0.1
28
+ response_residual_dropout: 0.0
29
+ layer_norm_type: rms
30
+ layer_norm_with_affine: true
31
+ layer_norm_eps: 1.0e-06
32
+ attention_layer_norm_with_affine: true
33
+ max_sequence_length: 4096
34
+ max_position_embeddings: null
35
+ include_bias: false
36
+ bias_for_layer_norm: null
37
+ norm_after: false
38
+ moe_num_experts: 8
39
+ moe_top_k: 2
40
+ moe_mlp_impl: sparse
41
+ moe_log_expert_assignment: false
42
+ moe_shared_expert: false
43
+ moe_lbl_in_fp32: false
44
+ moe_interleave: false
45
+ moe_loss_weight: 0.1
46
+ moe_zloss_weight: null
47
+ moe_dropless: true
48
+ moe_capacity_factor: 1.25
49
+ embedding_dropout: 0.0
50
+ scale_logits: false
51
+ vocab_size: 152064
52
+ additional_vocab_size: 128
53
+ weight_tying: false
54
+ embedding_size: 152064
55
+ use_position_ids: true
56
+ tokenizer:
57
+ identifier: Qwen/Qwen2.5-7B
58
+ tokenizer_dir: null
59
+ depth_tokens: true
60
+ init_path: gs://mm-olmo/pretrained_llms/qwen2.5-7b.pt
61
+ init_incremental: null
62
+ new_embedding_init_range: 0.02
63
+ initializer_range: 0.02
64
+ normalize_input_embeds: false
65
+ activation_checkpoint: whole_layer
66
+ compile: blocks
67
+ fix_pad_tokenizer: false
68
+ resize_vocab: false
69
+ init_std: 0.02
70
+ init_fn: normal
71
+ init_cutoff_factor: null
72
+ vision_backbone:
73
+ vit:
74
+ image_model_type: siglip
75
+ image_default_input_size:
76
+ - 378
77
+ - 378
78
+ image_patch_size: 14
79
+ image_pos_patch_size: 14
80
+ image_emb_dim: 1152
81
+ image_num_heads: 16
82
+ image_num_key_value_heads: 16
83
+ image_num_layers: 27
84
+ image_head_dim: 72
85
+ image_mlp_dim: 4304
86
+ image_mlp_activations: gelu_pytorch_tanh
87
+ image_dropout_rate: 0.0
88
+ image_num_pos: 729
89
+ image_norm_eps: 1.0e-06
90
+ attention_dropout: 0.0
91
+ residual_dropout: 0.0
92
+ initializer_range: 0.02
93
+ float32_attention: true
94
+ attention_type: sdpa
95
+ activation_checkpointing: true
96
+ init_path: gs://mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt
97
+ resize_mode: siglip
98
+ pad_value: 0.0
99
+ normalize: siglip
100
+ image_pooling_2d: attention_meanq
101
+ pooling_attention_mask: false
102
+ image_projector: mlp
103
+ image_padding_embed: null
104
+ vit_layers:
105
+ - -3
106
+ - -9
107
+ skip_unused_layers: true
108
+ image_feature_dropout: 0.0
109
+ connector_activation_checkpointing: true
110
+ compile_vit: blocks
111
+ data_formatter:
112
+ prompt_templates: uber_model
113
+ message_format: role
114
+ system_prompt: demo_or_style
115
+ always_start_with_space: false
116
+ default_inference_len: 65
117
+ select_answer: best
118
+ debug: false
119
+ image_last: false
120
+ format_message_list: null
121
+ p_one_message: 0.0
122
+ mm_preprocessor:
123
+ crop_mode: overlap-and-resize-c2
124
+ max_crops: 8
125
+ max_images: 2
126
+ max_multi_image_crops: 8
127
+ pooling_w: 2
128
+ pooling_h: 2
129
+ overlap_margins:
130
+ - 4
131
+ - 4
132
+ use_col_tokens: true
133
+ loss_token_weighting: root_subsegments
134
+ legacy_image_mask: false
135
+ max_answer_len: null
136
+ img_aug: true
137
+ bi_directional_attn: null
138
+ lora_enable: true
139
+ lora_rank: 32
140
+ lora_alpha: 16
141
+ lora_dropout: 0.0
142
+ lora_bias: none
143
+ n_action_bins: 256
144
+ norm_stats:
145
+ libero_object_no_noops_modified:
146
+ action:
147
+ mean:
148
+ - 0.07096529006958008
149
+ - 0.13498851656913757
150
+ - -0.04601382836699486
151
+ - 0.00123520044144243
152
+ - 0.006998839322477579
153
+ - -0.015027612447738647
154
+ - 0.46428999304771423
155
+ std:
156
+ - 0.2681235373020172
157
+ - 0.43846824765205383
158
+ - 0.4474974274635315
159
+ - 0.024446550756692886
160
+ - 0.049355510622262955
161
+ - 0.042107198387384415
162
+ - 0.49879148602485657
163
+ max:
164
+ - 0.9375
165
+ - 0.8919642567634583
166
+ - 0.9375
167
+ - 0.17678570747375488
168
+ - 0.35035714507102966
169
+ - 0.1810714304447174
170
+ - 1.0
171
+ min:
172
+ - -0.8839285969734192
173
+ - -0.9375
174
+ - -0.9375
175
+ - -0.15000000596046448
176
+ - -0.29035714268684387
177
+ - -0.32892856001853943
178
+ - 0.0
179
+ q01:
180
+ - -0.5383928418159485
181
+ - -0.8758928775787354
182
+ - -0.9375
183
+ - -0.06964285671710968
184
+ - -0.11678571254014969
185
+ - -0.15964286029338837
186
+ - 0.0
187
+ q99:
188
+ - 0.8464285731315613
189
+ - 0.84375
190
+ - 0.9375
191
+ - 0.08142857253551483
192
+ - 0.14892856776714325
193
+ - 0.0867857113480568
194
+ - 1.0
195
+ proprio:
196
+ mean:
197
+ - -0.02999030612409115
198
+ - -0.007947085425257683
199
+ - 0.20293472707271576
200
+ - 3.1086409091949463
201
+ - -0.21404768526554108
202
+ - -0.11307074874639511
203
+ - 0.0
204
+ - 0.029380427673459053
205
+ - -0.030556727200746536
206
+ std:
207
+ - 0.06694897264242172
208
+ - 0.17608462274074554
209
+ - 0.07807064801454544
210
+ - 0.0868484303355217
211
+ - 0.33540457487106323
212
+ - 0.20728276669979095
213
+ - 0.0
214
+ - 0.00956575945019722
215
+ - 0.009197483770549297
216
+ max:
217
+ - 0.14580604434013367
218
+ - 0.33216384053230286
219
+ - 0.3857804834842682
220
+ - 3.4003844261169434
221
+ - 0.7954911589622498
222
+ - 0.6642207503318787
223
+ - 0.0
224
+ - 0.04104341194033623
225
+ - -0.00018117300351150334
226
+ min:
227
+ - -0.1765444278717041
228
+ - -0.29457300901412964
229
+ - 0.008128180168569088
230
+ - 2.2890501022338867
231
+ - -1.883241891860962
232
+ - -1.0600427389144897
233
+ - 0.0
234
+ - 0.0006495157140307128
235
+ - -0.041782498359680176
236
+ q01:
237
+ - -0.14911890715360643
238
+ - -0.25978428691625594
239
+ - 0.009925739830359817
240
+ - 2.7545341420173646
241
+ - -1.3996034812927245
242
+ - -0.6867720144987106
243
+ - 0.0
244
+ - 0.008197814421728254
245
+ - -0.04015838988125324
246
+ q99:
247
+ - 0.09063626825809479
248
+ - 0.29066365867853167
249
+ - 0.3370887073874472
250
+ - 3.2611824750900267
251
+ - 0.32092821151018125
252
+ - 0.4037663781642913
253
+ - 0.0
254
+ - 0.039891827926039694
255
+ - -0.009106044843792932
256
+ num_transitions: 66984
257
+ num_trajectories: 454