w3en2g commited on
Commit
ff5fb66
·
verified ·
1 Parent(s): 599d6cc

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: Qwen/Qwen2.5-32B
5
+ tags:
6
+ - llama-factory
7
+ - full
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: Qwen2.5-32B-Base
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # Qwen2.5-32B-Base
18
+
19
+ This model is a fine-tuned version of [Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) on the QA_train_data dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.9999
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 5e-06
41
+ - train_batch_size: 1
42
+ - eval_batch_size: 1
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 16
46
+ - gradient_accumulation_steps: 2
47
+ - total_train_batch_size: 32
48
+ - total_eval_batch_size: 16
49
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
50
+ - lr_scheduler_type: cosine
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 3.0
53
+
54
+ ### Training results
55
+
56
+ | Training Loss | Epoch | Step | Validation Loss |
57
+ |:-------------:|:------:|:----:|:---------------:|
58
+ | 0.9373 | 0.1778 | 100 | 0.9672 |
59
+ | 0.9671 | 0.3556 | 200 | 0.9639 |
60
+ | 0.9584 | 0.5333 | 300 | 0.9629 |
61
+ | 0.957 | 0.7111 | 400 | 0.9597 |
62
+ | 0.9477 | 0.8889 | 500 | 0.9587 |
63
+ | 0.8552 | 1.0667 | 600 | 0.9710 |
64
+ | 0.7944 | 1.2444 | 700 | 0.9722 |
65
+ | 0.7359 | 1.4222 | 800 | 0.9709 |
66
+ | 0.8494 | 1.6 | 900 | 0.9662 |
67
+ | 0.8163 | 1.7778 | 1000 | 0.9663 |
68
+ | 0.8041 | 1.9556 | 1100 | 0.9639 |
69
+ | 0.6291 | 2.1333 | 1200 | 0.9990 |
70
+ | 0.6122 | 2.3111 | 1300 | 1.0004 |
71
+ | 0.6718 | 2.4889 | 1400 | 1.0003 |
72
+ | 0.6712 | 2.6667 | 1500 | 1.0002 |
73
+ | 0.6397 | 2.8444 | 1600 | 0.9996 |
74
+
75
+
76
+ ### Framework versions
77
+
78
+ - Transformers 4.46.1
79
+ - Pytorch 2.5.1+cu124
80
+ - Datasets 2.21.0
81
+ - Tokenizers 0.20.3
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.997333333333333,
3
+ "eval_loss": 0.9999473690986633,
4
+ "eval_runtime": 67.4925,
5
+ "eval_samples_per_second": 29.633,
6
+ "eval_steps_per_second": 1.852,
7
+ "total_flos": 297597215473664.0,
8
+ "train_loss": 0.7989534891938541,
9
+ "train_runtime": 8333.3074,
10
+ "train_samples_per_second": 6.48,
11
+ "train_steps_per_second": 0.202
12
+ }
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Qwen/Qwen2.5-32B",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 5120,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 27648,
13
+ "max_position_embeddings": 131072,
14
+ "max_window_layers": 64,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 40,
17
+ "num_hidden_layers": 64,
18
+ "num_key_value_heads": 8,
19
+ "rms_norm_eps": 1e-05,
20
+ "rope_scaling": null,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.46.1",
26
+ "use_cache": false,
27
+ "use_sliding_window": false,
28
+ "vocab_size": 152064
29
+ }
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.997333333333333,
3
+ "eval_loss": 0.9999473690986633,
4
+ "eval_runtime": 67.4925,
5
+ "eval_samples_per_second": 29.633,
6
+ "eval_steps_per_second": 1.852
7
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.46.1"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00014.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a8f3fc838fa2cc8e4e18bb9e4bff58453db6e7b3a35f9d5456d6da7170636de
3
+ size 4891730992
model-00002-of-00014.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ce4b27d2bcc747451e717646b7115e94dedf984aedadcec20dfaed93b4aee98
3
+ size 4876059352
model-00003-of-00014.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2a36d34ffcd7bb0619ac02565031abbaf18de66effeb816e8be4993dc701c14
3
+ size 4876059384
model-00004-of-00014.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f570c10d7fe1a73d7362f0acb38c7be37639a9c60205e45070f2f06ae66b67c
3
+ size 4876059416
model-00005-of-00014.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80b6ca01c24590f188630ab33c9376bff5da4cbc0f481f6c3d2d0bc215802455
3
+ size 4876059416
model-00006-of-00014.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1151a1548683255a03140de6c0c5d8827f91f7306fe41a0b6b80b9c100f32942
3
+ size 4876059416
model-00007-of-00014.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15bf6ba48b9b22f2ae29e06de39732a5be1faa8ab1a9f8b56ab6dd00b1b46e80
3
+ size 4876059416
model-00008-of-00014.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5a5b8f22704766b95042ee2fab1f77710250eb1d3890f15a83715d71dfc4c28
3
+ size 4876059416
model-00009-of-00014.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9870cd80a9175c827ef8a9c96f8353ab7cc18b29d7bfe4454de9d910f51b1146
3
+ size 4876059416
model-00010-of-00014.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aef2e801263b7f886a4981c37f1cce53b404809122084b20dfc10ebb27d0b9cd
3
+ size 4876059416
model-00011-of-00014.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64ba1f1b0973b790e76b53b13aa12ac5eb1b7f2ef8a7be5828b3840447b2ddf8
3
+ size 4876059416
model-00012-of-00014.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aad45de9f5b9ca3736693f5f05ef00a772b8c13ad2f22c3a8e72fe84aac94638
3
+ size 4876059416
model-00013-of-00014.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f807cf5d2c9c986124fb054c48238f32b2ffd74811db915a788b498ae83365b
3
+ size 4876059416
model-00014-of-00014.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c178f2eaa397911f576aed4e4ca950219caf040842351fad77f2daae1125d4c2
3
+ size 2123397800
model.safetensors.index.json ADDED
@@ -0,0 +1,778 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 65527752704
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00014-of-00014.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00014.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00014.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00014.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00014.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00014.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00014.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00014.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00014.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00014.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00014.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00014.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00014.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00014.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00003-of-00014.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00014.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00014.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00014.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00003-of-00014.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00014.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00014.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00003-of-00014.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00003-of-00014.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00014.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00014.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00003-of-00014.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00004-of-00014.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00014.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00014.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00014.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00004-of-00014.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00004-of-00014.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00014.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00004-of-00014.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00004-of-00014.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00004-of-00014.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00004-of-00014.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00004-of-00014.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00004-of-00014.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00004-of-00014.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00004-of-00014.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00004-of-00014.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00004-of-00014.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00004-of-00014.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00004-of-00014.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00004-of-00014.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00005-of-00014.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00004-of-00014.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00004-of-00014.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00004-of-00014.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00005-of-00014.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00005-of-00014.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00005-of-00014.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00005-of-00014.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00014.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00014.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00014.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00014.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00014.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00014.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00005-of-00014.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00005-of-00014.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00005-of-00014.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00005-of-00014.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00005-of-00014.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00005-of-00014.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00005-of-00014.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00005-of-00014.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00005-of-00014.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00005-of-00014.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00005-of-00014.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00005-of-00014.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00006-of-00014.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00005-of-00014.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00005-of-00014.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00005-of-00014.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00006-of-00014.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00006-of-00014.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00006-of-00014.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00006-of-00014.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00006-of-00014.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00006-of-00014.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00006-of-00014.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00006-of-00014.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00006-of-00014.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00006-of-00014.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00006-of-00014.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00006-of-00014.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00006-of-00014.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00006-of-00014.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00006-of-00014.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00006-of-00014.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
260
+ "model.layers.28.input_layernorm.weight": "model-00007-of-00014.safetensors",
261
+ "model.layers.28.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
262
+ "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
263
+ "model.layers.28.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
264
+ "model.layers.28.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
265
+ "model.layers.28.self_attn.k_proj.bias": "model-00006-of-00014.safetensors",
266
+ "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
267
+ "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
268
+ "model.layers.28.self_attn.q_proj.bias": "model-00006-of-00014.safetensors",
269
+ "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
270
+ "model.layers.28.self_attn.v_proj.bias": "model-00006-of-00014.safetensors",
271
+ "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
272
+ "model.layers.29.input_layernorm.weight": "model-00007-of-00014.safetensors",
273
+ "model.layers.29.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
274
+ "model.layers.29.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
275
+ "model.layers.29.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
276
+ "model.layers.29.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
277
+ "model.layers.29.self_attn.k_proj.bias": "model-00007-of-00014.safetensors",
278
+ "model.layers.29.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
279
+ "model.layers.29.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
280
+ "model.layers.29.self_attn.q_proj.bias": "model-00007-of-00014.safetensors",
281
+ "model.layers.29.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
282
+ "model.layers.29.self_attn.v_proj.bias": "model-00007-of-00014.safetensors",
283
+ "model.layers.29.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
284
+ "model.layers.3.input_layernorm.weight": "model-00002-of-00014.safetensors",
285
+ "model.layers.3.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
286
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
287
+ "model.layers.3.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
288
+ "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
289
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00014.safetensors",
290
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
291
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
292
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00014.safetensors",
293
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
294
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00014.safetensors",
295
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
296
+ "model.layers.30.input_layernorm.weight": "model-00007-of-00014.safetensors",
297
+ "model.layers.30.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
298
+ "model.layers.30.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
299
+ "model.layers.30.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
300
+ "model.layers.30.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
301
+ "model.layers.30.self_attn.k_proj.bias": "model-00007-of-00014.safetensors",
302
+ "model.layers.30.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
303
+ "model.layers.30.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
304
+ "model.layers.30.self_attn.q_proj.bias": "model-00007-of-00014.safetensors",
305
+ "model.layers.30.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
306
+ "model.layers.30.self_attn.v_proj.bias": "model-00007-of-00014.safetensors",
307
+ "model.layers.30.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
308
+ "model.layers.31.input_layernorm.weight": "model-00007-of-00014.safetensors",
309
+ "model.layers.31.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
310
+ "model.layers.31.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
311
+ "model.layers.31.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
312
+ "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
313
+ "model.layers.31.self_attn.k_proj.bias": "model-00007-of-00014.safetensors",
314
+ "model.layers.31.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
315
+ "model.layers.31.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
316
+ "model.layers.31.self_attn.q_proj.bias": "model-00007-of-00014.safetensors",
317
+ "model.layers.31.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
318
+ "model.layers.31.self_attn.v_proj.bias": "model-00007-of-00014.safetensors",
319
+ "model.layers.31.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
320
+ "model.layers.32.input_layernorm.weight": "model-00007-of-00014.safetensors",
321
+ "model.layers.32.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
322
+ "model.layers.32.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
323
+ "model.layers.32.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
324
+ "model.layers.32.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
325
+ "model.layers.32.self_attn.k_proj.bias": "model-00007-of-00014.safetensors",
326
+ "model.layers.32.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
327
+ "model.layers.32.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
328
+ "model.layers.32.self_attn.q_proj.bias": "model-00007-of-00014.safetensors",
329
+ "model.layers.32.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
330
+ "model.layers.32.self_attn.v_proj.bias": "model-00007-of-00014.safetensors",
331
+ "model.layers.32.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
332
+ "model.layers.33.input_layernorm.weight": "model-00008-of-00014.safetensors",
333
+ "model.layers.33.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
334
+ "model.layers.33.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
335
+ "model.layers.33.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
336
+ "model.layers.33.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
337
+ "model.layers.33.self_attn.k_proj.bias": "model-00007-of-00014.safetensors",
338
+ "model.layers.33.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
339
+ "model.layers.33.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
340
+ "model.layers.33.self_attn.q_proj.bias": "model-00007-of-00014.safetensors",
341
+ "model.layers.33.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
342
+ "model.layers.33.self_attn.v_proj.bias": "model-00007-of-00014.safetensors",
343
+ "model.layers.33.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
344
+ "model.layers.34.input_layernorm.weight": "model-00008-of-00014.safetensors",
345
+ "model.layers.34.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
346
+ "model.layers.34.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
347
+ "model.layers.34.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
348
+ "model.layers.34.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
349
+ "model.layers.34.self_attn.k_proj.bias": "model-00008-of-00014.safetensors",
350
+ "model.layers.34.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
351
+ "model.layers.34.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
352
+ "model.layers.34.self_attn.q_proj.bias": "model-00008-of-00014.safetensors",
353
+ "model.layers.34.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
354
+ "model.layers.34.self_attn.v_proj.bias": "model-00008-of-00014.safetensors",
355
+ "model.layers.34.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
356
+ "model.layers.35.input_layernorm.weight": "model-00008-of-00014.safetensors",
357
+ "model.layers.35.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
358
+ "model.layers.35.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
359
+ "model.layers.35.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
360
+ "model.layers.35.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
361
+ "model.layers.35.self_attn.k_proj.bias": "model-00008-of-00014.safetensors",
362
+ "model.layers.35.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
363
+ "model.layers.35.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
364
+ "model.layers.35.self_attn.q_proj.bias": "model-00008-of-00014.safetensors",
365
+ "model.layers.35.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
366
+ "model.layers.35.self_attn.v_proj.bias": "model-00008-of-00014.safetensors",
367
+ "model.layers.35.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
368
+ "model.layers.36.input_layernorm.weight": "model-00008-of-00014.safetensors",
369
+ "model.layers.36.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
370
+ "model.layers.36.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
371
+ "model.layers.36.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
372
+ "model.layers.36.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
373
+ "model.layers.36.self_attn.k_proj.bias": "model-00008-of-00014.safetensors",
374
+ "model.layers.36.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
375
+ "model.layers.36.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
376
+ "model.layers.36.self_attn.q_proj.bias": "model-00008-of-00014.safetensors",
377
+ "model.layers.36.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
378
+ "model.layers.36.self_attn.v_proj.bias": "model-00008-of-00014.safetensors",
379
+ "model.layers.36.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
380
+ "model.layers.37.input_layernorm.weight": "model-00008-of-00014.safetensors",
381
+ "model.layers.37.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
382
+ "model.layers.37.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
383
+ "model.layers.37.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
384
+ "model.layers.37.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
385
+ "model.layers.37.self_attn.k_proj.bias": "model-00008-of-00014.safetensors",
386
+ "model.layers.37.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
387
+ "model.layers.37.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
388
+ "model.layers.37.self_attn.q_proj.bias": "model-00008-of-00014.safetensors",
389
+ "model.layers.37.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
390
+ "model.layers.37.self_attn.v_proj.bias": "model-00008-of-00014.safetensors",
391
+ "model.layers.37.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
392
+ "model.layers.38.input_layernorm.weight": "model-00009-of-00014.safetensors",
393
+ "model.layers.38.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
394
+ "model.layers.38.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
395
+ "model.layers.38.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
396
+ "model.layers.38.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
397
+ "model.layers.38.self_attn.k_proj.bias": "model-00008-of-00014.safetensors",
398
+ "model.layers.38.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
399
+ "model.layers.38.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
400
+ "model.layers.38.self_attn.q_proj.bias": "model-00008-of-00014.safetensors",
401
+ "model.layers.38.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
402
+ "model.layers.38.self_attn.v_proj.bias": "model-00008-of-00014.safetensors",
403
+ "model.layers.38.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
404
+ "model.layers.39.input_layernorm.weight": "model-00009-of-00014.safetensors",
405
+ "model.layers.39.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
406
+ "model.layers.39.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
407
+ "model.layers.39.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
408
+ "model.layers.39.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
409
+ "model.layers.39.self_attn.k_proj.bias": "model-00009-of-00014.safetensors",
410
+ "model.layers.39.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
411
+ "model.layers.39.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
412
+ "model.layers.39.self_attn.q_proj.bias": "model-00009-of-00014.safetensors",
413
+ "model.layers.39.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
414
+ "model.layers.39.self_attn.v_proj.bias": "model-00009-of-00014.safetensors",
415
+ "model.layers.39.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
416
+ "model.layers.4.input_layernorm.weight": "model-00002-of-00014.safetensors",
417
+ "model.layers.4.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
418
+ "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
419
+ "model.layers.4.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
420
+ "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
421
+ "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00014.safetensors",
422
+ "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
423
+ "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
424
+ "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00014.safetensors",
425
+ "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
426
+ "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00014.safetensors",
427
+ "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
428
+ "model.layers.40.input_layernorm.weight": "model-00009-of-00014.safetensors",
429
+ "model.layers.40.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
430
+ "model.layers.40.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
431
+ "model.layers.40.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
432
+ "model.layers.40.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
433
+ "model.layers.40.self_attn.k_proj.bias": "model-00009-of-00014.safetensors",
434
+ "model.layers.40.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
435
+ "model.layers.40.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
436
+ "model.layers.40.self_attn.q_proj.bias": "model-00009-of-00014.safetensors",
437
+ "model.layers.40.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
438
+ "model.layers.40.self_attn.v_proj.bias": "model-00009-of-00014.safetensors",
439
+ "model.layers.40.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
440
+ "model.layers.41.input_layernorm.weight": "model-00009-of-00014.safetensors",
441
+ "model.layers.41.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
442
+ "model.layers.41.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
443
+ "model.layers.41.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
444
+ "model.layers.41.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
445
+ "model.layers.41.self_attn.k_proj.bias": "model-00009-of-00014.safetensors",
446
+ "model.layers.41.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
447
+ "model.layers.41.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
448
+ "model.layers.41.self_attn.q_proj.bias": "model-00009-of-00014.safetensors",
449
+ "model.layers.41.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
450
+ "model.layers.41.self_attn.v_proj.bias": "model-00009-of-00014.safetensors",
451
+ "model.layers.41.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
452
+ "model.layers.42.input_layernorm.weight": "model-00009-of-00014.safetensors",
453
+ "model.layers.42.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
454
+ "model.layers.42.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
455
+ "model.layers.42.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
456
+ "model.layers.42.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
457
+ "model.layers.42.self_attn.k_proj.bias": "model-00009-of-00014.safetensors",
458
+ "model.layers.42.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
459
+ "model.layers.42.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
460
+ "model.layers.42.self_attn.q_proj.bias": "model-00009-of-00014.safetensors",
461
+ "model.layers.42.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
462
+ "model.layers.42.self_attn.v_proj.bias": "model-00009-of-00014.safetensors",
463
+ "model.layers.42.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
464
+ "model.layers.43.input_layernorm.weight": "model-00010-of-00014.safetensors",
465
+ "model.layers.43.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
466
+ "model.layers.43.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
467
+ "model.layers.43.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
468
+ "model.layers.43.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
469
+ "model.layers.43.self_attn.k_proj.bias": "model-00009-of-00014.safetensors",
470
+ "model.layers.43.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
471
+ "model.layers.43.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
472
+ "model.layers.43.self_attn.q_proj.bias": "model-00009-of-00014.safetensors",
473
+ "model.layers.43.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
474
+ "model.layers.43.self_attn.v_proj.bias": "model-00009-of-00014.safetensors",
475
+ "model.layers.43.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
476
+ "model.layers.44.input_layernorm.weight": "model-00010-of-00014.safetensors",
477
+ "model.layers.44.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
478
+ "model.layers.44.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
479
+ "model.layers.44.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
480
+ "model.layers.44.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
481
+ "model.layers.44.self_attn.k_proj.bias": "model-00010-of-00014.safetensors",
482
+ "model.layers.44.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
483
+ "model.layers.44.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
484
+ "model.layers.44.self_attn.q_proj.bias": "model-00010-of-00014.safetensors",
485
+ "model.layers.44.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
486
+ "model.layers.44.self_attn.v_proj.bias": "model-00010-of-00014.safetensors",
487
+ "model.layers.44.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
488
+ "model.layers.45.input_layernorm.weight": "model-00010-of-00014.safetensors",
489
+ "model.layers.45.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
490
+ "model.layers.45.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
491
+ "model.layers.45.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
492
+ "model.layers.45.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
493
+ "model.layers.45.self_attn.k_proj.bias": "model-00010-of-00014.safetensors",
494
+ "model.layers.45.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
495
+ "model.layers.45.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
496
+ "model.layers.45.self_attn.q_proj.bias": "model-00010-of-00014.safetensors",
497
+ "model.layers.45.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
498
+ "model.layers.45.self_attn.v_proj.bias": "model-00010-of-00014.safetensors",
499
+ "model.layers.45.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
500
+ "model.layers.46.input_layernorm.weight": "model-00010-of-00014.safetensors",
501
+ "model.layers.46.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
502
+ "model.layers.46.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
503
+ "model.layers.46.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
504
+ "model.layers.46.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
505
+ "model.layers.46.self_attn.k_proj.bias": "model-00010-of-00014.safetensors",
506
+ "model.layers.46.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
507
+ "model.layers.46.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
508
+ "model.layers.46.self_attn.q_proj.bias": "model-00010-of-00014.safetensors",
509
+ "model.layers.46.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
510
+ "model.layers.46.self_attn.v_proj.bias": "model-00010-of-00014.safetensors",
511
+ "model.layers.46.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
512
+ "model.layers.47.input_layernorm.weight": "model-00010-of-00014.safetensors",
513
+ "model.layers.47.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
514
+ "model.layers.47.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
515
+ "model.layers.47.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
516
+ "model.layers.47.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
517
+ "model.layers.47.self_attn.k_proj.bias": "model-00010-of-00014.safetensors",
518
+ "model.layers.47.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
519
+ "model.layers.47.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
520
+ "model.layers.47.self_attn.q_proj.bias": "model-00010-of-00014.safetensors",
521
+ "model.layers.47.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
522
+ "model.layers.47.self_attn.v_proj.bias": "model-00010-of-00014.safetensors",
523
+ "model.layers.47.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
524
+ "model.layers.48.input_layernorm.weight": "model-00011-of-00014.safetensors",
525
+ "model.layers.48.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
526
+ "model.layers.48.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
527
+ "model.layers.48.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
528
+ "model.layers.48.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
529
+ "model.layers.48.self_attn.k_proj.bias": "model-00010-of-00014.safetensors",
530
+ "model.layers.48.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
531
+ "model.layers.48.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
532
+ "model.layers.48.self_attn.q_proj.bias": "model-00010-of-00014.safetensors",
533
+ "model.layers.48.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
534
+ "model.layers.48.self_attn.v_proj.bias": "model-00010-of-00014.safetensors",
535
+ "model.layers.48.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
536
+ "model.layers.49.input_layernorm.weight": "model-00011-of-00014.safetensors",
537
+ "model.layers.49.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
538
+ "model.layers.49.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
539
+ "model.layers.49.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
540
+ "model.layers.49.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
541
+ "model.layers.49.self_attn.k_proj.bias": "model-00011-of-00014.safetensors",
542
+ "model.layers.49.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
543
+ "model.layers.49.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
544
+ "model.layers.49.self_attn.q_proj.bias": "model-00011-of-00014.safetensors",
545
+ "model.layers.49.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
546
+ "model.layers.49.self_attn.v_proj.bias": "model-00011-of-00014.safetensors",
547
+ "model.layers.49.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
548
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00014.safetensors",
549
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
550
+ "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
551
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
552
+ "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
553
+ "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00014.safetensors",
554
+ "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
555
+ "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
556
+ "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00014.safetensors",
557
+ "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
558
+ "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00014.safetensors",
559
+ "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
560
+ "model.layers.50.input_layernorm.weight": "model-00011-of-00014.safetensors",
561
+ "model.layers.50.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
562
+ "model.layers.50.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
563
+ "model.layers.50.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
564
+ "model.layers.50.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
565
+ "model.layers.50.self_attn.k_proj.bias": "model-00011-of-00014.safetensors",
566
+ "model.layers.50.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
567
+ "model.layers.50.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
568
+ "model.layers.50.self_attn.q_proj.bias": "model-00011-of-00014.safetensors",
569
+ "model.layers.50.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
570
+ "model.layers.50.self_attn.v_proj.bias": "model-00011-of-00014.safetensors",
571
+ "model.layers.50.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
572
+ "model.layers.51.input_layernorm.weight": "model-00011-of-00014.safetensors",
573
+ "model.layers.51.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
574
+ "model.layers.51.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
575
+ "model.layers.51.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
576
+ "model.layers.51.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
577
+ "model.layers.51.self_attn.k_proj.bias": "model-00011-of-00014.safetensors",
578
+ "model.layers.51.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
579
+ "model.layers.51.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
580
+ "model.layers.51.self_attn.q_proj.bias": "model-00011-of-00014.safetensors",
581
+ "model.layers.51.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
582
+ "model.layers.51.self_attn.v_proj.bias": "model-00011-of-00014.safetensors",
583
+ "model.layers.51.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
584
+ "model.layers.52.input_layernorm.weight": "model-00011-of-00014.safetensors",
585
+ "model.layers.52.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
586
+ "model.layers.52.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
587
+ "model.layers.52.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
588
+ "model.layers.52.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
589
+ "model.layers.52.self_attn.k_proj.bias": "model-00011-of-00014.safetensors",
590
+ "model.layers.52.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
591
+ "model.layers.52.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
592
+ "model.layers.52.self_attn.q_proj.bias": "model-00011-of-00014.safetensors",
593
+ "model.layers.52.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
594
+ "model.layers.52.self_attn.v_proj.bias": "model-00011-of-00014.safetensors",
595
+ "model.layers.52.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
596
+ "model.layers.53.input_layernorm.weight": "model-00012-of-00014.safetensors",
597
+ "model.layers.53.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
598
+ "model.layers.53.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
599
+ "model.layers.53.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
600
+ "model.layers.53.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
601
+ "model.layers.53.self_attn.k_proj.bias": "model-00011-of-00014.safetensors",
602
+ "model.layers.53.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
603
+ "model.layers.53.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
604
+ "model.layers.53.self_attn.q_proj.bias": "model-00011-of-00014.safetensors",
605
+ "model.layers.53.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
606
+ "model.layers.53.self_attn.v_proj.bias": "model-00011-of-00014.safetensors",
607
+ "model.layers.53.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
608
+ "model.layers.54.input_layernorm.weight": "model-00012-of-00014.safetensors",
609
+ "model.layers.54.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
610
+ "model.layers.54.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
611
+ "model.layers.54.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
612
+ "model.layers.54.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
613
+ "model.layers.54.self_attn.k_proj.bias": "model-00012-of-00014.safetensors",
614
+ "model.layers.54.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
615
+ "model.layers.54.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
616
+ "model.layers.54.self_attn.q_proj.bias": "model-00012-of-00014.safetensors",
617
+ "model.layers.54.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
618
+ "model.layers.54.self_attn.v_proj.bias": "model-00012-of-00014.safetensors",
619
+ "model.layers.54.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
620
+ "model.layers.55.input_layernorm.weight": "model-00012-of-00014.safetensors",
621
+ "model.layers.55.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
622
+ "model.layers.55.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
623
+ "model.layers.55.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
624
+ "model.layers.55.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
625
+ "model.layers.55.self_attn.k_proj.bias": "model-00012-of-00014.safetensors",
626
+ "model.layers.55.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
627
+ "model.layers.55.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
628
+ "model.layers.55.self_attn.q_proj.bias": "model-00012-of-00014.safetensors",
629
+ "model.layers.55.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
630
+ "model.layers.55.self_attn.v_proj.bias": "model-00012-of-00014.safetensors",
631
+ "model.layers.55.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
632
+ "model.layers.56.input_layernorm.weight": "model-00012-of-00014.safetensors",
633
+ "model.layers.56.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
634
+ "model.layers.56.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
635
+ "model.layers.56.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
636
+ "model.layers.56.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
637
+ "model.layers.56.self_attn.k_proj.bias": "model-00012-of-00014.safetensors",
638
+ "model.layers.56.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
639
+ "model.layers.56.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
640
+ "model.layers.56.self_attn.q_proj.bias": "model-00012-of-00014.safetensors",
641
+ "model.layers.56.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
642
+ "model.layers.56.self_attn.v_proj.bias": "model-00012-of-00014.safetensors",
643
+ "model.layers.56.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
644
+ "model.layers.57.input_layernorm.weight": "model-00012-of-00014.safetensors",
645
+ "model.layers.57.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
646
+ "model.layers.57.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
647
+ "model.layers.57.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
648
+ "model.layers.57.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
649
+ "model.layers.57.self_attn.k_proj.bias": "model-00012-of-00014.safetensors",
650
+ "model.layers.57.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
651
+ "model.layers.57.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
652
+ "model.layers.57.self_attn.q_proj.bias": "model-00012-of-00014.safetensors",
653
+ "model.layers.57.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
654
+ "model.layers.57.self_attn.v_proj.bias": "model-00012-of-00014.safetensors",
655
+ "model.layers.57.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
656
+ "model.layers.58.input_layernorm.weight": "model-00013-of-00014.safetensors",
657
+ "model.layers.58.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
658
+ "model.layers.58.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
659
+ "model.layers.58.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
660
+ "model.layers.58.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
661
+ "model.layers.58.self_attn.k_proj.bias": "model-00012-of-00014.safetensors",
662
+ "model.layers.58.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
663
+ "model.layers.58.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
664
+ "model.layers.58.self_attn.q_proj.bias": "model-00012-of-00014.safetensors",
665
+ "model.layers.58.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
666
+ "model.layers.58.self_attn.v_proj.bias": "model-00012-of-00014.safetensors",
667
+ "model.layers.58.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
668
+ "model.layers.59.input_layernorm.weight": "model-00013-of-00014.safetensors",
669
+ "model.layers.59.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
670
+ "model.layers.59.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
671
+ "model.layers.59.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
672
+ "model.layers.59.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
673
+ "model.layers.59.self_attn.k_proj.bias": "model-00013-of-00014.safetensors",
674
+ "model.layers.59.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
675
+ "model.layers.59.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
676
+ "model.layers.59.self_attn.q_proj.bias": "model-00013-of-00014.safetensors",
677
+ "model.layers.59.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
678
+ "model.layers.59.self_attn.v_proj.bias": "model-00013-of-00014.safetensors",
679
+ "model.layers.59.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
680
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00014.safetensors",
681
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
682
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
683
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
684
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
685
+ "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00014.safetensors",
686
+ "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
687
+ "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
688
+ "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00014.safetensors",
689
+ "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
690
+ "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00014.safetensors",
691
+ "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
692
+ "model.layers.60.input_layernorm.weight": "model-00013-of-00014.safetensors",
693
+ "model.layers.60.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
694
+ "model.layers.60.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
695
+ "model.layers.60.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
696
+ "model.layers.60.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
697
+ "model.layers.60.self_attn.k_proj.bias": "model-00013-of-00014.safetensors",
698
+ "model.layers.60.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
699
+ "model.layers.60.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
700
+ "model.layers.60.self_attn.q_proj.bias": "model-00013-of-00014.safetensors",
701
+ "model.layers.60.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
702
+ "model.layers.60.self_attn.v_proj.bias": "model-00013-of-00014.safetensors",
703
+ "model.layers.60.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
704
+ "model.layers.61.input_layernorm.weight": "model-00013-of-00014.safetensors",
705
+ "model.layers.61.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
706
+ "model.layers.61.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
707
+ "model.layers.61.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
708
+ "model.layers.61.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
709
+ "model.layers.61.self_attn.k_proj.bias": "model-00013-of-00014.safetensors",
710
+ "model.layers.61.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
711
+ "model.layers.61.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
712
+ "model.layers.61.self_attn.q_proj.bias": "model-00013-of-00014.safetensors",
713
+ "model.layers.61.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
714
+ "model.layers.61.self_attn.v_proj.bias": "model-00013-of-00014.safetensors",
715
+ "model.layers.61.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
716
+ "model.layers.62.input_layernorm.weight": "model-00013-of-00014.safetensors",
717
+ "model.layers.62.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
718
+ "model.layers.62.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
719
+ "model.layers.62.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
720
+ "model.layers.62.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
721
+ "model.layers.62.self_attn.k_proj.bias": "model-00013-of-00014.safetensors",
722
+ "model.layers.62.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
723
+ "model.layers.62.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
724
+ "model.layers.62.self_attn.q_proj.bias": "model-00013-of-00014.safetensors",
725
+ "model.layers.62.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
726
+ "model.layers.62.self_attn.v_proj.bias": "model-00013-of-00014.safetensors",
727
+ "model.layers.62.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
728
+ "model.layers.63.input_layernorm.weight": "model-00014-of-00014.safetensors",
729
+ "model.layers.63.mlp.down_proj.weight": "model-00014-of-00014.safetensors",
730
+ "model.layers.63.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
731
+ "model.layers.63.mlp.up_proj.weight": "model-00014-of-00014.safetensors",
732
+ "model.layers.63.post_attention_layernorm.weight": "model-00014-of-00014.safetensors",
733
+ "model.layers.63.self_attn.k_proj.bias": "model-00013-of-00014.safetensors",
734
+ "model.layers.63.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
735
+ "model.layers.63.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
736
+ "model.layers.63.self_attn.q_proj.bias": "model-00013-of-00014.safetensors",
737
+ "model.layers.63.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
738
+ "model.layers.63.self_attn.v_proj.bias": "model-00013-of-00014.safetensors",
739
+ "model.layers.63.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
740
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00014.safetensors",
741
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
742
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
743
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
744
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
745
+ "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00014.safetensors",
746
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
747
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
748
+ "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00014.safetensors",
749
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
750
+ "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00014.safetensors",
751
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
752
+ "model.layers.8.input_layernorm.weight": "model-00003-of-00014.safetensors",
753
+ "model.layers.8.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
754
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
755
+ "model.layers.8.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
756
+ "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
757
+ "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00014.safetensors",
758
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
759
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
760
+ "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00014.safetensors",
761
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
762
+ "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00014.safetensors",
763
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
764
+ "model.layers.9.input_layernorm.weight": "model-00003-of-00014.safetensors",
765
+ "model.layers.9.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
766
+ "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
767
+ "model.layers.9.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
768
+ "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
769
+ "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00014.safetensors",
770
+ "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
771
+ "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
772
+ "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00014.safetensors",
773
+ "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
774
+ "model.layers.9.self_attn.v_proj.bias": "model-00003-of-00014.safetensors",
775
+ "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
776
+ "model.norm.weight": "model-00014-of-00014.safetensors"
777
+ }
778
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|endoftext|>",
201
+ "errors": "replace",
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "padding_side": "right",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.997333333333333,
3
+ "total_flos": 297597215473664.0,
4
+ "train_loss": 0.7989534891938541,
5
+ "train_runtime": 8333.3074,
6
+ "train_samples_per_second": 6.48,
7
+ "train_steps_per_second": 0.202
8
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 1686, "loss": 1.0313, "lr": 2.958579881656805e-07, "epoch": 0.017777777777777778, "percentage": 0.59, "elapsed_time": "0:00:46", "remaining_time": "2:11:13"}
2
+ {"current_steps": 20, "total_steps": 1686, "loss": 1.0088, "lr": 5.91715976331361e-07, "epoch": 0.035555555555555556, "percentage": 1.19, "elapsed_time": "0:01:28", "remaining_time": "2:02:10"}
3
+ {"current_steps": 30, "total_steps": 1686, "loss": 1.0074, "lr": 8.875739644970415e-07, "epoch": 0.05333333333333334, "percentage": 1.78, "elapsed_time": "0:02:06", "remaining_time": "1:56:27"}
4
+ {"current_steps": 40, "total_steps": 1686, "loss": 0.9623, "lr": 1.183431952662722e-06, "epoch": 0.07111111111111111, "percentage": 2.37, "elapsed_time": "0:02:43", "remaining_time": "1:52:24"}
5
+ {"current_steps": 50, "total_steps": 1686, "loss": 0.9638, "lr": 1.4792899408284026e-06, "epoch": 0.08888888888888889, "percentage": 2.97, "elapsed_time": "0:03:20", "remaining_time": "1:49:21"}
6
+ {"current_steps": 60, "total_steps": 1686, "loss": 1.0104, "lr": 1.775147928994083e-06, "epoch": 0.10666666666666667, "percentage": 3.56, "elapsed_time": "0:03:56", "remaining_time": "1:47:02"}
7
+ {"current_steps": 70, "total_steps": 1686, "loss": 1.0002, "lr": 2.0710059171597635e-06, "epoch": 0.12444444444444444, "percentage": 4.15, "elapsed_time": "0:04:32", "remaining_time": "1:44:44"}
8
+ {"current_steps": 80, "total_steps": 1686, "loss": 1.0223, "lr": 2.366863905325444e-06, "epoch": 0.14222222222222222, "percentage": 4.74, "elapsed_time": "0:05:09", "remaining_time": "1:43:38"}
9
+ {"current_steps": 90, "total_steps": 1686, "loss": 0.9855, "lr": 2.6627218934911246e-06, "epoch": 0.16, "percentage": 5.34, "elapsed_time": "0:05:44", "remaining_time": "1:41:48"}
10
+ {"current_steps": 100, "total_steps": 1686, "loss": 0.9373, "lr": 2.958579881656805e-06, "epoch": 0.17777777777777778, "percentage": 5.93, "elapsed_time": "0:06:27", "remaining_time": "1:42:22"}
11
+ {"current_steps": 100, "total_steps": 1686, "eval_loss": 0.9671729803085327, "epoch": 0.17777777777777778, "percentage": 5.93, "elapsed_time": "0:07:35", "remaining_time": "2:00:31"}
12
+ {"current_steps": 110, "total_steps": 1686, "loss": 0.9625, "lr": 3.2544378698224853e-06, "epoch": 0.19555555555555557, "percentage": 6.52, "elapsed_time": "0:08:16", "remaining_time": "1:58:28"}
13
+ {"current_steps": 120, "total_steps": 1686, "loss": 0.9634, "lr": 3.550295857988166e-06, "epoch": 0.21333333333333335, "percentage": 7.12, "elapsed_time": "0:08:53", "remaining_time": "1:56:00"}
14
+ {"current_steps": 130, "total_steps": 1686, "loss": 0.933, "lr": 3.846153846153847e-06, "epoch": 0.2311111111111111, "percentage": 7.71, "elapsed_time": "0:09:30", "remaining_time": "1:53:49"}
15
+ {"current_steps": 140, "total_steps": 1686, "loss": 0.951, "lr": 4.142011834319527e-06, "epoch": 0.24888888888888888, "percentage": 8.3, "elapsed_time": "0:10:10", "remaining_time": "1:52:20"}
16
+ {"current_steps": 150, "total_steps": 1686, "loss": 0.9375, "lr": 4.437869822485207e-06, "epoch": 0.26666666666666666, "percentage": 8.9, "elapsed_time": "0:10:49", "remaining_time": "1:50:47"}
17
+ {"current_steps": 160, "total_steps": 1686, "loss": 0.9533, "lr": 4.733727810650888e-06, "epoch": 0.28444444444444444, "percentage": 9.49, "elapsed_time": "0:11:33", "remaining_time": "1:50:09"}
18
+ {"current_steps": 170, "total_steps": 1686, "loss": 0.9641, "lr": 4.999994639090922e-06, "epoch": 0.3022222222222222, "percentage": 10.08, "elapsed_time": "0:12:09", "remaining_time": "1:48:28"}
19
+ {"current_steps": 180, "total_steps": 1686, "loss": 0.958, "lr": 4.999351357820732e-06, "epoch": 0.32, "percentage": 10.68, "elapsed_time": "0:12:51", "remaining_time": "1:47:34"}
20
+ {"current_steps": 190, "total_steps": 1686, "loss": 0.9524, "lr": 4.99763621084537e-06, "epoch": 0.3377777777777778, "percentage": 11.27, "elapsed_time": "0:13:40", "remaining_time": "1:47:38"}
21
+ {"current_steps": 200, "total_steps": 1686, "loss": 0.9671, "lr": 4.994849933718568e-06, "epoch": 0.35555555555555557, "percentage": 11.86, "elapsed_time": "0:14:21", "remaining_time": "1:46:38"}
22
+ {"current_steps": 200, "total_steps": 1686, "eval_loss": 0.9638746380805969, "epoch": 0.35555555555555557, "percentage": 11.86, "elapsed_time": "0:15:28", "remaining_time": "1:55:01"}
23
+ {"current_steps": 210, "total_steps": 1686, "loss": 0.9697, "lr": 4.990993721356317e-06, "epoch": 0.37333333333333335, "percentage": 12.46, "elapsed_time": "0:16:10", "remaining_time": "1:53:41"}
24
+ {"current_steps": 220, "total_steps": 1686, "loss": 1.0093, "lr": 4.986069227524407e-06, "epoch": 0.39111111111111113, "percentage": 13.05, "elapsed_time": "0:16:59", "remaining_time": "1:53:14"}
25
+ {"current_steps": 230, "total_steps": 1686, "loss": 0.9411, "lr": 4.980078564129211e-06, "epoch": 0.4088888888888889, "percentage": 13.64, "elapsed_time": "0:17:41", "remaining_time": "1:52:02"}
26
+ {"current_steps": 240, "total_steps": 1686, "loss": 1.0076, "lr": 4.973024300311968e-06, "epoch": 0.4266666666666667, "percentage": 14.23, "elapsed_time": "0:18:17", "remaining_time": "1:50:10"}
27
+ {"current_steps": 250, "total_steps": 1686, "loss": 0.9674, "lr": 4.9649094613469904e-06, "epoch": 0.4444444444444444, "percentage": 14.83, "elapsed_time": "0:18:56", "remaining_time": "1:48:49"}
28
+ {"current_steps": 260, "total_steps": 1686, "loss": 0.9507, "lr": 4.955737527344251e-06, "epoch": 0.4622222222222222, "percentage": 15.42, "elapsed_time": "0:19:38", "remaining_time": "1:47:45"}
29
+ {"current_steps": 270, "total_steps": 1686, "loss": 0.9783, "lr": 4.945512431756916e-06, "epoch": 0.48, "percentage": 16.01, "elapsed_time": "0:20:22", "remaining_time": "1:46:50"}
30
+ {"current_steps": 280, "total_steps": 1686, "loss": 0.9694, "lr": 4.934238559694448e-06, "epoch": 0.49777777777777776, "percentage": 16.61, "elapsed_time": "0:21:07", "remaining_time": "1:46:06"}
31
+ {"current_steps": 290, "total_steps": 1686, "loss": 0.9724, "lr": 4.921920746042023e-06, "epoch": 0.5155555555555555, "percentage": 17.2, "elapsed_time": "0:21:51", "remaining_time": "1:45:14"}
32
+ {"current_steps": 300, "total_steps": 1686, "loss": 0.9584, "lr": 4.908564273387051e-06, "epoch": 0.5333333333333333, "percentage": 17.79, "elapsed_time": "0:22:30", "remaining_time": "1:43:57"}
33
+ {"current_steps": 300, "total_steps": 1686, "eval_loss": 0.9628667235374451, "epoch": 0.5333333333333333, "percentage": 17.79, "elapsed_time": "0:23:37", "remaining_time": "1:49:10"}
34
+ {"current_steps": 310, "total_steps": 1686, "loss": 0.9468, "lr": 4.894174869753701e-06, "epoch": 0.5511111111111111, "percentage": 18.39, "elapsed_time": "0:24:13", "remaining_time": "1:47:33"}
35
+ {"current_steps": 320, "total_steps": 1686, "loss": 0.9523, "lr": 4.878758706146389e-06, "epoch": 0.5688888888888889, "percentage": 18.98, "elapsed_time": "0:24:55", "remaining_time": "1:46:22"}
36
+ {"current_steps": 330, "total_steps": 1686, "loss": 0.9527, "lr": 4.862322393903305e-06, "epoch": 0.5866666666666667, "percentage": 19.57, "elapsed_time": "0:25:33", "remaining_time": "1:44:59"}
37
+ {"current_steps": 340, "total_steps": 1686, "loss": 0.9544, "lr": 4.84487298186108e-06, "epoch": 0.6044444444444445, "percentage": 20.17, "elapsed_time": "0:26:17", "remaining_time": "1:44:04"}
38
+ {"current_steps": 350, "total_steps": 1686, "loss": 0.903, "lr": 4.8264179533318526e-06, "epoch": 0.6222222222222222, "percentage": 20.76, "elapsed_time": "0:26:55", "remaining_time": "1:42:46"}
39
+ {"current_steps": 360, "total_steps": 1686, "loss": 0.935, "lr": 4.806965222893979e-06, "epoch": 0.64, "percentage": 21.35, "elapsed_time": "0:27:35", "remaining_time": "1:41:39"}
40
+ {"current_steps": 370, "total_steps": 1686, "loss": 0.9859, "lr": 4.786523132997817e-06, "epoch": 0.6577777777777778, "percentage": 21.95, "elapsed_time": "0:28:23", "remaining_time": "1:40:57"}
41
+ {"current_steps": 380, "total_steps": 1686, "loss": 1.0107, "lr": 4.765100450387999e-06, "epoch": 0.6755555555555556, "percentage": 22.54, "elapsed_time": "0:29:14", "remaining_time": "1:40:30"}
42
+ {"current_steps": 390, "total_steps": 1686, "loss": 0.9379, "lr": 4.74270636234375e-06, "epoch": 0.6933333333333334, "percentage": 23.13, "elapsed_time": "0:29:56", "remaining_time": "1:39:30"}
43
+ {"current_steps": 400, "total_steps": 1686, "loss": 0.957, "lr": 4.719350472738849e-06, "epoch": 0.7111111111111111, "percentage": 23.72, "elapsed_time": "0:30:38", "remaining_time": "1:38:29"}
44
+ {"current_steps": 400, "total_steps": 1686, "eval_loss": 0.9597401022911072, "epoch": 0.7111111111111111, "percentage": 23.72, "elapsed_time": "0:31:45", "remaining_time": "1:42:06"}
45
+ {"current_steps": 410, "total_steps": 1686, "loss": 0.9736, "lr": 4.69504279792294e-06, "epoch": 0.7288888888888889, "percentage": 24.32, "elapsed_time": "0:32:26", "remaining_time": "1:40:57"}
46
+ {"current_steps": 420, "total_steps": 1686, "loss": 0.9402, "lr": 4.669793762425942e-06, "epoch": 0.7466666666666667, "percentage": 24.91, "elapsed_time": "0:33:04", "remaining_time": "1:39:43"}
47
+ {"current_steps": 430, "total_steps": 1686, "loss": 1.0085, "lr": 4.643614194487411e-06, "epoch": 0.7644444444444445, "percentage": 25.5, "elapsed_time": "0:33:45", "remaining_time": "1:38:37"}
48
+ {"current_steps": 440, "total_steps": 1686, "loss": 0.9195, "lr": 4.616515321412769e-06, "epoch": 0.7822222222222223, "percentage": 26.1, "elapsed_time": "0:34:27", "remaining_time": "1:37:34"}
49
+ {"current_steps": 450, "total_steps": 1686, "loss": 0.9976, "lr": 4.588508764758386e-06, "epoch": 0.8, "percentage": 26.69, "elapsed_time": "0:35:06", "remaining_time": "1:36:27"}
50
+ {"current_steps": 460, "total_steps": 1686, "loss": 0.9269, "lr": 4.559606535347594e-06, "epoch": 0.8177777777777778, "percentage": 27.28, "elapsed_time": "0:35:42", "remaining_time": "1:35:10"}
51
+ {"current_steps": 470, "total_steps": 1686, "loss": 0.9837, "lr": 4.5298210281197474e-06, "epoch": 0.8355555555555556, "percentage": 27.88, "elapsed_time": "0:36:16", "remaining_time": "1:33:51"}
52
+ {"current_steps": 480, "total_steps": 1686, "loss": 0.9712, "lr": 4.499165016814564e-06, "epoch": 0.8533333333333334, "percentage": 28.47, "elapsed_time": "0:36:57", "remaining_time": "1:32:52"}
53
+ {"current_steps": 490, "total_steps": 1686, "loss": 1.0075, "lr": 4.46765164849401e-06, "epoch": 0.8711111111111111, "percentage": 29.06, "elapsed_time": "0:37:35", "remaining_time": "1:31:44"}
54
+ {"current_steps": 500, "total_steps": 1686, "loss": 0.9477, "lr": 4.435294437904082e-06, "epoch": 0.8888888888888888, "percentage": 29.66, "elapsed_time": "0:38:12", "remaining_time": "1:30:37"}
55
+ {"current_steps": 500, "total_steps": 1686, "eval_loss": 0.9587395787239075, "epoch": 0.8888888888888888, "percentage": 29.66, "elapsed_time": "0:39:19", "remaining_time": "1:33:17"}
56
+ {"current_steps": 510, "total_steps": 1686, "loss": 1.002, "lr": 4.402107261678908e-06, "epoch": 0.9066666666666666, "percentage": 30.25, "elapsed_time": "0:39:58", "remaining_time": "1:32:11"}
57
+ {"current_steps": 520, "total_steps": 1686, "loss": 0.9613, "lr": 4.368104352389641e-06, "epoch": 0.9244444444444444, "percentage": 30.84, "elapsed_time": "0:40:37", "remaining_time": "1:31:06"}
58
+ {"current_steps": 530, "total_steps": 1686, "loss": 0.9345, "lr": 4.333300292440728e-06, "epoch": 0.9422222222222222, "percentage": 31.44, "elapsed_time": "0:41:15", "remaining_time": "1:29:58"}
59
+ {"current_steps": 540, "total_steps": 1686, "loss": 0.9644, "lr": 4.2977100078161196e-06, "epoch": 0.96, "percentage": 32.03, "elapsed_time": "0:41:57", "remaining_time": "1:29:02"}
60
+ {"current_steps": 550, "total_steps": 1686, "loss": 0.9952, "lr": 4.2613487616781595e-06, "epoch": 0.9777777777777777, "percentage": 32.62, "elapsed_time": "0:42:37", "remaining_time": "1:28:01"}
61
+ {"current_steps": 560, "total_steps": 1686, "loss": 1.0213, "lr": 4.224232147821858e-06, "epoch": 0.9955555555555555, "percentage": 33.21, "elapsed_time": "0:43:17", "remaining_time": "1:27:03"}
62
+ {"current_steps": 570, "total_steps": 1686, "loss": 0.8555, "lr": 4.186376083987376e-06, "epoch": 1.0133333333333334, "percentage": 33.81, "elapsed_time": "0:46:35", "remaining_time": "1:31:14"}
63
+ {"current_steps": 580, "total_steps": 1686, "loss": 0.8231, "lr": 4.147796805033583e-06, "epoch": 1.031111111111111, "percentage": 34.4, "elapsed_time": "0:47:11", "remaining_time": "1:30:00"}
64
+ {"current_steps": 590, "total_steps": 1686, "loss": 0.7805, "lr": 4.108510855975611e-06, "epoch": 1.048888888888889, "percentage": 34.99, "elapsed_time": "0:47:47", "remaining_time": "1:28:46"}
65
+ {"current_steps": 600, "total_steps": 1686, "loss": 0.8552, "lr": 4.0685350848894065e-06, "epoch": 1.0666666666666667, "percentage": 35.59, "elapsed_time": "0:48:23", "remaining_time": "1:27:34"}
66
+ {"current_steps": 600, "total_steps": 1686, "eval_loss": 0.9710310101509094, "epoch": 1.0666666666666667, "percentage": 35.59, "elapsed_time": "0:49:30", "remaining_time": "1:29:36"}
67
+ {"current_steps": 610, "total_steps": 1686, "loss": 0.8172, "lr": 4.027886635686301e-06, "epoch": 1.0844444444444445, "percentage": 36.18, "elapsed_time": "0:50:09", "remaining_time": "1:28:27"}
68
+ {"current_steps": 620, "total_steps": 1686, "loss": 0.7653, "lr": 3.986582940760717e-06, "epoch": 1.1022222222222222, "percentage": 36.77, "elapsed_time": "0:50:47", "remaining_time": "1:27:19"}
69
+ {"current_steps": 630, "total_steps": 1686, "loss": 0.8175, "lr": 3.9446417135141536e-06, "epoch": 1.12, "percentage": 37.37, "elapsed_time": "0:51:25", "remaining_time": "1:26:11"}
70
+ {"current_steps": 640, "total_steps": 1686, "loss": 0.7756, "lr": 3.902080940758658e-06, "epoch": 1.1377777777777778, "percentage": 37.96, "elapsed_time": "0:52:01", "remaining_time": "1:25:01"}
71
+ {"current_steps": 650, "total_steps": 1686, "loss": 0.752, "lr": 3.858918875003053e-06, "epoch": 1.1555555555555554, "percentage": 38.55, "elapsed_time": "0:52:41", "remaining_time": "1:23:58"}
72
+ {"current_steps": 660, "total_steps": 1686, "loss": 0.7721, "lr": 3.815174026625202e-06, "epoch": 1.1733333333333333, "percentage": 39.15, "elapsed_time": "0:53:20", "remaining_time": "1:22:56"}
73
+ {"current_steps": 670, "total_steps": 1686, "loss": 0.7816, "lr": 3.770865155933694e-06, "epoch": 1.1911111111111112, "percentage": 39.74, "elapsed_time": "0:54:06", "remaining_time": "1:22:03"}
74
+ {"current_steps": 680, "total_steps": 1686, "loss": 0.7897, "lr": 3.7260112651223553e-06, "epoch": 1.208888888888889, "percentage": 40.33, "elapsed_time": "0:54:48", "remaining_time": "1:21:05"}
75
+ {"current_steps": 690, "total_steps": 1686, "loss": 0.8024, "lr": 3.6806315901209987e-06, "epoch": 1.2266666666666666, "percentage": 40.93, "elapsed_time": "0:55:25", "remaining_time": "1:20:00"}
76
+ {"current_steps": 700, "total_steps": 1686, "loss": 0.7944, "lr": 3.6347455923459616e-06, "epoch": 1.2444444444444445, "percentage": 41.52, "elapsed_time": "0:56:05", "remaining_time": "1:19:00"}
77
+ {"current_steps": 700, "total_steps": 1686, "eval_loss": 0.9722110629081726, "epoch": 1.2444444444444445, "percentage": 41.52, "elapsed_time": "0:57:12", "remaining_time": "1:20:35"}
78
+ {"current_steps": 710, "total_steps": 1686, "loss": 0.8198, "lr": 3.5883729503539265e-06, "epoch": 1.2622222222222224, "percentage": 42.11, "elapsed_time": "0:57:52", "remaining_time": "1:19:33"}
79
+ {"current_steps": 720, "total_steps": 1686, "loss": 0.7903, "lr": 3.541533551402621e-06, "epoch": 1.28, "percentage": 42.7, "elapsed_time": "0:58:32", "remaining_time": "1:18:32"}
80
+ {"current_steps": 730, "total_steps": 1686, "loss": 0.8095, "lr": 3.494247482922024e-06, "epoch": 1.2977777777777777, "percentage": 43.3, "elapsed_time": "0:59:08", "remaining_time": "1:17:26"}
81
+ {"current_steps": 740, "total_steps": 1686, "loss": 0.8326, "lr": 3.4465350238997112e-06, "epoch": 1.3155555555555556, "percentage": 43.89, "elapsed_time": "0:59:45", "remaining_time": "1:16:23"}
82
+ {"current_steps": 750, "total_steps": 1686, "loss": 0.7833, "lr": 3.3984166361840646e-06, "epoch": 1.3333333333333333, "percentage": 44.48, "elapsed_time": "1:00:32", "remaining_time": "1:15:32"}
83
+ {"current_steps": 760, "total_steps": 1686, "loss": 0.8171, "lr": 3.3499129557090517e-06, "epoch": 1.3511111111111112, "percentage": 45.08, "elapsed_time": "1:01:16", "remaining_time": "1:14:39"}
84
+ {"current_steps": 770, "total_steps": 1686, "loss": 0.7968, "lr": 3.301044783644348e-06, "epoch": 1.3688888888888888, "percentage": 45.67, "elapsed_time": "1:01:56", "remaining_time": "1:13:41"}
85
+ {"current_steps": 780, "total_steps": 1686, "loss": 0.7689, "lr": 3.2518330774746014e-06, "epoch": 1.3866666666666667, "percentage": 46.26, "elapsed_time": "1:02:35", "remaining_time": "1:12:41"}
86
+ {"current_steps": 790, "total_steps": 1686, "loss": 0.776, "lr": 3.2022989420116556e-06, "epoch": 1.4044444444444444, "percentage": 46.86, "elapsed_time": "1:03:16", "remaining_time": "1:11:45"}
87
+ {"current_steps": 800, "total_steps": 1686, "loss": 0.7359, "lr": 3.152463620343591e-06, "epoch": 1.4222222222222223, "percentage": 47.45, "elapsed_time": "1:03:52", "remaining_time": "1:10:44"}
88
+ {"current_steps": 800, "total_steps": 1686, "eval_loss": 0.9709274768829346, "epoch": 1.4222222222222223, "percentage": 47.45, "elapsed_time": "1:05:00", "remaining_time": "1:12:00"}
89
+ {"current_steps": 810, "total_steps": 1686, "loss": 0.8143, "lr": 3.102348484724467e-06, "epoch": 1.44, "percentage": 48.04, "elapsed_time": "1:05:38", "remaining_time": "1:10:59"}
90
+ {"current_steps": 820, "total_steps": 1686, "loss": 0.8282, "lr": 3.0519750274086675e-06, "epoch": 1.4577777777777778, "percentage": 48.64, "elapsed_time": "1:06:20", "remaining_time": "1:10:04"}
91
+ {"current_steps": 830, "total_steps": 1686, "loss": 0.7994, "lr": 3.001364851433789e-06, "epoch": 1.4755555555555555, "percentage": 49.23, "elapsed_time": "1:07:04", "remaining_time": "1:09:10"}
92
+ {"current_steps": 840, "total_steps": 1686, "loss": 0.7533, "lr": 2.950539661356011e-06, "epoch": 1.4933333333333334, "percentage": 49.82, "elapsed_time": "1:07:40", "remaining_time": "1:08:09"}
93
+ {"current_steps": 850, "total_steps": 1686, "loss": 0.8492, "lr": 2.899521253941937e-06, "epoch": 1.511111111111111, "percentage": 50.42, "elapsed_time": "1:08:23", "remaining_time": "1:07:15"}
94
+ {"current_steps": 860, "total_steps": 1686, "loss": 0.7805, "lr": 2.8483315088208828e-06, "epoch": 1.528888888888889, "percentage": 51.01, "elapsed_time": "1:09:13", "remaining_time": "1:06:29"}
95
+ {"current_steps": 870, "total_steps": 1686, "loss": 0.8412, "lr": 2.7969923791016334e-06, "epoch": 1.5466666666666666, "percentage": 51.6, "elapsed_time": "1:09:48", "remaining_time": "1:05:28"}
96
+ {"current_steps": 880, "total_steps": 1686, "loss": 0.8048, "lr": 2.7455258819576876e-06, "epoch": 1.5644444444444443, "percentage": 52.19, "elapsed_time": "1:10:28", "remaining_time": "1:04:32"}
97
+ {"current_steps": 890, "total_steps": 1686, "loss": 0.7951, "lr": 2.6939540891850265e-06, "epoch": 1.5822222222222222, "percentage": 52.79, "elapsed_time": "1:11:09", "remaining_time": "1:03:38"}
98
+ {"current_steps": 900, "total_steps": 1686, "loss": 0.8494, "lr": 2.642299117736456e-06, "epoch": 1.6, "percentage": 53.38, "elapsed_time": "1:11:52", "remaining_time": "1:02:46"}
99
+ {"current_steps": 900, "total_steps": 1686, "eval_loss": 0.9661916494369507, "epoch": 1.6, "percentage": 53.38, "elapsed_time": "1:13:00", "remaining_time": "1:03:45"}
100
+ {"current_steps": 910, "total_steps": 1686, "loss": 0.7601, "lr": 2.5905831202365856e-06, "epoch": 1.6177777777777778, "percentage": 53.97, "elapsed_time": "1:13:40", "remaining_time": "1:02:49"}
101
+ {"current_steps": 920, "total_steps": 1686, "loss": 0.7561, "lr": 2.538828275481509e-06, "epoch": 1.6355555555555554, "percentage": 54.57, "elapsed_time": "1:14:19", "remaining_time": "1:01:52"}
102
+ {"current_steps": 930, "total_steps": 1686, "loss": 0.7645, "lr": 2.4870567789272563e-06, "epoch": 1.6533333333333333, "percentage": 55.16, "elapsed_time": "1:14:56", "remaining_time": "1:00:55"}
103
+ {"current_steps": 940, "total_steps": 1686, "loss": 0.7935, "lr": 2.435290833171109e-06, "epoch": 1.6711111111111112, "percentage": 55.75, "elapsed_time": "1:15:38", "remaining_time": "1:00:01"}
104
+ {"current_steps": 950, "total_steps": 1686, "loss": 0.7813, "lr": 2.3835526384298475e-06, "epoch": 1.6888888888888889, "percentage": 56.35, "elapsed_time": "1:16:17", "remaining_time": "0:59:06"}
105
+ {"current_steps": 960, "total_steps": 1686, "loss": 0.752, "lr": 2.3318643830190186e-06, "epoch": 1.7066666666666666, "percentage": 56.94, "elapsed_time": "1:16:53", "remaining_time": "0:58:08"}
106
+ {"current_steps": 970, "total_steps": 1686, "loss": 0.78, "lr": 2.2802482338373096e-06, "epoch": 1.7244444444444444, "percentage": 57.53, "elapsed_time": "1:17:30", "remaining_time": "0:57:12"}
107
+ {"current_steps": 980, "total_steps": 1686, "loss": 0.7736, "lr": 2.228726326860109e-06, "epoch": 1.7422222222222223, "percentage": 58.13, "elapsed_time": "1:18:11", "remaining_time": "0:56:19"}
108
+ {"current_steps": 990, "total_steps": 1686, "loss": 0.799, "lr": 2.1773207576463227e-06, "epoch": 1.76, "percentage": 58.72, "elapsed_time": "1:18:51", "remaining_time": "0:55:26"}
109
+ {"current_steps": 1000, "total_steps": 1686, "loss": 0.8163, "lr": 2.1260535718625274e-06, "epoch": 1.7777777777777777, "percentage": 59.31, "elapsed_time": "1:19:34", "remaining_time": "0:54:35"}
110
+ {"current_steps": 1000, "total_steps": 1686, "eval_loss": 0.9663456082344055, "epoch": 1.7777777777777777, "percentage": 59.31, "elapsed_time": "1:20:41", "remaining_time": "0:55:21"}
111
+ {"current_steps": 1010, "total_steps": 1686, "loss": 0.7882, "lr": 2.0749467558285224e-06, "epoch": 1.7955555555555556, "percentage": 59.91, "elapsed_time": "1:21:24", "remaining_time": "0:54:29"}
112
+ {"current_steps": 1020, "total_steps": 1686, "loss": 0.8198, "lr": 2.024022227088329e-06, "epoch": 1.8133333333333335, "percentage": 60.5, "elapsed_time": "1:22:02", "remaining_time": "0:53:34"}
113
+ {"current_steps": 1030, "total_steps": 1686, "loss": 0.8141, "lr": 1.973301825010685e-06, "epoch": 1.8311111111111111, "percentage": 61.09, "elapsed_time": "1:22:41", "remaining_time": "0:52:40"}
114
+ {"current_steps": 1040, "total_steps": 1686, "loss": 0.7297, "lr": 1.922807301423065e-06, "epoch": 1.8488888888888888, "percentage": 61.68, "elapsed_time": "1:23:18", "remaining_time": "0:51:45"}
115
+ {"current_steps": 1050, "total_steps": 1686, "loss": 0.7754, "lr": 1.8725603112832518e-06, "epoch": 1.8666666666666667, "percentage": 62.28, "elapsed_time": "1:23:59", "remaining_time": "0:50:52"}
116
+ {"current_steps": 1060, "total_steps": 1686, "loss": 0.8001, "lr": 1.8225824033924377e-06, "epoch": 1.8844444444444446, "percentage": 62.87, "elapsed_time": "1:24:40", "remaining_time": "0:50:00"}
117
+ {"current_steps": 1070, "total_steps": 1686, "loss": 0.7828, "lr": 1.7728950111538585e-06, "epoch": 1.9022222222222223, "percentage": 63.46, "elapsed_time": "1:25:25", "remaining_time": "0:49:10"}
118
+ {"current_steps": 1080, "total_steps": 1686, "loss": 0.7754, "lr": 1.7235194433809215e-06, "epoch": 1.92, "percentage": 64.06, "elapsed_time": "1:26:02", "remaining_time": "0:48:16"}
119
+ {"current_steps": 1090, "total_steps": 1686, "loss": 0.7963, "lr": 1.674476875158756e-06, "epoch": 1.9377777777777778, "percentage": 64.65, "elapsed_time": "1:26:45", "remaining_time": "0:47:26"}
120
+ {"current_steps": 1100, "total_steps": 1686, "loss": 0.8041, "lr": 1.6257883387631178e-06, "epoch": 1.9555555555555557, "percentage": 65.24, "elapsed_time": "1:27:23", "remaining_time": "0:46:33"}
121
+ {"current_steps": 1100, "total_steps": 1686, "eval_loss": 0.9638973474502563, "epoch": 1.9555555555555557, "percentage": 65.24, "elapsed_time": "1:28:31", "remaining_time": "0:47:09"}
122
+ {"current_steps": 1110, "total_steps": 1686, "loss": 0.8067, "lr": 1.5774747146405423e-06, "epoch": 1.9733333333333334, "percentage": 65.84, "elapsed_time": "1:29:16", "remaining_time": "0:46:19"}
123
+ {"current_steps": 1120, "total_steps": 1686, "loss": 0.7654, "lr": 1.5295567224536023e-06, "epoch": 1.991111111111111, "percentage": 66.43, "elapsed_time": "1:30:03", "remaining_time": "0:45:30"}
124
+ {"current_steps": 1130, "total_steps": 1686, "loss": 0.7469, "lr": 1.4820549121951321e-06, "epoch": 2.008888888888889, "percentage": 67.02, "elapsed_time": "1:33:23", "remaining_time": "0:45:56"}
125
+ {"current_steps": 1140, "total_steps": 1686, "loss": 0.6237, "lr": 1.434989655375197e-06, "epoch": 2.026666666666667, "percentage": 67.62, "elapsed_time": "1:34:02", "remaining_time": "0:45:02"}
126
+ {"current_steps": 1150, "total_steps": 1686, "loss": 0.6339, "lr": 1.3883811362846275e-06, "epoch": 2.0444444444444443, "percentage": 68.21, "elapsed_time": "1:34:43", "remaining_time": "0:44:09"}
127
+ {"current_steps": 1160, "total_steps": 1686, "loss": 0.6055, "lr": 1.342249343338829e-06, "epoch": 2.062222222222222, "percentage": 68.8, "elapsed_time": "1:35:21", "remaining_time": "0:43:14"}
128
+ {"current_steps": 1170, "total_steps": 1686, "loss": 0.6589, "lr": 1.296614060505596e-06, "epoch": 2.08, "percentage": 69.4, "elapsed_time": "1:35:56", "remaining_time": "0:42:18"}
129
+ {"current_steps": 1180, "total_steps": 1686, "loss": 0.6441, "lr": 1.251494858820615e-06, "epoch": 2.097777777777778, "percentage": 69.99, "elapsed_time": "1:36:41", "remaining_time": "0:41:27"}
130
+ {"current_steps": 1190, "total_steps": 1686, "loss": 0.6571, "lr": 1.2069110879942758e-06, "epoch": 2.1155555555555554, "percentage": 70.58, "elapsed_time": "1:37:20", "remaining_time": "0:40:34"}
131
+ {"current_steps": 1200, "total_steps": 1686, "loss": 0.6291, "lr": 1.1628818681133965e-06, "epoch": 2.1333333333333333, "percentage": 71.17, "elapsed_time": "1:37:57", "remaining_time": "0:39:40"}
132
+ {"current_steps": 1200, "total_steps": 1686, "eval_loss": 0.9990262985229492, "epoch": 2.1333333333333333, "percentage": 71.17, "elapsed_time": "1:39:05", "remaining_time": "0:40:07"}
133
+ {"current_steps": 1210, "total_steps": 1686, "loss": 0.6581, "lr": 1.119426081441437e-06, "epoch": 2.151111111111111, "percentage": 71.77, "elapsed_time": "1:39:43", "remaining_time": "0:39:13"}
134
+ {"current_steps": 1220, "total_steps": 1686, "loss": 0.6067, "lr": 1.0765623643206946e-06, "epoch": 2.168888888888889, "percentage": 72.36, "elapsed_time": "1:40:26", "remaining_time": "0:38:21"}
135
+ {"current_steps": 1230, "total_steps": 1686, "loss": 0.6324, "lr": 1.0343090991799677e-06, "epoch": 2.1866666666666665, "percentage": 72.95, "elapsed_time": "1:41:06", "remaining_time": "0:37:28"}
136
+ {"current_steps": 1240, "total_steps": 1686, "loss": 0.6631, "lr": 9.926844066511132e-07, "epoch": 2.2044444444444444, "percentage": 73.55, "elapsed_time": "1:41:46", "remaining_time": "0:36:36"}
137
+ {"current_steps": 1250, "total_steps": 1686, "loss": 0.6162, "lr": 9.517061377978762e-07, "epoch": 2.2222222222222223, "percentage": 74.14, "elapsed_time": "1:42:24", "remaining_time": "0:35:43"}
138
+ {"current_steps": 1260, "total_steps": 1686, "loss": 0.7062, "lr": 9.113918664603277e-07, "epoch": 2.24, "percentage": 74.73, "elapsed_time": "1:43:09", "remaining_time": "0:34:52"}
139
+ {"current_steps": 1270, "total_steps": 1686, "loss": 0.6165, "lr": 8.717588817181868e-07, "epoch": 2.2577777777777777, "percentage": 75.33, "elapsed_time": "1:44:00", "remaining_time": "0:34:04"}
140
+ {"current_steps": 1280, "total_steps": 1686, "loss": 0.6159, "lr": 8.328241804762737e-07, "epoch": 2.2755555555555556, "percentage": 75.92, "elapsed_time": "1:44:40", "remaining_time": "0:33:12"}
141
+ {"current_steps": 1290, "total_steps": 1686, "loss": 0.6516, "lr": 7.946044601752539e-07, "epoch": 2.2933333333333334, "percentage": 76.51, "elapsed_time": "1:45:22", "remaining_time": "0:32:20"}
142
+ {"current_steps": 1300, "total_steps": 1686, "loss": 0.6122, "lr": 7.571161116308206e-07, "epoch": 2.311111111111111, "percentage": 77.11, "elapsed_time": "1:45:59", "remaining_time": "0:31:28"}
143
+ {"current_steps": 1300, "total_steps": 1686, "eval_loss": 1.0003914833068848, "epoch": 2.311111111111111, "percentage": 77.11, "elapsed_time": "1:47:06", "remaining_time": "0:31:48"}
144
+ {"current_steps": 1310, "total_steps": 1686, "loss": 0.618, "lr": 7.203752120043608e-07, "epoch": 2.328888888888889, "percentage": 77.7, "elapsed_time": "1:47:43", "remaining_time": "0:30:55"}
145
+ {"current_steps": 1320, "total_steps": 1686, "loss": 0.6663, "lr": 6.843975179081513e-07, "epoch": 2.3466666666666667, "percentage": 78.29, "elapsed_time": "1:48:18", "remaining_time": "0:30:01"}
146
+ {"current_steps": 1330, "total_steps": 1686, "loss": 0.6055, "lr": 6.491984586480132e-07, "epoch": 2.3644444444444446, "percentage": 78.88, "elapsed_time": "1:48:54", "remaining_time": "0:29:08"}
147
+ {"current_steps": 1340, "total_steps": 1686, "loss": 0.6199, "lr": 6.147931296063353e-07, "epoch": 2.3822222222222225, "percentage": 79.48, "elapsed_time": "1:49:33", "remaining_time": "0:28:17"}
148
+ {"current_steps": 1350, "total_steps": 1686, "loss": 0.6617, "lr": 5.811962857683124e-07, "epoch": 2.4, "percentage": 80.07, "elapsed_time": "1:50:17", "remaining_time": "0:27:26"}
149
+ {"current_steps": 1360, "total_steps": 1686, "loss": 0.6497, "lr": 5.484223353941506e-07, "epoch": 2.417777777777778, "percentage": 80.66, "elapsed_time": "1:50:57", "remaining_time": "0:26:35"}
150
+ {"current_steps": 1370, "total_steps": 1686, "loss": 0.5923, "lr": 5.164853338399795e-07, "epoch": 2.4355555555555557, "percentage": 81.26, "elapsed_time": "1:51:35", "remaining_time": "0:25:44"}
151
+ {"current_steps": 1380, "total_steps": 1686, "loss": 0.6634, "lr": 4.853989775301074e-07, "epoch": 2.453333333333333, "percentage": 81.85, "elapsed_time": "1:52:14", "remaining_time": "0:24:53"}
152
+ {"current_steps": 1390, "total_steps": 1686, "loss": 0.6338, "lr": 4.55176598083206e-07, "epoch": 2.471111111111111, "percentage": 82.44, "elapsed_time": "1:52:51", "remaining_time": "0:24:02"}
153
+ {"current_steps": 1400, "total_steps": 1686, "loss": 0.6718, "lr": 4.2583115659494356e-07, "epoch": 2.488888888888889, "percentage": 83.04, "elapsed_time": "1:53:29", "remaining_time": "0:23:11"}
154
+ {"current_steps": 1400, "total_steps": 1686, "eval_loss": 1.0003468990325928, "epoch": 2.488888888888889, "percentage": 83.04, "elapsed_time": "1:54:37", "remaining_time": "0:23:24"}
155
+ {"current_steps": 1410, "total_steps": 1686, "loss": 0.5828, "lr": 3.9737523807952206e-07, "epoch": 2.506666666666667, "percentage": 83.63, "elapsed_time": "1:55:19", "remaining_time": "0:22:34"}
156
+ {"current_steps": 1420, "total_steps": 1686, "loss": 0.6212, "lr": 3.698210460724991e-07, "epoch": 2.5244444444444447, "percentage": 84.22, "elapsed_time": "1:56:09", "remaining_time": "0:21:45"}
157
+ {"current_steps": 1430, "total_steps": 1686, "loss": 0.5807, "lr": 3.4318039739720974e-07, "epoch": 2.542222222222222, "percentage": 84.82, "elapsed_time": "1:56:47", "remaining_time": "0:20:54"}
158
+ {"current_steps": 1440, "total_steps": 1686, "loss": 0.6545, "lr": 3.1746471709702963e-07, "epoch": 2.56, "percentage": 85.41, "elapsed_time": "1:57:30", "remaining_time": "0:20:04"}
159
+ {"current_steps": 1450, "total_steps": 1686, "loss": 0.6209, "lr": 2.9268503353566484e-07, "epoch": 2.5777777777777775, "percentage": 86.0, "elapsed_time": "1:58:19", "remaining_time": "0:19:15"}
160
+ {"current_steps": 1460, "total_steps": 1686, "loss": 0.6429, "lr": 2.6885197366754935e-07, "epoch": 2.5955555555555554, "percentage": 86.6, "elapsed_time": "1:59:00", "remaining_time": "0:18:25"}
161
+ {"current_steps": 1470, "total_steps": 1686, "loss": 0.6184, "lr": 2.459757584803965e-07, "epoch": 2.6133333333333333, "percentage": 87.19, "elapsed_time": "1:59:35", "remaining_time": "0:17:34"}
162
+ {"current_steps": 1480, "total_steps": 1686, "loss": 0.6387, "lr": 2.2406619861185363e-07, "epoch": 2.631111111111111, "percentage": 87.78, "elapsed_time": "2:00:15", "remaining_time": "0:16:44"}
163
+ {"current_steps": 1490, "total_steps": 1686, "loss": 0.6321, "lr": 2.0313269014213378e-07, "epoch": 2.648888888888889, "percentage": 88.37, "elapsed_time": "2:00:51", "remaining_time": "0:15:53"}
164
+ {"current_steps": 1500, "total_steps": 1686, "loss": 0.6712, "lr": 1.8318421056443576e-07, "epoch": 2.6666666666666665, "percentage": 88.97, "elapsed_time": "2:01:32", "remaining_time": "0:15:04"}
165
+ {"current_steps": 1500, "total_steps": 1686, "eval_loss": 1.0002143383026123, "epoch": 2.6666666666666665, "percentage": 88.97, "elapsed_time": "2:02:40", "remaining_time": "0:15:12"}
166
+ {"current_steps": 1510, "total_steps": 1686, "loss": 0.64, "lr": 1.6422931493488004e-07, "epoch": 2.6844444444444444, "percentage": 89.56, "elapsed_time": "2:03:18", "remaining_time": "0:14:22"}
167
+ {"current_steps": 1520, "total_steps": 1686, "loss": 0.6328, "lr": 1.4627613220360632e-07, "epoch": 2.7022222222222223, "percentage": 90.15, "elapsed_time": "2:03:56", "remaining_time": "0:13:32"}
168
+ {"current_steps": 1530, "total_steps": 1686, "loss": 0.6195, "lr": 1.2933236172861435e-07, "epoch": 2.7199999999999998, "percentage": 90.75, "elapsed_time": "2:04:32", "remaining_time": "0:12:41"}
169
+ {"current_steps": 1540, "total_steps": 1686, "loss": 0.6437, "lr": 1.1340526997383194e-07, "epoch": 2.7377777777777776, "percentage": 91.34, "elapsed_time": "2:05:16", "remaining_time": "0:11:52"}
170
+ {"current_steps": 1550, "total_steps": 1686, "loss": 0.6516, "lr": 9.850168739284183e-08, "epoch": 2.7555555555555555, "percentage": 91.93, "elapsed_time": "2:05:52", "remaining_time": "0:11:02"}
171
+ {"current_steps": 1560, "total_steps": 1686, "loss": 0.6293, "lr": 8.462800549958793e-08, "epoch": 2.7733333333333334, "percentage": 92.53, "elapsed_time": "2:06:31", "remaining_time": "0:10:13"}
172
+ {"current_steps": 1570, "total_steps": 1686, "loss": 0.6241, "lr": 7.179017412732581e-08, "epoch": 2.7911111111111113, "percentage": 93.12, "elapsed_time": "2:07:10", "remaining_time": "0:09:23"}
173
+ {"current_steps": 1580, "total_steps": 1686, "loss": 0.6547, "lr": 5.999369887699113e-08, "epoch": 2.8088888888888888, "percentage": 93.71, "elapsed_time": "2:07:48", "remaining_time": "0:08:34"}
174
+ {"current_steps": 1590, "total_steps": 1686, "loss": 0.6381, "lr": 4.924363875608379e-08, "epoch": 2.8266666666666667, "percentage": 94.31, "elapsed_time": "2:08:32", "remaining_time": "0:07:45"}
175
+ {"current_steps": 1600, "total_steps": 1686, "loss": 0.6397, "lr": 3.9544604009072173e-08, "epoch": 2.8444444444444446, "percentage": 94.9, "elapsed_time": "2:09:07", "remaining_time": "0:06:56"}
176
+ {"current_steps": 1600, "total_steps": 1686, "eval_loss": 0.9996402859687805, "epoch": 2.8444444444444446, "percentage": 94.9, "elapsed_time": "2:10:15", "remaining_time": "0:07:00"}
177
+ {"current_steps": 1610, "total_steps": 1686, "loss": 0.6399, "lr": 3.090075414025562e-08, "epoch": 2.862222222222222, "percentage": 95.49, "elapsed_time": "2:10:54", "remaining_time": "0:06:10"}
178
+ {"current_steps": 1620, "total_steps": 1686, "loss": 0.6356, "lr": 2.331579612993018e-08, "epoch": 2.88, "percentage": 96.09, "elapsed_time": "2:11:38", "remaining_time": "0:05:21"}
179
+ {"current_steps": 1630, "total_steps": 1686, "loss": 0.6473, "lr": 1.679298284462033e-08, "epoch": 2.897777777777778, "percentage": 96.68, "elapsed_time": "2:12:16", "remaining_time": "0:04:32"}
180
+ {"current_steps": 1640, "total_steps": 1686, "loss": 0.639, "lr": 1.1335111642064855e-08, "epoch": 2.9155555555555557, "percentage": 97.27, "elapsed_time": "2:12:58", "remaining_time": "0:03:43"}
181
+ {"current_steps": 1650, "total_steps": 1686, "loss": 0.6115, "lr": 6.944523171547313e-09, "epoch": 2.9333333333333336, "percentage": 97.86, "elapsed_time": "2:13:39", "remaining_time": "0:02:54"}
182
+ {"current_steps": 1660, "total_steps": 1686, "loss": 0.6142, "lr": 3.623100370091226e-09, "epoch": 2.951111111111111, "percentage": 98.46, "elapsed_time": "2:14:17", "remaining_time": "0:02:06"}
183
+ {"current_steps": 1670, "total_steps": 1686, "loss": 0.6238, "lr": 1.3722676549493553e-09, "epoch": 2.968888888888889, "percentage": 99.05, "elapsed_time": "2:14:55", "remaining_time": "0:01:17"}
184
+ {"current_steps": 1680, "total_steps": 1686, "loss": 0.6662, "lr": 1.9299031272956846e-10, "epoch": 2.986666666666667, "percentage": 99.64, "elapsed_time": "2:15:39", "remaining_time": "0:00:29"}
185
+ {"current_steps": 1686, "total_steps": 1686, "epoch": 2.997333333333333, "percentage": 100.0, "elapsed_time": "2:18:51", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,1346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.997333333333333,
5
+ "eval_steps": 100,
6
+ "global_step": 1686,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.017777777777777778,
13
+ "grad_norm": 3.064221071757039,
14
+ "learning_rate": 2.958579881656805e-07,
15
+ "loss": 1.0313,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.035555555555555556,
20
+ "grad_norm": 2.311668702631407,
21
+ "learning_rate": 5.91715976331361e-07,
22
+ "loss": 1.0088,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.05333333333333334,
27
+ "grad_norm": 1.7042990791798596,
28
+ "learning_rate": 8.875739644970415e-07,
29
+ "loss": 1.0074,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.07111111111111111,
34
+ "grad_norm": 1.4979922190245858,
35
+ "learning_rate": 1.183431952662722e-06,
36
+ "loss": 0.9623,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.08888888888888889,
41
+ "grad_norm": 1.62270031557648,
42
+ "learning_rate": 1.4792899408284026e-06,
43
+ "loss": 0.9638,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.10666666666666667,
48
+ "grad_norm": 2.3961447512268093,
49
+ "learning_rate": 1.775147928994083e-06,
50
+ "loss": 1.0104,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.12444444444444444,
55
+ "grad_norm": 1.4915238179077648,
56
+ "learning_rate": 2.0710059171597635e-06,
57
+ "loss": 1.0002,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.14222222222222222,
62
+ "grad_norm": 2.2772628135300255,
63
+ "learning_rate": 2.366863905325444e-06,
64
+ "loss": 1.0223,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.16,
69
+ "grad_norm": 1.8457773806820135,
70
+ "learning_rate": 2.6627218934911246e-06,
71
+ "loss": 0.9855,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.17777777777777778,
76
+ "grad_norm": 1.0682523156202766,
77
+ "learning_rate": 2.958579881656805e-06,
78
+ "loss": 0.9373,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.17777777777777778,
83
+ "eval_loss": 0.9671729803085327,
84
+ "eval_runtime": 68.6602,
85
+ "eval_samples_per_second": 29.129,
86
+ "eval_steps_per_second": 1.821,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 0.19555555555555557,
91
+ "grad_norm": 1.4598489133901087,
92
+ "learning_rate": 3.2544378698224853e-06,
93
+ "loss": 0.9625,
94
+ "step": 110
95
+ },
96
+ {
97
+ "epoch": 0.21333333333333335,
98
+ "grad_norm": 2.180267593823412,
99
+ "learning_rate": 3.550295857988166e-06,
100
+ "loss": 0.9634,
101
+ "step": 120
102
+ },
103
+ {
104
+ "epoch": 0.2311111111111111,
105
+ "grad_norm": 1.2931186615642782,
106
+ "learning_rate": 3.846153846153847e-06,
107
+ "loss": 0.933,
108
+ "step": 130
109
+ },
110
+ {
111
+ "epoch": 0.24888888888888888,
112
+ "grad_norm": 1.512746752342751,
113
+ "learning_rate": 4.142011834319527e-06,
114
+ "loss": 0.951,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 0.26666666666666666,
119
+ "grad_norm": 1.8895957215192007,
120
+ "learning_rate": 4.437869822485207e-06,
121
+ "loss": 0.9375,
122
+ "step": 150
123
+ },
124
+ {
125
+ "epoch": 0.28444444444444444,
126
+ "grad_norm": 2.562051614464793,
127
+ "learning_rate": 4.733727810650888e-06,
128
+ "loss": 0.9533,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 0.3022222222222222,
133
+ "grad_norm": 1.7071555116041661,
134
+ "learning_rate": 4.999994639090922e-06,
135
+ "loss": 0.9641,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 0.32,
140
+ "grad_norm": 1.4808539506312526,
141
+ "learning_rate": 4.999351357820732e-06,
142
+ "loss": 0.958,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 0.3377777777777778,
147
+ "grad_norm": 3.108282359705838,
148
+ "learning_rate": 4.99763621084537e-06,
149
+ "loss": 0.9524,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 0.35555555555555557,
154
+ "grad_norm": 1.6209678934544214,
155
+ "learning_rate": 4.994849933718568e-06,
156
+ "loss": 0.9671,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 0.35555555555555557,
161
+ "eval_loss": 0.9638746380805969,
162
+ "eval_runtime": 67.7576,
163
+ "eval_samples_per_second": 29.517,
164
+ "eval_steps_per_second": 1.845,
165
+ "step": 200
166
+ },
167
+ {
168
+ "epoch": 0.37333333333333335,
169
+ "grad_norm": 2.011525806742045,
170
+ "learning_rate": 4.990993721356317e-06,
171
+ "loss": 0.9697,
172
+ "step": 210
173
+ },
174
+ {
175
+ "epoch": 0.39111111111111113,
176
+ "grad_norm": 1.3952496927844644,
177
+ "learning_rate": 4.986069227524407e-06,
178
+ "loss": 1.0093,
179
+ "step": 220
180
+ },
181
+ {
182
+ "epoch": 0.4088888888888889,
183
+ "grad_norm": 1.1889866128952742,
184
+ "learning_rate": 4.980078564129211e-06,
185
+ "loss": 0.9411,
186
+ "step": 230
187
+ },
188
+ {
189
+ "epoch": 0.4266666666666667,
190
+ "grad_norm": 1.5732743711804986,
191
+ "learning_rate": 4.973024300311968e-06,
192
+ "loss": 1.0076,
193
+ "step": 240
194
+ },
195
+ {
196
+ "epoch": 0.4444444444444444,
197
+ "grad_norm": 1.6111844423415858,
198
+ "learning_rate": 4.9649094613469904e-06,
199
+ "loss": 0.9674,
200
+ "step": 250
201
+ },
202
+ {
203
+ "epoch": 0.4622222222222222,
204
+ "grad_norm": 1.3411297288659159,
205
+ "learning_rate": 4.955737527344251e-06,
206
+ "loss": 0.9507,
207
+ "step": 260
208
+ },
209
+ {
210
+ "epoch": 0.48,
211
+ "grad_norm": 1.8162663201702451,
212
+ "learning_rate": 4.945512431756916e-06,
213
+ "loss": 0.9783,
214
+ "step": 270
215
+ },
216
+ {
217
+ "epoch": 0.49777777777777776,
218
+ "grad_norm": 1.373822283926042,
219
+ "learning_rate": 4.934238559694448e-06,
220
+ "loss": 0.9694,
221
+ "step": 280
222
+ },
223
+ {
224
+ "epoch": 0.5155555555555555,
225
+ "grad_norm": 1.4026814878661258,
226
+ "learning_rate": 4.921920746042023e-06,
227
+ "loss": 0.9724,
228
+ "step": 290
229
+ },
230
+ {
231
+ "epoch": 0.5333333333333333,
232
+ "grad_norm": 1.4388462413249878,
233
+ "learning_rate": 4.908564273387051e-06,
234
+ "loss": 0.9584,
235
+ "step": 300
236
+ },
237
+ {
238
+ "epoch": 0.5333333333333333,
239
+ "eval_loss": 0.9628667235374451,
240
+ "eval_runtime": 67.724,
241
+ "eval_samples_per_second": 29.532,
242
+ "eval_steps_per_second": 1.846,
243
+ "step": 300
244
+ },
245
+ {
246
+ "epoch": 0.5511111111111111,
247
+ "grad_norm": 1.1700333111323251,
248
+ "learning_rate": 4.894174869753701e-06,
249
+ "loss": 0.9468,
250
+ "step": 310
251
+ },
252
+ {
253
+ "epoch": 0.5688888888888889,
254
+ "grad_norm": 1.4725090311768079,
255
+ "learning_rate": 4.878758706146389e-06,
256
+ "loss": 0.9523,
257
+ "step": 320
258
+ },
259
+ {
260
+ "epoch": 0.5866666666666667,
261
+ "grad_norm": 1.2704505454904513,
262
+ "learning_rate": 4.862322393903305e-06,
263
+ "loss": 0.9527,
264
+ "step": 330
265
+ },
266
+ {
267
+ "epoch": 0.6044444444444445,
268
+ "grad_norm": 2.3493531077217336,
269
+ "learning_rate": 4.84487298186108e-06,
270
+ "loss": 0.9544,
271
+ "step": 340
272
+ },
273
+ {
274
+ "epoch": 0.6222222222222222,
275
+ "grad_norm": 1.3584514135832693,
276
+ "learning_rate": 4.8264179533318526e-06,
277
+ "loss": 0.903,
278
+ "step": 350
279
+ },
280
+ {
281
+ "epoch": 0.64,
282
+ "grad_norm": 1.3344268399358867,
283
+ "learning_rate": 4.806965222893979e-06,
284
+ "loss": 0.935,
285
+ "step": 360
286
+ },
287
+ {
288
+ "epoch": 0.6577777777777778,
289
+ "grad_norm": 1.626354968285715,
290
+ "learning_rate": 4.786523132997817e-06,
291
+ "loss": 0.9859,
292
+ "step": 370
293
+ },
294
+ {
295
+ "epoch": 0.6755555555555556,
296
+ "grad_norm": 1.3370406996157487,
297
+ "learning_rate": 4.765100450387999e-06,
298
+ "loss": 1.0107,
299
+ "step": 380
300
+ },
301
+ {
302
+ "epoch": 0.6933333333333334,
303
+ "grad_norm": 2.4117986108589893,
304
+ "learning_rate": 4.74270636234375e-06,
305
+ "loss": 0.9379,
306
+ "step": 390
307
+ },
308
+ {
309
+ "epoch": 0.7111111111111111,
310
+ "grad_norm": 3.6777984705003957,
311
+ "learning_rate": 4.719350472738849e-06,
312
+ "loss": 0.957,
313
+ "step": 400
314
+ },
315
+ {
316
+ "epoch": 0.7111111111111111,
317
+ "eval_loss": 0.9597401022911072,
318
+ "eval_runtime": 67.6639,
319
+ "eval_samples_per_second": 29.558,
320
+ "eval_steps_per_second": 1.847,
321
+ "step": 400
322
+ },
323
+ {
324
+ "epoch": 0.7288888888888889,
325
+ "grad_norm": 2.9005913172464544,
326
+ "learning_rate": 4.69504279792294e-06,
327
+ "loss": 0.9736,
328
+ "step": 410
329
+ },
330
+ {
331
+ "epoch": 0.7466666666666667,
332
+ "grad_norm": 1.4861556222479484,
333
+ "learning_rate": 4.669793762425942e-06,
334
+ "loss": 0.9402,
335
+ "step": 420
336
+ },
337
+ {
338
+ "epoch": 0.7644444444444445,
339
+ "grad_norm": 1.404332314308356,
340
+ "learning_rate": 4.643614194487411e-06,
341
+ "loss": 1.0085,
342
+ "step": 430
343
+ },
344
+ {
345
+ "epoch": 0.7822222222222223,
346
+ "grad_norm": 1.2973467924712199,
347
+ "learning_rate": 4.616515321412769e-06,
348
+ "loss": 0.9195,
349
+ "step": 440
350
+ },
351
+ {
352
+ "epoch": 0.8,
353
+ "grad_norm": 1.8400721698335023,
354
+ "learning_rate": 4.588508764758386e-06,
355
+ "loss": 0.9976,
356
+ "step": 450
357
+ },
358
+ {
359
+ "epoch": 0.8177777777777778,
360
+ "grad_norm": 1.34389013645689,
361
+ "learning_rate": 4.559606535347594e-06,
362
+ "loss": 0.9269,
363
+ "step": 460
364
+ },
365
+ {
366
+ "epoch": 0.8355555555555556,
367
+ "grad_norm": 1.4781191494614154,
368
+ "learning_rate": 4.5298210281197474e-06,
369
+ "loss": 0.9837,
370
+ "step": 470
371
+ },
372
+ {
373
+ "epoch": 0.8533333333333334,
374
+ "grad_norm": 1.2822503245441141,
375
+ "learning_rate": 4.499165016814564e-06,
376
+ "loss": 0.9712,
377
+ "step": 480
378
+ },
379
+ {
380
+ "epoch": 0.8711111111111111,
381
+ "grad_norm": 1.7262576201495767,
382
+ "learning_rate": 4.46765164849401e-06,
383
+ "loss": 1.0075,
384
+ "step": 490
385
+ },
386
+ {
387
+ "epoch": 0.8888888888888888,
388
+ "grad_norm": 1.848736473377725,
389
+ "learning_rate": 4.435294437904082e-06,
390
+ "loss": 0.9477,
391
+ "step": 500
392
+ },
393
+ {
394
+ "epoch": 0.8888888888888888,
395
+ "eval_loss": 0.9587395787239075,
396
+ "eval_runtime": 67.6284,
397
+ "eval_samples_per_second": 29.573,
398
+ "eval_steps_per_second": 1.848,
399
+ "step": 500
400
+ },
401
+ {
402
+ "epoch": 0.9066666666666666,
403
+ "grad_norm": 1.7174287518034348,
404
+ "learning_rate": 4.402107261678908e-06,
405
+ "loss": 1.002,
406
+ "step": 510
407
+ },
408
+ {
409
+ "epoch": 0.9244444444444444,
410
+ "grad_norm": 1.4683365527516716,
411
+ "learning_rate": 4.368104352389641e-06,
412
+ "loss": 0.9613,
413
+ "step": 520
414
+ },
415
+ {
416
+ "epoch": 0.9422222222222222,
417
+ "grad_norm": 1.4656973277718064,
418
+ "learning_rate": 4.333300292440728e-06,
419
+ "loss": 0.9345,
420
+ "step": 530
421
+ },
422
+ {
423
+ "epoch": 0.96,
424
+ "grad_norm": 1.3740078632601003,
425
+ "learning_rate": 4.2977100078161196e-06,
426
+ "loss": 0.9644,
427
+ "step": 540
428
+ },
429
+ {
430
+ "epoch": 0.9777777777777777,
431
+ "grad_norm": 1.4355083298426001,
432
+ "learning_rate": 4.2613487616781595e-06,
433
+ "loss": 0.9952,
434
+ "step": 550
435
+ },
436
+ {
437
+ "epoch": 0.9955555555555555,
438
+ "grad_norm": 1.8504347531710936,
439
+ "learning_rate": 4.224232147821858e-06,
440
+ "loss": 1.0213,
441
+ "step": 560
442
+ },
443
+ {
444
+ "epoch": 1.0133333333333334,
445
+ "grad_norm": 1.1787812976568717,
446
+ "learning_rate": 4.186376083987376e-06,
447
+ "loss": 0.8555,
448
+ "step": 570
449
+ },
450
+ {
451
+ "epoch": 1.031111111111111,
452
+ "grad_norm": 1.6020289309675289,
453
+ "learning_rate": 4.147796805033583e-06,
454
+ "loss": 0.8231,
455
+ "step": 580
456
+ },
457
+ {
458
+ "epoch": 1.048888888888889,
459
+ "grad_norm": 1.5772724854646063,
460
+ "learning_rate": 4.108510855975611e-06,
461
+ "loss": 0.7805,
462
+ "step": 590
463
+ },
464
+ {
465
+ "epoch": 1.0666666666666667,
466
+ "grad_norm": 1.5423541893936756,
467
+ "learning_rate": 4.0685350848894065e-06,
468
+ "loss": 0.8552,
469
+ "step": 600
470
+ },
471
+ {
472
+ "epoch": 1.0666666666666667,
473
+ "eval_loss": 0.9710310101509094,
474
+ "eval_runtime": 67.5663,
475
+ "eval_samples_per_second": 29.601,
476
+ "eval_steps_per_second": 1.85,
477
+ "step": 600
478
+ },
479
+ {
480
+ "epoch": 1.0844444444444445,
481
+ "grad_norm": 2.0459202464567303,
482
+ "learning_rate": 4.027886635686301e-06,
483
+ "loss": 0.8172,
484
+ "step": 610
485
+ },
486
+ {
487
+ "epoch": 1.1022222222222222,
488
+ "grad_norm": 1.1330519416716263,
489
+ "learning_rate": 3.986582940760717e-06,
490
+ "loss": 0.7653,
491
+ "step": 620
492
+ },
493
+ {
494
+ "epoch": 1.12,
495
+ "grad_norm": 1.291984175159882,
496
+ "learning_rate": 3.9446417135141536e-06,
497
+ "loss": 0.8175,
498
+ "step": 630
499
+ },
500
+ {
501
+ "epoch": 1.1377777777777778,
502
+ "grad_norm": 1.9370076066794117,
503
+ "learning_rate": 3.902080940758658e-06,
504
+ "loss": 0.7756,
505
+ "step": 640
506
+ },
507
+ {
508
+ "epoch": 1.1555555555555554,
509
+ "grad_norm": 1.1866194794808598,
510
+ "learning_rate": 3.858918875003053e-06,
511
+ "loss": 0.752,
512
+ "step": 650
513
+ },
514
+ {
515
+ "epoch": 1.1733333333333333,
516
+ "grad_norm": 1.2727193771706111,
517
+ "learning_rate": 3.815174026625202e-06,
518
+ "loss": 0.7721,
519
+ "step": 660
520
+ },
521
+ {
522
+ "epoch": 1.1911111111111112,
523
+ "grad_norm": 1.545919656220833,
524
+ "learning_rate": 3.770865155933694e-06,
525
+ "loss": 0.7816,
526
+ "step": 670
527
+ },
528
+ {
529
+ "epoch": 1.208888888888889,
530
+ "grad_norm": 1.9266494592917187,
531
+ "learning_rate": 3.7260112651223553e-06,
532
+ "loss": 0.7897,
533
+ "step": 680
534
+ },
535
+ {
536
+ "epoch": 1.2266666666666666,
537
+ "grad_norm": 2.057825893443874,
538
+ "learning_rate": 3.6806315901209987e-06,
539
+ "loss": 0.8024,
540
+ "step": 690
541
+ },
542
+ {
543
+ "epoch": 1.2444444444444445,
544
+ "grad_norm": 1.2392576716268238,
545
+ "learning_rate": 3.6347455923459616e-06,
546
+ "loss": 0.7944,
547
+ "step": 700
548
+ },
549
+ {
550
+ "epoch": 1.2444444444444445,
551
+ "eval_loss": 0.9722110629081726,
552
+ "eval_runtime": 67.6028,
553
+ "eval_samples_per_second": 29.585,
554
+ "eval_steps_per_second": 1.849,
555
+ "step": 700
556
+ },
557
+ {
558
+ "epoch": 1.2622222222222224,
559
+ "grad_norm": 1.7969055853329803,
560
+ "learning_rate": 3.5883729503539265e-06,
561
+ "loss": 0.8198,
562
+ "step": 710
563
+ },
564
+ {
565
+ "epoch": 1.28,
566
+ "grad_norm": 1.3886692866173258,
567
+ "learning_rate": 3.541533551402621e-06,
568
+ "loss": 0.7903,
569
+ "step": 720
570
+ },
571
+ {
572
+ "epoch": 1.2977777777777777,
573
+ "grad_norm": 1.2294305356221513,
574
+ "learning_rate": 3.494247482922024e-06,
575
+ "loss": 0.8095,
576
+ "step": 730
577
+ },
578
+ {
579
+ "epoch": 1.3155555555555556,
580
+ "grad_norm": 1.558667712302195,
581
+ "learning_rate": 3.4465350238997112e-06,
582
+ "loss": 0.8326,
583
+ "step": 740
584
+ },
585
+ {
586
+ "epoch": 1.3333333333333333,
587
+ "grad_norm": 1.1479609335353171,
588
+ "learning_rate": 3.3984166361840646e-06,
589
+ "loss": 0.7833,
590
+ "step": 750
591
+ },
592
+ {
593
+ "epoch": 1.3511111111111112,
594
+ "grad_norm": 1.2159354644232423,
595
+ "learning_rate": 3.3499129557090517e-06,
596
+ "loss": 0.8171,
597
+ "step": 760
598
+ },
599
+ {
600
+ "epoch": 1.3688888888888888,
601
+ "grad_norm": 1.4385597949788431,
602
+ "learning_rate": 3.301044783644348e-06,
603
+ "loss": 0.7968,
604
+ "step": 770
605
+ },
606
+ {
607
+ "epoch": 1.3866666666666667,
608
+ "grad_norm": 1.4756414075732949,
609
+ "learning_rate": 3.2518330774746014e-06,
610
+ "loss": 0.7689,
611
+ "step": 780
612
+ },
613
+ {
614
+ "epoch": 1.4044444444444444,
615
+ "grad_norm": 1.5209105713654745,
616
+ "learning_rate": 3.2022989420116556e-06,
617
+ "loss": 0.776,
618
+ "step": 790
619
+ },
620
+ {
621
+ "epoch": 1.4222222222222223,
622
+ "grad_norm": 1.7391061143058137,
623
+ "learning_rate": 3.152463620343591e-06,
624
+ "loss": 0.7359,
625
+ "step": 800
626
+ },
627
+ {
628
+ "epoch": 1.4222222222222223,
629
+ "eval_loss": 0.9709274768829346,
630
+ "eval_runtime": 67.9642,
631
+ "eval_samples_per_second": 29.427,
632
+ "eval_steps_per_second": 1.839,
633
+ "step": 800
634
+ },
635
+ {
636
+ "epoch": 1.44,
637
+ "grad_norm": 1.023232810277016,
638
+ "learning_rate": 3.102348484724467e-06,
639
+ "loss": 0.8143,
640
+ "step": 810
641
+ },
642
+ {
643
+ "epoch": 1.4577777777777778,
644
+ "grad_norm": 1.4087143597081888,
645
+ "learning_rate": 3.0519750274086675e-06,
646
+ "loss": 0.8282,
647
+ "step": 820
648
+ },
649
+ {
650
+ "epoch": 1.4755555555555555,
651
+ "grad_norm": 2.379499508964777,
652
+ "learning_rate": 3.001364851433789e-06,
653
+ "loss": 0.7994,
654
+ "step": 830
655
+ },
656
+ {
657
+ "epoch": 1.4933333333333334,
658
+ "grad_norm": 1.4148971570091637,
659
+ "learning_rate": 2.950539661356011e-06,
660
+ "loss": 0.7533,
661
+ "step": 840
662
+ },
663
+ {
664
+ "epoch": 1.511111111111111,
665
+ "grad_norm": 1.3823213189741146,
666
+ "learning_rate": 2.899521253941937e-06,
667
+ "loss": 0.8492,
668
+ "step": 850
669
+ },
670
+ {
671
+ "epoch": 1.528888888888889,
672
+ "grad_norm": 1.1488970638328968,
673
+ "learning_rate": 2.8483315088208828e-06,
674
+ "loss": 0.7805,
675
+ "step": 860
676
+ },
677
+ {
678
+ "epoch": 1.5466666666666666,
679
+ "grad_norm": 1.5267987965557517,
680
+ "learning_rate": 2.7969923791016334e-06,
681
+ "loss": 0.8412,
682
+ "step": 870
683
+ },
684
+ {
685
+ "epoch": 1.5644444444444443,
686
+ "grad_norm": 1.393454391992812,
687
+ "learning_rate": 2.7455258819576876e-06,
688
+ "loss": 0.8048,
689
+ "step": 880
690
+ },
691
+ {
692
+ "epoch": 1.5822222222222222,
693
+ "grad_norm": 1.8181282810366222,
694
+ "learning_rate": 2.6939540891850265e-06,
695
+ "loss": 0.7951,
696
+ "step": 890
697
+ },
698
+ {
699
+ "epoch": 1.6,
700
+ "grad_norm": 1.2842315283042123,
701
+ "learning_rate": 2.642299117736456e-06,
702
+ "loss": 0.8494,
703
+ "step": 900
704
+ },
705
+ {
706
+ "epoch": 1.6,
707
+ "eval_loss": 0.9661916494369507,
708
+ "eval_runtime": 67.5902,
709
+ "eval_samples_per_second": 29.59,
710
+ "eval_steps_per_second": 1.849,
711
+ "step": 900
712
+ },
713
+ {
714
+ "epoch": 1.6177777777777778,
715
+ "grad_norm": 1.3592948657037425,
716
+ "learning_rate": 2.5905831202365856e-06,
717
+ "loss": 0.7601,
718
+ "step": 910
719
+ },
720
+ {
721
+ "epoch": 1.6355555555555554,
722
+ "grad_norm": 1.811137037940328,
723
+ "learning_rate": 2.538828275481509e-06,
724
+ "loss": 0.7561,
725
+ "step": 920
726
+ },
727
+ {
728
+ "epoch": 1.6533333333333333,
729
+ "grad_norm": 1.2508052091761472,
730
+ "learning_rate": 2.4870567789272563e-06,
731
+ "loss": 0.7645,
732
+ "step": 930
733
+ },
734
+ {
735
+ "epoch": 1.6711111111111112,
736
+ "grad_norm": 1.6434568587878196,
737
+ "learning_rate": 2.435290833171109e-06,
738
+ "loss": 0.7935,
739
+ "step": 940
740
+ },
741
+ {
742
+ "epoch": 1.6888888888888889,
743
+ "grad_norm": 1.635983339764013,
744
+ "learning_rate": 2.3835526384298475e-06,
745
+ "loss": 0.7813,
746
+ "step": 950
747
+ },
748
+ {
749
+ "epoch": 1.7066666666666666,
750
+ "grad_norm": 1.3393891297643545,
751
+ "learning_rate": 2.3318643830190186e-06,
752
+ "loss": 0.752,
753
+ "step": 960
754
+ },
755
+ {
756
+ "epoch": 1.7244444444444444,
757
+ "grad_norm": 1.468531788770485,
758
+ "learning_rate": 2.2802482338373096e-06,
759
+ "loss": 0.78,
760
+ "step": 970
761
+ },
762
+ {
763
+ "epoch": 1.7422222222222223,
764
+ "grad_norm": 1.4403918441032775,
765
+ "learning_rate": 2.228726326860109e-06,
766
+ "loss": 0.7736,
767
+ "step": 980
768
+ },
769
+ {
770
+ "epoch": 1.76,
771
+ "grad_norm": 1.5016888280136909,
772
+ "learning_rate": 2.1773207576463227e-06,
773
+ "loss": 0.799,
774
+ "step": 990
775
+ },
776
+ {
777
+ "epoch": 1.7777777777777777,
778
+ "grad_norm": 1.4232106086043248,
779
+ "learning_rate": 2.1260535718625274e-06,
780
+ "loss": 0.8163,
781
+ "step": 1000
782
+ },
783
+ {
784
+ "epoch": 1.7777777777777777,
785
+ "eval_loss": 0.9663456082344055,
786
+ "eval_runtime": 67.7852,
787
+ "eval_samples_per_second": 29.505,
788
+ "eval_steps_per_second": 1.844,
789
+ "step": 1000
790
+ },
791
+ {
792
+ "epoch": 1.7955555555555556,
793
+ "grad_norm": 2.021142501636729,
794
+ "learning_rate": 2.0749467558285224e-06,
795
+ "loss": 0.7882,
796
+ "step": 1010
797
+ },
798
+ {
799
+ "epoch": 1.8133333333333335,
800
+ "grad_norm": 1.4389492235517993,
801
+ "learning_rate": 2.024022227088329e-06,
802
+ "loss": 0.8198,
803
+ "step": 1020
804
+ },
805
+ {
806
+ "epoch": 1.8311111111111111,
807
+ "grad_norm": 1.8381964707129699,
808
+ "learning_rate": 1.973301825010685e-06,
809
+ "loss": 0.8141,
810
+ "step": 1030
811
+ },
812
+ {
813
+ "epoch": 1.8488888888888888,
814
+ "grad_norm": 1.1150366712372968,
815
+ "learning_rate": 1.922807301423065e-06,
816
+ "loss": 0.7297,
817
+ "step": 1040
818
+ },
819
+ {
820
+ "epoch": 1.8666666666666667,
821
+ "grad_norm": 1.1816318425909016,
822
+ "learning_rate": 1.8725603112832518e-06,
823
+ "loss": 0.7754,
824
+ "step": 1050
825
+ },
826
+ {
827
+ "epoch": 1.8844444444444446,
828
+ "grad_norm": 2.3505796457878154,
829
+ "learning_rate": 1.8225824033924377e-06,
830
+ "loss": 0.8001,
831
+ "step": 1060
832
+ },
833
+ {
834
+ "epoch": 1.9022222222222223,
835
+ "grad_norm": 1.4164977558439569,
836
+ "learning_rate": 1.7728950111538585e-06,
837
+ "loss": 0.7828,
838
+ "step": 1070
839
+ },
840
+ {
841
+ "epoch": 1.92,
842
+ "grad_norm": 1.0614294585251351,
843
+ "learning_rate": 1.7235194433809215e-06,
844
+ "loss": 0.7754,
845
+ "step": 1080
846
+ },
847
+ {
848
+ "epoch": 1.9377777777777778,
849
+ "grad_norm": 1.2695304411022084,
850
+ "learning_rate": 1.674476875158756e-06,
851
+ "loss": 0.7963,
852
+ "step": 1090
853
+ },
854
+ {
855
+ "epoch": 1.9555555555555557,
856
+ "grad_norm": 1.7878889758415848,
857
+ "learning_rate": 1.6257883387631178e-06,
858
+ "loss": 0.8041,
859
+ "step": 1100
860
+ },
861
+ {
862
+ "epoch": 1.9555555555555557,
863
+ "eval_loss": 0.9638973474502563,
864
+ "eval_runtime": 67.6938,
865
+ "eval_samples_per_second": 29.545,
866
+ "eval_steps_per_second": 1.847,
867
+ "step": 1100
868
+ },
869
+ {
870
+ "epoch": 1.9733333333333334,
871
+ "grad_norm": 1.2393550606660526,
872
+ "learning_rate": 1.5774747146405423e-06,
873
+ "loss": 0.8067,
874
+ "step": 1110
875
+ },
876
+ {
877
+ "epoch": 1.991111111111111,
878
+ "grad_norm": 1.3321869776875785,
879
+ "learning_rate": 1.5295567224536023e-06,
880
+ "loss": 0.7654,
881
+ "step": 1120
882
+ },
883
+ {
884
+ "epoch": 2.008888888888889,
885
+ "grad_norm": 1.352880244351638,
886
+ "learning_rate": 1.4820549121951321e-06,
887
+ "loss": 0.7469,
888
+ "step": 1130
889
+ },
890
+ {
891
+ "epoch": 2.026666666666667,
892
+ "grad_norm": 1.4679043094200657,
893
+ "learning_rate": 1.434989655375197e-06,
894
+ "loss": 0.6237,
895
+ "step": 1140
896
+ },
897
+ {
898
+ "epoch": 2.0444444444444443,
899
+ "grad_norm": 1.308896485866915,
900
+ "learning_rate": 1.3883811362846275e-06,
901
+ "loss": 0.6339,
902
+ "step": 1150
903
+ },
904
+ {
905
+ "epoch": 2.062222222222222,
906
+ "grad_norm": 1.1573190903084591,
907
+ "learning_rate": 1.342249343338829e-06,
908
+ "loss": 0.6055,
909
+ "step": 1160
910
+ },
911
+ {
912
+ "epoch": 2.08,
913
+ "grad_norm": 1.4670238225692112,
914
+ "learning_rate": 1.296614060505596e-06,
915
+ "loss": 0.6589,
916
+ "step": 1170
917
+ },
918
+ {
919
+ "epoch": 2.097777777777778,
920
+ "grad_norm": 1.1016096251472431,
921
+ "learning_rate": 1.251494858820615e-06,
922
+ "loss": 0.6441,
923
+ "step": 1180
924
+ },
925
+ {
926
+ "epoch": 2.1155555555555554,
927
+ "grad_norm": 1.4722885203046696,
928
+ "learning_rate": 1.2069110879942758e-06,
929
+ "loss": 0.6571,
930
+ "step": 1190
931
+ },
932
+ {
933
+ "epoch": 2.1333333333333333,
934
+ "grad_norm": 1.8212881071516174,
935
+ "learning_rate": 1.1628818681133965e-06,
936
+ "loss": 0.6291,
937
+ "step": 1200
938
+ },
939
+ {
940
+ "epoch": 2.1333333333333333,
941
+ "eval_loss": 0.9990262985229492,
942
+ "eval_runtime": 67.7113,
943
+ "eval_samples_per_second": 29.537,
944
+ "eval_steps_per_second": 1.846,
945
+ "step": 1200
946
+ },
947
+ {
948
+ "epoch": 2.151111111111111,
949
+ "grad_norm": 1.1599321734840744,
950
+ "learning_rate": 1.119426081441437e-06,
951
+ "loss": 0.6581,
952
+ "step": 1210
953
+ },
954
+ {
955
+ "epoch": 2.168888888888889,
956
+ "grad_norm": 1.0610363548430464,
957
+ "learning_rate": 1.0765623643206946e-06,
958
+ "loss": 0.6067,
959
+ "step": 1220
960
+ },
961
+ {
962
+ "epoch": 2.1866666666666665,
963
+ "grad_norm": 1.2202345490291358,
964
+ "learning_rate": 1.0343090991799677e-06,
965
+ "loss": 0.6324,
966
+ "step": 1230
967
+ },
968
+ {
969
+ "epoch": 2.2044444444444444,
970
+ "grad_norm": 1.3964391722449683,
971
+ "learning_rate": 9.926844066511132e-07,
972
+ "loss": 0.6631,
973
+ "step": 1240
974
+ },
975
+ {
976
+ "epoch": 2.2222222222222223,
977
+ "grad_norm": 1.5877635166510526,
978
+ "learning_rate": 9.517061377978762e-07,
979
+ "loss": 0.6162,
980
+ "step": 1250
981
+ },
982
+ {
983
+ "epoch": 2.24,
984
+ "grad_norm": 1.6284099053558183,
985
+ "learning_rate": 9.113918664603277e-07,
986
+ "loss": 0.7062,
987
+ "step": 1260
988
+ },
989
+ {
990
+ "epoch": 2.2577777777777777,
991
+ "grad_norm": 1.2153325885276287,
992
+ "learning_rate": 8.717588817181868e-07,
993
+ "loss": 0.6165,
994
+ "step": 1270
995
+ },
996
+ {
997
+ "epoch": 2.2755555555555556,
998
+ "grad_norm": 1.9069853339995495,
999
+ "learning_rate": 8.328241804762737e-07,
1000
+ "loss": 0.6159,
1001
+ "step": 1280
1002
+ },
1003
+ {
1004
+ "epoch": 2.2933333333333334,
1005
+ "grad_norm": 1.2627676318256413,
1006
+ "learning_rate": 7.946044601752539e-07,
1007
+ "loss": 0.6516,
1008
+ "step": 1290
1009
+ },
1010
+ {
1011
+ "epoch": 2.311111111111111,
1012
+ "grad_norm": 1.5118126208151608,
1013
+ "learning_rate": 7.571161116308206e-07,
1014
+ "loss": 0.6122,
1015
+ "step": 1300
1016
+ },
1017
+ {
1018
+ "epoch": 2.311111111111111,
1019
+ "eval_loss": 1.0003914833068848,
1020
+ "eval_runtime": 67.475,
1021
+ "eval_samples_per_second": 29.641,
1022
+ "eval_steps_per_second": 1.853,
1023
+ "step": 1300
1024
+ },
1025
+ {
1026
+ "epoch": 2.328888888888889,
1027
+ "grad_norm": 1.2227440757674266,
1028
+ "learning_rate": 7.203752120043608e-07,
1029
+ "loss": 0.618,
1030
+ "step": 1310
1031
+ },
1032
+ {
1033
+ "epoch": 2.3466666666666667,
1034
+ "grad_norm": 1.5510858354361334,
1035
+ "learning_rate": 6.843975179081513e-07,
1036
+ "loss": 0.6663,
1037
+ "step": 1320
1038
+ },
1039
+ {
1040
+ "epoch": 2.3644444444444446,
1041
+ "grad_norm": 1.378250665848821,
1042
+ "learning_rate": 6.491984586480132e-07,
1043
+ "loss": 0.6055,
1044
+ "step": 1330
1045
+ },
1046
+ {
1047
+ "epoch": 2.3822222222222225,
1048
+ "grad_norm": 1.1127356981856744,
1049
+ "learning_rate": 6.147931296063353e-07,
1050
+ "loss": 0.6199,
1051
+ "step": 1340
1052
+ },
1053
+ {
1054
+ "epoch": 2.4,
1055
+ "grad_norm": 2.188826901721088,
1056
+ "learning_rate": 5.811962857683124e-07,
1057
+ "loss": 0.6617,
1058
+ "step": 1350
1059
+ },
1060
+ {
1061
+ "epoch": 2.417777777777778,
1062
+ "grad_norm": 1.3828567284735036,
1063
+ "learning_rate": 5.484223353941506e-07,
1064
+ "loss": 0.6497,
1065
+ "step": 1360
1066
+ },
1067
+ {
1068
+ "epoch": 2.4355555555555557,
1069
+ "grad_norm": 1.5823725055737101,
1070
+ "learning_rate": 5.164853338399795e-07,
1071
+ "loss": 0.5923,
1072
+ "step": 1370
1073
+ },
1074
+ {
1075
+ "epoch": 2.453333333333333,
1076
+ "grad_norm": 1.3562682177669632,
1077
+ "learning_rate": 4.853989775301074e-07,
1078
+ "loss": 0.6634,
1079
+ "step": 1380
1080
+ },
1081
+ {
1082
+ "epoch": 2.471111111111111,
1083
+ "grad_norm": 1.2195280460980205,
1084
+ "learning_rate": 4.55176598083206e-07,
1085
+ "loss": 0.6338,
1086
+ "step": 1390
1087
+ },
1088
+ {
1089
+ "epoch": 2.488888888888889,
1090
+ "grad_norm": 1.2628448199756899,
1091
+ "learning_rate": 4.2583115659494356e-07,
1092
+ "loss": 0.6718,
1093
+ "step": 1400
1094
+ },
1095
+ {
1096
+ "epoch": 2.488888888888889,
1097
+ "eval_loss": 1.0003468990325928,
1098
+ "eval_runtime": 67.5434,
1099
+ "eval_samples_per_second": 29.611,
1100
+ "eval_steps_per_second": 1.851,
1101
+ "step": 1400
1102
+ },
1103
+ {
1104
+ "epoch": 2.506666666666667,
1105
+ "grad_norm": 1.2286413880313467,
1106
+ "learning_rate": 3.9737523807952206e-07,
1107
+ "loss": 0.5828,
1108
+ "step": 1410
1109
+ },
1110
+ {
1111
+ "epoch": 2.5244444444444447,
1112
+ "grad_norm": 1.394873151610969,
1113
+ "learning_rate": 3.698210460724991e-07,
1114
+ "loss": 0.6212,
1115
+ "step": 1420
1116
+ },
1117
+ {
1118
+ "epoch": 2.542222222222222,
1119
+ "grad_norm": 1.780676850609808,
1120
+ "learning_rate": 3.4318039739720974e-07,
1121
+ "loss": 0.5807,
1122
+ "step": 1430
1123
+ },
1124
+ {
1125
+ "epoch": 2.56,
1126
+ "grad_norm": 1.1085749116496726,
1127
+ "learning_rate": 3.1746471709702963e-07,
1128
+ "loss": 0.6545,
1129
+ "step": 1440
1130
+ },
1131
+ {
1132
+ "epoch": 2.5777777777777775,
1133
+ "grad_norm": 1.330319629815968,
1134
+ "learning_rate": 2.9268503353566484e-07,
1135
+ "loss": 0.6209,
1136
+ "step": 1450
1137
+ },
1138
+ {
1139
+ "epoch": 2.5955555555555554,
1140
+ "grad_norm": 1.3556403897420495,
1141
+ "learning_rate": 2.6885197366754935e-07,
1142
+ "loss": 0.6429,
1143
+ "step": 1460
1144
+ },
1145
+ {
1146
+ "epoch": 2.6133333333333333,
1147
+ "grad_norm": 1.3098137995965915,
1148
+ "learning_rate": 2.459757584803965e-07,
1149
+ "loss": 0.6184,
1150
+ "step": 1470
1151
+ },
1152
+ {
1153
+ "epoch": 2.631111111111111,
1154
+ "grad_norm": 1.402526099054316,
1155
+ "learning_rate": 2.2406619861185363e-07,
1156
+ "loss": 0.6387,
1157
+ "step": 1480
1158
+ },
1159
+ {
1160
+ "epoch": 2.648888888888889,
1161
+ "grad_norm": 1.230774964223306,
1162
+ "learning_rate": 2.0313269014213378e-07,
1163
+ "loss": 0.6321,
1164
+ "step": 1490
1165
+ },
1166
+ {
1167
+ "epoch": 2.6666666666666665,
1168
+ "grad_norm": 1.3122297500292182,
1169
+ "learning_rate": 1.8318421056443576e-07,
1170
+ "loss": 0.6712,
1171
+ "step": 1500
1172
+ },
1173
+ {
1174
+ "epoch": 2.6666666666666665,
1175
+ "eval_loss": 1.0002143383026123,
1176
+ "eval_runtime": 67.5875,
1177
+ "eval_samples_per_second": 29.591,
1178
+ "eval_steps_per_second": 1.849,
1179
+ "step": 1500
1180
+ },
1181
+ {
1182
+ "epoch": 2.6844444444444444,
1183
+ "grad_norm": 1.551009662996816,
1184
+ "learning_rate": 1.6422931493488004e-07,
1185
+ "loss": 0.64,
1186
+ "step": 1510
1187
+ },
1188
+ {
1189
+ "epoch": 2.7022222222222223,
1190
+ "grad_norm": 1.6211134481898983,
1191
+ "learning_rate": 1.4627613220360632e-07,
1192
+ "loss": 0.6328,
1193
+ "step": 1520
1194
+ },
1195
+ {
1196
+ "epoch": 2.7199999999999998,
1197
+ "grad_norm": 1.4231163217204568,
1198
+ "learning_rate": 1.2933236172861435e-07,
1199
+ "loss": 0.6195,
1200
+ "step": 1530
1201
+ },
1202
+ {
1203
+ "epoch": 2.7377777777777776,
1204
+ "grad_norm": 1.3269268827126044,
1205
+ "learning_rate": 1.1340526997383194e-07,
1206
+ "loss": 0.6437,
1207
+ "step": 1540
1208
+ },
1209
+ {
1210
+ "epoch": 2.7555555555555555,
1211
+ "grad_norm": 1.48084418534789,
1212
+ "learning_rate": 9.850168739284183e-08,
1213
+ "loss": 0.6516,
1214
+ "step": 1550
1215
+ },
1216
+ {
1217
+ "epoch": 2.7733333333333334,
1218
+ "grad_norm": 1.1708528942719758,
1219
+ "learning_rate": 8.462800549958793e-08,
1220
+ "loss": 0.6293,
1221
+ "step": 1560
1222
+ },
1223
+ {
1224
+ "epoch": 2.7911111111111113,
1225
+ "grad_norm": 1.7994599570567928,
1226
+ "learning_rate": 7.179017412732581e-08,
1227
+ "loss": 0.6241,
1228
+ "step": 1570
1229
+ },
1230
+ {
1231
+ "epoch": 2.8088888888888888,
1232
+ "grad_norm": 1.3656892730986343,
1233
+ "learning_rate": 5.999369887699113e-08,
1234
+ "loss": 0.6547,
1235
+ "step": 1580
1236
+ },
1237
+ {
1238
+ "epoch": 2.8266666666666667,
1239
+ "grad_norm": 1.1251691436587674,
1240
+ "learning_rate": 4.924363875608379e-08,
1241
+ "loss": 0.6381,
1242
+ "step": 1590
1243
+ },
1244
+ {
1245
+ "epoch": 2.8444444444444446,
1246
+ "grad_norm": 1.3837083789874627,
1247
+ "learning_rate": 3.9544604009072173e-08,
1248
+ "loss": 0.6397,
1249
+ "step": 1600
1250
+ },
1251
+ {
1252
+ "epoch": 2.8444444444444446,
1253
+ "eval_loss": 0.9996402859687805,
1254
+ "eval_runtime": 67.5391,
1255
+ "eval_samples_per_second": 29.612,
1256
+ "eval_steps_per_second": 1.851,
1257
+ "step": 1600
1258
+ },
1259
+ {
1260
+ "epoch": 2.862222222222222,
1261
+ "grad_norm": 1.2700069292148142,
1262
+ "learning_rate": 3.090075414025562e-08,
1263
+ "loss": 0.6399,
1264
+ "step": 1610
1265
+ },
1266
+ {
1267
+ "epoch": 2.88,
1268
+ "grad_norm": 1.4185347259810261,
1269
+ "learning_rate": 2.331579612993018e-08,
1270
+ "loss": 0.6356,
1271
+ "step": 1620
1272
+ },
1273
+ {
1274
+ "epoch": 2.897777777777778,
1275
+ "grad_norm": 1.72135552899837,
1276
+ "learning_rate": 1.679298284462033e-08,
1277
+ "loss": 0.6473,
1278
+ "step": 1630
1279
+ },
1280
+ {
1281
+ "epoch": 2.9155555555555557,
1282
+ "grad_norm": 1.4178555511488433,
1283
+ "learning_rate": 1.1335111642064855e-08,
1284
+ "loss": 0.639,
1285
+ "step": 1640
1286
+ },
1287
+ {
1288
+ "epoch": 2.9333333333333336,
1289
+ "grad_norm": 1.6652000112393874,
1290
+ "learning_rate": 6.944523171547313e-09,
1291
+ "loss": 0.6115,
1292
+ "step": 1650
1293
+ },
1294
+ {
1295
+ "epoch": 2.951111111111111,
1296
+ "grad_norm": 1.3379355246759241,
1297
+ "learning_rate": 3.623100370091226e-09,
1298
+ "loss": 0.6142,
1299
+ "step": 1660
1300
+ },
1301
+ {
1302
+ "epoch": 2.968888888888889,
1303
+ "grad_norm": 1.1616853765979296,
1304
+ "learning_rate": 1.3722676549493553e-09,
1305
+ "loss": 0.6238,
1306
+ "step": 1670
1307
+ },
1308
+ {
1309
+ "epoch": 2.986666666666667,
1310
+ "grad_norm": 1.2081807930869184,
1311
+ "learning_rate": 1.9299031272956846e-10,
1312
+ "loss": 0.6662,
1313
+ "step": 1680
1314
+ },
1315
+ {
1316
+ "epoch": 2.997333333333333,
1317
+ "step": 1686,
1318
+ "total_flos": 297597215473664.0,
1319
+ "train_loss": 0.7989534891938541,
1320
+ "train_runtime": 8333.3074,
1321
+ "train_samples_per_second": 6.48,
1322
+ "train_steps_per_second": 0.202
1323
+ }
1324
+ ],
1325
+ "logging_steps": 10,
1326
+ "max_steps": 1686,
1327
+ "num_input_tokens_seen": 0,
1328
+ "num_train_epochs": 3,
1329
+ "save_steps": 562,
1330
+ "stateful_callbacks": {
1331
+ "TrainerControl": {
1332
+ "args": {
1333
+ "should_epoch_stop": false,
1334
+ "should_evaluate": false,
1335
+ "should_log": false,
1336
+ "should_save": true,
1337
+ "should_training_stop": true
1338
+ },
1339
+ "attributes": {}
1340
+ }
1341
+ },
1342
+ "total_flos": 297597215473664.0,
1343
+ "train_batch_size": 1,
1344
+ "trial_name": null,
1345
+ "trial_params": null
1346
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d546eb7dd498845b935fd0668a8645c7c06579214979706fcc1fcdf936843ef2
3
+ size 7160
training_eval_loss.png ADDED
training_loss.png ADDED
vocab.json ADDED
The diff for this file is too large to render. See raw diff