| { | |
| "name": "blt_7b", | |
| "dump_dir": "/checkpoints/blt_7b", | |
| "seed": 42, | |
| "debug_dynamo": false, | |
| "grad_acc_steps": 1, | |
| "gc_collect_freq": 1000, | |
| "probe_freq": null, | |
| "steps": 240000, | |
| "max_steps": null, | |
| "data": { | |
| "s3_profile": "blt", | |
| "batch_size": 4, | |
| "seq_len": 4096, | |
| "seed": 42, | |
| "add_bos": true, | |
| "add_eos": true, | |
| "load_async": true, | |
| "async_persist_type": "approximate", | |
| "prefetch_size": 200, | |
| "preprocess_dir": "/corpora/entropy_preprocess", | |
| "dataset_files": null, | |
| "entropy_model_name": "transformer_100m", | |
| "arrow_batch_size": 20, | |
| "buffer_size": 512, | |
| "file_format": "arrow", | |
| "pad_to_max_length": true, | |
| "max_encoder_seq_length": 24576, | |
| "enable_byte_ngrams": false, | |
| "add_patches": true, | |
| "tokenizer_args": { | |
| "name": "blt", | |
| "init_kwargs": { | |
| "bpe_tokenizer_path": "/tokenizers/tokenizer_final_32k.minus_inf_ws.model" | |
| } | |
| }, | |
| "patcher_args": { | |
| "patching_mode": "entropy", | |
| "patching_device": "cuda", | |
| "entropy_model_checkpoint_dir": null, | |
| "realtime_patching": false, | |
| "threshold": 1.335442066192627, | |
| "threshold_add": null, | |
| "max_patch_length": null, | |
| "patch_size": 4.5, | |
| "patching_batch_size": 1, | |
| "device": "cuda", | |
| "monotonicity": false, | |
| "log_time": false | |
| } | |
| }, | |
| "optim": { | |
| "lr": 0.0004, | |
| "weight_decay": 0.1, | |
| "epsilon": 1e-08, | |
| "beta1": 0.9, | |
| "beta2": 0.95, | |
| "clip": 1.0, | |
| "scheduler": "cosine", | |
| "warmup": 2000, | |
| "lr_min_ratio": 0.01, | |
| "cycle_length": 1.0, | |
| "cosine_theta": 1.0, | |
| "annealing_step": 1000, | |
| "decay_fraction": 0.1, | |
| "exp_factor": 0.5 | |
| }, | |
| "model": { | |
| "dim": 512, | |
| "n_layers": 8, | |
| "head_dim": null, | |
| "n_heads": 8, | |
| "n_kv_heads": null, | |
| "ffn_dim_multiplier": 1.0, | |
| "multiple_of": 256, | |
| "norm_eps": 1e-05, | |
| "rope_theta": 500000.0, | |
| "rope_use_fp32_in_outer_product": true, | |
| "init_base_std": null, | |
| "init_std_factor": "current_depth", | |
| "max_seqlen": 4096, | |
| "attn_impl": "xformers", | |
| "attn_bias_type": "block_causal", | |
| "eos_id": 2, | |
| "seed": 42, | |
| "vocab_size": 260, | |
| "weight_tying": false, | |
| "patch_in_forward": true, | |
| "dim_token": null, | |
| "dim_global": 4096, | |
| "dim_local_decoder": 1280, | |
| "dim_local_encoder": 1280, | |
| "n_layers_global": 32, | |
| "n_layers_local_decoder": 6, | |
| "n_layers_local_encoder": 1, | |
| "patch_size": 4.5, | |
| "patching_mode": "entropy", | |
| "patching_threshold": 1.335442066192627, | |
| "patching_threshold_add": null, | |
| "monotonicity": false, | |
| "patching_batch_size": 1, | |
| "patching_device": "cuda", | |
| "max_patch_length": null, | |
| "tie_local_encoder_decoder_logits": false, | |
| "use_local_encoder_transformer": true, | |
| "encoder_lm_loss": false, | |
| "max_encoder_seq_length": 24576, | |
| "pad_to_max_length": true, | |
| "encoder_enable_byte_ngrams": false, | |
| "encoder_enable_byte_group_hash": false, | |
| "ngram_vocab_sizes": null, | |
| "cross_attn_encoder": true, | |
| "cross_attn_decoder": true, | |
| "cross_attn_window_encoder": null, | |
| "cross_attn_window_decoder": null, | |
| "cross_attn_k": 4, | |
| "cross_attn_nheads": 20, | |
| "cross_attn_all_layers_decoder": true, | |
| "cross_attn_all_layers_encoder": false, | |
| "cross_attn_use_flex_attention": true, | |
| "cross_attn_init_by_pooling": true, | |
| "encoder_hash_byte_group_size": [ | |
| 3, | |
| 4, | |
| 5, | |
| 6, | |
| 7, | |
| 8 | |
| ], | |
| "encoder_hash_byte_group_vocab": 500002, | |
| "encoder_hash_byte_group_nb_functions": 1, | |
| "log_patch_lengths": false, | |
| "non_linearity": "swiglu", | |
| "use_rope": true, | |
| "recompute_fc1_out": false, | |
| "recompute_fc3_out": false, | |
| "recompute_attn": false, | |
| "custom_bwd": false, | |
| "layer_ckpt": "none", | |
| "init_use_gaussian": true, | |
| "init_use_depth": "current", | |
| "alpha_depth": "disabled", | |
| "max_length": 4096, | |
| "norm_affine": true, | |
| "pre_norm": true, | |
| "norm_type": "rmsnorm", | |
| "dropout": 0.0, | |
| "output_size": -1, | |
| "architecture": "vanilla", | |
| "share_encoder_decoder_emb": true, | |
| "global_local_decoder_residual_layer": null, | |
| "tokenize_with_bpe_delimiter": false, | |
| "patching_thresholds_str": null, | |
| "tie_local_encoder_decoder": false, | |
| "encoder_preds_low_entropy_toks": null, | |
| "encoder_preds_random_toks": null, | |
| "dim_token_emb": null, | |
| "dim_patch_emb": null, | |
| "encoder_ngram_table_dir": null, | |
| "encoder_ngram_to_size_str": null, | |
| "entropy_model_checkpoint_dir": null, | |
| "entropy_model_is_ngram_model": false, | |
| "downsampling_by_pooling": "max", | |
| "n_heads_global": 32, | |
| "n_heads_local_decoder": 20, | |
| "n_heads_local_encoder": 20, | |
| "n_kv_heads_global": null, | |
| "conv_kernel_size": null, | |
| "local_attention_window_len": 512, | |
| "sequence_parallel": false, | |
| "loss_parallel": false, | |
| "fuse_sequence_parallel": false, | |
| "use_fsdp": true, | |
| "attn_to_keep": "all", | |
| "pm_size": 0, | |
| "full_logging_n_layers": 4 | |
| }, | |
| "entropy_model": null, | |
| "train_entropy_model": false, | |
| "distributed": { | |
| "dp_shard": 1, | |
| "dp_replicate": 256, | |
| "tp_size": 1, | |
| "selective_activation_checkpointing": true, | |
| "compile": false, | |
| "fsdp_type": "full_shard", | |
| "model_dtype": "bf16", | |
| "float8_recipe": null, | |
| "float8_filter": "layers\\.[0-9]+\\.", | |
| "matmul_allow_tf32": false, | |
| "allow_bf16_reduced_precision_reduction": true, | |
| "detect_anomaly": false, | |
| "compile_cache_size_limit": 8, | |
| "spawn_method": "forkserver" | |
| }, | |
| "env": { | |
| "MKL_SERVICE_FORCE_INTEL": "GNU", | |
| "OMP_NUM_THREADS": "1", | |
| "MKL_NUM_THREADS": "1", | |
| "ENABLE_INTRA_NODE_COMM": "1", | |
| "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", | |
| "NCCL_IB_TIMEOUT": "22", | |
| "NCCL_DEBUG": "INFO", | |
| "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1" | |
| }, | |
| "checkpoint": { | |
| "dump": { | |
| "every": 1000, | |
| "keep": 1 | |
| }, | |
| "eval": { | |
| "every": 100000, | |
| "keep": -1 | |
| }, | |
| "path": "/checkpoints/blt_7b", | |
| "init_ckpt_path": null, | |
| "continue_training_from_init": false, | |
| "s3_profile": null | |
| }, | |
| "profiling": { | |
| "run": false, | |
| "trace_folder": "profiling", | |
| "mem_warmup": 0, | |
| "mem_steps": 4, | |
| "profile_warmup": 100, | |
| "profile_steps": 4 | |
| }, | |
| "logging": { | |
| "freq": 10, | |
| "acc_freq": null, | |
| "wandb": { | |
| "job_type": "train", | |
| "dir": null, | |
| "project": "blt", | |
| "entity": "blt", | |
| "tags": null, | |
| "group": null, | |
| "name": "blt_7b", | |
| "notes": null, | |
| "config_exclude_keys": null, | |
| "config_include_keys": null, | |
| "anonymous": null, | |
| "mode": null, | |
| "allow_val_change": null, | |
| "resume": null, | |
| "force": null, | |
| "tensorboard": null, | |
| "sync_tensorboard": null, | |
| "monitor_gym": null, | |
| "save_code": null, | |
| "id": null, | |
| "fork_from": null, | |
| "resume_from": null | |
| } | |
| }, | |
| "async_eval_gpus": null, | |
| "eval": null, | |
| "eval_on_gpus": 8 | |
| } | |