| common: | |
| # The number of historical images | |
| img_history_size: 2 | |
| # The number of future actions to predict | |
| action_chunk_size: 64 | |
| # The number of cameras to be used in the model | |
| num_cameras: 3 | |
| # Dimension for state/action, we use the same space for both state and action | |
| # This MUST be equal to configs/state_vec.py | |
| state_dim: 128 | |
| dataset: | |
| # We will extract the data from raw dataset | |
| # and store them in the disk buffer by producer | |
| # When training, we will read the data | |
| # randomly from the buffer by consumer | |
| # The producer will replace the data which has been | |
| # read by the consumer with new data | |
| # The path to the buffer (at least 400GB) | |
| buf_path: /home/jellyho/RDTBuffer | |
| # The number of chunks in the buffer | |
| buf_num_chunks: 128 | |
| # The number of samples (step rather than episode) in each chunk | |
| buf_chunk_size: 128 | |
| # We will filter the episodes with length less than `epsd_len_thresh_low` | |
| epsd_len_thresh_low: 32 | |
| # For those more than `epsd_len_thresh_high`, | |
| # we will randomly sample `epsd_len_thresh_high` steps each time we load the episode | |
| # to better balance the training datasets | |
| epsd_len_thresh_high: 2048 | |
| # How to fit the image size | |
| image_aspect_ratio: pad | |
| # Maximum number of language tokens | |
| tokenizer_max_length: 1024 | |
| model: | |
| # Config for condition adpators | |
| lang_adaptor: mlp2x_gelu | |
| img_adaptor: mlp2x_gelu | |
| state_adaptor: mlp3x_gelu | |
| lang_token_dim: 4096 | |
| img_token_dim: 1152 | |
| # Dim of action or proprioception vector | |
| # A `state` refers to an action or a proprioception vector | |
| state_token_dim: 128 | |
| # Config for RDT structure | |
| rdt: | |
| # 1B: num_head 32 hidden_size 2048 | |
| hidden_size: 2048 | |
| depth: 28 | |
| num_heads: 32 | |
| cond_pos_embed_type: multimodal | |
| # For noise scheduler | |
| noise_scheduler: | |
| type: ddpm | |
| num_train_timesteps: 1000 | |
| num_inference_timesteps: 5 | |
| beta_schedule: squaredcos_cap_v2 # Critical choice | |
| prediction_type: sample | |
| clip_sample: False | |
| # For EMA (params averaging) | |
| # We do not use EMA currently | |
| ema: | |
| update_after_step: 0 | |
| inv_gamma: 1.0 | |
| power: 0.75 | |
| min_value: 0.0 | |
| max_value: 0.9999 | |