| from transformers.models.llama.configuration_llama import LlamaConfig | |
| class NextStepConfig(LlamaConfig): | |
| model_type = "nextstep" | |
| def __init__( | |
| self, | |
| vae_name_or_path: str | None = None, | |
| latent_size: int = 32, | |
| latent_patch_size: int = 2, | |
| latent_channels: int = 16, | |
| boi: int | None = None, | |
| eoi: int | None = None, | |
| image_placeholder_id: int | None = None, | |
| pad_token_id_added: int | None = None, | |
| lm_loss_weight: float = 0.01, | |
| im_loss_weight: float = 1.0, | |
| fm_head_dim: int = 1536, | |
| fm_head_layers: int = 12, | |
| fm_head_batch_mul: int = 4, | |
| o_attention_bias: bool | None = None, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.vae_name_or_path = vae_name_or_path | |
| self.latent_size = latent_size | |
| self.latent_patch_size = latent_patch_size | |
| self.latent_channels = latent_channels | |
| self.boi = boi | |
| self.eoi = eoi | |
| self.image_placeholder_id = image_placeholder_id | |
| self.pad_token_id_added = pad_token_id_added | |
| self.lm_loss_weight = lm_loss_weight | |
| self.im_loss_weight = im_loss_weight | |
| self.fm_head_dim = fm_head_dim | |
| self.fm_head_layers = fm_head_layers | |
| self.fm_head_batch_mul = fm_head_batch_mul | |
| self.o_attention_bias = self.attention_bias if o_attention_bias is None else o_attention_bias |