ThomasTheMaker commited on Aug 31

Commit

a1a7208

verified ·

1 Parent(s): 20387e9

Upload folder using huggingface_hub

Browse files

Files changed (32) hide show

.gitattributes +10 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/fabric_state/checkpoint.pt +1 -1
pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/learning_dynamics/train_activations.pt +0 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/learning_dynamics/train_data/data-00000-of-00001.arrow +3 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/learning_dynamics/train_data/dataset_info.json +19 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/learning_dynamics/train_data/state.json +13 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/learning_dynamics/train_gradients.pt +3 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/learning_dynamics/train_weights.pt +3 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/config.json +22 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/fabric_state/checkpoint.pt +3 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/generation_config.json +4 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/learning_dynamics/train_activations.pt +0 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/learning_dynamics/train_data/data-00000-of-00001.arrow +3 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/learning_dynamics/train_data/dataset_info.json +19 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/learning_dynamics/train_data/state.json +13 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/learning_dynamics/train_gradients.pt +3 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/learning_dynamics/train_weights.pt +3 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/model.safetensors +3 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/pico_decoder.py +911 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/special_tokens_map.json +16 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/tokenizer.json +0 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/tokenizer_config.json +239 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/config.json +22 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/fabric_state/checkpoint.pt +3 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/generation_config.json +4 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/model.safetensors +3 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/pico_decoder.py +911 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/special_tokens_map.json +16 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/tokenizer.json +0 -0
pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/tokenizer_config.json +239 -0
pico-decoder-tiny-dolma250M-v1/eval_results/step_102000.json +1 -0
pico-decoder-tiny-dolma250M-v1/logs/log_20250831_162326.log +269 -0

.gitattributes CHANGED Viewed

@@ -982,3 +982,13 @@ pico-decoder-tiny-dolma250M-v1/checkpoints/step_98000/learning_dynamics/train_da
 pico-decoder-tiny-dolma250M-v1/checkpoints/step_98000/learning_dynamics/train_gradients.pt filter=lfs diff=lfs merge=lfs -text
 pico-decoder-tiny-dolma250M-v1/checkpoints/step_98000/learning_dynamics/train_weights.pt filter=lfs diff=lfs merge=lfs -text
 pico-decoder-tiny-dolma250M-v1/checkpoints/step_98000/model.safetensors filter=lfs diff=lfs merge=lfs -text

 pico-decoder-tiny-dolma250M-v1/checkpoints/step_98000/learning_dynamics/train_gradients.pt filter=lfs diff=lfs merge=lfs -text
 pico-decoder-tiny-dolma250M-v1/checkpoints/step_98000/learning_dynamics/train_weights.pt filter=lfs diff=lfs merge=lfs -text
 pico-decoder-tiny-dolma250M-v1/checkpoints/step_98000/model.safetensors filter=lfs diff=lfs merge=lfs -text
+pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/learning_dynamics/train_data/data-00000-of-00001.arrow filter=lfs diff=lfs merge=lfs -text
+pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/learning_dynamics/train_gradients.pt filter=lfs diff=lfs merge=lfs -text
+pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/learning_dynamics/train_weights.pt filter=lfs diff=lfs merge=lfs -text
+pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/fabric_state/checkpoint.pt filter=lfs diff=lfs merge=lfs -text
+pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/learning_dynamics/train_data/data-00000-of-00001.arrow filter=lfs diff=lfs merge=lfs -text
+pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/learning_dynamics/train_gradients.pt filter=lfs diff=lfs merge=lfs -text
+pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/learning_dynamics/train_weights.pt filter=lfs diff=lfs merge=lfs -text
+pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/model.safetensors filter=lfs diff=lfs merge=lfs -text
+pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/fabric_state/checkpoint.pt filter=lfs diff=lfs merge=lfs -text
+pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/model.safetensors filter=lfs diff=lfs merge=lfs -text

pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/fabric_state/checkpoint.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9a8e38fba8b39bdce89550461657b3d2c12715e0529740b50487e5b382b7e31b
 size 135543171

 version https://git-lfs.github.com/spec/v1
+oid sha256:7821832b1e1fcd6692940396e6edc50fce443f493f232c38e245da9561676d9a
 size 135543171

pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/learning_dynamics/train_activations.pt ADDED Viewed

Binary file (98.3 kB). View file

pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/learning_dynamics/train_data/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c563e1b5b21a23ec6c9e50ea1a3ff547984bf10de5016d87b310deb6c2d7b333
+size 276480

pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/learning_dynamics/train_data/dataset_info.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/learning_dynamics/train_data/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "0f66378a2401a0b7",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/learning_dynamics/train_gradients.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:852bbfeb8c97f09cadb3d7da918a6e72dd2810785847b156b295731b5233a900
+size 2371527

pico-decoder-tiny-dolma250M-v1/checkpoints/step_100000/learning_dynamics/train_weights.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca1c52d716f2f6090ed01b61b7d81a6b890637239c04ca310e52d23845963cbc
+size 2371443

pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "activation_hidden_dim": 384,
+  "architectures": [
+    "PicoDecoderHF"
+  ],
+  "attention_n_heads": 12,
+  "attention_n_kv_heads": 4,
+  "auto_map": {
+    "AutoConfig": "pico_decoder.PicoDecoderHFConfig",
+    "AutoModelForCausalLM": "pico_decoder.PicoDecoderHF"
+  },
+  "batch_size": 1024,
+  "d_model": 96,
+  "max_seq_len": 2048,
+  "model_type": "pico_decoder",
+  "n_layers": 12,
+  "norm_eps": 1e-06,
+  "position_emb_theta": 10000.0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.3",
+  "vocab_size": 50304
+}

pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/fabric_state/checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e03faf348f9fe044370ed46fe8fb6d144ea32db9df87b30d97884ad379bd07db
+size 135543171

pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "transformers_version": "4.48.3",
+  "vocab_size": 50304
+}

pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/learning_dynamics/train_activations.pt ADDED Viewed

Binary file (98.3 kB). View file

pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/learning_dynamics/train_data/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:257002d56c53426ecb933f4a65f1b66cb0f13a179a31844da13d1792ffab80c9
+size 278184

pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/learning_dynamics/train_data/dataset_info.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/learning_dynamics/train_data/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "f1628a2d831f3cda",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/learning_dynamics/train_gradients.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7266fedcfa4f2a3054badf1b4378fbbef9dac915dd694f27b2f6458ea363ac0c
+size 2371527

pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/learning_dynamics/train_weights.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3dab45ffb82880e9473f5b219a8fd7a1fbe79bcce195839f43072a00807e98b
+size 2371443

pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99bb8d1781b101d5c6271107090b8d643ebf4c7cfc56d2b92e8a7f50902f916a
+size 45143592

pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/pico_decoder.py ADDED Viewed

	@@ -0,0 +1,911 @@

+"""
+Pico Decoder: A Lightweight Causal Transformer Language Model
+Pico Decoder uses a simple LLAMA-style transformer architecture, written for clarity and educational purposes.
+Everything is written with a modular design for easy modification and experimentation.
+Key features:
+- RMSNorm for layer normalization
+- Rotary Positional Embeddings (RoPE)
+- Multi-head attention with KV-cache support
+- SwiGLU activation function
+- Residual connections throughout
+- KV-cache for faster autoregressive generation
+References:
+    - RoPE: https://arxiv.org/abs/2104.09864
+    - SwiGLU: https://arxiv.org/abs/2002.05202
+    - LLAMA: https://arxiv.org/abs/2302.13971
+Adapted from:
+    - OLMO: https://github.com/allenai/OLMo
+    - LLAMA: https://github.com/meta/llama
+"""
+from dataclasses import asdict
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# Handle PyTorch version compatibility for attention backend
+try:
+    from torch.nn.attention import SDPBackend, sdpa_kernel
+    HAS_TORCH_ATTENTION = True
+except ImportError:
+    # Fallback for older PyTorch versions
+    HAS_TORCH_ATTENTION = False
+    SDPBackend = None
+    sdpa_kernel = None
+from transformers import GenerationMixin, PretrainedConfig, PreTrainedModel
+from transformers.generation import GenerationConfig
+from transformers.modeling_outputs import CausalLMOutput, CausalLMOutputWithPast
+try:
+    if TYPE_CHECKING:
+        # We need to do this to avoid importing these when creating the HF-compatible models
+        from src.config import ModelConfig
+except ImportError:
+    pass
+########################################################
+#
+# Layer Normalization
+#
+########################################################
+class RMSNorm(torch.nn.Module):
+    """Root Mean Square Layer Normalization.
+    A variant of Layer Normalization that uses RMS statistics instead of mean/variance,
+    resulting in improved stability and performance.
+    Args:
+        config (Union[ModelConfig, PicoHFConfig]): Configuration object containing normalization parameters
+            - config.norm_eps: Small constant for numerical stability
+            - config.d_model: Model dimension for the weight parameter
+    References:
+        https://arxiv.org/abs/1910.07467
+    """
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        self.eps = config.norm_eps
+        self.weight = nn.Parameter(torch.ones(config.d_model))
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Normalizes the input tensor by its RMS value.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies RMS normalization to the input tensor and scales it by the weight parameter.
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+########################################################
+#
+# Positional Embedding
+#
+########################################################
+class RoPE(nn.Module):
+    """Rotary Positional Embeddings (RoPE).
+    Implements position-dependent rotation of keys and queries in attention mechanism,
+    allowing better modeling of relative positions in sequences. Uses complex number
+    operations for efficient rotation.
+    Args:
+        config (Union[ModelConfig, PicoHFConfig]): Model configuration containing:
+            - config.position_emb_theta: Base for frequency computation
+            - config.d_model: Model dimension
+            - config.attention_n_heads: Number of attention heads
+            - config.max_seq_len: Maximum sequence length
+    References:
+        https://arxiv.org/abs/2104.09864
+    """
+    _freqs_cis_tensor: torch.Tensor | None = None
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        self.theta = config.position_emb_theta
+        self.dim = config.d_model // config.attention_n_heads
+        max_seq_len = config.max_seq_len
+        # only gets set once, and then reused for all RoPE instances
+        if RoPE._freqs_cis_tensor is None:
+            RoPE._freqs_cis_tensor = self._setup_freqs_cis(
+                max_seq_len, self.theta, self.dim
+            )
+        # register _freqs_cis buffer
+        # can be easily recomputed so persistent=False
+        self.register_buffer("_freqs_cis", self._freqs_cis_tensor, persistent=False)
+    @classmethod
+    def _setup_freqs_cis(cls, seq_len: int, theta: float, dim: int) -> torch.Tensor:
+        """Setup Frequency Tensor for RoPE Embeddings
+        Initializes the complex frequency tensor that is used to compute the RoPE embeddings.
+        Note other implementations will use cos and sin directly, but using the complex
+        number representation is (probably) more efficient:
+            e^(theta * i * t) = cos(theta * t) + i * sin(theta * t) [Euler's formula]
+        """
+        _freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        positions = torch.arange(seq_len)
+        freqs = torch.outer(positions, _freqs)
+        return torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    def get_freqs_cis(
+        self, input_shape: torch.Size, start_pos: int, end_pos: int
+    ) -> torch.Tensor:
+        """Reshape Frequency Tensor for RoPE Embeddings
+        Makes the frequency tensor broadcastable with the input tensor.
+        """
+        _freqs_cis = self._freqs_cis[start_pos:end_pos]
+        ndim = len(input_shape)
+        assert 0 <= 1 < ndim
+        assert _freqs_cis.shape == (input_shape[1], input_shape[-1])
+        # TODO: Check whether this is correct (might be able to remove this)
+        shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(input_shape)]
+        return _freqs_cis.view(*shape)
+    def forward(
+        self,
+        queries: torch.Tensor,
+        keys: torch.Tensor,
+        start_pos: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply RoPE Embeddings to Queries and Keys
+        Applies the rotary positional embeddings to the input tensors via complex num multiplication
+        NOTE: The start_pos is used if we want to use the kv_cache in the attention mechanism.
+        """
+        queries_ = torch.view_as_complex(
+            queries.float().reshape(*queries.shape[:-1], -1, 2)
+        )
+        keys_ = torch.view_as_complex(keys.float().reshape(*keys.shape[:-1], -1, 2))
+        input_shape = (
+            queries_.shape
+        )  # same as keys: (batch_size, seq_len, n_heads, head_dim/2)
+        freqs_start_pos = start_pos
+        freqs_end_pos = freqs_start_pos + queries_.shape[1]
+        freqs_cis = self.get_freqs_cis(input_shape, freqs_start_pos, freqs_end_pos)
+        queries_rotated = torch.view_as_real(queries_ * freqs_cis).flatten(3)
+        keys_rotated = torch.view_as_real(keys_ * freqs_cis).flatten(3)
+        return queries_rotated.type_as(queries), keys_rotated.type_as(keys)
+########################################################
+#
+# Attention
+#
+########################################################
+class Attention(nn.Module):
+    """Multi-head Attention with Group Query Attention support.
+    Implements scaled dot-product attention and supports:
+    - Grouped Query Attention (GQA)
+    - Key-Value caching for efficient inference
+    - RoPE integration
+    Args:
+        config (Union[ModelConfig, PretrainedConfig]): Configuration containing:
+            - config.attention_n_heads: Number of attention heads
+            - config.attention_n_kv_heads: Number of key/value heads
+            - config.d_model: Model dimension
+            - config.batch_size: Maximum batch size
+            - config.max_seq_len: Maximum sequence length
+    Shape:
+        - Input: (batch_size, seq_len, d_model)
+        - Output: (batch_size, seq_len, d_model)
+    """
+    def __init__(
+        self,
+        config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.n_heads = config.attention_n_heads
+        self.n_kv_heads = config.attention_n_kv_heads
+        self.batch_size = config.batch_size
+        self.max_seq_len = config.max_seq_len
+        d_model = config.d_model
+        self.head_dim = d_model // self.n_heads
+        self.n_rep = self.n_heads // self.n_kv_heads
+        self.q_proj = nn.Linear(d_model, self.n_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.n_heads * self.head_dim, d_model, bias=False)
+        self.rope = RoPE(config)
+    def forward(
+        self,
+        input: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor, ...]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        """Forward pass for the attention mechanism.
+        Computes queries, keys, and values for the attention mechanism. Applies rotary positional
+        embeddings to the queries and keys, and then computes attention scores and outputs.
+        For an introduction to the attention mechanism, see:
+        https://arxiv.org/abs/1706.03762
+        A few things to note:
+        - The past_key_values is used to implement the KV cache, which is used to speed up
+          generation by caching the KV pairs from previous forward passes. This is useful when doing
+          tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
+          modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
+          its own KV cache - this KV cache is implemented as a tuple.
+        """
+        bsz, seq_len, _ = input.shape
+        _queries, _keys, _values = (
+            self.q_proj(input),
+            self.k_proj(input),
+            self.v_proj(input),
+        )
+        # Reshaping for multi-head attention
+        queries = _queries.view(bsz, seq_len, self.n_heads, self.head_dim)
+        keys = _keys.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        values = _values.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        # The start position is used to apply the RoPE embeddings to only the new tokens
+        # when using the kv_cache in the attention mechanism.
+        # We want to start from the last position in the cache.
+        start_pos = 0
+        if past_key_values is not None and past_key_values[0] is not None:
+            start_pos = past_key_values[0].shape[1]
+        # apply rotary positional embeddings
+        queries, keys = self.rope(queries, keys, start_pos)
+        if (
+            past_key_values is not None
+            and past_key_values[0] is not None
+            and past_key_values[1] is not None
+        ):
+            keys = torch.cat([past_key_values[0], keys], dim=1)
+            values = torch.cat([past_key_values[1], values], dim=1)
+        if use_cache:
+            cached_keys = keys
+            cached_values = values
+        else:
+            cached_keys = None
+            cached_values = None
+        queries = queries.transpose(1, 2)
+        keys = keys.transpose(1, 2)
+        values = values.transpose(1, 2)
+        apply_gqa = self.n_rep > 1
+        if apply_gqa and queries.device.type == "mps":
+            # NOTE: MPS does not support GQA in the SDPA kernel, but we can repeat the keys and values
+            # outside of the kernel to get the same effect.
+            # See: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+            keys = keys.repeat_interleave(self.n_rep, dim=-3)
+            values = values.repeat_interleave(self.n_rep, dim=-3)
+            apply_gqa = False
+        if HAS_TORCH_ATTENTION:
+            backends = [SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH]
+            with sdpa_kernel(backends=backends):
+                attn_output = F.scaled_dot_product_attention(
+                    queries.contiguous(),
+                    keys.contiguous(),
+                    values.contiguous(),
+                    attn_mask=mask.to(queries.dtype) if mask is not None else None,
+                    enable_gqa=apply_gqa,
+                )
+        else:
+            # Fallback for older PyTorch versions - use default backend
+            attn_output = F.scaled_dot_product_attention(
+                queries.contiguous(),
+                keys.contiguous(),
+                values.contiguous(),
+                attn_mask=mask.to(queries.dtype) if mask is not None else None,
+                enable_gqa=apply_gqa,
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        output = self.o_proj(attn_output)
+        return output, (cached_keys, cached_values)
+########################################################
+#
+# SwiGLU (Combines MLP and Activation)
+#
+########################################################
+class SwiGLU(nn.Module):
+    """SwiGLU Activation Function with Linear Projections.
+    Implements the SwiGLU activation function combined with linear transformations,
+    serving as the feed-forward network in transformer blocks.
+    Args:
+        config (Union[ModelConfig, PicoDecoderHFConfig]): Configuration containing:
+            - config.d_model: Model dimension
+            - config.activation_hidden_dim: Hidden dimension (typically 4 * d_model)
+    References:
+        https://arxiv.org/abs/2002.05202
+    """
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        model_dim = config.d_model
+        act_hidden_dim = config.activation_hidden_dim  # usually 4 * d_model
+        self.w_0 = nn.Linear(model_dim, act_hidden_dim, bias=False)
+        self.w_1 = nn.Linear(model_dim, act_hidden_dim, bias=False)
+        self.w_2 = nn.Linear(act_hidden_dim, model_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w_2(F.silu(self.w_0(x)) * self.w_1(x))
+########################################################
+#
+# PicoDecoderBlock
+#
+########################################################
+class PicoDecoderBlock(nn.Module):
+    """Single Transformer Block with Attention and Feed-forward layers.
+    Implements a standard transformer block with:
+    - Multi-head attention with normalization and residual connection
+    - SwiGLU feed-forward network with normalization and residual connection
+    Args:
+        config (Union[ModelConfig, PicoDecoderHFConfig]): Model configuration; either a dataclass or
+            a HuggingFace PicoDecoderHFConfig
+    """
+    def __init__(
+        self,
+        config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.attention = Attention(config)
+        self.swiglu = SwiGLU(config)
+        self.attention_norm = RMSNorm(config)
+        self.swiglu_norm = RMSNorm(config)
+    def forward(
+        self,
+        input: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        attention_output, cached_key_values = self.attention(
+            self.attention_norm(input),
+            mask=mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+        # NOTE: cached_key_values is None if use_cache is False
+        h = input + attention_output
+        out = h + self.swiglu(self.swiglu_norm(h))
+        return out, cached_key_values
+########################################################
+#
+# Pico Decoder (Causal Transformer Model)
+#
+########################################################
+class PicoDecoder(nn.Module):
+    """
+    Pico Decoder: combines the embedding, causal decoder blocks, and output projection into a
+    single autoregressive model.
+    For more information on the model, see the classes for the modules that make up the model.
+    """
+    def __init__(
+        self,
+        model_config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.config = model_config
+        self.embedding_proj = nn.Embedding(self.config.vocab_size, self.config.d_model)
+        self.layers = nn.ModuleList(
+            [PicoDecoderBlock(self.config) for _ in range(self.config.n_layers)]
+        )
+        self.output_norm = RMSNorm(self.config)
+        self.de_embedding_proj = nn.Linear(
+            self.config.d_model, self.config.vocab_size, bias=False
+        )
+    def convert_to_hf_model(self) -> "PicoDecoderHF":
+        """Convert the Lightning model to a HuggingFace model."""
+        # Create HF config without fabric-specific settings
+        hf_config = PicoDecoderHFConfig.from_dataclass(self.config)
+        # Create new HF model
+        hf_model = PicoDecoderHF(hf_config)
+        # Copy state dict, excluding fabric-specific keys
+        hf_model.load_state_dict(self.state_dict(prefix="pico_decoder."))
+        return hf_model
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[Tuple[torch.Tensor, torch.Tensor]]]]:
+        """
+        This is the forward pass for the entire Pico model. It boils down to:
+        - Embedding the input ids
+        - Creating a causal mask
+        - Processing through the pico layers
+        - Projecting the output to logits
+        NOTE: One feature that might be confusing is the KV cache. The KV cache is used to speed up
+        generation by caching the KV pairs from previous forward passes. This is useful when doing
+        tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
+        modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
+        its own KV cache which is stored as a tuple. The whole model then stores a tuple of these
+        KV caches (so a tuple of tuples).
+        """
+        seq_len = input_ids.shape[-1]
+        h = self.embedding_proj(input_ids)
+        # Calculate start position from past cached KV pairs. Remember that each layer has its
+        # own KV Cache. So when we index past_key_values, we need to index into the KV pairs for the
+        # correct layer and then for either the keys or values.
+        start_pos = 0
+        if (
+            past_key_values is not None
+            and past_key_values[0] is not None
+            and past_key_values[0][0] is not None
+        ):
+            start_pos = past_key_values[0][0].shape[1]
+        # Create causal mask for current sequence
+        mask = None
+        if seq_len > 1:
+            mask = torch.full((seq_len, seq_len), float("-inf"))
+            mask = torch.triu(mask, diagonal=1)
+            # If using KV cache, extend mask to cover cached sequence length
+            if past_key_values is not None:
+                # Add zeros for cached tokens (we can attend to all of them)
+                mask = torch.hstack([torch.zeros((seq_len, start_pos)), mask])
+            mask = mask.to(h.device)
+        # NOTE: If we are using the cache, we need to store the cached KV pairs for each layer
+        #       in a tuple. Each layer will have its own cached KV pair which we aggregate in a tuple.
+        cached_key_values = () if use_cache else None
+        # Process through transformer blocks
+        for idx, layer in enumerate(self.layers):
+            layer_past_key_values = None
+            if past_key_values is not None:
+                try:
+                    # Handle both tuple-based cache and HuggingFace cache objects
+                    if hasattr(past_key_values, "__getitem__") and idx < len(
+                        past_key_values
+                    ):
+                        layer_past_key_values = past_key_values[idx]
+                except (KeyError, IndexError, TypeError):
+                    # If we can't access the cache properly, just skip it
+                    layer_past_key_values = None
+            h, layer_cached_key_values = layer(
+                h, mask=mask, past_key_values=layer_past_key_values, use_cache=use_cache
+            )
+            if use_cache:
+                cached_key_values += (layer_cached_key_values,)
+        # Final norm and projection
+        h = self.output_norm(h)
+        logits = self.de_embedding_proj(h).float()
+        return logits, cached_key_values
+########################################################
+#
+# HuggingFace Wrapper for the Pico Decoder model.
+#
+########################################################
+class PicoDecoderHFConfig(PretrainedConfig):
+    """Config class for the Pico Decoder HuggingFace wrapper."""
+    model_type = "pico_decoder"
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PicoDecoderHFConfig":
+        """
+        Initialize config from a dictionary. Note that no kwargs are passed to the constructor --
+        this is because with some kwargs special handling is required and can make this class
+        brittle.
+        """
+        pico_config = cls(**config_dict)
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+        unused_kwargs = {
+            key: value for key, value in kwargs.items() if not hasattr(pico_config, key)
+        }
+        if return_unused_kwargs:
+            return pico_config, unused_kwargs
+        return pico_config
+    @classmethod
+    def from_dataclass(cls, model_config: "ModelConfig"):
+        """Initialise from our custom config dataclass."""
+        return cls.from_dict(asdict(model_config))
+class PicoDecoderHF(PreTrainedModel, GenerationMixin):
+    """
+    HuggingFace wrapper for the Pico model with generation support.
+    Many evaluation frameworks require a model be setup as a HuggingFace model, so we provide a simple
+    wrapper that does just that. When we save checkpoints of the Pico model, we save both the normal
+    Pico model as well as the model wrapped in this HuggingFace class.
+    This also lets you do cool things like:
+    `model = AutoModelForCausalLM.from_pretrained("path/to/checkpoint")`
+    """
+    config_class = PicoDecoderHFConfig
+    _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
+    main_input_name = "input_ids"
+    def __init__(self, config: PicoDecoderHFConfig):
+        super().__init__(config)
+        self.pico_decoder = PicoDecoder(config)
+        # Initialize generation config with defaults
+        self.generation_config = GenerationConfig()
+        # Set some reasonable defaults for the model
+        if hasattr(config, "max_position_embeddings"):
+            self.generation_config.max_length = config.max_position_embeddings
+        if hasattr(config, "vocab_size"):
+            self.generation_config.vocab_size = config.vocab_size
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
+        """HuggingFace forward pass wrapper.
+        Forwards pass for the HuggingFace version of the Pico Model. Basic wrapper around the
+        Pico model's forward pass, and returns the output as a HuggingFace CausalLMOutput.
+        """
+        logits, past_key_values = self.pico_decoder(
+            input_ids, past_key_values, use_cache
+        )
+        if use_cache:
+            return CausalLMOutputWithPast(
+                logits=logits,
+                past_key_values=past_key_values,
+            )
+        else:
+            return CausalLMOutput(
+                logits=logits,
+            )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """
+        Prepare inputs for generation.
+        Args:
+            input_ids: Input token IDs
+            past_key_values: Cached key-value pairs from previous forward passes
+            attention_mask: Attention mask for the input
+            **kwargs: Additional arguments
+        Returns:
+            Dictionary containing prepared inputs
+        """
+        # If we have past_key_values, we only need the last token
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": True,
+        }
+    def get_input_embeddings(self):
+        """Get the input embeddings layer."""
+        return self.pico_decoder.embedding_proj
+    def set_input_embeddings(self, value):
+        """Set the input embeddings layer."""
+        self.pico_decoder.embedding_proj = value
+    def get_output_embeddings(self):
+        """Get the output embeddings layer."""
+        return self.pico_decoder.de_embedding_proj
+    def set_output_embeddings(self, value):
+        """Set the output embeddings layer."""
+        self.pico_decoder.de_embedding_proj = value
+    def get_lm_head(self):
+        """Get the language model head."""
+        return self.pico_decoder.de_embedding_proj
+    def can_generate(self) -> bool:
+        """Check if the model can generate text."""
+        return True
+    @property
+    def is_encoder_decoder(self) -> bool:
+        """Check if the model is an encoder-decoder model."""
+        return False
+    @property
+    def can_use_cache(self) -> bool:
+        """Check if the model can use KV cache."""
+        return True
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None
+    ) -> torch.nn.Embedding:
+        """Resize token embeddings."""
+        old_embeddings = self.get_input_embeddings()
+        if new_num_tokens is None:
+            new_num_tokens = old_embeddings.num_embeddings
+        new_embeddings = torch.nn.Embedding(
+            new_num_tokens, old_embeddings.embedding_dim
+        )
+        new_embeddings.weight.data[: old_embeddings.num_embeddings] = (
+            old_embeddings.weight.data
+        )
+        self.pico_decoder.embedding_proj = new_embeddings
+        self.pico_decoder.de_embedding_proj = torch.nn.Linear(
+            old_embeddings.embedding_dim, new_num_tokens, bias=False
+        )
+        return new_embeddings
+# Register for auto classes
+PicoDecoderHFConfig.register_for_auto_class()
+PicoDecoderHF.register_for_auto_class("AutoModel")
+PicoDecoderHF.register_for_auto_class("AutoModelForCausalLM")
+########################################################
+#
+# New PicoDecoderForCausalLM class for generation support
+#
+########################################################
+class PicoDecoderForCausalLM(PreTrainedModel, GenerationMixin):
+    """
+    PicoDecoderForCausalLM: A HuggingFace-compatible model that properly supports generation.
+    This class is designed to work with existing checkpoints and provides full generation support.
+    It inherits from the right base classes that HuggingFace expects for text generation.
+    """
+    config_class = PicoDecoderHFConfig
+    _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
+    main_input_name = "input_ids"
+    def __init__(self, config: PicoDecoderHFConfig):
+        super().__init__(config)
+        self.pico_decoder = PicoDecoder(config)
+        # Initialize generation config with defaults
+        self.generation_config = GenerationConfig()
+        # Set some reasonable defaults for the model
+        if hasattr(config, "max_position_embeddings"):
+            self.generation_config.max_length = config.max_position_embeddings
+        if hasattr(config, "vocab_size"):
+            self.generation_config.vocab_size = config.vocab_size
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
+        """Forward pass for text generation."""
+        logits, past_key_values = self.pico_decoder(
+            input_ids, past_key_values, use_cache
+        )
+        if use_cache:
+            return CausalLMOutputWithPast(
+                logits=logits,
+                past_key_values=past_key_values,
+            )
+        else:
+            return CausalLMOutput(
+                logits=logits,
+            )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Prepare inputs for generation."""
+        # If we have past_key_values, we only need the last token
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": True,
+        }
+    def get_input_embeddings(self):
+        """Get the input embeddings layer."""
+        return self.pico_decoder.embedding_proj
+    def set_input_embeddings(self, value):
+        """Set the input embeddings layer."""
+        self.pico_decoder.embedding_proj = value
+    def get_output_embeddings(self):
+        """Get the output embeddings layer."""
+        return self.pico_decoder.de_embedding_proj
+    def set_output_embeddings(self, value):
+        """Set the output embeddings layer."""
+        self.pico_decoder.de_embedding_proj = value
+    def get_lm_head(self):
+        """Get the language model head."""
+        return self.pico_decoder.de_embedding_proj
+    def can_generate(self) -> bool:
+        """Check if the model can generate text."""
+        return True
+    @property
+    def is_encoder_decoder(self) -> bool:
+        """Check if the model is an encoder-decoder model."""
+        return False
+    @property
+    def can_use_cache(self) -> bool:
+        """Check if the model can use KV cache."""
+        return True
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None
+    ) -> torch.nn.Embedding:
+        """Resize token embeddings."""
+        old_embeddings = self.get_input_embeddings()
+        if new_num_tokens is None:
+            new_num_tokens = old_embeddings.num_embeddings
+        new_embeddings = torch.nn.Embedding(
+            new_num_tokens, old_embeddings.embedding_dim
+        )
+        new_embeddings.weight.data[: old_embeddings.num_embeddings] = (
+            old_embeddings.weight.data
+        )
+        self.pico_decoder.embedding_proj = new_embeddings
+        self.pico_decoder.de_embedding_proj = torch.nn.Linear(
+            old_embeddings.embedding_dim, new_num_tokens, bias=False
+        )
+        return new_embeddings
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Load a pretrained model from a checkpoint.
+        This method handles loading from both the old PicoDecoderHF format and the new format.
+        """
+        # First try to load with the new class
+        try:
+            return super().from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        except Exception as e:
+            print(f"Failed to load with new class: {e}")
+            print("Attempting to load with legacy class and convert...")
+            # Try to load with the old class and convert
+            try:
+                from transformers import AutoModel
+                old_model = AutoModel.from_pretrained(
+                    pretrained_model_name_or_path,
+                    trust_remote_code=True,
+                    *model_args,
+                    **kwargs,
+                )
+                # Create new model instance
+                new_model = cls(old_model.config)
+                # Copy state dict
+                new_model.load_state_dict(old_model.state_dict(), strict=False)
+                return new_model
+            except Exception as e2:
+                print(f"Failed to convert from legacy format: {e2}")
+                raise e
+# Register the new class
+PicoDecoderForCausalLM.register_for_auto_class("AutoModelForCausalLM")

pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|padding|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

pico-decoder-tiny-dolma250M-v1/checkpoints/step_102000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "|||IP_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50254": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50255": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50256": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50257": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "|||EMAIL_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "|||PHONE_NUMBER|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": null,
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|padding|>",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": null
+}

pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "activation_hidden_dim": 384,
+  "architectures": [
+    "PicoDecoderHF"
+  ],
+  "attention_n_heads": 12,
+  "attention_n_kv_heads": 4,
+  "auto_map": {
+    "AutoConfig": "pico_decoder.PicoDecoderHFConfig",
+    "AutoModelForCausalLM": "pico_decoder.PicoDecoderHF"
+  },
+  "batch_size": 1024,
+  "d_model": 96,
+  "max_seq_len": 2048,
+  "model_type": "pico_decoder",
+  "n_layers": 12,
+  "norm_eps": 1e-06,
+  "position_emb_theta": 10000.0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.3",
+  "vocab_size": 50304
+}

pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/fabric_state/checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f39d8040ef9e3d9b47f2a819749ffc0c39ad0a6aa2c965cd8d6106726b2e55d6
+size 135543171

pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "transformers_version": "4.48.3",
+  "vocab_size": 50304
+}

pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d2c1947251972d2ba272b673761ef16d72a8f30e3967f821245685a51c8347c
+size 45143592

pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/pico_decoder.py ADDED Viewed

	@@ -0,0 +1,911 @@

+"""
+Pico Decoder: A Lightweight Causal Transformer Language Model
+Pico Decoder uses a simple LLAMA-style transformer architecture, written for clarity and educational purposes.
+Everything is written with a modular design for easy modification and experimentation.
+Key features:
+- RMSNorm for layer normalization
+- Rotary Positional Embeddings (RoPE)
+- Multi-head attention with KV-cache support
+- SwiGLU activation function
+- Residual connections throughout
+- KV-cache for faster autoregressive generation
+References:
+    - RoPE: https://arxiv.org/abs/2104.09864
+    - SwiGLU: https://arxiv.org/abs/2002.05202
+    - LLAMA: https://arxiv.org/abs/2302.13971
+Adapted from:
+    - OLMO: https://github.com/allenai/OLMo
+    - LLAMA: https://github.com/meta/llama
+"""
+from dataclasses import asdict
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# Handle PyTorch version compatibility for attention backend
+try:
+    from torch.nn.attention import SDPBackend, sdpa_kernel
+    HAS_TORCH_ATTENTION = True
+except ImportError:
+    # Fallback for older PyTorch versions
+    HAS_TORCH_ATTENTION = False
+    SDPBackend = None
+    sdpa_kernel = None
+from transformers import GenerationMixin, PretrainedConfig, PreTrainedModel
+from transformers.generation import GenerationConfig
+from transformers.modeling_outputs import CausalLMOutput, CausalLMOutputWithPast
+try:
+    if TYPE_CHECKING:
+        # We need to do this to avoid importing these when creating the HF-compatible models
+        from src.config import ModelConfig
+except ImportError:
+    pass
+########################################################
+#
+# Layer Normalization
+#
+########################################################
+class RMSNorm(torch.nn.Module):
+    """Root Mean Square Layer Normalization.
+    A variant of Layer Normalization that uses RMS statistics instead of mean/variance,
+    resulting in improved stability and performance.
+    Args:
+        config (Union[ModelConfig, PicoHFConfig]): Configuration object containing normalization parameters
+            - config.norm_eps: Small constant for numerical stability
+            - config.d_model: Model dimension for the weight parameter
+    References:
+        https://arxiv.org/abs/1910.07467
+    """
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        self.eps = config.norm_eps
+        self.weight = nn.Parameter(torch.ones(config.d_model))
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Normalizes the input tensor by its RMS value.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies RMS normalization to the input tensor and scales it by the weight parameter.
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+########################################################
+#
+# Positional Embedding
+#
+########################################################
+class RoPE(nn.Module):
+    """Rotary Positional Embeddings (RoPE).
+    Implements position-dependent rotation of keys and queries in attention mechanism,
+    allowing better modeling of relative positions in sequences. Uses complex number
+    operations for efficient rotation.
+    Args:
+        config (Union[ModelConfig, PicoHFConfig]): Model configuration containing:
+            - config.position_emb_theta: Base for frequency computation
+            - config.d_model: Model dimension
+            - config.attention_n_heads: Number of attention heads
+            - config.max_seq_len: Maximum sequence length
+    References:
+        https://arxiv.org/abs/2104.09864
+    """
+    _freqs_cis_tensor: torch.Tensor | None = None
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        self.theta = config.position_emb_theta
+        self.dim = config.d_model // config.attention_n_heads
+        max_seq_len = config.max_seq_len
+        # only gets set once, and then reused for all RoPE instances
+        if RoPE._freqs_cis_tensor is None:
+            RoPE._freqs_cis_tensor = self._setup_freqs_cis(
+                max_seq_len, self.theta, self.dim
+            )
+        # register _freqs_cis buffer
+        # can be easily recomputed so persistent=False
+        self.register_buffer("_freqs_cis", self._freqs_cis_tensor, persistent=False)
+    @classmethod
+    def _setup_freqs_cis(cls, seq_len: int, theta: float, dim: int) -> torch.Tensor:
+        """Setup Frequency Tensor for RoPE Embeddings
+        Initializes the complex frequency tensor that is used to compute the RoPE embeddings.
+        Note other implementations will use cos and sin directly, but using the complex
+        number representation is (probably) more efficient:
+            e^(theta * i * t) = cos(theta * t) + i * sin(theta * t) [Euler's formula]
+        """
+        _freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        positions = torch.arange(seq_len)
+        freqs = torch.outer(positions, _freqs)
+        return torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    def get_freqs_cis(
+        self, input_shape: torch.Size, start_pos: int, end_pos: int
+    ) -> torch.Tensor:
+        """Reshape Frequency Tensor for RoPE Embeddings
+        Makes the frequency tensor broadcastable with the input tensor.
+        """
+        _freqs_cis = self._freqs_cis[start_pos:end_pos]
+        ndim = len(input_shape)
+        assert 0 <= 1 < ndim
+        assert _freqs_cis.shape == (input_shape[1], input_shape[-1])
+        # TODO: Check whether this is correct (might be able to remove this)
+        shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(input_shape)]
+        return _freqs_cis.view(*shape)
+    def forward(
+        self,
+        queries: torch.Tensor,
+        keys: torch.Tensor,
+        start_pos: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply RoPE Embeddings to Queries and Keys
+        Applies the rotary positional embeddings to the input tensors via complex num multiplication
+        NOTE: The start_pos is used if we want to use the kv_cache in the attention mechanism.
+        """
+        queries_ = torch.view_as_complex(
+            queries.float().reshape(*queries.shape[:-1], -1, 2)
+        )
+        keys_ = torch.view_as_complex(keys.float().reshape(*keys.shape[:-1], -1, 2))
+        input_shape = (
+            queries_.shape
+        )  # same as keys: (batch_size, seq_len, n_heads, head_dim/2)
+        freqs_start_pos = start_pos
+        freqs_end_pos = freqs_start_pos + queries_.shape[1]
+        freqs_cis = self.get_freqs_cis(input_shape, freqs_start_pos, freqs_end_pos)
+        queries_rotated = torch.view_as_real(queries_ * freqs_cis).flatten(3)
+        keys_rotated = torch.view_as_real(keys_ * freqs_cis).flatten(3)
+        return queries_rotated.type_as(queries), keys_rotated.type_as(keys)
+########################################################
+#
+# Attention
+#
+########################################################
+class Attention(nn.Module):
+    """Multi-head Attention with Group Query Attention support.
+    Implements scaled dot-product attention and supports:
+    - Grouped Query Attention (GQA)
+    - Key-Value caching for efficient inference
+    - RoPE integration
+    Args:
+        config (Union[ModelConfig, PretrainedConfig]): Configuration containing:
+            - config.attention_n_heads: Number of attention heads
+            - config.attention_n_kv_heads: Number of key/value heads
+            - config.d_model: Model dimension
+            - config.batch_size: Maximum batch size
+            - config.max_seq_len: Maximum sequence length
+    Shape:
+        - Input: (batch_size, seq_len, d_model)
+        - Output: (batch_size, seq_len, d_model)
+    """
+    def __init__(
+        self,
+        config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.n_heads = config.attention_n_heads
+        self.n_kv_heads = config.attention_n_kv_heads
+        self.batch_size = config.batch_size
+        self.max_seq_len = config.max_seq_len
+        d_model = config.d_model
+        self.head_dim = d_model // self.n_heads
+        self.n_rep = self.n_heads // self.n_kv_heads
+        self.q_proj = nn.Linear(d_model, self.n_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.n_heads * self.head_dim, d_model, bias=False)
+        self.rope = RoPE(config)
+    def forward(
+        self,
+        input: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor, ...]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        """Forward pass for the attention mechanism.
+        Computes queries, keys, and values for the attention mechanism. Applies rotary positional
+        embeddings to the queries and keys, and then computes attention scores and outputs.
+        For an introduction to the attention mechanism, see:
+        https://arxiv.org/abs/1706.03762
+        A few things to note:
+        - The past_key_values is used to implement the KV cache, which is used to speed up
+          generation by caching the KV pairs from previous forward passes. This is useful when doing
+          tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
+          modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
+          its own KV cache - this KV cache is implemented as a tuple.
+        """
+        bsz, seq_len, _ = input.shape
+        _queries, _keys, _values = (
+            self.q_proj(input),
+            self.k_proj(input),
+            self.v_proj(input),
+        )
+        # Reshaping for multi-head attention
+        queries = _queries.view(bsz, seq_len, self.n_heads, self.head_dim)
+        keys = _keys.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        values = _values.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        # The start position is used to apply the RoPE embeddings to only the new tokens
+        # when using the kv_cache in the attention mechanism.
+        # We want to start from the last position in the cache.
+        start_pos = 0
+        if past_key_values is not None and past_key_values[0] is not None:
+            start_pos = past_key_values[0].shape[1]
+        # apply rotary positional embeddings
+        queries, keys = self.rope(queries, keys, start_pos)
+        if (
+            past_key_values is not None
+            and past_key_values[0] is not None
+            and past_key_values[1] is not None
+        ):
+            keys = torch.cat([past_key_values[0], keys], dim=1)
+            values = torch.cat([past_key_values[1], values], dim=1)
+        if use_cache:
+            cached_keys = keys
+            cached_values = values
+        else:
+            cached_keys = None
+            cached_values = None
+        queries = queries.transpose(1, 2)
+        keys = keys.transpose(1, 2)
+        values = values.transpose(1, 2)
+        apply_gqa = self.n_rep > 1
+        if apply_gqa and queries.device.type == "mps":
+            # NOTE: MPS does not support GQA in the SDPA kernel, but we can repeat the keys and values
+            # outside of the kernel to get the same effect.
+            # See: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+            keys = keys.repeat_interleave(self.n_rep, dim=-3)
+            values = values.repeat_interleave(self.n_rep, dim=-3)
+            apply_gqa = False
+        if HAS_TORCH_ATTENTION:
+            backends = [SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH]
+            with sdpa_kernel(backends=backends):
+                attn_output = F.scaled_dot_product_attention(
+                    queries.contiguous(),
+                    keys.contiguous(),
+                    values.contiguous(),
+                    attn_mask=mask.to(queries.dtype) if mask is not None else None,
+                    enable_gqa=apply_gqa,
+                )
+        else:
+            # Fallback for older PyTorch versions - use default backend
+            attn_output = F.scaled_dot_product_attention(
+                queries.contiguous(),
+                keys.contiguous(),
+                values.contiguous(),
+                attn_mask=mask.to(queries.dtype) if mask is not None else None,
+                enable_gqa=apply_gqa,
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        output = self.o_proj(attn_output)
+        return output, (cached_keys, cached_values)
+########################################################
+#
+# SwiGLU (Combines MLP and Activation)
+#
+########################################################
+class SwiGLU(nn.Module):
+    """SwiGLU Activation Function with Linear Projections.
+    Implements the SwiGLU activation function combined with linear transformations,
+    serving as the feed-forward network in transformer blocks.
+    Args:
+        config (Union[ModelConfig, PicoDecoderHFConfig]): Configuration containing:
+            - config.d_model: Model dimension
+            - config.activation_hidden_dim: Hidden dimension (typically 4 * d_model)
+    References:
+        https://arxiv.org/abs/2002.05202
+    """
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        model_dim = config.d_model
+        act_hidden_dim = config.activation_hidden_dim  # usually 4 * d_model
+        self.w_0 = nn.Linear(model_dim, act_hidden_dim, bias=False)
+        self.w_1 = nn.Linear(model_dim, act_hidden_dim, bias=False)
+        self.w_2 = nn.Linear(act_hidden_dim, model_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w_2(F.silu(self.w_0(x)) * self.w_1(x))
+########################################################
+#
+# PicoDecoderBlock
+#
+########################################################
+class PicoDecoderBlock(nn.Module):
+    """Single Transformer Block with Attention and Feed-forward layers.
+    Implements a standard transformer block with:
+    - Multi-head attention with normalization and residual connection
+    - SwiGLU feed-forward network with normalization and residual connection
+    Args:
+        config (Union[ModelConfig, PicoDecoderHFConfig]): Model configuration; either a dataclass or
+            a HuggingFace PicoDecoderHFConfig
+    """
+    def __init__(
+        self,
+        config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.attention = Attention(config)
+        self.swiglu = SwiGLU(config)
+        self.attention_norm = RMSNorm(config)
+        self.swiglu_norm = RMSNorm(config)
+    def forward(
+        self,
+        input: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        attention_output, cached_key_values = self.attention(
+            self.attention_norm(input),
+            mask=mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+        # NOTE: cached_key_values is None if use_cache is False
+        h = input + attention_output
+        out = h + self.swiglu(self.swiglu_norm(h))
+        return out, cached_key_values
+########################################################
+#
+# Pico Decoder (Causal Transformer Model)
+#
+########################################################
+class PicoDecoder(nn.Module):
+    """
+    Pico Decoder: combines the embedding, causal decoder blocks, and output projection into a
+    single autoregressive model.
+    For more information on the model, see the classes for the modules that make up the model.
+    """
+    def __init__(
+        self,
+        model_config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.config = model_config
+        self.embedding_proj = nn.Embedding(self.config.vocab_size, self.config.d_model)
+        self.layers = nn.ModuleList(
+            [PicoDecoderBlock(self.config) for _ in range(self.config.n_layers)]
+        )
+        self.output_norm = RMSNorm(self.config)
+        self.de_embedding_proj = nn.Linear(
+            self.config.d_model, self.config.vocab_size, bias=False
+        )
+    def convert_to_hf_model(self) -> "PicoDecoderHF":
+        """Convert the Lightning model to a HuggingFace model."""
+        # Create HF config without fabric-specific settings
+        hf_config = PicoDecoderHFConfig.from_dataclass(self.config)
+        # Create new HF model
+        hf_model = PicoDecoderHF(hf_config)
+        # Copy state dict, excluding fabric-specific keys
+        hf_model.load_state_dict(self.state_dict(prefix="pico_decoder."))
+        return hf_model
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[Tuple[torch.Tensor, torch.Tensor]]]]:
+        """
+        This is the forward pass for the entire Pico model. It boils down to:
+        - Embedding the input ids
+        - Creating a causal mask
+        - Processing through the pico layers
+        - Projecting the output to logits
+        NOTE: One feature that might be confusing is the KV cache. The KV cache is used to speed up
+        generation by caching the KV pairs from previous forward passes. This is useful when doing
+        tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
+        modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
+        its own KV cache which is stored as a tuple. The whole model then stores a tuple of these
+        KV caches (so a tuple of tuples).
+        """
+        seq_len = input_ids.shape[-1]
+        h = self.embedding_proj(input_ids)
+        # Calculate start position from past cached KV pairs. Remember that each layer has its
+        # own KV Cache. So when we index past_key_values, we need to index into the KV pairs for the
+        # correct layer and then for either the keys or values.
+        start_pos = 0
+        if (
+            past_key_values is not None
+            and past_key_values[0] is not None
+            and past_key_values[0][0] is not None
+        ):
+            start_pos = past_key_values[0][0].shape[1]
+        # Create causal mask for current sequence
+        mask = None
+        if seq_len > 1:
+            mask = torch.full((seq_len, seq_len), float("-inf"))
+            mask = torch.triu(mask, diagonal=1)
+            # If using KV cache, extend mask to cover cached sequence length
+            if past_key_values is not None:
+                # Add zeros for cached tokens (we can attend to all of them)
+                mask = torch.hstack([torch.zeros((seq_len, start_pos)), mask])
+            mask = mask.to(h.device)
+        # NOTE: If we are using the cache, we need to store the cached KV pairs for each layer
+        #       in a tuple. Each layer will have its own cached KV pair which we aggregate in a tuple.
+        cached_key_values = () if use_cache else None
+        # Process through transformer blocks
+        for idx, layer in enumerate(self.layers):
+            layer_past_key_values = None
+            if past_key_values is not None:
+                try:
+                    # Handle both tuple-based cache and HuggingFace cache objects
+                    if hasattr(past_key_values, "__getitem__") and idx < len(
+                        past_key_values
+                    ):
+                        layer_past_key_values = past_key_values[idx]
+                except (KeyError, IndexError, TypeError):
+                    # If we can't access the cache properly, just skip it
+                    layer_past_key_values = None
+            h, layer_cached_key_values = layer(
+                h, mask=mask, past_key_values=layer_past_key_values, use_cache=use_cache
+            )
+            if use_cache:
+                cached_key_values += (layer_cached_key_values,)
+        # Final norm and projection
+        h = self.output_norm(h)
+        logits = self.de_embedding_proj(h).float()
+        return logits, cached_key_values
+########################################################
+#
+# HuggingFace Wrapper for the Pico Decoder model.
+#
+########################################################
+class PicoDecoderHFConfig(PretrainedConfig):
+    """Config class for the Pico Decoder HuggingFace wrapper."""
+    model_type = "pico_decoder"
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PicoDecoderHFConfig":
+        """
+        Initialize config from a dictionary. Note that no kwargs are passed to the constructor --
+        this is because with some kwargs special handling is required and can make this class
+        brittle.
+        """
+        pico_config = cls(**config_dict)
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+        unused_kwargs = {
+            key: value for key, value in kwargs.items() if not hasattr(pico_config, key)
+        }
+        if return_unused_kwargs:
+            return pico_config, unused_kwargs
+        return pico_config
+    @classmethod
+    def from_dataclass(cls, model_config: "ModelConfig"):
+        """Initialise from our custom config dataclass."""
+        return cls.from_dict(asdict(model_config))
+class PicoDecoderHF(PreTrainedModel, GenerationMixin):
+    """
+    HuggingFace wrapper for the Pico model with generation support.
+    Many evaluation frameworks require a model be setup as a HuggingFace model, so we provide a simple
+    wrapper that does just that. When we save checkpoints of the Pico model, we save both the normal
+    Pico model as well as the model wrapped in this HuggingFace class.
+    This also lets you do cool things like:
+    `model = AutoModelForCausalLM.from_pretrained("path/to/checkpoint")`
+    """
+    config_class = PicoDecoderHFConfig
+    _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
+    main_input_name = "input_ids"
+    def __init__(self, config: PicoDecoderHFConfig):
+        super().__init__(config)
+        self.pico_decoder = PicoDecoder(config)
+        # Initialize generation config with defaults
+        self.generation_config = GenerationConfig()
+        # Set some reasonable defaults for the model
+        if hasattr(config, "max_position_embeddings"):
+            self.generation_config.max_length = config.max_position_embeddings
+        if hasattr(config, "vocab_size"):
+            self.generation_config.vocab_size = config.vocab_size
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
+        """HuggingFace forward pass wrapper.
+        Forwards pass for the HuggingFace version of the Pico Model. Basic wrapper around the
+        Pico model's forward pass, and returns the output as a HuggingFace CausalLMOutput.
+        """
+        logits, past_key_values = self.pico_decoder(
+            input_ids, past_key_values, use_cache
+        )
+        if use_cache:
+            return CausalLMOutputWithPast(
+                logits=logits,
+                past_key_values=past_key_values,
+            )
+        else:
+            return CausalLMOutput(
+                logits=logits,
+            )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """
+        Prepare inputs for generation.
+        Args:
+            input_ids: Input token IDs
+            past_key_values: Cached key-value pairs from previous forward passes
+            attention_mask: Attention mask for the input
+            **kwargs: Additional arguments
+        Returns:
+            Dictionary containing prepared inputs
+        """
+        # If we have past_key_values, we only need the last token
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": True,
+        }
+    def get_input_embeddings(self):
+        """Get the input embeddings layer."""
+        return self.pico_decoder.embedding_proj
+    def set_input_embeddings(self, value):
+        """Set the input embeddings layer."""
+        self.pico_decoder.embedding_proj = value
+    def get_output_embeddings(self):
+        """Get the output embeddings layer."""
+        return self.pico_decoder.de_embedding_proj
+    def set_output_embeddings(self, value):
+        """Set the output embeddings layer."""
+        self.pico_decoder.de_embedding_proj = value
+    def get_lm_head(self):
+        """Get the language model head."""
+        return self.pico_decoder.de_embedding_proj
+    def can_generate(self) -> bool:
+        """Check if the model can generate text."""
+        return True
+    @property
+    def is_encoder_decoder(self) -> bool:
+        """Check if the model is an encoder-decoder model."""
+        return False
+    @property
+    def can_use_cache(self) -> bool:
+        """Check if the model can use KV cache."""
+        return True
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None
+    ) -> torch.nn.Embedding:
+        """Resize token embeddings."""
+        old_embeddings = self.get_input_embeddings()
+        if new_num_tokens is None:
+            new_num_tokens = old_embeddings.num_embeddings
+        new_embeddings = torch.nn.Embedding(
+            new_num_tokens, old_embeddings.embedding_dim
+        )
+        new_embeddings.weight.data[: old_embeddings.num_embeddings] = (
+            old_embeddings.weight.data
+        )
+        self.pico_decoder.embedding_proj = new_embeddings
+        self.pico_decoder.de_embedding_proj = torch.nn.Linear(
+            old_embeddings.embedding_dim, new_num_tokens, bias=False
+        )
+        return new_embeddings
+# Register for auto classes
+PicoDecoderHFConfig.register_for_auto_class()
+PicoDecoderHF.register_for_auto_class("AutoModel")
+PicoDecoderHF.register_for_auto_class("AutoModelForCausalLM")
+########################################################
+#
+# New PicoDecoderForCausalLM class for generation support
+#
+########################################################
+class PicoDecoderForCausalLM(PreTrainedModel, GenerationMixin):
+    """
+    PicoDecoderForCausalLM: A HuggingFace-compatible model that properly supports generation.
+    This class is designed to work with existing checkpoints and provides full generation support.
+    It inherits from the right base classes that HuggingFace expects for text generation.
+    """
+    config_class = PicoDecoderHFConfig
+    _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
+    main_input_name = "input_ids"
+    def __init__(self, config: PicoDecoderHFConfig):
+        super().__init__(config)
+        self.pico_decoder = PicoDecoder(config)
+        # Initialize generation config with defaults
+        self.generation_config = GenerationConfig()
+        # Set some reasonable defaults for the model
+        if hasattr(config, "max_position_embeddings"):
+            self.generation_config.max_length = config.max_position_embeddings
+        if hasattr(config, "vocab_size"):
+            self.generation_config.vocab_size = config.vocab_size
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
+        """Forward pass for text generation."""
+        logits, past_key_values = self.pico_decoder(
+            input_ids, past_key_values, use_cache
+        )
+        if use_cache:
+            return CausalLMOutputWithPast(
+                logits=logits,
+                past_key_values=past_key_values,
+            )
+        else:
+            return CausalLMOutput(
+                logits=logits,
+            )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Prepare inputs for generation."""
+        # If we have past_key_values, we only need the last token
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": True,
+        }
+    def get_input_embeddings(self):
+        """Get the input embeddings layer."""
+        return self.pico_decoder.embedding_proj
+    def set_input_embeddings(self, value):
+        """Set the input embeddings layer."""
+        self.pico_decoder.embedding_proj = value
+    def get_output_embeddings(self):
+        """Get the output embeddings layer."""
+        return self.pico_decoder.de_embedding_proj
+    def set_output_embeddings(self, value):
+        """Set the output embeddings layer."""
+        self.pico_decoder.de_embedding_proj = value
+    def get_lm_head(self):
+        """Get the language model head."""
+        return self.pico_decoder.de_embedding_proj
+    def can_generate(self) -> bool:
+        """Check if the model can generate text."""
+        return True
+    @property
+    def is_encoder_decoder(self) -> bool:
+        """Check if the model is an encoder-decoder model."""
+        return False
+    @property
+    def can_use_cache(self) -> bool:
+        """Check if the model can use KV cache."""
+        return True
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None
+    ) -> torch.nn.Embedding:
+        """Resize token embeddings."""
+        old_embeddings = self.get_input_embeddings()
+        if new_num_tokens is None:
+            new_num_tokens = old_embeddings.num_embeddings
+        new_embeddings = torch.nn.Embedding(
+            new_num_tokens, old_embeddings.embedding_dim
+        )
+        new_embeddings.weight.data[: old_embeddings.num_embeddings] = (
+            old_embeddings.weight.data
+        )
+        self.pico_decoder.embedding_proj = new_embeddings
+        self.pico_decoder.de_embedding_proj = torch.nn.Linear(
+            old_embeddings.embedding_dim, new_num_tokens, bias=False
+        )
+        return new_embeddings
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Load a pretrained model from a checkpoint.
+        This method handles loading from both the old PicoDecoderHF format and the new format.
+        """
+        # First try to load with the new class
+        try:
+            return super().from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        except Exception as e:
+            print(f"Failed to load with new class: {e}")
+            print("Attempting to load with legacy class and convert...")
+            # Try to load with the old class and convert
+            try:
+                from transformers import AutoModel
+                old_model = AutoModel.from_pretrained(
+                    pretrained_model_name_or_path,
+                    trust_remote_code=True,
+                    *model_args,
+                    **kwargs,
+                )
+                # Create new model instance
+                new_model = cls(old_model.config)
+                # Copy state dict
+                new_model.load_state_dict(old_model.state_dict(), strict=False)
+                return new_model
+            except Exception as e2:
+                print(f"Failed to convert from legacy format: {e2}")
+                raise e
+# Register the new class
+PicoDecoderForCausalLM.register_for_auto_class("AutoModelForCausalLM")

pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|padding|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

pico-decoder-tiny-dolma250M-v1/checkpoints/step_104000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "|||IP_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50254": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50255": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50256": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50257": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "|||EMAIL_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "|||PHONE_NUMBER|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": null,
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|padding|>",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": null
+}

pico-decoder-tiny-dolma250M-v1/eval_results/step_102000.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"paloma": Infinity}

pico-decoder-tiny-dolma250M-v1/logs/log_20250831_162326.log ADDED Viewed

	@@ -0,0 +1,269 @@

+2025-08-31 17:03:52 - pico-train - INFO - Step 100000 -- 📊 Evaluation Results
+2025-08-31 17:03:52 - pico-train - INFO - └── paloma: inf
+2025-08-31 17:03:52 - pico-train - INFO - ==================================================
+2025-08-31 17:03:52 - pico-train - INFO - ✨ Training Configuration
+2025-08-31 17:03:52 - pico-train - INFO - ==================================================
+2025-08-31 17:03:52 - pico-train - INFO - ╭─────────────────────────────────────────────────────╮
+2025-08-31 17:03:52 - pico-train - INFO - │ checkpointing:                                      │
+2025-08-31 17:03:52 - pico-train - INFO - │   checkpoints_dir: checkpoints                      │
+2025-08-31 17:03:52 - pico-train - INFO - │   evaluation:                                       │
+2025-08-31 17:03:52 - pico-train - INFO - │     eval_results_dir: eval_results                  │
+2025-08-31 17:03:52 - pico-train - INFO - │   fabric_checkpoint_dir: fabric_state               │
+2025-08-31 17:03:52 - pico-train - INFO - │   fabric_checkpoint_filename: checkpoint.pt         │
+2025-08-31 17:03:52 - pico-train - INFO - │   hf_checkpoint:                                    │
+2025-08-31 17:03:52 - pico-train - INFO - │     collection_slug: null                           │
+2025-08-31 17:03:52 - pico-train - INFO - │     repo_id: ThomasTheMaker/pico-decoder-tiny       │
+2025-08-31 17:03:52 - pico-train - INFO - │   learning_dynamics:                                │
+2025-08-31 17:03:52 - pico-train - INFO - │     batch_size: 1                                   │
+2025-08-31 17:03:52 - pico-train - INFO - │     eval_data: null                                 │
+2025-08-31 17:03:52 - pico-train - INFO - │     layer_suffixes:                                 │
+2025-08-31 17:03:52 - pico-train - INFO - │     - attention.v_proj                              │
+2025-08-31 17:03:52 - pico-train - INFO - │     - attention.o_proj                              │
+2025-08-31 17:03:52 - pico-train - INFO - │     - swiglu.w_2                                    │
+2025-08-31 17:03:52 - pico-train - INFO - │     sequence_idx: -1                                │
+2025-08-31 17:03:52 - pico-train - INFO - │   learning_dynamics_dir: learning_dynamics          │
+2025-08-31 17:03:52 - pico-train - INFO - │   logs_dir: logs                                    │
+2025-08-31 17:03:52 - pico-train - INFO - │   run_name: pico-decoder-tiny-dolma250M-v1          │
+2025-08-31 17:03:52 - pico-train - INFO - │   runs_dir: runs                                    │
+2025-08-31 17:03:52 - pico-train - INFO - │   save_every_n_steps: 2000                          │
+2025-08-31 17:03:52 - pico-train - INFO - │   save_to_hf: false                                 │
+2025-08-31 17:03:52 - pico-train - INFO - │   training:                                         │
+2025-08-31 17:03:52 - pico-train - INFO - │     auto_resume: true                               │
+2025-08-31 17:03:52 - pico-train - INFO - │ data:                                               │
+2025-08-31 17:03:52 - pico-train - INFO - │   dataloader:                                       │
+2025-08-31 17:03:52 - pico-train - INFO - │     batch_size: 16                                  │
+2025-08-31 17:03:52 - pico-train - INFO - │   dataset:                                          │
+2025-08-31 17:03:52 - pico-train - INFO - │     name: pico-lm/pretokenized-dolma                │
+2025-08-31 17:03:52 - pico-train - INFO - │   tokenizer:                                        │
+2025-08-31 17:03:52 - pico-train - INFO - │     name: allenai/OLMo-7B-0724-hf                   │
+2025-08-31 17:03:52 - pico-train - INFO - │     vocab_size: 50304                               │
+2025-08-31 17:03:52 - pico-train - INFO - │ evaluation:                                         │
+2025-08-31 17:03:52 - pico-train - INFO - │   metrics:                                          │
+2025-08-31 17:03:52 - pico-train - INFO - │   - paloma                                          │
+2025-08-31 17:03:52 - pico-train - INFO - │   paloma:                                           │
+2025-08-31 17:03:52 - pico-train - INFO - │     batch_size: 1                                   │
+2025-08-31 17:03:52 - pico-train - INFO - │     dataset_name: pico-lm/pretokenized-paloma-tinsy │
+2025-08-31 17:03:52 - pico-train - INFO - │     dataset_split: val                              │
+2025-08-31 17:03:52 - pico-train - INFO - │     max_length: 2048                                │
+2025-08-31 17:03:52 - pico-train - INFO - │ model:                                              │
+2025-08-31 17:03:52 - pico-train - INFO - │   activation_hidden_dim: 384                        │
+2025-08-31 17:03:52 - pico-train - INFO - │   attention_n_heads: 12                             │
+2025-08-31 17:03:52 - pico-train - INFO - │   attention_n_kv_heads: 4                           │
+2025-08-31 17:03:52 - pico-train - INFO - │   batch_size: 1024                                  │
+2025-08-31 17:03:52 - pico-train - INFO - │   d_model: 96                                       │
+2025-08-31 17:03:52 - pico-train - INFO - │   max_seq_len: 2048                                 │
+2025-08-31 17:03:52 - pico-train - INFO - │   model_type: pico_decoder                          │
+2025-08-31 17:03:52 - pico-train - INFO - │   n_layers: 12                                      │
+2025-08-31 17:03:52 - pico-train - INFO - │   norm_eps: 1.0e-06                                 │
+2025-08-31 17:03:52 - pico-train - INFO - │   position_emb_theta: 10000.0                       │
+2025-08-31 17:03:52 - pico-train - INFO - │   vocab_size: 50304                                 │
+2025-08-31 17:03:52 - pico-train - INFO - │ monitoring:                                         │
+2025-08-31 17:03:52 - pico-train - INFO - │   logging:                                          │
+2025-08-31 17:03:52 - pico-train - INFO - │     log_every_n_steps: 100                          │
+2025-08-31 17:03:52 - pico-train - INFO - │     log_level: INFO                                 │
+2025-08-31 17:03:52 - pico-train - INFO - │   save_to_wandb: false                              │
+2025-08-31 17:03:52 - pico-train - INFO - │   wandb:                                            │
+2025-08-31 17:03:52 - pico-train - INFO - │     entity: boymyc                                  │
+2025-08-31 17:03:52 - pico-train - INFO - │     project: pico-decoder-tiny                      │
+2025-08-31 17:03:52 - pico-train - INFO - │ training:                                           │
+2025-08-31 17:03:52 - pico-train - INFO - │   fabric:                                           │
+2025-08-31 17:03:52 - pico-train - INFO - │     accelerator: cuda                               │
+2025-08-31 17:03:52 - pico-train - INFO - │     num_devices: 1                                  │
+2025-08-31 17:03:52 - pico-train - INFO - │     num_nodes: 1                                    │
+2025-08-31 17:03:52 - pico-train - INFO - │     precision: bf16-mixed                           │
+2025-08-31 17:03:52 - pico-train - INFO - │   max_steps: 100000                                 │
+2025-08-31 17:03:52 - pico-train - INFO - │   optimization:                                     │
+2025-08-31 17:03:52 - pico-train - INFO - │     gradient_accumulation_steps: 1                  │
+2025-08-31 17:03:52 - pico-train - INFO - │     lr: 0.0002                                      │
+2025-08-31 17:03:52 - pico-train - INFO - │     lr_scheduler: cosine                            │
+2025-08-31 17:03:52 - pico-train - INFO - │     lr_warmup_steps: 2000                           │
+2025-08-31 17:03:52 - pico-train - INFO - │     optimizer: adamw                                │
+2025-08-31 17:03:52 - pico-train - INFO - │                                                     │
+2025-08-31 17:03:52 - pico-train - INFO - ╰─────────────────────────────────────────────────────╯
+2025-08-31 17:03:52 - pico-train - INFO - ==================================================
+2025-08-31 17:03:52 - pico-train - INFO - ⛭ Runtime Summary:
+2025-08-31 17:03:52 - pico-train - INFO - ==================================================
+2025-08-31 17:03:52 - pico-train - INFO - Starting from step: 100000
+2025-08-31 17:03:52 - pico-train - INFO - Model Setup:
+2025-08-31 17:03:52 - pico-train - INFO - └─ Total Parameters: 11,282,784
+2025-08-31 17:03:52 - pico-train - INFO - └─ Trainable Parameters: 11,282,784
+2025-08-31 17:03:52 - pico-train - INFO - Distributed Setup:
+2025-08-31 17:03:52 - pico-train - INFO - └─ Number of Devices: 1
+2025-08-31 17:03:52 - pico-train - INFO - └─ Device Type: NVIDIA H100 80GB HBM3
+2025-08-31 17:03:52 - pico-train - INFO - └─ Available Memory: 85.03 GB
+2025-08-31 17:03:52 - pico-train - INFO - Software Setup:
+2025-08-31 17:03:52 - pico-train - INFO - └─ Python Version: 3.12.3
+2025-08-31 17:03:52 - pico-train - INFO - └─ PyTorch Version: 2.8.0+cu128
+2025-08-31 17:03:52 - pico-train - INFO - └─ CUDA Version: 12.8
+2025-08-31 17:03:52 - pico-train - INFO - └─ Operating System: Linux 6.8.0-71-generic
+2025-08-31 17:03:52 - pico-train - INFO - Batch Size Configuration:
+2025-08-31 17:03:52 - pico-train - INFO - └─ Global Batch Size: 16
+2025-08-31 17:03:52 - pico-train - INFO - └─ Per Device Batch Size: 16
+2025-08-31 17:03:52 - pico-train - INFO - └─ Gradient Accumulation Steps: 1
+2025-08-31 17:03:52 - pico-train - INFO - ==================================================
+2025-08-31 17:03:52 - pico-train - INFO - Step 100000 -- 🔄 Training Metrics
+2025-08-31 17:03:52 - pico-train - INFO - ├── Loss: 4.9432
+2025-08-31 17:03:52 - pico-train - INFO - ├── Learning Rate: 2.00e-05
+2025-08-31 17:03:52 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:03:52 - pico-train - INFO - Step 100000 -- 📈 Saving Learning Dynamics
+2025-08-31 17:04:49 - pico-train - INFO - Step 100100 -- 🔄 Training Metrics
+2025-08-31 17:04:49 - pico-train - INFO - ├── Loss: 4.7703
+2025-08-31 17:04:49 - pico-train - INFO - ├── Learning Rate: 1.01e-04
+2025-08-31 17:04:49 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:05:43 - pico-train - INFO - Step 100200 -- 🔄 Training Metrics
+2025-08-31 17:05:43 - pico-train - INFO - ├── Loss: 4.8047
+2025-08-31 17:05:43 - pico-train - INFO - ├── Learning Rate: 1.01e-04
+2025-08-31 17:05:43 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:06:37 - pico-train - INFO - Step 100300 -- 🔄 Training Metrics
+2025-08-31 17:06:37 - pico-train - INFO - ├── Loss: 4.8076
+2025-08-31 17:06:37 - pico-train - INFO - ├── Learning Rate: 1.01e-04
+2025-08-31 17:06:37 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:07:31 - pico-train - INFO - Step 100400 -- 🔄 Training Metrics
+2025-08-31 17:07:31 - pico-train - INFO - ├── Loss: 4.7926
+2025-08-31 17:07:31 - pico-train - INFO - ├── Learning Rate: 1.01e-04
+2025-08-31 17:07:31 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:08:25 - pico-train - INFO - Step 100500 -- 🔄 Training Metrics
+2025-08-31 17:08:25 - pico-train - INFO - ├── Loss: 4.8059
+2025-08-31 17:08:25 - pico-train - INFO - ├── Learning Rate: 1.01e-04
+2025-08-31 17:08:25 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:09:19 - pico-train - INFO - Step 100600 -- 🔄 Training Metrics
+2025-08-31 17:09:19 - pico-train - INFO - ├── Loss: 4.7896
+2025-08-31 17:09:19 - pico-train - INFO - ├── Learning Rate: 1.01e-04
+2025-08-31 17:09:19 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:10:12 - pico-train - INFO - Step 100700 -- 🔄 Training Metrics
+2025-08-31 17:10:12 - pico-train - INFO - ├── Loss: 4.8066
+2025-08-31 17:10:12 - pico-train - INFO - ├── Learning Rate: 1.00e-04
+2025-08-31 17:10:12 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:11:07 - pico-train - INFO - Step 100800 -- 🔄 Training Metrics
+2025-08-31 17:11:07 - pico-train - INFO - ├── Loss: 4.7870
+2025-08-31 17:11:07 - pico-train - INFO - ├── Learning Rate: 1.00e-04
+2025-08-31 17:11:07 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:12:01 - pico-train - INFO - Step 100900 -- 🔄 Training Metrics
+2025-08-31 17:12:01 - pico-train - INFO - ├── Loss: 4.7958
+2025-08-31 17:12:01 - pico-train - INFO - ├── Learning Rate: 1.00e-04
+2025-08-31 17:12:01 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:12:55 - pico-train - INFO - Step 101000 -- 🔄 Training Metrics
+2025-08-31 17:12:55 - pico-train - INFO - ├── Loss: 4.8081
+2025-08-31 17:12:55 - pico-train - INFO - ├── Learning Rate: 1.00e-04
+2025-08-31 17:12:55 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:13:48 - pico-train - INFO - Step 101100 -- 🔄 Training Metrics
+2025-08-31 17:13:48 - pico-train - INFO - ├── Loss: 4.8023
+2025-08-31 17:13:48 - pico-train - INFO - ├── Learning Rate: 9.98e-05
+2025-08-31 17:13:48 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:14:43 - pico-train - INFO - Step 101200 -- 🔄 Training Metrics
+2025-08-31 17:14:43 - pico-train - INFO - ├── Loss: 4.7830
+2025-08-31 17:14:43 - pico-train - INFO - ├── Learning Rate: 9.97e-05
+2025-08-31 17:14:43 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:15:38 - pico-train - INFO - Step 101300 -- 🔄 Training Metrics
+2025-08-31 17:15:38 - pico-train - INFO - ├── Loss: 4.8071
+2025-08-31 17:15:38 - pico-train - INFO - ├── Learning Rate: 9.95e-05
+2025-08-31 17:15:38 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:16:32 - pico-train - INFO - Step 101400 -- 🔄 Training Metrics
+2025-08-31 17:16:32 - pico-train - INFO - ├── Loss: 4.8072
+2025-08-31 17:16:32 - pico-train - INFO - ├── Learning Rate: 9.94e-05
+2025-08-31 17:16:32 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:17:27 - pico-train - INFO - Step 101500 -- 🔄 Training Metrics
+2025-08-31 17:17:27 - pico-train - INFO - ├── Loss: 4.8027
+2025-08-31 17:17:27 - pico-train - INFO - ├── Learning Rate: 9.92e-05
+2025-08-31 17:17:27 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:18:20 - pico-train - INFO - Step 101600 -- 🔄 Training Metrics
+2025-08-31 17:18:20 - pico-train - INFO - ├── Loss: 4.7874
+2025-08-31 17:18:20 - pico-train - INFO - ├── Learning Rate: 9.90e-05
+2025-08-31 17:18:20 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:19:15 - pico-train - INFO - Step 101700 -- 🔄 Training Metrics
+2025-08-31 17:19:15 - pico-train - INFO - ├���─ Loss: 4.7817
+2025-08-31 17:19:15 - pico-train - INFO - ├── Learning Rate: 9.89e-05
+2025-08-31 17:19:15 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:20:09 - pico-train - INFO - Step 101800 -- 🔄 Training Metrics
+2025-08-31 17:20:09 - pico-train - INFO - ├── Loss: 4.8188
+2025-08-31 17:20:09 - pico-train - INFO - ├── Learning Rate: 9.87e-05
+2025-08-31 17:20:09 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:21:04 - pico-train - INFO - Step 101900 -- 🔄 Training Metrics
+2025-08-31 17:21:04 - pico-train - INFO - ├── Loss: 4.7880
+2025-08-31 17:21:04 - pico-train - INFO - ├── Learning Rate: 9.86e-05
+2025-08-31 17:21:04 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 17:21:58 - pico-train - INFO - Step 102000 -- 💾 Saving Checkpoint
+2025-08-31 18:00:17 - pico-train - INFO - Step 102000 -- 📊 Evaluation Results
+2025-08-31 18:00:17 - pico-train - INFO - └── paloma: inf
+2025-08-31 18:00:17 - pico-train - INFO - Step 102000 -- 🔄 Training Metrics
+2025-08-31 18:00:17 - pico-train - INFO - ├── Loss: 4.8055
+2025-08-31 18:00:17 - pico-train - INFO - ├── Learning Rate: 9.84e-05
+2025-08-31 18:00:17 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:00:17 - pico-train - INFO - Step 102000 -- 📈 Saving Learning Dynamics
+2025-08-31 18:01:13 - pico-train - INFO - Step 102100 -- 🔄 Training Metrics
+2025-08-31 18:01:13 - pico-train - INFO - ├── Loss: 4.7742
+2025-08-31 18:01:13 - pico-train - INFO - ├── Learning Rate: 9.83e-05
+2025-08-31 18:01:13 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:02:07 - pico-train - INFO - Step 102200 -- 🔄 Training Metrics
+2025-08-31 18:02:07 - pico-train - INFO - ├── Loss: 4.8050
+2025-08-31 18:02:07 - pico-train - INFO - ├── Learning Rate: 9.81e-05
+2025-08-31 18:02:07 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:03:01 - pico-train - INFO - Step 102300 -- 🔄 Training Metrics
+2025-08-31 18:03:01 - pico-train - INFO - ├── Loss: 4.8066
+2025-08-31 18:03:01 - pico-train - INFO - ├── Learning Rate: 9.79e-05
+2025-08-31 18:03:01 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:03:57 - pico-train - INFO - Step 102400 -- 🔄 Training Metrics
+2025-08-31 18:03:57 - pico-train - INFO - ├── Loss: 4.7865
+2025-08-31 18:03:57 - pico-train - INFO - ├── Learning Rate: 9.78e-05
+2025-08-31 18:03:57 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:04:50 - pico-train - INFO - Step 102500 -- 🔄 Training Metrics
+2025-08-31 18:04:50 - pico-train - INFO - ├── Loss: 4.8019
+2025-08-31 18:04:50 - pico-train - INFO - ├── Learning Rate: 9.76e-05
+2025-08-31 18:04:50 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:05:45 - pico-train - INFO - Step 102600 -- 🔄 Training Metrics
+2025-08-31 18:05:45 - pico-train - INFO - ├── Loss: 4.7948
+2025-08-31 18:05:45 - pico-train - INFO - ├── Learning Rate: 9.75e-05
+2025-08-31 18:05:45 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:06:39 - pico-train - INFO - Step 102700 -- 🔄 Training Metrics
+2025-08-31 18:06:39 - pico-train - INFO - ├── Loss: 4.8006
+2025-08-31 18:06:39 - pico-train - INFO - ├── Learning Rate: 9.73e-05
+2025-08-31 18:06:39 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:07:33 - pico-train - INFO - Step 102800 -- 🔄 Training Metrics
+2025-08-31 18:07:33 - pico-train - INFO - ├── Loss: 4.8049
+2025-08-31 18:07:33 - pico-train - INFO - ├── Learning Rate: 9.71e-05
+2025-08-31 18:07:33 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:08:27 - pico-train - INFO - Step 102900 -- 🔄 Training Metrics
+2025-08-31 18:08:27 - pico-train - INFO - ├── Loss: 4.8086
+2025-08-31 18:08:27 - pico-train - INFO - ├── Learning Rate: 9.70e-05
+2025-08-31 18:08:27 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:09:21 - pico-train - INFO - Step 103000 -- 🔄 Training Metrics
+2025-08-31 18:09:21 - pico-train - INFO - ├── Loss: 4.8154
+2025-08-31 18:09:21 - pico-train - INFO - ├── Learning Rate: 9.68e-05
+2025-08-31 18:09:21 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:10:15 - pico-train - INFO - Step 103100 -- 🔄 Training Metrics
+2025-08-31 18:10:15 - pico-train - INFO - ├── Loss: 4.8232
+2025-08-31 18:10:15 - pico-train - INFO - ├── Learning Rate: 9.67e-05
+2025-08-31 18:10:15 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:11:10 - pico-train - INFO - Step 103200 -- 🔄 Training Metrics
+2025-08-31 18:11:10 - pico-train - INFO - ├── Loss: 4.8032
+2025-08-31 18:11:10 - pico-train - INFO - ├── Learning Rate: 9.65e-05
+2025-08-31 18:11:10 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:12:05 - pico-train - INFO - Step 103300 -- 🔄 Training Metrics
+2025-08-31 18:12:05 - pico-train - INFO - ├── Loss: 4.8157
+2025-08-31 18:12:05 - pico-train - INFO - ├── Learning Rate: 9.64e-05
+2025-08-31 18:12:05 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:13:00 - pico-train - INFO - Step 103400 -- 🔄 Training Metrics
+2025-08-31 18:13:00 - pico-train - INFO - ├── Loss: 4.7903
+2025-08-31 18:13:00 - pico-train - INFO - ├── Learning Rate: 9.62e-05
+2025-08-31 18:13:00 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:13:54 - pico-train - INFO - Step 103500 -- 🔄 Training Metrics
+2025-08-31 18:13:54 - pico-train - INFO - ├── Loss: 4.7786
+2025-08-31 18:13:54 - pico-train - INFO - ├── Learning Rate: 9.60e-05
+2025-08-31 18:13:54 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:14:48 - pico-train - INFO - Step 103600 -- 🔄 Training Metrics
+2025-08-31 18:14:48 - pico-train - INFO - ├── Loss: 4.7962
+2025-08-31 18:14:48 - pico-train - INFO - ├── Learning Rate: 9.59e-05
+2025-08-31 18:14:48 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:15:43 - pico-train - INFO - Step 103700 -- 🔄 Training Metrics
+2025-08-31 18:15:43 - pico-train - INFO - ├── Loss: 4.8097
+2025-08-31 18:15:43 - pico-train - INFO - ├── Learning Rate: 9.57e-05
+2025-08-31 18:15:43 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:16:37 - pico-train - INFO - Step 103800 -- 🔄 Training Metrics
+2025-08-31 18:16:37 - pico-train - INFO - ├── Loss: 4.7613
+2025-08-31 18:16:37 - pico-train - INFO - ├── Learning Rate: 9.56e-05
+2025-08-31 18:16:37 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:17:31 - pico-train - INFO - Step 103900 -- 🔄 Training Metrics
+2025-08-31 18:17:31 - pico-train - INFO - ├── Loss: 4.7992
+2025-08-31 18:17:31 - pico-train - INFO - ├── Learning Rate: 9.54e-05
+2025-08-31 18:17:31 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-31 18:18:25 - pico-train - INFO - Step 104000 -- 💾 Saving Checkpoint