ThomasTheMaker commited on Aug 29

Commit

3b448ed

verified ·

1 Parent(s): f44ef3c

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/config.json +22 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/fabric_state/checkpoint.pt +3 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/learning_dynamics/train_activations.pt +3 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/learning_dynamics/train_data/data-00000-of-00001.arrow +3 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/learning_dynamics/train_data/dataset_info.json +19 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/learning_dynamics/train_data/state.json +13 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/learning_dynamics/train_gradients.pt +3 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/learning_dynamics/train_weights.pt +3 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/model.safetensors +3 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/pico_decoder.py +608 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/special_tokens_map.json +16 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/tokenizer.json +0 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/tokenizer_config.json +239 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_27/config.json +22 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_27/fabric_state/checkpoint.pt +3 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_27/model.safetensors +3 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_27/pico_decoder.py +608 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_27/special_tokens_map.json +16 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_27/tokenizer.json +0 -0
pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_27/tokenizer_config.json +239 -0
pico-decoder-tiny-dolma-teensy-v0/eval_results/step_0.json +1 -0
pico-decoder-tiny-dolma-teensy-v0/eval_results/step_27.json +1 -0
pico-decoder-tiny-dolma-teensy-v0/logs/log_20250828_210922.log +113 -0
pico-decoder-tiny-dolma-teensy-v0/training_config.yaml +74 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/config.json +22 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/fabric_state/checkpoint.pt +3 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/generation_config.json +4 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/learning_dynamics/train_activations.pt +3 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/learning_dynamics/train_data/data-00000-of-00001.arrow +3 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/learning_dynamics/train_data/dataset_info.json +19 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/learning_dynamics/train_data/state.json +13 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/learning_dynamics/train_gradients.pt +3 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/learning_dynamics/train_weights.pt +3 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/model.safetensors +3 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/pico_decoder.py +856 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/special_tokens_map.json +16 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/tokenizer.json +0 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/tokenizer_config.json +239 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/config.json +22 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/fabric_state/checkpoint.pt +3 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/generation_config.json +4 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/learning_dynamics/train_activations.pt +3 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/learning_dynamics/train_data/data-00000-of-00001.arrow +3 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/learning_dynamics/train_data/dataset_info.json +19 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/learning_dynamics/train_data/state.json +13 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/learning_dynamics/train_gradients.pt +3 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/learning_dynamics/train_weights.pt +3 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/model.safetensors +3 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/pico_decoder.py +871 -0
pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/special_tokens_map.json +16 -0

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "activation_hidden_dim": 384,
+  "architectures": [
+    "PicoDecoderHF"
+  ],
+  "attention_n_heads": 12,
+  "attention_n_kv_heads": 4,
+  "auto_map": {
+    "AutoConfig": "pico_decoder.PicoDecoderHFConfig",
+    "AutoModelForCausalLM": "pico_decoder.PicoDecoderHF"
+  },
+  "batch_size": 1024,
+  "d_model": 96,
+  "max_seq_len": 2048,
+  "model_type": "pico_decoder",
+  "n_layers": 12,
+  "norm_eps": 1e-06,
+  "position_emb_theta": 10000.0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.3",
+  "vocab_size": 50304
+}

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/fabric_state/checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b50a50fd67e7a1dfa214a074549428c03047ccc26357734db80084015a538b90
+size 45187997

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/learning_dynamics/train_activations.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6e6e181e18c36507d7cb053f37008011d6846e06f9e345baf9d0663fb288d53
+size 1388635

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/learning_dynamics/train_data/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8cf7fcdfd88a10fcfc5c173847b1b6f8926953cc585a896149edefcde9308ba8
+size 4121312

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/learning_dynamics/train_data/dataset_info.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/learning_dynamics/train_data/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "6848e6167d9ecc18",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/learning_dynamics/train_gradients.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27e73f3bd443e10701a6786ae83543453f0ffe514be04040edb55b9ff158895d
+size 2371527

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/learning_dynamics/train_weights.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c029ef92a6494ae121c847e432e52e6a8ff3bf7d9fef3e61bef871c1e9a9aa02
+size 2371443

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1852515eb5c8556533445f22edf523884b9f8cc44812379a6a951668a4ffa3a3
+size 45143592

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/pico_decoder.py ADDED Viewed

	@@ -0,0 +1,608 @@

+"""
+Pico Decoder: A Lightweight Causal Transformer Language Model
+Pico Decoder uses a simple LLAMA-style transformer architecture, written for clarity and educational purposes.
+Everything is written with a modular design for easy modification and experimentation.
+Key features:
+- RMSNorm for layer normalization
+- Rotary Positional Embeddings (RoPE)
+- Multi-head attention with KV-cache support
+- SwiGLU activation function
+- Residual connections throughout
+- KV-cache for faster autoregressive generation
+References:
+    - RoPE: https://arxiv.org/abs/2104.09864
+    - SwiGLU: https://arxiv.org/abs/2002.05202
+    - LLAMA: https://arxiv.org/abs/2302.13971
+Adapted from:
+    - OLMO: https://github.com/allenai/OLMo
+    - LLAMA: https://github.com/meta/llama
+"""
+from dataclasses import asdict
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutput, CausalLMOutputWithPast
+try:
+    if TYPE_CHECKING:
+        # We need to do this to avoid importing these when creating the HF-compatible models
+        from src.config import ModelConfig
+except ImportError:
+    pass
+########################################################
+#
+# Layer Normalization
+#
+########################################################
+class RMSNorm(torch.nn.Module):
+    """Root Mean Square Layer Normalization.
+    A variant of Layer Normalization that uses RMS statistics instead of mean/variance,
+    resulting in improved stability and performance.
+    Args:
+        config (Union[ModelConfig, PicoHFConfig]): Configuration object containing normalization parameters
+            - config.norm_eps: Small constant for numerical stability
+            - config.d_model: Model dimension for the weight parameter
+    References:
+        https://arxiv.org/abs/1910.07467
+    """
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        self.eps = config.norm_eps
+        self.weight = nn.Parameter(torch.ones(config.d_model))
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Normalizes the input tensor by its RMS value.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies RMS normalization to the input tensor and scales it by the weight parameter.
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+########################################################
+#
+# Positional Embedding
+#
+########################################################
+class RoPE(nn.Module):
+    """Rotary Positional Embeddings (RoPE).
+    Implements position-dependent rotation of keys and queries in attention mechanism,
+    allowing better modeling of relative positions in sequences. Uses complex number
+    operations for efficient rotation.
+    Args:
+        config (Union[ModelConfig, PicoHFConfig]): Model configuration containing:
+            - config.position_emb_theta: Base for frequency computation
+            - config.d_model: Model dimension
+            - config.attention_n_heads: Number of attention heads
+            - config.max_seq_len: Maximum sequence length
+    References:
+        https://arxiv.org/abs/2104.09864
+    """
+    _freqs_cis_tensor: torch.Tensor | None = None
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        self.theta = config.position_emb_theta
+        self.dim = config.d_model // config.attention_n_heads
+        max_seq_len = config.max_seq_len
+        # only gets set once, and then reused for all RoPE instances
+        if RoPE._freqs_cis_tensor is None:
+            RoPE._freqs_cis_tensor = self._setup_freqs_cis(
+                max_seq_len, self.theta, self.dim
+            )
+        # register _freqs_cis buffer
+        # can be easily recomputed so persistent=False
+        self.register_buffer("_freqs_cis", self._freqs_cis_tensor, persistent=False)
+    @classmethod
+    def _setup_freqs_cis(cls, seq_len: int, theta: float, dim: int) -> torch.Tensor:
+        """Setup Frequency Tensor for RoPE Embeddings
+        Initializes the complex frequency tensor that is used to compute the RoPE embeddings.
+        Note other implementations will use cos and sin directly, but using the complex
+        number representation is (probably?) more efficient:
+            e^(theta * i * t) = cos(theta * t) + i * sin(theta * t) [Euler's formula]
+        """
+        _freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        positions = torch.arange(seq_len)
+        freqs = torch.outer(positions, _freqs)
+        return torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    def get_freqs_cis(
+        self, input_shape: torch.Size, start_pos: int, end_pos: int
+    ) -> torch.Tensor:
+        """Reshape Frequency Tensor for RoPE Embeddings
+        Makes the frequency tensor broadcastable with the input tensor.
+        """
+        _freqs_cis = self._freqs_cis[start_pos:end_pos]
+        ndim = len(input_shape)
+        assert 0 <= 1 < ndim
+        assert _freqs_cis.shape == (input_shape[1], input_shape[-1])
+        # TODO: Check whether this is correct (might be able to remove this)
+        shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(input_shape)]
+        return _freqs_cis.view(*shape)
+    def forward(
+        self,
+        queries: torch.Tensor,
+        keys: torch.Tensor,
+        start_pos: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply RoPE Embeddings to Queries and Keys
+        Applies the rotary positional embeddings to the input tensors via complex num multiplication
+        NOTE: The start_pos is used if we want to use the kv_cache in the attention mechanism.
+        """
+        queries_ = torch.view_as_complex(
+            queries.float().reshape(*queries.shape[:-1], -1, 2)
+        )
+        keys_ = torch.view_as_complex(keys.float().reshape(*keys.shape[:-1], -1, 2))
+        input_shape = (
+            queries_.shape
+        )  # same as keys: (batch_size, seq_len, n_heads, head_dim/2)
+        freqs_start_pos = start_pos
+        freqs_end_pos = freqs_start_pos + queries_.shape[1]
+        freqs_cis = self.get_freqs_cis(input_shape, freqs_start_pos, freqs_end_pos)
+        queries_rotated = torch.view_as_real(queries_ * freqs_cis).flatten(3)
+        keys_rotated = torch.view_as_real(keys_ * freqs_cis).flatten(3)
+        return queries_rotated.type_as(queries), keys_rotated.type_as(keys)
+########################################################
+#
+# Attention
+#
+########################################################
+class Attention(nn.Module):
+    """Multi-head Attention with Group Query Attention support.
+    Implements scaled dot-product attention and supports:
+    - Grouped Query Attention (GQA)
+    - Key-Value caching for efficient inference
+    - RoPE integration
+    Args:
+        config (Union[ModelConfig, PretrainedConfig]): Configuration containing:
+            - config.attention_n_heads: Number of attention heads
+            - config.attention_n_kv_heads: Number of key/value heads
+            - config.d_model: Model dimension
+            - config.batch_size: Maximum batch size
+            - config.max_seq_len: Maximum sequence length
+    Shape:
+        - Input: (batch_size, seq_len, d_model)
+        - Output: (batch_size, seq_len, d_model)
+    """
+    def __init__(
+        self,
+        config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.n_heads = config.attention_n_heads
+        self.n_kv_heads = config.attention_n_kv_heads
+        self.batch_size = config.batch_size
+        self.max_seq_len = config.max_seq_len
+        d_model = config.d_model
+        self.head_dim = d_model // self.n_heads
+        self.n_rep = self.n_heads // self.n_kv_heads
+        self.q_proj = nn.Linear(d_model, self.n_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.n_heads * self.head_dim, d_model, bias=False)
+        self.rope = RoPE(config)
+    def forward(
+        self,
+        input: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor, ...]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        """Forward pass for the attention mechanism.
+        Computes queries, keys, and values for the attention mechanism. Applies rotary positional
+        embeddings to the queries and keys, and then computes attention scores and outputs.
+        For an introduction to the attention mechanism, see:
+        https://arxiv.org/abs/1706.03762
+        A few things to note:
+        - The past_key_values is used to implement the KV cache, which is used to speed up
+          generation by caching the KV pairs from previous forward passes. This is useful when doing
+          tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
+          modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
+          its own KV cache - this KV cache is implemented as a tuple.
+        """
+        bsz, seq_len, _ = input.shape
+        _queries, _keys, _values = (
+            self.q_proj(input),
+            self.k_proj(input),
+            self.v_proj(input),
+        )
+        # Reshaping for multi-head attention
+        queries = _queries.view(bsz, seq_len, self.n_heads, self.head_dim)
+        keys = _keys.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        values = _values.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        # The start position is used to apply the RoPE embeddings to only the new tokens
+        # when using the kv_cache in the attention mechanism.
+        # We want to start from the last position in the cache.
+        start_pos = past_key_values[0].shape[1] if past_key_values is not None else 0
+        # apply rotary positional embeddings
+        queries, keys = self.rope(queries, keys, start_pos)
+        if past_key_values is not None:
+            keys = torch.cat([past_key_values[0], keys], dim=1)
+            values = torch.cat([past_key_values[1], values], dim=1)
+        if use_cache:
+            cached_keys = keys
+            cached_values = values
+        else:
+            cached_keys = None
+            cached_values = None
+        queries = queries.transpose(1, 2)
+        keys = keys.transpose(1, 2)
+        values = values.transpose(1, 2)
+        apply_gqa = self.n_rep > 1
+        if apply_gqa and queries.device.type == "mps":
+            # NOTE: MPS does not support GQA in the SDPA kernel, but we can repeat the keys and values
+            # outside of the kernel to get the same effect.
+            # See: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+            keys = keys.repeat_interleave(self.n_rep, dim=-3)
+            values = values.repeat_interleave(self.n_rep, dim=-3)
+            apply_gqa = False
+        backends = [SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH]
+        with sdpa_kernel(backends=backends):
+            attn_output = F.scaled_dot_product_attention(
+                queries.contiguous(),
+                keys.contiguous(),
+                values.contiguous(),
+                attn_mask=mask.to(queries.dtype),
+                enable_gqa=apply_gqa,
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        output = self.o_proj(attn_output)
+        return output, (cached_keys, cached_values)
+########################################################
+#
+# SwiGLU (Combines MLP and Activation)
+#
+########################################################
+class SwiGLU(nn.Module):
+    """SwiGLU Activation Function with Linear Projections.
+    Implements the SwiGLU activation function combined with linear transformations,
+    serving as the feed-forward network in transformer blocks.
+    Args:
+        config (Union[ModelConfig, PicoDecoderHFConfig]): Configuration containing:
+            - config.d_model: Model dimension
+            - config.activation_hidden_dim: Hidden dimension (typically 4 * d_model)
+    References:
+        https://arxiv.org/abs/2002.05202
+    """
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        model_dim = config.d_model
+        act_hidden_dim = config.activation_hidden_dim  # usually 4 * d_model
+        self.w_0 = nn.Linear(model_dim, act_hidden_dim, bias=False)
+        self.w_1 = nn.Linear(model_dim, act_hidden_dim, bias=False)
+        self.w_2 = nn.Linear(act_hidden_dim, model_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w_2(F.silu(self.w_0(x)) * self.w_1(x))
+########################################################
+#
+# PicoDecoderBlock
+#
+########################################################
+class PicoDecoderBlock(nn.Module):
+    """Single Transformer Block with Attention and Feed-forward layers.
+    Implements a standard transformer block with:
+    - Multi-head attention with normalization and residual connection
+    - SwiGLU feed-forward network with normalization and residual connection
+    Args:
+        config (Union[ModelConfig, PicoDecoderHFConfig]): Model configuration; either a dataclass or
+            a HuggingFace PicoDecoderHFConfig
+    """
+    def __init__(
+        self,
+        config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.attention = Attention(config)
+        self.swiglu = SwiGLU(config)
+        self.attention_norm = RMSNorm(config)
+        self.swiglu_norm = RMSNorm(config)
+    def forward(
+        self,
+        input: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        attention_output, cached_key_values = self.attention(
+            self.attention_norm(input),
+            mask=mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+        # NOTE: cached_key_values is None if use_cache is False
+        h = input + attention_output
+        out = h + self.swiglu(self.swiglu_norm(h))
+        return out, cached_key_values
+########################################################
+#
+# Pico Decoder (Causal Transformer Model)
+#
+########################################################
+class PicoDecoder(nn.Module):
+    """
+    Pico Decoder: combines the embedding, causal decoder blocks, and output projection into a
+    single autoregressive model.
+    For more information on the model, see the classes for the modules that make up the model.
+    """
+    def __init__(
+        self,
+        model_config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.config = model_config
+        self.embedding_proj = nn.Embedding(self.config.vocab_size, self.config.d_model)
+        self.layers = nn.ModuleList(
+            [PicoDecoderBlock(self.config) for _ in range(self.config.n_layers)]
+        )
+        self.output_norm = RMSNorm(self.config)
+        self.de_embedding_proj = nn.Linear(
+            self.config.d_model, self.config.vocab_size, bias=False
+        )
+    def convert_to_hf_model(self) -> "PicoDecoderHF":
+        """Convert the Lightning model to a HuggingFace model."""
+        # Create HF config without fabric-specific settings
+        hf_config = PicoDecoderHFConfig.from_dataclass(self.config)
+        # Create new HF model
+        hf_model = PicoDecoderHF(hf_config)
+        # Copy state dict, excluding fabric-specific keys
+        hf_model.load_state_dict(self.state_dict(prefix="pico_decoder."))
+        return hf_model
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[Tuple[torch.Tensor, torch.Tensor]]]]:
+        """
+        This is the forward pass for the entire Pico model. It boils down to:
+        - Embedding the input ids
+        - Creating a causal mask
+        - Processing through the pico layers
+        - Projecting the output to logits
+        NOTE: One feature that might be confusing is the KV cache. The KV cache is used to speed up
+        generation by caching the KV pairs from previous forward passes. This is useful when doing
+        tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
+        modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
+        its own KV cache which is stored as a tuple. The whole model then stores a tuple of these
+        KV caches (so a tuple of tuples).
+        """
+        seq_len = input_ids.shape[-1]
+        h = self.embedding_proj(input_ids)
+        # Calculate start position from past cached KV pairs. Remember that each layer has its
+        # own KV Cache. So when we index past_key_values, we need to index into the KV pairs for the
+        # correct layer and then for either the keys or values.
+        start_pos = 0 if past_key_values is None else past_key_values[0][0].shape[1]
+        # Create causal mask for current sequence
+        mask = None
+        if seq_len > 1:
+            mask = torch.full((seq_len, seq_len), float("-inf"))
+            mask = torch.triu(mask, diagonal=1)
+            # If using KV cache, extend mask to cover cached sequence length
+            if past_key_values is not None:
+                # Add zeros for cached tokens (we can attend to all of them)
+                mask = torch.hstack([torch.zeros((seq_len, start_pos)), mask])
+            mask = mask.to(h.device)
+        # NOTE: If we are using the cache, we need to store the cached KV pairs for each layer
+        #       in a tuple. Each layer will have its own cached KV pair which we aggregate in a tuple.
+        cached_key_values = () if use_cache else None
+        # Process through transformer blocks
+        for idx, layer in enumerate(self.layers):
+            layer_past_key_values = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+            h, layer_cached_key_values = layer(
+                h, mask=mask, past_key_values=layer_past_key_values, use_cache=use_cache
+            )
+            if use_cache:
+                cached_key_values += (layer_cached_key_values,)
+        # Final norm and projection
+        h = self.output_norm(h)
+        logits = self.de_embedding_proj(h).float()
+        return logits, cached_key_values
+########################################################
+#
+# HuggingFace Wrapper for the Pico Decoder model.
+#
+########################################################
+class PicoDecoderHFConfig(PretrainedConfig):
+    """Config class for the Pico Decoder HuggingFace wrapper."""
+    model_type = "pico_decoder"
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PicoDecoderHFConfig":
+        """
+        Initialize config from a dictionary. Note that no kwargs are passed to the constructor --
+        this is because with some kwargs special handling is required and can make this class
+        brittle.
+        """
+        pico_config = cls(**config_dict)
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+        unused_kwargs = {
+            key: value for key, value in kwargs.items() if not hasattr(pico_config, key)
+        }
+        if return_unused_kwargs:
+            return pico_config, unused_kwargs
+        return pico_config
+    @classmethod
+    def from_dataclass(cls, model_config: "ModelConfig"):
+        """Initialise from our custom config dataclass."""
+        return cls.from_dict(asdict(model_config))
+class PicoDecoderHF(PreTrainedModel):
+    """
+    HuggingFace wrapper for the Pico model.
+    Many evaluation frameworks require a model be setup as a HuggingFace model, so we provide a simple
+    wrapper that does just that. When we save checkpoints of the Pico model, we save both the normal
+    Pico model as well as the model wrapped in this HuggingFace class.
+    This also lets you do cool things like:
+    `model = AutoModelForCausalLM.from_pretrained("path/to/checkpoint")`
+    """
+    config_class = PicoDecoderHFConfig
+    _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
+    def __init__(self, config: PicoDecoderHFConfig):
+        super().__init__(config)
+        self.pico_decoder = PicoDecoder(config)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
+        """HuggingFace forward pass wrapper.
+        Forwards pass for the HuggingFace version of the Pico Model. Basic wrapper around the
+        Pico model's forward pass, and returns the output as a HuggingFace CausalLMOutput.
+        """
+        logits, past_key_values = self.pico_decoder(
+            input_ids, past_key_values, use_cache
+        )
+        if use_cache:
+            return CausalLMOutputWithPast(
+                logits=logits,
+                past_key_values=past_key_values,
+            )
+        else:
+            return CausalLMOutput(
+                logits=logits,
+            )
+# Register for auto classes
+PicoDecoderHFConfig.register_for_auto_class()
+PicoDecoderHF.register_for_auto_class("AutoModel")
+PicoDecoderHF.register_for_auto_class("AutoModelForCausalLM")

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|padding|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_0/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "|||IP_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50254": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50255": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50256": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50257": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "|||EMAIL_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "|||PHONE_NUMBER|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": null,
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|padding|>",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": null
+}

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_27/config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "activation_hidden_dim": 384,
+  "architectures": [
+    "PicoDecoderHF"
+  ],
+  "attention_n_heads": 12,
+  "attention_n_kv_heads": 4,
+  "auto_map": {
+    "AutoConfig": "pico_decoder.PicoDecoderHFConfig",
+    "AutoModelForCausalLM": "pico_decoder.PicoDecoderHF"
+  },
+  "batch_size": 1024,
+  "d_model": 96,
+  "max_seq_len": 2048,
+  "model_type": "pico_decoder",
+  "n_layers": 12,
+  "norm_eps": 1e-06,
+  "position_emb_theta": 10000.0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.3",
+  "vocab_size": 50304
+}

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_27/fabric_state/checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e42d749796c6550ffb318da21c493f94df7f0c48120ac9ecbbd0eb6402fc67ff
+size 135543171

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_27/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d7be9a1e9b585a92821668324e20d977c23d51c04b2ade7610f764f62efe829
+size 45143592

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_27/pico_decoder.py ADDED Viewed

	@@ -0,0 +1,608 @@

+"""
+Pico Decoder: A Lightweight Causal Transformer Language Model
+Pico Decoder uses a simple LLAMA-style transformer architecture, written for clarity and educational purposes.
+Everything is written with a modular design for easy modification and experimentation.
+Key features:
+- RMSNorm for layer normalization
+- Rotary Positional Embeddings (RoPE)
+- Multi-head attention with KV-cache support
+- SwiGLU activation function
+- Residual connections throughout
+- KV-cache for faster autoregressive generation
+References:
+    - RoPE: https://arxiv.org/abs/2104.09864
+    - SwiGLU: https://arxiv.org/abs/2002.05202
+    - LLAMA: https://arxiv.org/abs/2302.13971
+Adapted from:
+    - OLMO: https://github.com/allenai/OLMo
+    - LLAMA: https://github.com/meta/llama
+"""
+from dataclasses import asdict
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutput, CausalLMOutputWithPast
+try:
+    if TYPE_CHECKING:
+        # We need to do this to avoid importing these when creating the HF-compatible models
+        from src.config import ModelConfig
+except ImportError:
+    pass
+########################################################
+#
+# Layer Normalization
+#
+########################################################
+class RMSNorm(torch.nn.Module):
+    """Root Mean Square Layer Normalization.
+    A variant of Layer Normalization that uses RMS statistics instead of mean/variance,
+    resulting in improved stability and performance.
+    Args:
+        config (Union[ModelConfig, PicoHFConfig]): Configuration object containing normalization parameters
+            - config.norm_eps: Small constant for numerical stability
+            - config.d_model: Model dimension for the weight parameter
+    References:
+        https://arxiv.org/abs/1910.07467
+    """
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        self.eps = config.norm_eps
+        self.weight = nn.Parameter(torch.ones(config.d_model))
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Normalizes the input tensor by its RMS value.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies RMS normalization to the input tensor and scales it by the weight parameter.
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+########################################################
+#
+# Positional Embedding
+#
+########################################################
+class RoPE(nn.Module):
+    """Rotary Positional Embeddings (RoPE).
+    Implements position-dependent rotation of keys and queries in attention mechanism,
+    allowing better modeling of relative positions in sequences. Uses complex number
+    operations for efficient rotation.
+    Args:
+        config (Union[ModelConfig, PicoHFConfig]): Model configuration containing:
+            - config.position_emb_theta: Base for frequency computation
+            - config.d_model: Model dimension
+            - config.attention_n_heads: Number of attention heads
+            - config.max_seq_len: Maximum sequence length
+    References:
+        https://arxiv.org/abs/2104.09864
+    """
+    _freqs_cis_tensor: torch.Tensor | None = None
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        self.theta = config.position_emb_theta
+        self.dim = config.d_model // config.attention_n_heads
+        max_seq_len = config.max_seq_len
+        # only gets set once, and then reused for all RoPE instances
+        if RoPE._freqs_cis_tensor is None:
+            RoPE._freqs_cis_tensor = self._setup_freqs_cis(
+                max_seq_len, self.theta, self.dim
+            )
+        # register _freqs_cis buffer
+        # can be easily recomputed so persistent=False
+        self.register_buffer("_freqs_cis", self._freqs_cis_tensor, persistent=False)
+    @classmethod
+    def _setup_freqs_cis(cls, seq_len: int, theta: float, dim: int) -> torch.Tensor:
+        """Setup Frequency Tensor for RoPE Embeddings
+        Initializes the complex frequency tensor that is used to compute the RoPE embeddings.
+        Note other implementations will use cos and sin directly, but using the complex
+        number representation is (probably?) more efficient:
+            e^(theta * i * t) = cos(theta * t) + i * sin(theta * t) [Euler's formula]
+        """
+        _freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        positions = torch.arange(seq_len)
+        freqs = torch.outer(positions, _freqs)
+        return torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    def get_freqs_cis(
+        self, input_shape: torch.Size, start_pos: int, end_pos: int
+    ) -> torch.Tensor:
+        """Reshape Frequency Tensor for RoPE Embeddings
+        Makes the frequency tensor broadcastable with the input tensor.
+        """
+        _freqs_cis = self._freqs_cis[start_pos:end_pos]
+        ndim = len(input_shape)
+        assert 0 <= 1 < ndim
+        assert _freqs_cis.shape == (input_shape[1], input_shape[-1])
+        # TODO: Check whether this is correct (might be able to remove this)
+        shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(input_shape)]
+        return _freqs_cis.view(*shape)
+    def forward(
+        self,
+        queries: torch.Tensor,
+        keys: torch.Tensor,
+        start_pos: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply RoPE Embeddings to Queries and Keys
+        Applies the rotary positional embeddings to the input tensors via complex num multiplication
+        NOTE: The start_pos is used if we want to use the kv_cache in the attention mechanism.
+        """
+        queries_ = torch.view_as_complex(
+            queries.float().reshape(*queries.shape[:-1], -1, 2)
+        )
+        keys_ = torch.view_as_complex(keys.float().reshape(*keys.shape[:-1], -1, 2))
+        input_shape = (
+            queries_.shape
+        )  # same as keys: (batch_size, seq_len, n_heads, head_dim/2)
+        freqs_start_pos = start_pos
+        freqs_end_pos = freqs_start_pos + queries_.shape[1]
+        freqs_cis = self.get_freqs_cis(input_shape, freqs_start_pos, freqs_end_pos)
+        queries_rotated = torch.view_as_real(queries_ * freqs_cis).flatten(3)
+        keys_rotated = torch.view_as_real(keys_ * freqs_cis).flatten(3)
+        return queries_rotated.type_as(queries), keys_rotated.type_as(keys)
+########################################################
+#
+# Attention
+#
+########################################################
+class Attention(nn.Module):
+    """Multi-head Attention with Group Query Attention support.
+    Implements scaled dot-product attention and supports:
+    - Grouped Query Attention (GQA)
+    - Key-Value caching for efficient inference
+    - RoPE integration
+    Args:
+        config (Union[ModelConfig, PretrainedConfig]): Configuration containing:
+            - config.attention_n_heads: Number of attention heads
+            - config.attention_n_kv_heads: Number of key/value heads
+            - config.d_model: Model dimension
+            - config.batch_size: Maximum batch size
+            - config.max_seq_len: Maximum sequence length
+    Shape:
+        - Input: (batch_size, seq_len, d_model)
+        - Output: (batch_size, seq_len, d_model)
+    """
+    def __init__(
+        self,
+        config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.n_heads = config.attention_n_heads
+        self.n_kv_heads = config.attention_n_kv_heads
+        self.batch_size = config.batch_size
+        self.max_seq_len = config.max_seq_len
+        d_model = config.d_model
+        self.head_dim = d_model // self.n_heads
+        self.n_rep = self.n_heads // self.n_kv_heads
+        self.q_proj = nn.Linear(d_model, self.n_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.n_heads * self.head_dim, d_model, bias=False)
+        self.rope = RoPE(config)
+    def forward(
+        self,
+        input: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor, ...]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        """Forward pass for the attention mechanism.
+        Computes queries, keys, and values for the attention mechanism. Applies rotary positional
+        embeddings to the queries and keys, and then computes attention scores and outputs.
+        For an introduction to the attention mechanism, see:
+        https://arxiv.org/abs/1706.03762
+        A few things to note:
+        - The past_key_values is used to implement the KV cache, which is used to speed up
+          generation by caching the KV pairs from previous forward passes. This is useful when doing
+          tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
+          modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
+          its own KV cache - this KV cache is implemented as a tuple.
+        """
+        bsz, seq_len, _ = input.shape
+        _queries, _keys, _values = (
+            self.q_proj(input),
+            self.k_proj(input),
+            self.v_proj(input),
+        )
+        # Reshaping for multi-head attention
+        queries = _queries.view(bsz, seq_len, self.n_heads, self.head_dim)
+        keys = _keys.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        values = _values.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        # The start position is used to apply the RoPE embeddings to only the new tokens
+        # when using the kv_cache in the attention mechanism.
+        # We want to start from the last position in the cache.
+        start_pos = past_key_values[0].shape[1] if past_key_values is not None else 0
+        # apply rotary positional embeddings
+        queries, keys = self.rope(queries, keys, start_pos)
+        if past_key_values is not None:
+            keys = torch.cat([past_key_values[0], keys], dim=1)
+            values = torch.cat([past_key_values[1], values], dim=1)
+        if use_cache:
+            cached_keys = keys
+            cached_values = values
+        else:
+            cached_keys = None
+            cached_values = None
+        queries = queries.transpose(1, 2)
+        keys = keys.transpose(1, 2)
+        values = values.transpose(1, 2)
+        apply_gqa = self.n_rep > 1
+        if apply_gqa and queries.device.type == "mps":
+            # NOTE: MPS does not support GQA in the SDPA kernel, but we can repeat the keys and values
+            # outside of the kernel to get the same effect.
+            # See: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+            keys = keys.repeat_interleave(self.n_rep, dim=-3)
+            values = values.repeat_interleave(self.n_rep, dim=-3)
+            apply_gqa = False
+        backends = [SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH]
+        with sdpa_kernel(backends=backends):
+            attn_output = F.scaled_dot_product_attention(
+                queries.contiguous(),
+                keys.contiguous(),
+                values.contiguous(),
+                attn_mask=mask.to(queries.dtype),
+                enable_gqa=apply_gqa,
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        output = self.o_proj(attn_output)
+        return output, (cached_keys, cached_values)
+########################################################
+#
+# SwiGLU (Combines MLP and Activation)
+#
+########################################################
+class SwiGLU(nn.Module):
+    """SwiGLU Activation Function with Linear Projections.
+    Implements the SwiGLU activation function combined with linear transformations,
+    serving as the feed-forward network in transformer blocks.
+    Args:
+        config (Union[ModelConfig, PicoDecoderHFConfig]): Configuration containing:
+            - config.d_model: Model dimension
+            - config.activation_hidden_dim: Hidden dimension (typically 4 * d_model)
+    References:
+        https://arxiv.org/abs/2002.05202
+    """
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        model_dim = config.d_model
+        act_hidden_dim = config.activation_hidden_dim  # usually 4 * d_model
+        self.w_0 = nn.Linear(model_dim, act_hidden_dim, bias=False)
+        self.w_1 = nn.Linear(model_dim, act_hidden_dim, bias=False)
+        self.w_2 = nn.Linear(act_hidden_dim, model_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w_2(F.silu(self.w_0(x)) * self.w_1(x))
+########################################################
+#
+# PicoDecoderBlock
+#
+########################################################
+class PicoDecoderBlock(nn.Module):
+    """Single Transformer Block with Attention and Feed-forward layers.
+    Implements a standard transformer block with:
+    - Multi-head attention with normalization and residual connection
+    - SwiGLU feed-forward network with normalization and residual connection
+    Args:
+        config (Union[ModelConfig, PicoDecoderHFConfig]): Model configuration; either a dataclass or
+            a HuggingFace PicoDecoderHFConfig
+    """
+    def __init__(
+        self,
+        config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.attention = Attention(config)
+        self.swiglu = SwiGLU(config)
+        self.attention_norm = RMSNorm(config)
+        self.swiglu_norm = RMSNorm(config)
+    def forward(
+        self,
+        input: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        attention_output, cached_key_values = self.attention(
+            self.attention_norm(input),
+            mask=mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+        # NOTE: cached_key_values is None if use_cache is False
+        h = input + attention_output
+        out = h + self.swiglu(self.swiglu_norm(h))
+        return out, cached_key_values
+########################################################
+#
+# Pico Decoder (Causal Transformer Model)
+#
+########################################################
+class PicoDecoder(nn.Module):
+    """
+    Pico Decoder: combines the embedding, causal decoder blocks, and output projection into a
+    single autoregressive model.
+    For more information on the model, see the classes for the modules that make up the model.
+    """
+    def __init__(
+        self,
+        model_config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.config = model_config
+        self.embedding_proj = nn.Embedding(self.config.vocab_size, self.config.d_model)
+        self.layers = nn.ModuleList(
+            [PicoDecoderBlock(self.config) for _ in range(self.config.n_layers)]
+        )
+        self.output_norm = RMSNorm(self.config)
+        self.de_embedding_proj = nn.Linear(
+            self.config.d_model, self.config.vocab_size, bias=False
+        )
+    def convert_to_hf_model(self) -> "PicoDecoderHF":
+        """Convert the Lightning model to a HuggingFace model."""
+        # Create HF config without fabric-specific settings
+        hf_config = PicoDecoderHFConfig.from_dataclass(self.config)
+        # Create new HF model
+        hf_model = PicoDecoderHF(hf_config)
+        # Copy state dict, excluding fabric-specific keys
+        hf_model.load_state_dict(self.state_dict(prefix="pico_decoder."))
+        return hf_model
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[Tuple[torch.Tensor, torch.Tensor]]]]:
+        """
+        This is the forward pass for the entire Pico model. It boils down to:
+        - Embedding the input ids
+        - Creating a causal mask
+        - Processing through the pico layers
+        - Projecting the output to logits
+        NOTE: One feature that might be confusing is the KV cache. The KV cache is used to speed up
+        generation by caching the KV pairs from previous forward passes. This is useful when doing
+        tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
+        modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
+        its own KV cache which is stored as a tuple. The whole model then stores a tuple of these
+        KV caches (so a tuple of tuples).
+        """
+        seq_len = input_ids.shape[-1]
+        h = self.embedding_proj(input_ids)
+        # Calculate start position from past cached KV pairs. Remember that each layer has its
+        # own KV Cache. So when we index past_key_values, we need to index into the KV pairs for the
+        # correct layer and then for either the keys or values.
+        start_pos = 0 if past_key_values is None else past_key_values[0][0].shape[1]
+        # Create causal mask for current sequence
+        mask = None
+        if seq_len > 1:
+            mask = torch.full((seq_len, seq_len), float("-inf"))
+            mask = torch.triu(mask, diagonal=1)
+            # If using KV cache, extend mask to cover cached sequence length
+            if past_key_values is not None:
+                # Add zeros for cached tokens (we can attend to all of them)
+                mask = torch.hstack([torch.zeros((seq_len, start_pos)), mask])
+            mask = mask.to(h.device)
+        # NOTE: If we are using the cache, we need to store the cached KV pairs for each layer
+        #       in a tuple. Each layer will have its own cached KV pair which we aggregate in a tuple.
+        cached_key_values = () if use_cache else None
+        # Process through transformer blocks
+        for idx, layer in enumerate(self.layers):
+            layer_past_key_values = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+            h, layer_cached_key_values = layer(
+                h, mask=mask, past_key_values=layer_past_key_values, use_cache=use_cache
+            )
+            if use_cache:
+                cached_key_values += (layer_cached_key_values,)
+        # Final norm and projection
+        h = self.output_norm(h)
+        logits = self.de_embedding_proj(h).float()
+        return logits, cached_key_values
+########################################################
+#
+# HuggingFace Wrapper for the Pico Decoder model.
+#
+########################################################
+class PicoDecoderHFConfig(PretrainedConfig):
+    """Config class for the Pico Decoder HuggingFace wrapper."""
+    model_type = "pico_decoder"
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PicoDecoderHFConfig":
+        """
+        Initialize config from a dictionary. Note that no kwargs are passed to the constructor --
+        this is because with some kwargs special handling is required and can make this class
+        brittle.
+        """
+        pico_config = cls(**config_dict)
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+        unused_kwargs = {
+            key: value for key, value in kwargs.items() if not hasattr(pico_config, key)
+        }
+        if return_unused_kwargs:
+            return pico_config, unused_kwargs
+        return pico_config
+    @classmethod
+    def from_dataclass(cls, model_config: "ModelConfig"):
+        """Initialise from our custom config dataclass."""
+        return cls.from_dict(asdict(model_config))
+class PicoDecoderHF(PreTrainedModel):
+    """
+    HuggingFace wrapper for the Pico model.
+    Many evaluation frameworks require a model be setup as a HuggingFace model, so we provide a simple
+    wrapper that does just that. When we save checkpoints of the Pico model, we save both the normal
+    Pico model as well as the model wrapped in this HuggingFace class.
+    This also lets you do cool things like:
+    `model = AutoModelForCausalLM.from_pretrained("path/to/checkpoint")`
+    """
+    config_class = PicoDecoderHFConfig
+    _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
+    def __init__(self, config: PicoDecoderHFConfig):
+        super().__init__(config)
+        self.pico_decoder = PicoDecoder(config)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
+        """HuggingFace forward pass wrapper.
+        Forwards pass for the HuggingFace version of the Pico Model. Basic wrapper around the
+        Pico model's forward pass, and returns the output as a HuggingFace CausalLMOutput.
+        """
+        logits, past_key_values = self.pico_decoder(
+            input_ids, past_key_values, use_cache
+        )
+        if use_cache:
+            return CausalLMOutputWithPast(
+                logits=logits,
+                past_key_values=past_key_values,
+            )
+        else:
+            return CausalLMOutput(
+                logits=logits,
+            )
+# Register for auto classes
+PicoDecoderHFConfig.register_for_auto_class()
+PicoDecoderHF.register_for_auto_class("AutoModel")
+PicoDecoderHF.register_for_auto_class("AutoModelForCausalLM")

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_27/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|padding|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_27/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

pico-decoder-tiny-dolma-teensy-v0/checkpoints/step_27/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "|||IP_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50254": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50255": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50256": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50257": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "|||EMAIL_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "|||PHONE_NUMBER|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": null,
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|padding|>",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": null
+}

pico-decoder-tiny-dolma-teensy-v0/eval_results/step_0.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"paloma": 59434.76600609756}

pico-decoder-tiny-dolma-teensy-v0/eval_results/step_27.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"paloma": 59120.39268292683}

pico-decoder-tiny-dolma-teensy-v0/logs/log_20250828_210922.log ADDED Viewed

	@@ -0,0 +1,113 @@

+2025-08-28 21:11:16 - pico-train - INFO - Step 0 -- 📊 Evaluation Results
+2025-08-28 21:11:16 - pico-train - INFO - └── paloma: 59434.76600609756
+2025-08-28 21:11:16 - pico-train - INFO - ==================================================
+2025-08-28 21:11:16 - pico-train - INFO - ✨ Training Configuration
+2025-08-28 21:11:16 - pico-train - INFO - ==================================================
+2025-08-28 21:11:16 - pico-train - INFO - ╭─────────────────────────────────────────────────────╮
+2025-08-28 21:11:16 - pico-train - INFO - │ checkpointing:                                      │
+2025-08-28 21:11:16 - pico-train - INFO - │   checkpoints_dir: checkpoints                      │
+2025-08-28 21:11:16 - pico-train - INFO - │   evaluation:                                       │
+2025-08-28 21:11:16 - pico-train - INFO - │     eval_results_dir: eval_results                  │
+2025-08-28 21:11:16 - pico-train - INFO - │   fabric_checkpoint_dir: fabric_state               │
+2025-08-28 21:11:16 - pico-train - INFO - │   fabric_checkpoint_filename: checkpoint.pt         │
+2025-08-28 21:11:16 - pico-train - INFO - │   hf_checkpoint:                                    │
+2025-08-28 21:11:16 - pico-train - INFO - │     collection_slug: null                           │
+2025-08-28 21:11:16 - pico-train - INFO - │     repo_id: ThomasTheMaker/pico-decoder-tiny       │
+2025-08-28 21:11:16 - pico-train - INFO - │   learning_dynamics:                                │
+2025-08-28 21:11:16 - pico-train - INFO - │     batch_size: 8                                   │
+2025-08-28 21:11:16 - pico-train - INFO - │     eval_data: null                                 │
+2025-08-28 21:11:16 - pico-train - INFO - │     layer_suffixes:                                 │
+2025-08-28 21:11:16 - pico-train - INFO - │     - attention.v_proj                              │
+2025-08-28 21:11:16 - pico-train - INFO - │     - attention.o_proj                              │
+2025-08-28 21:11:16 - pico-train - INFO - │     - swiglu.w_2                                    │
+2025-08-28 21:11:16 - pico-train - INFO - │     sequence_idx: -1                                │
+2025-08-28 21:11:16 - pico-train - INFO - │   learning_dynamics_dir: learning_dynamics          │
+2025-08-28 21:11:16 - pico-train - INFO - │   logs_dir: logs                                    │
+2025-08-28 21:11:16 - pico-train - INFO - │   run_name: pico-decoder-tiny-max-vram              │
+2025-08-28 21:11:16 - pico-train - INFO - │   runs_dir: runs                                    │
+2025-08-28 21:11:16 - pico-train - INFO - │   save_every_n_steps: 1000                          │
+2025-08-28 21:11:16 - pico-train - INFO - │   save_to_hf: true                                  │
+2025-08-28 21:11:16 - pico-train - INFO - │   training:                                         │
+2025-08-28 21:11:16 - pico-train - INFO - │     auto_resume: true                               │
+2025-08-28 21:11:16 - pico-train - INFO - │ data:                                               │
+2025-08-28 21:11:16 - pico-train - INFO - │   dataloader:                                       │
+2025-08-28 21:11:16 - pico-train - INFO - │     batch_size: 64                                  │
+2025-08-28 21:11:16 - pico-train - INFO - │   dataset:                                          │
+2025-08-28 21:11:16 - pico-train - INFO - │     name: pico-lm/pretokenized-dolma-tinsy          │
+2025-08-28 21:11:16 - pico-train - INFO - │   tokenizer:                                        │
+2025-08-28 21:11:16 - pico-train - INFO - │     name: allenai/OLMo-7B-0724-hf                   │
+2025-08-28 21:11:16 - pico-train - INFO - │     vocab_size: 50304                               │
+2025-08-28 21:11:16 - pico-train - INFO - │ evaluation:                                         │
+2025-08-28 21:11:16 - pico-train - INFO - │   metrics:                                          │
+2025-08-28 21:11:16 - pico-train - INFO - │   - paloma                                          │
+2025-08-28 21:11:16 - pico-train - INFO - │   paloma:                                           │
+2025-08-28 21:11:16 - pico-train - INFO - │     batch_size: 2                                   │
+2025-08-28 21:11:16 - pico-train - INFO - │     dataset_name: pico-lm/pretokenized-paloma-tinsy │
+2025-08-28 21:11:16 - pico-train - INFO - │     dataset_split: val                              │
+2025-08-28 21:11:16 - pico-train - INFO - │     max_length: 2048                                │
+2025-08-28 21:11:16 - pico-train - INFO - │ model:                                              │
+2025-08-28 21:11:16 - pico-train - INFO - │   activation_hidden_dim: 384                        │
+2025-08-28 21:11:16 - pico-train - INFO - │   attention_n_heads: 12                             │
+2025-08-28 21:11:16 - pico-train - INFO - │   attention_n_kv_heads: 4                           │
+2025-08-28 21:11:16 - pico-train - INFO - │   batch_size: 1024                                  │
+2025-08-28 21:11:16 - pico-train - INFO - │   d_model: 96                                       │
+2025-08-28 21:11:16 - pico-train - INFO - │   max_seq_len: 2048                                 │
+2025-08-28 21:11:16 - pico-train - INFO - │   model_type: pico_decoder                          │
+2025-08-28 21:11:16 - pico-train - INFO - │   n_layers: 12                                      │
+2025-08-28 21:11:16 - pico-train - INFO - │   norm_eps: 1.0e-06                                 │
+2025-08-28 21:11:16 - pico-train - INFO - │   position_emb_theta: 10000.0                       │
+2025-08-28 21:11:16 - pico-train - INFO - │   vocab_size: 50304                                 │
+2025-08-28 21:11:16 - pico-train - INFO - │ monitoring:                                         │
+2025-08-28 21:11:16 - pico-train - INFO - │   logging:                                          │
+2025-08-28 21:11:16 - pico-train - INFO - │     log_every_n_steps: 100                          │
+2025-08-28 21:11:16 - pico-train - INFO - │     log_level: INFO                                 │
+2025-08-28 21:11:16 - pico-train - INFO - │   save_to_wandb: false                              │
+2025-08-28 21:11:16 - pico-train - INFO - │   wandb:                                            │
+2025-08-28 21:11:16 - pico-train - INFO - │     entity: boymyc                                  │
+2025-08-28 21:11:16 - pico-train - INFO - │     project: pico-decoder-tiny                      │
+2025-08-28 21:11:16 - pico-train - INFO - │ training:                                           │
+2025-08-28 21:11:16 - pico-train - INFO - │   fabric:                                           │
+2025-08-28 21:11:16 - pico-train - INFO - │     accelerator: cuda                               │
+2025-08-28 21:11:16 - pico-train - INFO - │     num_devices: 1                                  │
+2025-08-28 21:11:16 - pico-train - INFO - │     num_nodes: 1                                    │
+2025-08-28 21:11:16 - pico-train - INFO - │     precision: 16-mixed                             │
+2025-08-28 21:11:16 - pico-train - INFO - │   max_steps: 200000                                 │
+2025-08-28 21:11:16 - pico-train - INFO - │   optimization:                                     │
+2025-08-28 21:11:16 - pico-train - INFO - │     gradient_accumulation_steps: 64                 │
+2025-08-28 21:11:16 - pico-train - INFO - │     lr: 0.0003                                      │
+2025-08-28 21:11:16 - pico-train - INFO - │     lr_scheduler: linear_with_warmup                │
+2025-08-28 21:11:16 - pico-train - INFO - │     lr_warmup_steps: 2500                           │
+2025-08-28 21:11:16 - pico-train - INFO - │     optimizer: adamw                                │
+2025-08-28 21:11:16 - pico-train - INFO - │                                                     │
+2025-08-28 21:11:16 - pico-train - INFO - ╰─────────────────────────────────────────────────────╯
+2025-08-28 21:11:16 - pico-train - INFO - ==================================================
+2025-08-28 21:11:16 - pico-train - INFO - ⛭ Runtime Summary:
+2025-08-28 21:11:16 - pico-train - INFO - ==================================================
+2025-08-28 21:11:16 - pico-train - INFO - Starting from step: 0
+2025-08-28 21:11:16 - pico-train - INFO - Model Setup:
+2025-08-28 21:11:16 - pico-train - INFO - └─ Total Parameters: 11,282,784
+2025-08-28 21:11:16 - pico-train - INFO - └─ Trainable Parameters: 11,282,784
+2025-08-28 21:11:16 - pico-train - INFO - Distributed Setup:
+2025-08-28 21:11:16 - pico-train - INFO - └─ Number of Devices: 1
+2025-08-28 21:11:16 - pico-train - INFO - └─ Device Type: NVIDIA GeForce RTX 5090
+2025-08-28 21:11:16 - pico-train - INFO - └─ Available Memory: 33.68 GB
+2025-08-28 21:11:16 - pico-train - INFO - Software Setup:
+2025-08-28 21:11:16 - pico-train - INFO - └─ Python Version: 3.10.12
+2025-08-28 21:11:16 - pico-train - INFO - └─ PyTorch Version: 2.8.0+cu128
+2025-08-28 21:11:16 - pico-train - INFO - └─ CUDA Version: 12.8
+2025-08-28 21:11:16 - pico-train - INFO - └─ Operating System: Linux 6.8.0-63-generic
+2025-08-28 21:11:16 - pico-train - INFO - Batch Size Configuration:
+2025-08-28 21:11:16 - pico-train - INFO - └─ Global Batch Size: 256
+2025-08-28 21:11:16 - pico-train - INFO - └─ Per Device Batch Size: 1
+2025-08-28 21:11:16 - pico-train - INFO - └─ Gradient Accumulation Steps: 256
+2025-08-28 21:11:16 - pico-train - INFO - ==================================================
+2025-08-28 21:11:49 - pico-train - INFO - Step 0 -- 🔄 Training Metrics
+2025-08-28 21:11:49 - pico-train - INFO - ├── Loss: 10.9914
+2025-08-28 21:11:49 - pico-train - INFO - ├── Learning Rate: 0.00e+00
+2025-08-28 21:11:49 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-28 21:11:49 - pico-train - INFO - Step 0 -- 📈 Saving Learning Dynamics
+2025-08-28 21:26:36 - pico-train - INFO - Step 27 -- 💾 Saving Final Checkpoint
+2025-08-28 21:28:36 - pico-train - INFO - Step 27 -- 📊 Evaluation Results
+2025-08-28 21:28:36 - pico-train - INFO - └── paloma: 59120.39268292683
+2025-08-28 21:28:37 - pico-train - INFO - 🎉 Training complete! Final step: 27
+2025-08-28 21:28:37 - pico-train - WARNING - 	 Note: Training stopped before max steps (200000)

pico-decoder-tiny-dolma-teensy-v0/training_config.yaml ADDED Viewed

	@@ -0,0 +1,74 @@

+checkpointing:
+  checkpoints_dir: checkpoints
+  evaluation:
+    eval_results_dir: eval_results
+  fabric_checkpoint_dir: fabric_state
+  fabric_checkpoint_filename: checkpoint.pt
+  hf_checkpoint:
+    collection_slug: null
+    repo_id: ThomasTheMaker/pico-decoder-tiny
+  learning_dynamics:
+    batch_size: 8
+    eval_data: null
+    layer_suffixes:
+    - attention.v_proj
+    - attention.o_proj
+    - swiglu.w_2
+    sequence_idx: -1
+  learning_dynamics_dir: learning_dynamics
+  logs_dir: logs
+  run_name: pico-decoder-tiny-max-vram
+  runs_dir: runs
+  save_every_n_steps: 1000
+  save_to_hf: true
+  training:
+    auto_resume: true
+data:
+  dataloader:
+    batch_size: 64
+  dataset:
+    name: pico-lm/pretokenized-dolma-tinsy
+  tokenizer:
+    name: allenai/OLMo-7B-0724-hf
+    vocab_size: 50304
+evaluation:
+  metrics:
+  - paloma
+  paloma:
+    batch_size: 2
+    dataset_name: pico-lm/pretokenized-paloma-tinsy
+    dataset_split: val
+    max_length: 2048
+model:
+  activation_hidden_dim: 384
+  attention_n_heads: 12
+  attention_n_kv_heads: 4
+  batch_size: 1024
+  d_model: 96
+  max_seq_len: 2048
+  model_type: pico_decoder
+  n_layers: 12
+  norm_eps: 1.0e-06
+  position_emb_theta: 10000.0
+  vocab_size: 50304
+monitoring:
+  logging:
+    log_every_n_steps: 100
+    log_level: INFO
+  save_to_wandb: false
+  wandb:
+    entity: boymyc
+    project: pico-decoder-tiny
+training:
+  fabric:
+    accelerator: cuda
+    num_devices: 1
+    num_nodes: 1
+    precision: 16-mixed
+  max_steps: 200000
+  optimization:
+    gradient_accumulation_steps: 64
+    lr: 0.0003
+    lr_scheduler: linear_with_warmup
+    lr_warmup_steps: 2500
+    optimizer: adamw

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "activation_hidden_dim": 384,
+  "architectures": [
+    "PicoDecoderHF"
+  ],
+  "attention_n_heads": 12,
+  "attention_n_kv_heads": 4,
+  "auto_map": {
+    "AutoConfig": "pico_decoder.PicoDecoderHFConfig",
+    "AutoModelForCausalLM": "pico_decoder.PicoDecoderHF"
+  },
+  "batch_size": 1024,
+  "d_model": 96,
+  "max_seq_len": 2048,
+  "model_type": "pico_decoder",
+  "n_layers": 12,
+  "norm_eps": 1e-06,
+  "position_emb_theta": 10000.0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.3",
+  "vocab_size": 50304
+}

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/fabric_state/checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b50a50fd67e7a1dfa214a074549428c03047ccc26357734db80084015a538b90
+size 45187997

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "transformers_version": "4.48.3",
+  "vocab_size": 50304
+}

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/learning_dynamics/train_activations.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33fda803f83cb9653b125b70cf8386e39812fa3e30e4746b52db22f5a248be93
+size 33819

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/learning_dynamics/train_data/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85fcf259ee523f219f5133a952ded67c5a339f05dc40df33188f33a1838bb3e0
+size 65384

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/learning_dynamics/train_data/dataset_info.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/learning_dynamics/train_data/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "d0a54608fc979d10",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/learning_dynamics/train_gradients.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b48826ca39adc92f370b9c3aa0ed42dce5dbf1ffe4fcfe1c320df08c344016bb
+size 2371527

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/learning_dynamics/train_weights.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c029ef92a6494ae121c847e432e52e6a8ff3bf7d9fef3e61bef871c1e9a9aa02
+size 2371443

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1852515eb5c8556533445f22edf523884b9f8cc44812379a6a951668a4ffa3a3
+size 45143592

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/pico_decoder.py ADDED Viewed

	@@ -0,0 +1,856 @@

+"""
+Pico Decoder: A Lightweight Causal Transformer Language Model
+Pico Decoder uses a simple LLAMA-style transformer architecture, written for clarity and educational purposes.
+Everything is written with a modular design for easy modification and experimentation.
+Key features:
+- RMSNorm for layer normalization
+- Rotary Positional Embeddings (RoPE)
+- Multi-head attention with KV-cache support
+- SwiGLU activation function
+- Residual connections throughout
+- KV-cache for faster autoregressive generation
+References:
+    - RoPE: https://arxiv.org/abs/2104.09864
+    - SwiGLU: https://arxiv.org/abs/2002.05202
+    - LLAMA: https://arxiv.org/abs/2302.13971
+Adapted from:
+    - OLMO: https://github.com/allenai/OLMo
+    - LLAMA: https://github.com/meta/llama
+"""
+from dataclasses import asdict
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from transformers import PretrainedConfig, PreTrainedModel, GenerationMixin
+from transformers.modeling_outputs import CausalLMOutput, CausalLMOutputWithPast
+from transformers.generation import GenerationConfig
+try:
+    if TYPE_CHECKING:
+        # We need to do this to avoid importing these when creating the HF-compatible models
+        from src.config import ModelConfig
+except ImportError:
+    pass
+########################################################
+#
+# Layer Normalization
+#
+########################################################
+class RMSNorm(torch.nn.Module):
+    """Root Mean Square Layer Normalization.
+    A variant of Layer Normalization that uses RMS statistics instead of mean/variance,
+    resulting in improved stability and performance.
+    Args:
+        config (Union[ModelConfig, PicoHFConfig]): Configuration object containing normalization parameters
+            - config.norm_eps: Small constant for numerical stability
+            - config.d_model: Model dimension for the weight parameter
+    References:
+        https://arxiv.org/abs/1910.07467
+    """
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        self.eps = config.norm_eps
+        self.weight = nn.Parameter(torch.ones(config.d_model))
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Normalizes the input tensor by its RMS value.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies RMS normalization to the input tensor and scales it by the weight parameter.
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+########################################################
+#
+# Positional Embedding
+#
+########################################################
+class RoPE(nn.Module):
+    """Rotary Positional Embeddings (RoPE).
+    Implements position-dependent rotation of keys and queries in attention mechanism,
+    allowing better modeling of relative positions in sequences. Uses complex number
+    operations for efficient rotation.
+    Args:
+        config (Union[ModelConfig, PicoHFConfig]): Model configuration containing:
+            - config.position_emb_theta: Base for frequency computation
+            - config.d_model: Model dimension
+            - config.attention_n_heads: Number of attention heads
+            - config.max_seq_len: Maximum sequence length
+    References:
+        https://arxiv.org/abs/2104.09864
+    """
+    _freqs_cis_tensor: torch.Tensor | None = None
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        self.theta = config.position_emb_theta
+        self.dim = config.d_model // config.attention_n_heads
+        max_seq_len = config.max_seq_len
+        # only gets set once, and then reused for all RoPE instances
+        if RoPE._freqs_cis_tensor is None:
+            RoPE._freqs_cis_tensor = self._setup_freqs_cis(
+                max_seq_len, self.theta, self.dim
+            )
+        # register _freqs_cis buffer
+        # can be easily recomputed so persistent=False
+        self.register_buffer("_freqs_cis", self._freqs_cis_tensor, persistent=False)
+    @classmethod
+    def _setup_freqs_cis(cls, seq_len: int, theta: float, dim: int) -> torch.Tensor:
+        """Setup Frequency Tensor for RoPE Embeddings
+        Initializes the complex frequency tensor that is used to compute the RoPE embeddings.
+        Note other implementations will use cos and sin directly, but using the complex
+        number representation is (probably) more efficient:
+            e^(theta * i * t) = cos(theta * t) + i * sin(theta * t) [Euler's formula]
+        """
+        _freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        positions = torch.arange(seq_len)
+        freqs = torch.outer(positions, _freqs)
+        return torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    def get_freqs_cis(
+        self, input_shape: torch.Size, start_pos: int, end_pos: int
+    ) -> torch.Tensor:
+        """Reshape Frequency Tensor for RoPE Embeddings
+        Makes the frequency tensor broadcastable with the input tensor.
+        """
+        _freqs_cis = self._freqs_cis[start_pos:end_pos]
+        ndim = len(input_shape)
+        assert 0 <= 1 < ndim
+        assert _freqs_cis.shape == (input_shape[1], input_shape[-1])
+        # TODO: Check whether this is correct (might be able to remove this)
+        shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(input_shape)]
+        return _freqs_cis.view(*shape)
+    def forward(
+        self,
+        queries: torch.Tensor,
+        keys: torch.Tensor,
+        start_pos: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply RoPE Embeddings to Queries and Keys
+        Applies the rotary positional embeddings to the input tensors via complex num multiplication
+        NOTE: The start_pos is used if we want to use the kv_cache in the attention mechanism.
+        """
+        queries_ = torch.view_as_complex(
+            queries.float().reshape(*queries.shape[:-1], -1, 2)
+        )
+        keys_ = torch.view_as_complex(keys.float().reshape(*keys.shape[:-1], -1, 2))
+        input_shape = (
+            queries_.shape
+        )  # same as keys: (batch_size, seq_len, n_heads, head_dim/2)
+        freqs_start_pos = start_pos
+        freqs_end_pos = freqs_start_pos + queries_.shape[1]
+        freqs_cis = self.get_freqs_cis(input_shape, freqs_start_pos, freqs_end_pos)
+        queries_rotated = torch.view_as_real(queries_ * freqs_cis).flatten(3)
+        keys_rotated = torch.view_as_real(keys_ * freqs_cis).flatten(3)
+        return queries_rotated.type_as(queries), keys_rotated.type_as(keys)
+########################################################
+#
+# Attention
+#
+########################################################
+class Attention(nn.Module):
+    """Multi-head Attention with Group Query Attention support.
+    Implements scaled dot-product attention and supports:
+    - Grouped Query Attention (GQA)
+    - Key-Value caching for efficient inference
+    - RoPE integration
+    Args:
+        config (Union[ModelConfig, PretrainedConfig]): Configuration containing:
+            - config.attention_n_heads: Number of attention heads
+            - config.attention_n_kv_heads: Number of key/value heads
+            - config.d_model: Model dimension
+            - config.batch_size: Maximum batch size
+            - config.max_seq_len: Maximum sequence length
+    Shape:
+        - Input: (batch_size, seq_len, d_model)
+        - Output: (batch_size, seq_len, d_model)
+    """
+    def __init__(
+        self,
+        config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.n_heads = config.attention_n_heads
+        self.n_kv_heads = config.attention_n_kv_heads
+        self.batch_size = config.batch_size
+        self.max_seq_len = config.max_seq_len
+        d_model = config.d_model
+        self.head_dim = d_model // self.n_heads
+        self.n_rep = self.n_heads // self.n_kv_heads
+        self.q_proj = nn.Linear(d_model, self.n_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.n_heads * self.head_dim, d_model, bias=False)
+        self.rope = RoPE(config)
+    def forward(
+        self,
+        input: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor, ...]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        """Forward pass for the attention mechanism.
+        Computes queries, keys, and values for the attention mechanism. Applies rotary positional
+        embeddings to the queries and keys, and then computes attention scores and outputs.
+        For an introduction to the attention mechanism, see:
+        https://arxiv.org/abs/1706.03762
+        A few things to note:
+        - The past_key_values is used to implement the KV cache, which is used to speed up
+          generation by caching the KV pairs from previous forward passes. This is useful when doing
+          tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
+          modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
+          its own KV cache - this KV cache is implemented as a tuple.
+        """
+        bsz, seq_len, _ = input.shape
+        _queries, _keys, _values = (
+            self.q_proj(input),
+            self.k_proj(input),
+            self.v_proj(input),
+        )
+        # Reshaping for multi-head attention
+        queries = _queries.view(bsz, seq_len, self.n_heads, self.head_dim)
+        keys = _keys.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        values = _values.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        # The start position is used to apply the RoPE embeddings to only the new tokens
+        # when using the kv_cache in the attention mechanism.
+        # We want to start from the last position in the cache.
+        start_pos = past_key_values[0].shape[1] if past_key_values is not None else 0
+        # apply rotary positional embeddings
+        queries, keys = self.rope(queries, keys, start_pos)
+        if past_key_values is not None:
+            keys = torch.cat([past_key_values[0], keys], dim=1)
+            values = torch.cat([past_key_values[1], values], dim=1)
+        if use_cache:
+            cached_keys = keys
+            cached_values = values
+        else:
+            cached_keys = None
+            cached_values = None
+        queries = queries.transpose(1, 2)
+        keys = keys.transpose(1, 2)
+        values = values.transpose(1, 2)
+        apply_gqa = self.n_rep > 1
+        if apply_gqa and queries.device.type == "mps":
+            # NOTE: MPS does not support GQA in the SDPA kernel, but we can repeat the keys and values
+            # outside of the kernel to get the same effect.
+            # See: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+            keys = keys.repeat_interleave(self.n_rep, dim=-3)
+            values = values.repeat_interleave(self.n_rep, dim=-3)
+            apply_gqa = False
+        backends = [SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH]
+        with sdpa_kernel(backends=backends):
+            attn_output = F.scaled_dot_product_attention(
+                queries.contiguous(),
+                keys.contiguous(),
+                values.contiguous(),
+                attn_mask=mask.to(queries.dtype) if mask is not None else None,
+                enable_gqa=apply_gqa,
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        output = self.o_proj(attn_output)
+        return output, (cached_keys, cached_values)
+########################################################
+#
+# SwiGLU (Combines MLP and Activation)
+#
+########################################################
+class SwiGLU(nn.Module):
+    """SwiGLU Activation Function with Linear Projections.
+    Implements the SwiGLU activation function combined with linear transformations,
+    serving as the feed-forward network in transformer blocks.
+    Args:
+        config (Union[ModelConfig, PicoDecoderHFConfig]): Configuration containing:
+            - config.d_model: Model dimension
+            - config.activation_hidden_dim: Hidden dimension (typically 4 * d_model)
+    References:
+        https://arxiv.org/abs/2002.05202
+    """
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        model_dim = config.d_model
+        act_hidden_dim = config.activation_hidden_dim  # usually 4 * d_model
+        self.w_0 = nn.Linear(model_dim, act_hidden_dim, bias=False)
+        self.w_1 = nn.Linear(model_dim, act_hidden_dim, bias=False)
+        self.w_2 = nn.Linear(act_hidden_dim, model_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w_2(F.silu(self.w_0(x)) * self.w_1(x))
+########################################################
+#
+# PicoDecoderBlock
+#
+########################################################
+class PicoDecoderBlock(nn.Module):
+    """Single Transformer Block with Attention and Feed-forward layers.
+    Implements a standard transformer block with:
+    - Multi-head attention with normalization and residual connection
+    - SwiGLU feed-forward network with normalization and residual connection
+    Args:
+        config (Union[ModelConfig, PicoDecoderHFConfig]): Model configuration; either a dataclass or
+            a HuggingFace PicoDecoderHFConfig
+    """
+    def __init__(
+        self,
+        config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.attention = Attention(config)
+        self.swiglu = SwiGLU(config)
+        self.attention_norm = RMSNorm(config)
+        self.swiglu_norm = RMSNorm(config)
+    def forward(
+        self,
+        input: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        attention_output, cached_key_values = self.attention(
+            self.attention_norm(input),
+            mask=mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+        # NOTE: cached_key_values is None if use_cache is False
+        h = input + attention_output
+        out = h + self.swiglu(self.swiglu_norm(h))
+        return out, cached_key_values
+########################################################
+#
+# Pico Decoder (Causal Transformer Model)
+#
+########################################################
+class PicoDecoder(nn.Module):
+    """
+    Pico Decoder: combines the embedding, causal decoder blocks, and output projection into a
+    single autoregressive model.
+    For more information on the model, see the classes for the modules that make up the model.
+    """
+    def __init__(
+        self,
+        model_config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.config = model_config
+        self.embedding_proj = nn.Embedding(self.config.vocab_size, self.config.d_model)
+        self.layers = nn.ModuleList(
+            [PicoDecoderBlock(self.config) for _ in range(self.config.n_layers)]
+        )
+        self.output_norm = RMSNorm(self.config)
+        self.de_embedding_proj = nn.Linear(
+            self.config.d_model, self.config.vocab_size, bias=False
+        )
+    def convert_to_hf_model(self) -> "PicoDecoderHF":
+        """Convert the Lightning model to a HuggingFace model."""
+        # Create HF config without fabric-specific settings
+        hf_config = PicoDecoderHFConfig.from_dataclass(self.config)
+        # Create new HF model
+        hf_model = PicoDecoderHF(hf_config)
+        # Copy state dict, excluding fabric-specific keys
+        hf_model.load_state_dict(self.state_dict(prefix="pico_decoder."))
+        return hf_model
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[Tuple[torch.Tensor, torch.Tensor]]]]:
+        """
+        This is the forward pass for the entire Pico model. It boils down to:
+        - Embedding the input ids
+        - Creating a causal mask
+        - Processing through the pico layers
+        - Projecting the output to logits
+        NOTE: One feature that might be confusing is the KV cache. The KV cache is used to speed up
+        generation by caching the KV pairs from previous forward passes. This is useful when doing
+        tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
+        modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
+        its own KV cache which is stored as a tuple. The whole model then stores a tuple of these
+        KV caches (so a tuple of tuples).
+        """
+        seq_len = input_ids.shape[-1]
+        h = self.embedding_proj(input_ids)
+        # Calculate start position from past cached KV pairs. Remember that each layer has its
+        # own KV Cache. So when we index past_key_values, we need to index into the KV pairs for the
+        # correct layer and then for either the keys or values.
+        start_pos = 0 if past_key_values is None else past_key_values[0][0].shape[1]
+        # Create causal mask for current sequence
+        mask = None
+        if seq_len > 1:
+            mask = torch.full((seq_len, seq_len), float("-inf"))
+            mask = torch.triu(mask, diagonal=1)
+            # If using KV cache, extend mask to cover cached sequence length
+            if past_key_values is not None:
+                # Add zeros for cached tokens (we can attend to all of them)
+                mask = torch.hstack([torch.zeros((seq_len, start_pos)), mask])
+            mask = mask.to(h.device)
+        # NOTE: If we are using the cache, we need to store the cached KV pairs for each layer
+        #       in a tuple. Each layer will have its own cached KV pair which we aggregate in a tuple.
+        cached_key_values = () if use_cache else None
+        # Process through transformer blocks
+        for idx, layer in enumerate(self.layers):
+            layer_past_key_values = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+            h, layer_cached_key_values = layer(
+                h, mask=mask, past_key_values=layer_past_key_values, use_cache=use_cache
+            )
+            if use_cache:
+                cached_key_values += (layer_cached_key_values,)
+        # Final norm and projection
+        h = self.output_norm(h)
+        logits = self.de_embedding_proj(h).float()
+        return logits, cached_key_values
+########################################################
+#
+# HuggingFace Wrapper for the Pico Decoder model.
+#
+########################################################
+class PicoDecoderHFConfig(PretrainedConfig):
+    """Config class for the Pico Decoder HuggingFace wrapper."""
+    model_type = "pico_decoder"
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PicoDecoderHFConfig":
+        """
+        Initialize config from a dictionary. Note that no kwargs are passed to the constructor --
+        this is because with some kwargs special handling is required and can make this class
+        brittle.
+        """
+        pico_config = cls(**config_dict)
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+        unused_kwargs = {
+            key: value for key, value in kwargs.items() if not hasattr(pico_config, key)
+        }
+        if return_unused_kwargs:
+            return pico_config, unused_kwargs
+        return pico_config
+    @classmethod
+    def from_dataclass(cls, model_config: "ModelConfig"):
+        """Initialise from our custom config dataclass."""
+        return cls.from_dict(asdict(model_config))
+class PicoDecoderHF(PreTrainedModel, GenerationMixin):
+    """
+    HuggingFace wrapper for the Pico model with generation support.
+    Many evaluation frameworks require a model be setup as a HuggingFace model, so we provide a simple
+    wrapper that does just that. When we save checkpoints of the Pico model, we save both the normal
+    Pico model as well as the model wrapped in this HuggingFace class.
+    This also lets you do cool things like:
+    `model = AutoModelForCausalLM.from_pretrained("path/to/checkpoint")`
+    """
+    config_class = PicoDecoderHFConfig
+    _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
+    main_input_name = "input_ids"
+    def __init__(self, config: PicoDecoderHFConfig):
+        super().__init__(config)
+        self.pico_decoder = PicoDecoder(config)
+        # Initialize generation config with defaults
+        self.generation_config = GenerationConfig()
+        # Set some reasonable defaults for the model
+        if hasattr(config, 'max_position_embeddings'):
+            self.generation_config.max_length = config.max_position_embeddings
+        if hasattr(config, 'vocab_size'):
+            self.generation_config.vocab_size = config.vocab_size
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
+        """HuggingFace forward pass wrapper.
+        Forwards pass for the HuggingFace version of the Pico Model. Basic wrapper around the
+        Pico model's forward pass, and returns the output as a HuggingFace CausalLMOutput.
+        """
+        logits, past_key_values = self.pico_decoder(
+            input_ids, past_key_values, use_cache
+        )
+        if use_cache:
+            return CausalLMOutputWithPast(
+                logits=logits,
+                past_key_values=past_key_values,
+            )
+        else:
+            return CausalLMOutput(
+                logits=logits,
+            )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Prepare inputs for generation.
+        Args:
+            input_ids: Input token IDs
+            past_key_values: Cached key-value pairs from previous forward passes
+            attention_mask: Attention mask for the input
+            **kwargs: Additional arguments
+        Returns:
+            Dictionary containing prepared inputs
+        """
+        # If we have past_key_values, we only need the last token
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": True,
+        }
+    def get_input_embeddings(self):
+        """Get the input embeddings layer."""
+        return self.pico_decoder.embedding_proj
+    def set_input_embeddings(self, value):
+        """Set the input embeddings layer."""
+        self.pico_decoder.embedding_proj = value
+    def get_output_embeddings(self):
+        """Get the output embeddings layer."""
+        return self.pico_decoder.de_embedding_proj
+    def set_output_embeddings(self, value):
+        """Set the output embeddings layer."""
+        self.pico_decoder.de_embedding_proj = value
+    def get_lm_head(self):
+        """Get the language model head."""
+        return self.pico_decoder.de_embedding_proj
+    def can_generate(self) -> bool:
+        """Check if the model can generate text."""
+        return True
+    @property
+    def is_encoder_decoder(self) -> bool:
+        """Check if the model is an encoder-decoder model."""
+        return False
+    @property
+    def can_use_cache(self) -> bool:
+        """Check if the model can use KV cache."""
+        return True
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> torch.nn.Embedding:
+        """Resize token embeddings."""
+        old_embeddings = self.get_input_embeddings()
+        if new_num_tokens is None:
+            new_num_tokens = old_embeddings.num_embeddings
+        new_embeddings = torch.nn.Embedding(new_num_tokens, old_embeddings.embedding_dim)
+        new_embeddings.weight.data[:old_embeddings.num_embeddings] = old_embeddings.weight.data
+        self.pico_decoder.embedding_proj = new_embeddings
+        self.pico_decoder.de_embedding_proj = torch.nn.Linear(
+            old_embeddings.embedding_dim, new_num_tokens, bias=False
+        )
+        return new_embeddings
+# Register for auto classes
+PicoDecoderHFConfig.register_for_auto_class()
+PicoDecoderHF.register_for_auto_class("AutoModel")
+PicoDecoderHF.register_for_auto_class("AutoModelForCausalLM")
+########################################################
+#
+# New PicoDecoderForCausalLM class for generation support
+#
+########################################################
+class PicoDecoderForCausalLM(PreTrainedModel, GenerationMixin):
+    """
+    PicoDecoderForCausalLM: A HuggingFace-compatible model that properly supports generation.
+    This class is designed to work with existing checkpoints and provides full generation support.
+    It inherits from the right base classes that HuggingFace expects for text generation.
+    """
+    config_class = PicoDecoderHFConfig
+    _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
+    main_input_name = "input_ids"
+    def __init__(self, config: PicoDecoderHFConfig):
+        super().__init__(config)
+        self.pico_decoder = PicoDecoder(config)
+        # Initialize generation config with defaults
+        self.generation_config = GenerationConfig()
+        # Set some reasonable defaults for the model
+        if hasattr(config, 'max_position_embeddings'):
+            self.generation_config.max_length = config.max_position_embeddings
+        if hasattr(config, 'vocab_size'):
+            self.generation_config.vocab_size = config.vocab_size
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
+        """Forward pass for text generation."""
+        logits, past_key_values = self.pico_decoder(
+            input_ids, past_key_values, use_cache
+        )
+        if use_cache:
+            return CausalLMOutputWithPast(
+                logits=logits,
+                past_key_values=past_key_values,
+            )
+        else:
+            return CausalLMOutput(
+                logits=logits,
+            )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        """Prepare inputs for generation."""
+        # If we have past_key_values, we only need the last token
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": True,
+        }
+    def get_input_embeddings(self):
+        """Get the input embeddings layer."""
+        return self.pico_decoder.embedding_proj
+    def set_input_embeddings(self, value):
+        """Set the input embeddings layer."""
+        self.pico_decoder.embedding_proj = value
+    def get_output_embeddings(self):
+        """Get the output embeddings layer."""
+        return self.pico_decoder.de_embedding_proj
+    def set_output_embeddings(self, value):
+        """Set the output embeddings layer."""
+        self.pico_decoder.de_embedding_proj = value
+    def get_lm_head(self):
+        """Get the language model head."""
+        return self.pico_decoder.de_embedding_proj
+    def can_generate(self) -> bool:
+        """Check if the model can generate text."""
+        return True
+    @property
+    def is_encoder_decoder(self) -> bool:
+        """Check if the model is an encoder-decoder model."""
+        return False
+    @property
+    def can_use_cache(self) -> bool:
+        """Check if the model can use KV cache."""
+        return True
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> torch.nn.Embedding:
+        """Resize token embeddings."""
+        old_embeddings = self.get_input_embeddings()
+        if new_num_tokens is None:
+            new_num_tokens = old_embeddings.num_embeddings
+        new_embeddings = torch.nn.Embedding(new_num_tokens, old_embeddings.embedding_dim)
+        new_embeddings.weight.data[:old_embeddings.num_embeddings] = old_embeddings.weight.data
+        self.pico_decoder.embedding_proj = new_embeddings
+        self.pico_decoder.de_embedding_proj = torch.nn.Linear(
+            old_embeddings.embedding_dim, new_num_tokens, bias=False
+        )
+        return new_embeddings
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Load a pretrained model from a checkpoint.
+        This method handles loading from both the old PicoDecoderHF format and the new format.
+        """
+        # First try to load with the new class
+        try:
+            return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        except Exception as e:
+            print(f"Failed to load with new class: {e}")
+            print("Attempting to load with legacy class and convert...")
+            # Try to load with the old class and convert
+            try:
+                from transformers import AutoModel
+                old_model = AutoModel.from_pretrained(
+                    pretrained_model_name_or_path,
+                    trust_remote_code=True,
+                    *model_args,
+                    **kwargs
+                )
+                # Create new model instance
+                new_model = cls(old_model.config)
+                # Copy state dict
+                new_model.load_state_dict(old_model.state_dict(), strict=False)
+                return new_model
+            except Exception as e2:
+                print(f"Failed to convert from legacy format: {e2}")
+                raise e
+# Register the new class
+PicoDecoderForCausalLM.register_for_auto_class("AutoModelForCausalLM")

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|padding|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_0/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "|||IP_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50254": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50255": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50256": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50257": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "|||EMAIL_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "|||PHONE_NUMBER|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": null,
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|padding|>",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": null
+}

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "activation_hidden_dim": 384,
+  "architectures": [
+    "PicoDecoderHF"
+  ],
+  "attention_n_heads": 12,
+  "attention_n_kv_heads": 4,
+  "auto_map": {
+    "AutoConfig": "pico_decoder.PicoDecoderHFConfig",
+    "AutoModelForCausalLM": "pico_decoder.PicoDecoderHF"
+  },
+  "batch_size": 1024,
+  "d_model": 96,
+  "max_seq_len": 2048,
+  "model_type": "pico_decoder",
+  "n_layers": 12,
+  "norm_eps": 1e-06,
+  "position_emb_theta": 10000.0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.3",
+  "vocab_size": 50304
+}

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/fabric_state/checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3e80057e3aeee9020555bb47c5510dffa49aa7bb95aa28626c755fd1bcd84c6
+size 135543171

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "transformers_version": "4.48.3",
+  "vocab_size": 50304
+}

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/learning_dynamics/train_activations.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e89d5ed24dce96b1c7926d0525d09f6fc80cd7ce982fdf4cb66817dcfbaeba9
+size 33819

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/learning_dynamics/train_data/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b91f61ac8aae1d61544370a4c754bda11f9501a2a8b2bdae615ead87385d6d0
+size 64520

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/learning_dynamics/train_data/dataset_info.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/learning_dynamics/train_data/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "6c5c8added4701f3",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/learning_dynamics/train_gradients.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cfeda57ef54a270df54117da31b7fa317f97f60870cd825f2f063392e85c1ad
+size 2371527

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/learning_dynamics/train_weights.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3b41e2ffb9726f463b88554fa3adf500de7cd5e7700cd3c15a0052d19e80ed3
+size 2371443

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1584cdf11592f978f9dd63c44fd15eec69dbc665b9b4c7a45d89a8f736931968
+size 45143592

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/pico_decoder.py ADDED Viewed

	@@ -0,0 +1,871 @@

+"""
+Pico Decoder: A Lightweight Causal Transformer Language Model
+Pico Decoder uses a simple LLAMA-style transformer architecture, written for clarity and educational purposes.
+Everything is written with a modular design for easy modification and experimentation.
+Key features:
+- RMSNorm for layer normalization
+- Rotary Positional Embeddings (RoPE)
+- Multi-head attention with KV-cache support
+- SwiGLU activation function
+- Residual connections throughout
+- KV-cache for faster autoregressive generation
+References:
+    - RoPE: https://arxiv.org/abs/2104.09864
+    - SwiGLU: https://arxiv.org/abs/2002.05202
+    - LLAMA: https://arxiv.org/abs/2302.13971
+Adapted from:
+    - OLMO: https://github.com/allenai/OLMo
+    - LLAMA: https://github.com/meta/llama
+"""
+from dataclasses import asdict
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from transformers import GenerationMixin, PretrainedConfig, PreTrainedModel
+from transformers.generation import GenerationConfig
+from transformers.modeling_outputs import CausalLMOutput, CausalLMOutputWithPast
+try:
+    if TYPE_CHECKING:
+        # We need to do this to avoid importing these when creating the HF-compatible models
+        from src.config import ModelConfig
+except ImportError:
+    pass
+########################################################
+#
+# Layer Normalization
+#
+########################################################
+class RMSNorm(torch.nn.Module):
+    """Root Mean Square Layer Normalization.
+    A variant of Layer Normalization that uses RMS statistics instead of mean/variance,
+    resulting in improved stability and performance.
+    Args:
+        config (Union[ModelConfig, PicoHFConfig]): Configuration object containing normalization parameters
+            - config.norm_eps: Small constant for numerical stability
+            - config.d_model: Model dimension for the weight parameter
+    References:
+        https://arxiv.org/abs/1910.07467
+    """
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        self.eps = config.norm_eps
+        self.weight = nn.Parameter(torch.ones(config.d_model))
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Normalizes the input tensor by its RMS value.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies RMS normalization to the input tensor and scales it by the weight parameter.
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+########################################################
+#
+# Positional Embedding
+#
+########################################################
+class RoPE(nn.Module):
+    """Rotary Positional Embeddings (RoPE).
+    Implements position-dependent rotation of keys and queries in attention mechanism,
+    allowing better modeling of relative positions in sequences. Uses complex number
+    operations for efficient rotation.
+    Args:
+        config (Union[ModelConfig, PicoHFConfig]): Model configuration containing:
+            - config.position_emb_theta: Base for frequency computation
+            - config.d_model: Model dimension
+            - config.attention_n_heads: Number of attention heads
+            - config.max_seq_len: Maximum sequence length
+    References:
+        https://arxiv.org/abs/2104.09864
+    """
+    _freqs_cis_tensor: torch.Tensor | None = None
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        self.theta = config.position_emb_theta
+        self.dim = config.d_model // config.attention_n_heads
+        max_seq_len = config.max_seq_len
+        # only gets set once, and then reused for all RoPE instances
+        if RoPE._freqs_cis_tensor is None:
+            RoPE._freqs_cis_tensor = self._setup_freqs_cis(
+                max_seq_len, self.theta, self.dim
+            )
+        # register _freqs_cis buffer
+        # can be easily recomputed so persistent=False
+        self.register_buffer("_freqs_cis", self._freqs_cis_tensor, persistent=False)
+    @classmethod
+    def _setup_freqs_cis(cls, seq_len: int, theta: float, dim: int) -> torch.Tensor:
+        """Setup Frequency Tensor for RoPE Embeddings
+        Initializes the complex frequency tensor that is used to compute the RoPE embeddings.
+        Note other implementations will use cos and sin directly, but using the complex
+        number representation is (probably) more efficient:
+            e^(theta * i * t) = cos(theta * t) + i * sin(theta * t) [Euler's formula]
+        """
+        _freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        positions = torch.arange(seq_len)
+        freqs = torch.outer(positions, _freqs)
+        return torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    def get_freqs_cis(
+        self, input_shape: torch.Size, start_pos: int, end_pos: int
+    ) -> torch.Tensor:
+        """Reshape Frequency Tensor for RoPE Embeddings
+        Makes the frequency tensor broadcastable with the input tensor.
+        """
+        _freqs_cis = self._freqs_cis[start_pos:end_pos]
+        ndim = len(input_shape)
+        assert 0 <= 1 < ndim
+        assert _freqs_cis.shape == (input_shape[1], input_shape[-1])
+        # TODO: Check whether this is correct (might be able to remove this)
+        shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(input_shape)]
+        return _freqs_cis.view(*shape)
+    def forward(
+        self,
+        queries: torch.Tensor,
+        keys: torch.Tensor,
+        start_pos: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply RoPE Embeddings to Queries and Keys
+        Applies the rotary positional embeddings to the input tensors via complex num multiplication
+        NOTE: The start_pos is used if we want to use the kv_cache in the attention mechanism.
+        """
+        queries_ = torch.view_as_complex(
+            queries.float().reshape(*queries.shape[:-1], -1, 2)
+        )
+        keys_ = torch.view_as_complex(keys.float().reshape(*keys.shape[:-1], -1, 2))
+        input_shape = (
+            queries_.shape
+        )  # same as keys: (batch_size, seq_len, n_heads, head_dim/2)
+        freqs_start_pos = start_pos
+        freqs_end_pos = freqs_start_pos + queries_.shape[1]
+        freqs_cis = self.get_freqs_cis(input_shape, freqs_start_pos, freqs_end_pos)
+        queries_rotated = torch.view_as_real(queries_ * freqs_cis).flatten(3)
+        keys_rotated = torch.view_as_real(keys_ * freqs_cis).flatten(3)
+        return queries_rotated.type_as(queries), keys_rotated.type_as(keys)
+########################################################
+#
+# Attention
+#
+########################################################
+class Attention(nn.Module):
+    """Multi-head Attention with Group Query Attention support.
+    Implements scaled dot-product attention and supports:
+    - Grouped Query Attention (GQA)
+    - Key-Value caching for efficient inference
+    - RoPE integration
+    Args:
+        config (Union[ModelConfig, PretrainedConfig]): Configuration containing:
+            - config.attention_n_heads: Number of attention heads
+            - config.attention_n_kv_heads: Number of key/value heads
+            - config.d_model: Model dimension
+            - config.batch_size: Maximum batch size
+            - config.max_seq_len: Maximum sequence length
+    Shape:
+        - Input: (batch_size, seq_len, d_model)
+        - Output: (batch_size, seq_len, d_model)
+    """
+    def __init__(
+        self,
+        config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.n_heads = config.attention_n_heads
+        self.n_kv_heads = config.attention_n_kv_heads
+        self.batch_size = config.batch_size
+        self.max_seq_len = config.max_seq_len
+        d_model = config.d_model
+        self.head_dim = d_model // self.n_heads
+        self.n_rep = self.n_heads // self.n_kv_heads
+        self.q_proj = nn.Linear(d_model, self.n_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.n_heads * self.head_dim, d_model, bias=False)
+        self.rope = RoPE(config)
+    def forward(
+        self,
+        input: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor, ...]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        """Forward pass for the attention mechanism.
+        Computes queries, keys, and values for the attention mechanism. Applies rotary positional
+        embeddings to the queries and keys, and then computes attention scores and outputs.
+        For an introduction to the attention mechanism, see:
+        https://arxiv.org/abs/1706.03762
+        A few things to note:
+        - The past_key_values is used to implement the KV cache, which is used to speed up
+          generation by caching the KV pairs from previous forward passes. This is useful when doing
+          tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
+          modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
+          its own KV cache - this KV cache is implemented as a tuple.
+        """
+        bsz, seq_len, _ = input.shape
+        _queries, _keys, _values = (
+            self.q_proj(input),
+            self.k_proj(input),
+            self.v_proj(input),
+        )
+        # Reshaping for multi-head attention
+        queries = _queries.view(bsz, seq_len, self.n_heads, self.head_dim)
+        keys = _keys.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        values = _values.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        # The start position is used to apply the RoPE embeddings to only the new tokens
+        # when using the kv_cache in the attention mechanism.
+        # We want to start from the last position in the cache.
+        start_pos = past_key_values[0].shape[1] if past_key_values is not None else 0
+        # apply rotary positional embeddings
+        queries, keys = self.rope(queries, keys, start_pos)
+        if past_key_values is not None:
+            keys = torch.cat([past_key_values[0], keys], dim=1)
+            values = torch.cat([past_key_values[1], values], dim=1)
+        if use_cache:
+            cached_keys = keys
+            cached_values = values
+        else:
+            cached_keys = None
+            cached_values = None
+        queries = queries.transpose(1, 2)
+        keys = keys.transpose(1, 2)
+        values = values.transpose(1, 2)
+        apply_gqa = self.n_rep > 1
+        if apply_gqa and queries.device.type == "mps":
+            # NOTE: MPS does not support GQA in the SDPA kernel, but we can repeat the keys and values
+            # outside of the kernel to get the same effect.
+            # See: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+            keys = keys.repeat_interleave(self.n_rep, dim=-3)
+            values = values.repeat_interleave(self.n_rep, dim=-3)
+            apply_gqa = False
+        backends = [SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH]
+        with sdpa_kernel(backends=backends):
+            attn_output = F.scaled_dot_product_attention(
+                queries.contiguous(),
+                keys.contiguous(),
+                values.contiguous(),
+                attn_mask=mask.to(queries.dtype) if mask is not None else None,
+                enable_gqa=apply_gqa,
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        output = self.o_proj(attn_output)
+        return output, (cached_keys, cached_values)
+########################################################
+#
+# SwiGLU (Combines MLP and Activation)
+#
+########################################################
+class SwiGLU(nn.Module):
+    """SwiGLU Activation Function with Linear Projections.
+    Implements the SwiGLU activation function combined with linear transformations,
+    serving as the feed-forward network in transformer blocks.
+    Args:
+        config (Union[ModelConfig, PicoDecoderHFConfig]): Configuration containing:
+            - config.d_model: Model dimension
+            - config.activation_hidden_dim: Hidden dimension (typically 4 * d_model)
+    References:
+        https://arxiv.org/abs/2002.05202
+    """
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        model_dim = config.d_model
+        act_hidden_dim = config.activation_hidden_dim  # usually 4 * d_model
+        self.w_0 = nn.Linear(model_dim, act_hidden_dim, bias=False)
+        self.w_1 = nn.Linear(model_dim, act_hidden_dim, bias=False)
+        self.w_2 = nn.Linear(act_hidden_dim, model_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w_2(F.silu(self.w_0(x)) * self.w_1(x))
+########################################################
+#
+# PicoDecoderBlock
+#
+########################################################
+class PicoDecoderBlock(nn.Module):
+    """Single Transformer Block with Attention and Feed-forward layers.
+    Implements a standard transformer block with:
+    - Multi-head attention with normalization and residual connection
+    - SwiGLU feed-forward network with normalization and residual connection
+    Args:
+        config (Union[ModelConfig, PicoDecoderHFConfig]): Model configuration; either a dataclass or
+            a HuggingFace PicoDecoderHFConfig
+    """
+    def __init__(
+        self,
+        config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.attention = Attention(config)
+        self.swiglu = SwiGLU(config)
+        self.attention_norm = RMSNorm(config)
+        self.swiglu_norm = RMSNorm(config)
+    def forward(
+        self,
+        input: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        attention_output, cached_key_values = self.attention(
+            self.attention_norm(input),
+            mask=mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+        # NOTE: cached_key_values is None if use_cache is False
+        h = input + attention_output
+        out = h + self.swiglu(self.swiglu_norm(h))
+        return out, cached_key_values
+########################################################
+#
+# Pico Decoder (Causal Transformer Model)
+#
+########################################################
+class PicoDecoder(nn.Module):
+    """
+    Pico Decoder: combines the embedding, causal decoder blocks, and output projection into a
+    single autoregressive model.
+    For more information on the model, see the classes for the modules that make up the model.
+    """
+    def __init__(
+        self,
+        model_config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.config = model_config
+        self.embedding_proj = nn.Embedding(self.config.vocab_size, self.config.d_model)
+        self.layers = nn.ModuleList(
+            [PicoDecoderBlock(self.config) for _ in range(self.config.n_layers)]
+        )
+        self.output_norm = RMSNorm(self.config)
+        self.de_embedding_proj = nn.Linear(
+            self.config.d_model, self.config.vocab_size, bias=False
+        )
+    def convert_to_hf_model(self) -> "PicoDecoderHF":
+        """Convert the Lightning model to a HuggingFace model."""
+        # Create HF config without fabric-specific settings
+        hf_config = PicoDecoderHFConfig.from_dataclass(self.config)
+        # Create new HF model
+        hf_model = PicoDecoderHF(hf_config)
+        # Copy state dict, excluding fabric-specific keys
+        hf_model.load_state_dict(self.state_dict(prefix="pico_decoder."))
+        return hf_model
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[Tuple[torch.Tensor, torch.Tensor]]]]:
+        """
+        This is the forward pass for the entire Pico model. It boils down to:
+        - Embedding the input ids
+        - Creating a causal mask
+        - Processing through the pico layers
+        - Projecting the output to logits
+        NOTE: One feature that might be confusing is the KV cache. The KV cache is used to speed up
+        generation by caching the KV pairs from previous forward passes. This is useful when doing
+        tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
+        modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
+        its own KV cache which is stored as a tuple. The whole model then stores a tuple of these
+        KV caches (so a tuple of tuples).
+        """
+        seq_len = input_ids.shape[-1]
+        h = self.embedding_proj(input_ids)
+        # Calculate start position from past cached KV pairs. Remember that each layer has its
+        # own KV Cache. So when we index past_key_values, we need to index into the KV pairs for the
+        # correct layer and then for either the keys or values.
+        start_pos = 0 if past_key_values is None else past_key_values[0][0].shape[1]
+        # Create causal mask for current sequence
+        mask = None
+        if seq_len > 1:
+            mask = torch.full((seq_len, seq_len), float("-inf"))
+            mask = torch.triu(mask, diagonal=1)
+            # If using KV cache, extend mask to cover cached sequence length
+            if past_key_values is not None:
+                # Add zeros for cached tokens (we can attend to all of them)
+                mask = torch.hstack([torch.zeros((seq_len, start_pos)), mask])
+            mask = mask.to(h.device)
+        # NOTE: If we are using the cache, we need to store the cached KV pairs for each layer
+        #       in a tuple. Each layer will have its own cached KV pair which we aggregate in a tuple.
+        cached_key_values = () if use_cache else None
+        # Process through transformer blocks
+        for idx, layer in enumerate(self.layers):
+            layer_past_key_values = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+            h, layer_cached_key_values = layer(
+                h, mask=mask, past_key_values=layer_past_key_values, use_cache=use_cache
+            )
+            if use_cache:
+                cached_key_values += (layer_cached_key_values,)
+        # Final norm and projection
+        h = self.output_norm(h)
+        logits = self.de_embedding_proj(h).float()
+        return logits, cached_key_values
+########################################################
+#
+# HuggingFace Wrapper for the Pico Decoder model.
+#
+########################################################
+class PicoDecoderHFConfig(PretrainedConfig):
+    """Config class for the Pico Decoder HuggingFace wrapper."""
+    model_type = "pico_decoder"
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PicoDecoderHFConfig":
+        """
+        Initialize config from a dictionary. Note that no kwargs are passed to the constructor --
+        this is because with some kwargs special handling is required and can make this class
+        brittle.
+        """
+        pico_config = cls(**config_dict)
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+        unused_kwargs = {
+            key: value for key, value in kwargs.items() if not hasattr(pico_config, key)
+        }
+        if return_unused_kwargs:
+            return pico_config, unused_kwargs
+        return pico_config
+    @classmethod
+    def from_dataclass(cls, model_config: "ModelConfig"):
+        """Initialise from our custom config dataclass."""
+        return cls.from_dict(asdict(model_config))
+class PicoDecoderHF(PreTrainedModel, GenerationMixin):
+    """
+    HuggingFace wrapper for the Pico model with generation support.
+    Many evaluation frameworks require a model be setup as a HuggingFace model, so we provide a simple
+    wrapper that does just that. When we save checkpoints of the Pico model, we save both the normal
+    Pico model as well as the model wrapped in this HuggingFace class.
+    This also lets you do cool things like:
+    `model = AutoModelForCausalLM.from_pretrained("path/to/checkpoint")`
+    """
+    config_class = PicoDecoderHFConfig
+    _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
+    main_input_name = "input_ids"
+    def __init__(self, config: PicoDecoderHFConfig):
+        super().__init__(config)
+        self.pico_decoder = PicoDecoder(config)
+        # Initialize generation config with defaults
+        self.generation_config = GenerationConfig()
+        # Set some reasonable defaults for the model
+        if hasattr(config, "max_position_embeddings"):
+            self.generation_config.max_length = config.max_position_embeddings
+        if hasattr(config, "vocab_size"):
+            self.generation_config.vocab_size = config.vocab_size
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
+        """HuggingFace forward pass wrapper.
+        Forwards pass for the HuggingFace version of the Pico Model. Basic wrapper around the
+        Pico model's forward pass, and returns the output as a HuggingFace CausalLMOutput.
+        """
+        logits, past_key_values = self.pico_decoder(
+            input_ids, past_key_values, use_cache
+        )
+        if use_cache:
+            return CausalLMOutputWithPast(
+                logits=logits,
+                past_key_values=past_key_values,
+            )
+        else:
+            return CausalLMOutput(
+                logits=logits,
+            )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """
+        Prepare inputs for generation.
+        Args:
+            input_ids: Input token IDs
+            past_key_values: Cached key-value pairs from previous forward passes
+            attention_mask: Attention mask for the input
+            **kwargs: Additional arguments
+        Returns:
+            Dictionary containing prepared inputs
+        """
+        # If we have past_key_values, we only need the last token
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": True,
+        }
+    def get_input_embeddings(self):
+        """Get the input embeddings layer."""
+        return self.pico_decoder.embedding_proj
+    def set_input_embeddings(self, value):
+        """Set the input embeddings layer."""
+        self.pico_decoder.embedding_proj = value
+    def get_output_embeddings(self):
+        """Get the output embeddings layer."""
+        return self.pico_decoder.de_embedding_proj
+    def set_output_embeddings(self, value):
+        """Set the output embeddings layer."""
+        self.pico_decoder.de_embedding_proj = value
+    def get_lm_head(self):
+        """Get the language model head."""
+        return self.pico_decoder.de_embedding_proj
+    def can_generate(self) -> bool:
+        """Check if the model can generate text."""
+        return True
+    @property
+    def is_encoder_decoder(self) -> bool:
+        """Check if the model is an encoder-decoder model."""
+        return False
+    @property
+    def can_use_cache(self) -> bool:
+        """Check if the model can use KV cache."""
+        return True
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None
+    ) -> torch.nn.Embedding:
+        """Resize token embeddings."""
+        old_embeddings = self.get_input_embeddings()
+        if new_num_tokens is None:
+            new_num_tokens = old_embeddings.num_embeddings
+        new_embeddings = torch.nn.Embedding(
+            new_num_tokens, old_embeddings.embedding_dim
+        )
+        new_embeddings.weight.data[: old_embeddings.num_embeddings] = (
+            old_embeddings.weight.data
+        )
+        self.pico_decoder.embedding_proj = new_embeddings
+        self.pico_decoder.de_embedding_proj = torch.nn.Linear(
+            old_embeddings.embedding_dim, new_num_tokens, bias=False
+        )
+        return new_embeddings
+# Register for auto classes
+PicoDecoderHFConfig.register_for_auto_class()
+PicoDecoderHF.register_for_auto_class("AutoModel")
+PicoDecoderHF.register_for_auto_class("AutoModelForCausalLM")
+########################################################
+#
+# New PicoDecoderForCausalLM class for generation support
+#
+########################################################
+class PicoDecoderForCausalLM(PreTrainedModel, GenerationMixin):
+    """
+    PicoDecoderForCausalLM: A HuggingFace-compatible model that properly supports generation.
+    This class is designed to work with existing checkpoints and provides full generation support.
+    It inherits from the right base classes that HuggingFace expects for text generation.
+    """
+    config_class = PicoDecoderHFConfig
+    _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
+    main_input_name = "input_ids"
+    def __init__(self, config: PicoDecoderHFConfig):
+        super().__init__(config)
+        self.pico_decoder = PicoDecoder(config)
+        # Initialize generation config with defaults
+        self.generation_config = GenerationConfig()
+        # Set some reasonable defaults for the model
+        if hasattr(config, "max_position_embeddings"):
+            self.generation_config.max_length = config.max_position_embeddings
+        if hasattr(config, "vocab_size"):
+            self.generation_config.vocab_size = config.vocab_size
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
+        """Forward pass for text generation."""
+        logits, past_key_values = self.pico_decoder(
+            input_ids, past_key_values, use_cache
+        )
+        if use_cache:
+            return CausalLMOutputWithPast(
+                logits=logits,
+                past_key_values=past_key_values,
+            )
+        else:
+            return CausalLMOutput(
+                logits=logits,
+            )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Prepare inputs for generation."""
+        # If we have past_key_values, we only need the last token
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": True,
+        }
+    def get_input_embeddings(self):
+        """Get the input embeddings layer."""
+        return self.pico_decoder.embedding_proj
+    def set_input_embeddings(self, value):
+        """Set the input embeddings layer."""
+        self.pico_decoder.embedding_proj = value
+    def get_output_embeddings(self):
+        """Get the output embeddings layer."""
+        return self.pico_decoder.de_embedding_proj
+    def set_output_embeddings(self, value):
+        """Set the output embeddings layer."""
+        self.pico_decoder.de_embedding_proj = value
+    def get_lm_head(self):
+        """Get the language model head."""
+        return self.pico_decoder.de_embedding_proj
+    def can_generate(self) -> bool:
+        """Check if the model can generate text."""
+        return True
+    @property
+    def is_encoder_decoder(self) -> bool:
+        """Check if the model is an encoder-decoder model."""
+        return False
+    @property
+    def can_use_cache(self) -> bool:
+        """Check if the model can use KV cache."""
+        return True
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None
+    ) -> torch.nn.Embedding:
+        """Resize token embeddings."""
+        old_embeddings = self.get_input_embeddings()
+        if new_num_tokens is None:
+            new_num_tokens = old_embeddings.num_embeddings
+        new_embeddings = torch.nn.Embedding(
+            new_num_tokens, old_embeddings.embedding_dim
+        )
+        new_embeddings.weight.data[: old_embeddings.num_embeddings] = (
+            old_embeddings.weight.data
+        )
+        self.pico_decoder.embedding_proj = new_embeddings
+        self.pico_decoder.de_embedding_proj = torch.nn.Linear(
+            old_embeddings.embedding_dim, new_num_tokens, bias=False
+        )
+        return new_embeddings
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Load a pretrained model from a checkpoint.
+        This method handles loading from both the old PicoDecoderHF format and the new format.
+        """
+        # First try to load with the new class
+        try:
+            return super().from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        except Exception as e:
+            print(f"Failed to load with new class: {e}")
+            print("Attempting to load with legacy class and convert...")
+            # Try to load with the old class and convert
+            try:
+                from transformers import AutoModel
+                old_model = AutoModel.from_pretrained(
+                    pretrained_model_name_or_path,
+                    trust_remote_code=True,
+                    *model_args,
+                    **kwargs,
+                )
+                # Create new model instance
+                new_model = cls(old_model.config)
+                # Copy state dict
+                new_model.load_state_dict(old_model.state_dict(), strict=False)
+                return new_model
+            except Exception as e2:
+                print(f"Failed to convert from legacy format: {e2}")
+                raise e
+# Register the new class
+PicoDecoderForCausalLM.register_for_auto_class("AutoModelForCausalLM")

pico-decoder-tiny-dolma-teensy-v1/checkpoints/step_1000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|padding|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}