Spaces:

Tinkering
/

Pytorch-day-prez

Running

App Files Files Community

Molbap HF Staff commited on May 7

Commit

bd27c9a

verified ·

1 Parent(s): 5d68161

Update index.html

Browse files

Files changed (1) hide show

index.html +4 -95

index.html CHANGED Viewed

@@ -303,10 +303,6 @@ class GlmAttention(LlamaAttention):
       self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim,
                               config.hidden_size, bias=False)
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
-  # Slightly different RoPE
-  …
 class GlmForCausalLM(LlamaForCausalLM):
   pass
       </code></pre>
@@ -318,7 +314,7 @@ class GlmForCausalLM(LlamaForCausalLM):
       <p>All the code becomes runnable and a self-contained model definition</p>
     <pre><code class="language-python" data-trim>
-      class GlmMLP(nn.Module):
       def __init__(self, config):
           super().__init__()
@@ -336,93 +332,6 @@ class GlmForCausalLM(LlamaForCausalLM):
           return self.down_proj(up_states)
-  def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-      """
-      This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-      num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-      """
-      batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-      if n_rep == 1:
-          return hidden_states
-      hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-      return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-  def eager_attention_forward(
-      module: nn.Module,
-      query: torch.Tensor,
-      key: torch.Tensor,
-      value: torch.Tensor,
-      attention_mask: Optional[torch.Tensor],
-      scaling: float,
-      dropout: float = 0.0,
-      **kwargs,
-  ):
-      key_states = repeat_kv(key, module.num_key_value_groups)
-      value_states = repeat_kv(value, module.num_key_value_groups)
-      attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
-      if attention_mask is not None:
-          causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-          attn_weights = attn_weights + causal_mask
-      attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-      attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
-      attn_output = torch.matmul(attn_weights, value_states)
-      attn_output = attn_output.transpose(1, 2).contiguous()
-      return attn_output, attn_weights
-  def rotate_half(x):
-      """Rotates half the hidden dims of the input."""
-      x1 = x[..., 0::2]
-      x2 = x[..., 1::2]
-      return torch.stack((-x2, x1), dim=-1).flatten(-2)
-  def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
-      """Applies Rotary Position Embedding to the query and key tensors.
-      Args:
-          q (`torch.Tensor`): The query tensor.
-          k (`torch.Tensor`): The key tensor.
-          cos (`torch.Tensor`): The cosine part of the rotary embedding.
-          sin (`torch.Tensor`): The sine part of the rotary embedding.
-          position_ids (`torch.Tensor`, *optional*):
-              Deprecated and unused.
-          unsqueeze_dim (`int`, *optional*, defaults to 1):
-              The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-              sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-              that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-              k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-              cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-              the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-      Returns:
-          `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-      """
-      cos = cos.unsqueeze(unsqueeze_dim)
-      sin = sin.unsqueeze(unsqueeze_dim)
-      # Interleave them instead of usual shape
-      cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
-      sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
-      # Keep half or full tensor for later concatenation
-      rotary_dim = cos.shape[-1]
-      q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
-      k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
-      # Apply rotary embeddings on the first half or full tensor
-      q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
-      k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
-      # Concatenate back to full shape
-      q_embed = torch.cat([q_embed, q_pass], dim=-1)
-      k_embed = torch.cat([k_embed, k_pass], dim=-1)
-      return q_embed, k_embed
   class GlmAttention(nn.Module):
       """Multi-headed attention from 'Attention Is All You Need' paper"""
@@ -647,7 +556,7 @@ y = torch.empty_like(x)
 activation.gelu_fast(y, x)
 print(y)
         </code></pre>
-        <p class="fragment">Same Transformer code — now with a <strong>3× faster</strong> GELU on A100s.</p>
       </section>
       <section>
@@ -722,8 +631,8 @@ model = AutoModelForConditionalGeneration.from_pretrained("Qwen/Qwen3-8B")
               🤝 Symbiotic Growth
             </p>
             <p style="display: flex; align-items: center; gap: 0.4rem; font-size: 1.4rem;">
-              <img src="assets/torchlogo.png" alt="PyTorch" style="height: 1.4rem;" />
-              PyTorch &amp; <code>transformers</code> grow together
               <img src="assets/head_logo.svg" alt="Transformers" style="height: 1.4rem;" />
             </p>
           </div>

       self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim,
                               config.hidden_size, bias=False)
 class GlmForCausalLM(LlamaForCausalLM):
   pass
       </code></pre>
       <p>All the code becomes runnable and a self-contained model definition</p>
     <pre><code class="language-python" data-trim>
+  class GlmMLP(nn.Module):
       def __init__(self, config):
           super().__init__()
           return self.down_proj(up_states)
   class GlmAttention(nn.Module):
       """Multi-headed attention from 'Attention Is All You Need' paper"""
 activation.gelu_fast(y, x)
 print(y)
         </code></pre>
+        <p>Same Transformer code — now with a <strong>3× faster</strong> GELU on A100s.</p>
       </section>
       <section>
               🤝 Symbiotic Growth
             </p>
             <p style="display: flex; align-items: center; gap: 0.4rem; font-size: 1.4rem;">
+              <img src="assets/transparent_PyTorch.png" alt="PyTorch" style="height: 1.4rem;" />
+              <code> PyTorch</code> &amp; <code>transformers</code> grow together
               <img src="assets/head_logo.svg" alt="Transformers" style="height: 1.4rem;" />
             </p>
           </div>