Molbap HF Staff commited on
Commit
fc6c5e6
·
1 Parent(s): 3ff5fdf
Files changed (42) hide show
  1. app/dist/_astro/{index.beJ178IL.css → index.7hgRH84_.css} +0 -0
  2. app/dist/_astro/{index.beJ178IL.css.gz → index.7hgRH84_.css.gz} +2 -2
  3. app/dist/images/transformers/big_picture_zoomout.png +3 -0
  4. app/dist/images/transformers/cluster_wave2vec2.png +3 -0
  5. app/dist/images/transformers/detr_island.png +3 -0
  6. app/dist/images/transformers/llama_center.png +3 -0
  7. app/dist/images/transformers/llama_glm_attn.png +3 -0
  8. app/dist/images/transformers/timeline_llava.png +3 -0
  9. app/dist/index.html +125 -99
  10. app/dist/index.html.gz +2 -2
  11. app/dist/llama_center.png +3 -0
  12. app/dist/llama_glm_attn.png +3 -0
  13. app/public/images/transformers/big_picture_zoomout.png +3 -0
  14. app/public/images/transformers/cluster_wave2vec2.png +3 -0
  15. app/public/images/transformers/detr_island.png +3 -0
  16. app/public/images/transformers/llama_center.png +3 -0
  17. app/public/images/transformers/llama_glm_attn.png +3 -0
  18. app/public/images/transformers/timeline_llava.png +3 -0
  19. app/public/llama_center.png +3 -0
  20. app/public/llama_glm_attn.png +3 -0
  21. app/src/components/Hero.astro +0 -13
  22. app/src/content/article.mdx +121 -59
  23. app/src/content/embeds/banner.html +7 -0
  24. app/src/content/embeds/transformers/tp-plan.html +22 -23
  25. app/src/content/new_article.mdx +1 -0
  26. app/src/styles/_base.css +4 -4
  27. src/distill.js +0 -0
  28. src/fragments/attention-visualizer.html +0 -45
  29. src/fragments/d3-graph.html +0 -12
  30. src/fragments/dependency-graph.html +0 -6
  31. src/fragments/glm-compare.html +0 -149
  32. src/fragments/loc-growth.html +0 -6
  33. src/fragments/memory-profiler.html +0 -16
  34. src/fragments/model-timeline.html +0 -6
  35. src/fragments/model-visualisation.html +0 -0
  36. src/fragments/terminal.html +0 -43
  37. src/fragments/tp-plan.html +0 -24
  38. src/fragments/warmup_demo.html +0 -398
  39. src/index.js +0 -21
  40. src/style.css +0 -741
  41. src/transformers-custom.css +0 -741
  42. webpack.config.js +4 -1
app/dist/_astro/{index.beJ178IL.css → index.7hgRH84_.css} RENAMED
The diff for this file is too large to render. See raw diff
 
app/dist/_astro/{index.beJ178IL.css.gz → index.7hgRH84_.css.gz} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b295619b7b1df79e3034566f78788f08d65d11da3dd3d3c5ac113a256470d828
3
- size 18469
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4e9905ab57ee091f7ff64f4356323970823bacad6cf723107e41b4696e3e6a9
3
+ size 18473
app/dist/images/transformers/big_picture_zoomout.png ADDED

Git LFS Details

  • SHA256: 6b48173ad33c50e9b1b7f674bb21948da982db04e4a927cf0cecee45bc749297
  • Pointer size: 131 Bytes
  • Size of remote file: 218 kB
app/dist/images/transformers/cluster_wave2vec2.png ADDED

Git LFS Details

  • SHA256: ad2931607cfd522cbccddc8047ee7fd6ee3945a2d818fe72b6c6b08c58e062b3
  • Pointer size: 130 Bytes
  • Size of remote file: 55.4 kB
app/dist/images/transformers/detr_island.png ADDED

Git LFS Details

  • SHA256: 6f6daf8ce4f8e71a0a9b3c60f2a7a18aacf1812a54337cf345b9005eaa251125
  • Pointer size: 130 Bytes
  • Size of remote file: 18.5 kB
app/dist/images/transformers/llama_center.png ADDED

Git LFS Details

  • SHA256: 3ec2caa493f919717ece1366836e156d8d05a3bf09ef4313ea502d5130a82cb0
  • Pointer size: 131 Bytes
  • Size of remote file: 406 kB
app/dist/images/transformers/llama_glm_attn.png ADDED

Git LFS Details

  • SHA256: 6b2c88d5eb3d461d791e7e280f74e54d05f01babb02ea0536b50386b7b1b1b8a
  • Pointer size: 131 Bytes
  • Size of remote file: 138 kB
app/dist/images/transformers/timeline_llava.png ADDED

Git LFS Details

  • SHA256: 2bd0469fa24737bf309c1225005b62d7f0a9d722df0e56a18c578a1327cf94fc
  • Pointer size: 131 Bytes
  • Size of remote file: 621 kB
app/dist/index.html CHANGED
@@ -1,7 +1,7 @@
1
  <!DOCTYPE html><html lang="en" data-theme="light" data-toc-auto-collapse="1"> <head><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1"><title>Maintain the unmaintainable:
2
  1M python loc, 400+ models</title><meta name="description" content="A peek into software engineering for the transformers library"><link rel="canonical" href="http://localhost:4321/"><meta property="og:type" content="article"><meta property="og:title" content="Maintain the unmaintainable:
3
- 1M python loc, 400+ models"><meta property="og:description" content="A peek into software engineering for the transformers library"><meta property="og:url" content="http://localhost:4321/"><meta property="og:image" content="/thumb.auto.jpg"><meta property="article:published_time" content="October 2, 2025"><meta property="article:author" content="Pablo Montalvo"><meta name="twitter:card" content="summary_large_image"><meta name="twitter:title" content="Maintain the unmaintainable:
4
- 1M python loc, 400+ models"><meta name="twitter:description" content="A peek into software engineering for the transformers library"><meta name="twitter:image" content="/thumb.auto.jpg"><script type="application/ld+json">{"@context":"https://schema.org","@type":"Article","headline":"Maintain the unmaintainable:\n1M python loc, 400+ models","description":"A peek into software engineering for the transformers library","datePublished":"October 2, 2025","author":[{"@type":"Person","name":"Pablo Montalvo"}],"keywords":"transformers, engineering, design-philosophy","mainEntityOfPage":"http://localhost:4321/","image":["/thumb.auto.jpg"]}</script><script>
5
  (() => {
6
  try {
7
  const saved = localStorage.getItem("theme");
@@ -12,8 +12,8 @@
12
  document.documentElement.setAttribute("data-theme", theme);
13
  } catch {}
14
  })();
15
- </script><script type="module" src="/scripts/color-palettes.js"></script><!-- TO MANAGE PROPERLY --><script src="https://cdn.plot.ly/plotly-3.0.0.min.js" charset="utf-8"></script><link rel="stylesheet" href="/_astro/index.beJ178IL.css"><script type="module" src="/_astro/hoisted.DK-CdsVg.js"></script>
16
- <script type="module" src="/_astro/page.CH0W_C1Z.js"></script></head> <body> <button id="theme-toggle" aria-label="Toggle color theme" data-astro-cid-x3pjskd3> <svg class="icon light" width="20" height="20" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" data-astro-cid-x3pjskd3> <circle cx="12" cy="12" r="5" data-astro-cid-x3pjskd3></circle> <line x1="12" y1="1" x2="12" y2="4" data-astro-cid-x3pjskd3></line> <line x1="12" y1="20" x2="12" y2="23" data-astro-cid-x3pjskd3></line> <line x1="1" y1="12" x2="4" y2="12" data-astro-cid-x3pjskd3></line> <line x1="20" y1="12" x2="23" y2="12" data-astro-cid-x3pjskd3></line> <line x1="4.22" y1="4.22" x2="6.34" y2="6.34" data-astro-cid-x3pjskd3></line> <line x1="17.66" y1="17.66" x2="19.78" y2="19.78" data-astro-cid-x3pjskd3></line> <line x1="4.22" y1="19.78" x2="6.34" y2="17.66" data-astro-cid-x3pjskd3></line> <line x1="17.66" y1="6.34" x2="19.78" y2="4.22" data-astro-cid-x3pjskd3></line> </svg> <svg class="icon dark" width="20" height="20" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" data-astro-cid-x3pjskd3> <path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z" data-astro-cid-x3pjskd3></path> </svg> </button> <section class="hero" data-astro-cid-bbe6dxrz> <h1 class="hero-title" data-astro-cid-bbe6dxrz>Maintain the unmaintainable:<br/>1M python loc, 400+ models</h1> <div class="hero-banner" data-astro-cid-bbe6dxrz> <figure class="html-embed"><div class="html-embed__card is-frameless"><div id="frag-7ye2najyjjf"><style>
17
  @import url('https://fonts.googleapis.com/css2?family=Inter:wght@500;600&display=swap');
18
 
19
  .banner-container {
@@ -137,6 +137,13 @@ node.append('text')
137
  .attr('dy','-1.1em')
138
  .text(d => shortId(d.id));
139
 
 
 
 
 
 
 
 
140
  // Forces tuned for wide, short aspect
141
  const sim = d3.forceSimulation(graph.nodes)
142
  .force('link', d3.forceLink(graph.links).id(d => d.id).distance(150).strength(0.4))
@@ -160,14 +167,12 @@ function dragEnd(e,d){ if(!e.active) sim.alphaTarget(0); d.fx=d.fy=null; }
160
  // Fit on first paint (no zoom UI for banner)
161
  window.addEventListener('resize', () => location.reload());
162
  </script>
163
- </div></div></figure> <p class="hero-desc" data-astro-cid-bbe6dxrz>A peek into software engineering for the transformers library</p> </div> </section> <header class="meta" aria-label="Article meta information" data-astro-cid-bbe6dxrz> <div class="meta-container" data-astro-cid-bbe6dxrz> <div class="meta-container-cell" data-astro-cid-bbe6dxrz> <h3 data-astro-cid-bbe6dxrz>Author</h3> <ul class="authors" data-astro-cid-bbe6dxrz> <li data-astro-cid-bbe6dxrz> <a href="https://huggingface.co/Molbap" data-astro-cid-bbe6dxrz>Pablo Montalvo</a> </li> </ul> </div> <div class="meta-container-cell meta-container-cell--affiliations" data-astro-cid-bbe6dxrz> <h3 data-astro-cid-bbe6dxrz>Affiliation</h3> <p data-astro-cid-bbe6dxrz> <a href="https://huggingface.co" target="_blank" rel="noopener noreferrer" data-astro-cid-bbe6dxrz> Hugging Face </a> </p> </div> <div class="meta-container-cell meta-container-cell--published" data-astro-cid-bbe6dxrz> <h3 data-astro-cid-bbe6dxrz>Published</h3> <p data-astro-cid-bbe6dxrz>October 2, 2025</p> </div> <!-- {doi && (
164
  <div class="meta-container-cell">
165
  <h3>DOI</h3>
166
  <p><a href={`https://doi.org/${doi}`} target="_blank" rel="noopener noreferrer">{doi}</a></p>
167
  </div>
168
- )} --> <div class="meta-container-cell meta-container-cell--pdf" data-astro-cid-bbe6dxrz> <h3 data-astro-cid-bbe6dxrz>PDF</h3> <p data-astro-cid-bbe6dxrz> <a class="button" href="/maintain-the-unmaintainable-1m-python-loc-400-models.pdf" download="maintain-the-unmaintainable-1m-python-loc-400-models.pdf" aria-label="Download PDF maintain-the-unmaintainable-1m-python-loc-400-models.pdf" data-astro-cid-bbe6dxrz>
169
- Download PDF
170
- </a> </p> </div> </div> </header> <section class="content-grid"> <nav class="table-of-contents" aria-label="Table of Contents" data-auto-collapse="1"> <div class="title">Table of Contents</div> <div id="article-toc-placeholder"></div> </nav> <details class="table-of-contents-mobile"> <summary>Table of Contents</summary> <div id="article-toc-mobile-placeholder"></div> </details> <script>
171
  // Build TOC from article headings (h2/h3/h4) and render into the sticky aside
172
  const buildTOC = () => {
173
  const holder = document.getElementById('article-toc-placeholder');
@@ -407,39 +412,43 @@ Download PDF
407
  <p>How do you keep such a ship afloat, made of so many moving, unrelated parts, contributed to by a buzzing hivemind? Especially as the pace of ML research accelerates? We receive constant feedback on everything from function signatures with hundreds of arguments to duplicated code and optimization concerns, and we listen to all of it, or try to. The library’s usage keeps on growing, and we are a small team of maintainers and contributors, backed by hundreds of open-source community members.
408
  We continue to support all new models and expect to do so for the foreseeable future.</p>
409
  <p>This post dissects the design philosophy that makes this possible. It’s the result of an evolution from our older principles, detailed on our previous <a href="https://huggingface.co/docs/transformers/en/philosophy">philosophy</a> page, as well as its accompanying <a href="https://huggingface.co/blog/transformers-design-philosophy">blog post from 2022</a>. More recently (and we strongly recommend the read) we published a blog post about <a href="https://huggingface.co/blog/faster-transformers">recent upgrades to transformers</a>, focusing on what makes the library faster today. All of these developments are only made possible thanks to these principles.</p>
410
- <p>We codify the “tenets” that guide our development, demonstrate how they are implemented in code, and show the measurable impact they have on the library’s sustainability and growth.</p>
411
- <p>For any OSS maintainer, power user, or contributor, this is the map to understanding, using, and building upon <code>transformers</code>, but not only: any project of comparable size will require you to make deep choices, not only on design and choice of abstraction, but on the very mindset of the software you are building.</p>
 
412
  <p><a href="#source-of-truth">Tenets exemplified</a> will have their summary available on hover.</p>
413
  <p><a href="https://huggingface.co/blog/welcome-openai-gpt-oss">External links</a> to articles will help you solidify your knowledge.</p>
414
- <p><a href="#generated-modeling">Several interactive visualisations</a> are available as you go - scroll, zoom, drag away.</p>
415
- <div class="crumbs"><p>Throughout this post, you’ll find breadcrumb boxes like this one. They summarize what you just learned, connect it to the tenets, and point to what’s coming <strong>Next</strong>. Think of them as narrative signposts to help you keep track.</p></div>
 
 
 
416
  <h2 id="the-core-tenets-of-transformers"><a href="#the-core-tenets-of-transformers">The core tenets of transformers</a></h2>
417
  <p>We summarize the foundations on which we’ve built everything, and write the “tenets” of the library. They behave like <em>software interfaces</em>, hence it is crucial that they are explicitly written down. However opinionated they are, they have evolved over time.</p>
418
- <p>Note that the library <em>evolved</em> towards these principles, and that they <em>emerged</em> from decisions taken, and once emerged they were recognized as critical.</p>
419
- <div class="tenet-list"><ol><li class="tenet"><a id="source-of-truth"></a><strong>Source of Truth</strong><p>We aim to be a <a href="#https://huggingface.co/blog/transformers-model-definition">source of truth for all model definitions</a>. This is not a tenet, but something that still guides our decisions. Model implementations should be reliable, reproducible, and faithful to the original performances.</p><em>This overarching guideline ensures quality and reproducibility across all models in the library.</em></li><li class="tenet"><a id="one-model-one-file"></a><strong>One Model, One File</strong><p>All inference and training core logic has to be visible, top‑to‑bottom, to maximize each model’s hackability.</p><em>Every model should be completely understandable and hackable by reading a single file from top to bottom.</em></li><li class="tenet"><a id="code-is-product"></a><strong>Code is Product</strong><p>Optimize for reading, diffing, and tweaking, our users are power users. Variables can be explicit, full words, even several words, readability is primordial.</p><em>Code quality matters as much as functionality - optimize for human readers, not just computers.</em></li><li class="tenet"><a id="standardize-dont-abstract"></a><strong>Standardize, Don’t Abstract</strong><p>If it’s model behavior, keep it in the file; abstractions only for generic infra.</p><em>Model-specific logic belongs in the model file, not hidden behind abstractions.</em></li><li class="tenet"><a id="do-repeat-yourself"></a><strong>DRY* (DO Repeat Yourself)</strong><p>Copy when it helps users; keep successors in sync without centralizing behavior.</p><p><strong>Amendment:</strong> With the introduction and global adoption of <a href="#modular">modular</a> transformers, we do not repeat any logic in the modular files, but end user files remain faithful to the original tenet.</p><em>Strategic duplication can improve readability and maintainability when done thoughtfully.</em></li><li class="tenet"><a id="minimal-user-api"></a><strong>Minimal User API</strong><p>Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. We want the least amount of codepaths. Reading should be obvious, configurations should be obvious.</p><em>Keep the public interface simple and predictable, users should know what to expect.</em></li><li class="tenet"><a id="backwards-compatibility"></a><strong>Backwards Compatibility</strong><p>Evolve by additive standardization, never break public APIs.</p><p>Any artifact that was once on the hub and loadable with transformers should be usable indefinitely with the same interface. Further, public methods should not change to avoid breaking dependencies.</p><em>Once something is public, it stays public, evolution through addition, not breaking changes.</em></li><li class="tenet"><a id="consistent-public-surface"></a><strong>Consistent Public Surface</strong><p>Same argument names, same outputs, hidden states and attentions exposed, enforced by tests. This is a goal we have as well as a tenet.</p><em>All models should feel familiar - consistent interfaces reduce cognitive load.</em></li></ol></div>
420
- <p>When a PR is merged, it is because the contribution is worthwhile, and that the <code>transformers</code> team finds the design of the contribution to be aligned with what is above.</p>
421
- <p>Does all the code in the library follow strictly these tenets? No. The library is a gigantic house with connected nooks, corridors, crannies everywhere built by thousands of different workers. We <em>try</em> to make it so all the code added is compliant, because if we fail and merge it, we cannot change it lest we break <a href="#backwards-compatibility">backwards compatibility</a>.</p>
422
- <p>For instance, one function essential to the implementation of <a href="https://huggingface.co/papers/2104.09864">Rotary Positional Embeddings</a> is identical in 70 <code>modeling_&lt;file&gt;.py</code> across <code>src/transformers/models/.</code> Why keep it? Because we want all the model logic to be <a href="#one-model-one-file">contained in the modeling file</a>. In order to do that, we <a href="#do-repeat-yourself">do repeat ourselves</a>.</p>
 
423
  <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">def</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> rotate_half</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(x):</span></span>
424
  <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> &quot;&quot;&quot;Rotates half the hidden dims of the input.&quot;&quot;&quot;</span></span>
425
  <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> x1 </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> x[</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">...</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">, : x.shape[</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">] </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">//</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> 2</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">]</span></span>
426
  <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> x2 </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> x[</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">...</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">, x.shape[</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">] </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">//</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> 2</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> :]</span></span>
427
  <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> return</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> torch.cat((</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">x2, x1), </span><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70">dim</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">)</span></span>
428
  <span class="line"></span></code></pre></div>
429
- <p>You can use a simple regex to look at all methods of a given name across your codebase and look at their differences and similarities, that’s what I did (+ a hash to avoid quadraticity).</p>
430
- <p>We want all models to have self-contained modeling code.</p>
431
- <p>Every core functionality <em>must</em> be in the modeling code, every non-core functionality <em>can</em> be outside of it.</p>
432
- <p>This comes as a great cost. Enter the <code>#Copied from...</code> mechanism: for a long time, these comments were indicating that some code was copied from another model, saving time both for the reviewers and for the CI. But the LOC count kept creeping up. Each new model copied over hundreds of lines that we considered largely boilerplate, yet, we could not remove them.</p>
433
- <p>We needed to separate both principles that were so far intertwined, <a href="#do-repeat-yourself">repetition</a> and <a href="#one-model-one-file">hackability</a>.</p>
434
- <p>What was the solution to this?</p>
435
- <div class="crumbs"><p>Read the code in one place (<a href="#one-model-one-file">One Model, One File</a>). Keep semantics local (<a href="#standardize-dont-abstract">Standardize, Don’t Abstract</a>). Allow strategic duplication for end users (<a href="#do-repeat-yourself">DRY*</a>). Keep the public surface minimal and stable (<a href="#minimal-user-api">Minimal API</a>, <a href="#backwards-compatibility">Backwards Compatibility</a>, <a href="#consistent-public-surface">Consistent Surface</a>). <strong>Next:</strong> how modular transformers honor these while removing boilerplate.</p></div>
436
  <h2 id="-modular-transformers"><a href="#-modular-transformers"><a id="modular"></a> Modular transformers</a></h2>
437
- <p>Transformers is an opinionated library. The previous <a href="https://huggingface.co/docs/transformers/en/philosophy">philosophy</a> page, and the <a href="https://huggingface.co/blog/transformers-design-philosophy">blog post</a> were already pointing at the drawbacks mentioned just above, which have been iteratively addressed. <a href="https://huggingface.co/docs/transformers/en/modular_transformers"><code>modular</code> transformers were introduced</a>, allowing a form of inheritance without breaking <a href="#one-model-one-file">One model, One file</a>.</p>
438
- <p>We amended the principle of <a href="#do-repeat-yourself">DRY*</a> by removing progressively all pieces of code that were “copied from” another file.</p>
439
- <p>It works as follows. In order to contribute a model, say for instance define a <code>modular_</code> file that can inherit from <em>any function across all other modeling, configuration and processor files</em>.
440
- This modular file can use inheritance across models: and then, it will be unravelled into a fully functional modeling file.</p>
441
  <summary id="generated-modeling">Auto-generated modeling code</summary>
442
- <figure class="html-embed"><div class="html-embed__card"><div id="frag-971efks994o"><div class="code-compare" style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin: 1.5rem 0;">
443
  <div class="code-column" style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden;">
444
  <div class="code-header" style="background: #f8f9fa; padding: 0.75rem 1rem; font-weight: 600; color: #495057; border-bottom: 1px solid #e2e8f0;">
445
  modular_glm.py
@@ -588,10 +597,13 @@ class GlmRMSNorm(nn.Module):
588
  <strong>Left:</strong> Clean modular definition with inheritance.
589
  <strong>Right:</strong> Auto-expanded version with all inherited functionality visible.
590
  </p></div></div></figure>
591
- <p>As you can see, we can now define any model as a <em>modular</em> of another.</p>
592
  <p>You might think “well that’s just how inheritance works”. The crucial difference is that we do <em>visibly</em> what is essentially the <em>compiler</em>’s job: by unrolling the inheritances, we make visible all of the modeling code, keeping it <a href="#one-model-one-file">all in one piece</a>.</p>
 
 
593
  <p>What is the consequence? When adding a model, we do not need to go over the entire modeling file. The modular (left side above) is enough.</p>
594
- <p>When <code>AutoModel.from_pretrained(...)</code> is called, it is indeed the modeling (right side) that is ran, and all the tests are run on the modeling code.</p>
 
595
  <p>What does that give us?</p>
596
  <div class="crumbs"><p><strong>TL;DR:</strong> A small <code>modular_*.py</code> declares reuse; the expanded modeling file stays visible (<a href="#one-model-one-file">One Model, One File tenet preserved</a>). Reviewers and contributors maintain the shard, not the repetition.</p><p><strong>Next:</strong> the measurable effect on effective LOC and maintenance cost.</p></div>
597
  <h3 id="a-maintainable-control-surface"><a href="#a-maintainable-control-surface">A maintainable control surface</a></h3>
@@ -599,14 +611,15 @@ class GlmRMSNorm(nn.Module):
599
  However, if a model has a modular_<em>.py and a corresponding automatically generated modeling_</em>/.py, we only count the LOC under the modular file. The modeling code has no maintenance cost as it is strictly dependent on the modular file.</p>
600
  <p>That gives an “effective LOC” curve: the 𝗺𝗮𝗶𝗻𝘁𝗲𝗻𝗮𝗻𝗰𝗲 𝘀𝘂𝗿𝗳𝗮𝗰𝗲.</p>
601
  <p>Measured on git history, raw <code>modeling_*.py</code> grew at ~362 LOC/day before modular; counting only modular shards yields ~25 LOC/day after — about <strong>15× lower</strong>. The effective curve (blue line below) represents the <strong>maintenance surface</strong> today: what maintainers actually read and review.</p>
602
- <p>Less code to hand-maintain means fewer places to break. LOC is not a direct measure of complexity, but they correlate in review effort and change risk.</p>
603
- <figure class="html-embed"><div class="html-embed__card"><div id="frag-b9s6rzaeqn8"><iframe
604
  src="https://molbap-loc-1.hf.space"
605
  style="width:100%; height:900px; border:0"
606
  allow="clipboard-read; clipboard-write; fullscreen"
607
  referrerpolicy="no-referrer-when-downgrade"
608
  ></iframe></div></div></figure>
609
- <p>If you zoom in, you’ll notice there’s a sharp drop near the end, it’s essentially due to us <a href="https://github.com/huggingface/transformers/commit/4df2529d79d75f44e70396df5888a32ffa02d61e#diff-60849db3e9922197854ef1cac92bf4aba08b5d7fd3fe6f3c16a3511e29e0eacc">removing support for Jax and TensorFlow</a> library-wide.</p>
 
610
  <p>But this was not the only effort that allowed us to reduce maintenance load.</p>
611
  <p>We recently underwent a deep refactor of the attention implementation. You’ve likely heard about <a href="https://huggingface.co/docs/text-generation-inference/en/conceptual/flash_attention">flash attention</a> and its several variants.</p>
612
  <p>The <em>attention computation</em> itself happens at a <em>lower</em> level of abstraction than the model itself.</p>
@@ -632,41 +645,40 @@ referrerpolicy="no-referrer-when-downgrade"
632
  <div class="crumbs"><p>Attention semantics remain in <code>eager_attention_forward</code>; faster backends are opt-in via config. We inform via types/annotations rather than enforce rigid kwargs, preserving integrations.</p><p><strong>Next:</strong> parallel partitioning is declared as a plan, not through model surgery.</p></div>
633
  <h3 id="-configurable-tensor-parallelism"><a href="#-configurable-tensor-parallelism"><a id="simpler-tensor-parallelism"></a> Configurable Tensor Parallelism</a></h3>
634
  <p>If you’re not familiar with the different flavours of parallelism, I recommend checking out <a href="https://huggingface.co/blog/accelerate-nd-parallel">this blog post</a> first, and of course a full <a href="https://huggingface.co/spaces/nanotron/ultrascale-playbook">dive into the ultra-scale playbook</a> is always recommended.</p>
635
- <p>The essential part is that, as <a href="https://huggingface.co/docs/transformers/v4.56.2/perf_train_gpu_many#tensor-parallelism">the documentation states</a> when tensors get too large to fit on a single GPU, they are sliced along a particular dimension and every slice is sent to a different GPU.</p>
636
  <p>Why does it matter?</p>
637
- <p>Because we want to avoid code modifications that are unrelated to the model.
638
- We choose to place the level of abstraction higher than the device placement: a matrix multiplication - a <code>nn.Linear</code> layer - should be always expressed in the same way, regardless of how it is placed.</p>
639
- <p>Hence, we want to touch <a href="#minimal-user-api">minimally</a> to the modeling code, and only modify it when <em>architectural changes</em> are involved. For instance, for tensor parallelism, we instead now specify a simple <code>tp_plan</code>.</p>
640
- <p>The alternative would be to modify parent classes specific to their</p>
641
- <p>It is written once in the config and passed to <code>.from_pretrained()</code>. The plan maps module name patterns to partitioning strategies. Strategies are resolved by the internal <code>ParallelInterface</code>, which wires to sharding implementations <code>ColwiseParallel</code>, <code>RowwiseParallel</code>, packed variants, and so on.</p>
642
- <figure class="html-embed"><div class="html-embed__card"><div id="frag-cnu461t3tc"><pre><code class="language-python"># In the model's config (example: ERNIE 4.5-style decoder blocks)
643
- base_model_tp_plan = {
644
- "layers.*.self_attn.q_proj": "colwise",
645
- "layers.*.self_attn.k_proj": "colwise",
646
- "layers.*.self_attn.v_proj": "colwise",
647
- "layers.*.self_attn.o_proj": "rowwise",
648
- "layers.*.mlp.gate_proj": "colwise",
649
- "layers.*.mlp.up_proj": "colwise",
650
- "layers.*.mlp.down_proj": "rowwise",
651
- }
652
-
653
- # Runtime
654
- import torch
655
- from transformers import AutoModelForCausalLM, AutoTokenizer
656
-
657
- model_id = "your/model-or-local-checkpoint"
658
- model = AutoModelForCausalLM.from_pretrained(
659
- model_id,
660
- dtype=torch.bfloat16,
661
- tp_plan=base_model_tp_plan, # <-- plan defined above
662
- )
663
- tok = AutoTokenizer.from_pretrained(model_id)
664
- inputs = tok("Hello", return_tensors="pt").to(model.device)
665
- out = model(**inputs)</code></pre></div></div></figure>
666
- <p>Which allows a user to run with multiple processes per node, e.g. 4 GPUs:</p>
667
  <p><code>torchrun --nproc-per-node 4 demo.py</code></p>
668
- <p>Semantics stay in the model (a Linear stays a Linear), distribution is orthogonal and declared via strings: “colwise” splits columns of weights/bias across ranks; “rowwise” splits rows; packed variants shard fused weights; The mapping keys accept glob patterns like <code>layers.*.mlp.down_proj</code> to target repeated submodules.</p>
669
- <div class="crumbs"><p>Sharding is configuration (<code>tp_plan</code>), not edits to <code>Linear</code>s. Glob patterns target repeated blocks; modeling semantics stay intact. <strong>Next:</strong> per-layer attention/caching schedules declared in config, not hardcoded.</p></div>
670
  <h3 id="-layers-attentions-and-caches"><a href="#-layers-attentions-and-caches"><a id="layers-attentions-caches"></a> Layers, attentions and caches</a></h3>
671
  <p>Following the same logic, the <em>nature</em> of attention and per-layer caching should not be hardcoded. We should be able to specify in the configuration how each layer is implemented. Thus, we define a mapping like:</p>
672
  <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">ALLOWED_LAYER_TYPES</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> =</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> (</span></span>
@@ -694,11 +706,11 @@ out = model(**inputs)</code></pre></div></div></figure>
694
  <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">class</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> GlmRMSNorm</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">nn</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">Module</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">):</span></span>
695
  <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> ...</span></span>
696
  <span class="line"></span></code></pre></div>
697
- <p>This also opens another contribution path: GPU specialists can contribute optimized kernels to the kernel hub, and have them usable in <code>transformers</code>. You can check on the <a href="https://huggingface.co/blog/hello-hf-kernels">kernel community blog post</a> to learn more about it!</p>
698
  <p>Even more resources have been added, like the formidable <a href="https://github.com/huggingface/kernel-builder">kernel builder</a> with its connected resources to <a href="https://github.com/huggingface/kernel-builder/blob/main/docs/writing-kernels.md">help you build kernels with it</a> and <a href="https://github.com/huggingface/kernel-builder/blob/main/docs/nix.md">with nix</a>.</p>
699
- <div class="crumbs"><p>Models define semantics; kernels define how to run them faster. Use annotations to borrow community forwards while keeping a consistent public surface. <strong>Next:</strong> what modularity looks like across the repo.</p></div>
700
- <h2 id="modular-developments"><a href="#modular-developments">Modular developments</a></h2>
701
- <p>Now, we have a form of inheritance in our codebase. Some models become standards, and model contributors are given the opportunity to <em>define standards</em>. Pushing the boundaries of scientific knowledge can translate into the boundaries of engineering if this effort is made, and we’re striving for it.
702
  It’s hard to conceptualize very large libraries and how their components interact with each other, regardless of your cognitive abilities for abstractions.
703
  So I wanted to take a look at the current <strong>state of modularity</strong> across the repository. How many models are defined using components of others?</p>
704
  <p>To get this graph, I used the heuristic of modular inheritance.</p>
@@ -707,31 +719,45 @@ So I wanted to take a look at the current <strong>state of modularity</strong> a
707
  <li>In this <code>modular</code> file, what models, configurations and processings are imported?</li>
708
  <li>Recurse through the model list that way.</li>
709
  </ol>
710
- <p>So what do we see? Llama is a basis for many models, and it shows.
711
- Radically different architectures such as mamba have spawned their own dependency subgraph.</p>
712
- <figure class="html-embed"><div class="html-embed__card"><div id="frag-3popmj9j337"><iframe
 
713
  src="https://molbap-dependencies-1.hf.space"
714
  style="width:100%; height:680px; border:0"
715
  allow="clipboard-read; clipboard-write; fullscreen"
716
  referrerpolicy="no-referrer-when-downgrade"
717
  ></iframe></div></div></figure>
 
 
 
 
 
 
718
  <p>In the case of VLMs, there’s far too many vision-based architectures that are not yet defined as modulars of other existing archs. In other words, there is no strong reference point in terms of software for vision models.
719
- As you can see, there is a small DETR island, a little llava pocket, and so on, but it’s not comparable to the centrality observed for llama.</p>
720
- <p>Another problem is, this visualization only shows <code>modular</code> models. Several models still do NOT have a modular file.</p>
721
- <p>How do we spot them, and how do we identify modularisable models?</p>
722
- <div class="crumbs"><p>Graph reading guide: nodes are models; edges are modular imports. Llama-lineage is a hub; several VLMs remain islands — engineering opportunity for shared parents. <strong>Next:</strong> timeline + similarity signals to spot candidates.</p></div>
 
 
 
 
 
723
  <h3 id="many-models-but-not-enough-yet-are-alike"><a href="#many-models-but-not-enough-yet-are-alike">Many models, but not enough yet, are alike</a></h3>
724
- <p>I looked into Jaccard similarity, which we use to measure set differences, to find similarities across models. I know that code is more than a set of characters stringed together. We also tried code-embedding models that ranked candidates better in practice, but for this post we stick to the deterministic Jaccard index. You can take a look at <a href="https://github.com/huggingface/transformers/pull/41289">the corresponding PR</a> for the embedding method.</p>
725
- <p>It is interesting, for our comparison, to look at <em>when</em> we deployed this modular logic and what was its rippling effect on the library. You can check the <a href="https://huggingface.co/spaces/Molbap/transformers-modular-refactor">larger space</a> to play around, but the gist is: adding modular allowed to connect more and more models to solid reference points.</p>
726
  <p>Yet, we still have a lot of gaps to fill.</p>
727
- <p>Zoom out below - it’s full of models. You can click on a node to see its connections better, or use the text box to search for a model.</p>
728
- <figure class="html-embed"><div class="html-embed__card"><div id="frag-n93vsx8na5e"> <iframe
729
  src="https://molbap-timeline-1.hf.space"
730
  style="width:100%; height:680px; border:0"
731
  allow="clipboard-read; clipboard-write; fullscreen"
732
  referrerpolicy="no-referrer-when-downgrade"
733
  ></iframe></div></div></figure>
734
- <p>If youve checked out llava, you’ve seen that llava_video is a red node, connected by a red edge to llava: it’s a candidate, something that we can <em>likely</em> remodularize, <a href="#backwards-compatibility">not touching the actual model</a> but being much more readable with <a href="#do-repeat-yourself">DRY*</a>.</p>
 
 
735
  <div class="crumbs"><p>Similarity metrics (Jaccard index or embeddings) surfaces likely parents; the timeline shows consolidation after modular landed. Red nodes/edges = candidates (e.g., <code>llava_video</code> → <code>llava</code>) for refactors that preserve behavior. <strong>Next:</strong> concrete VLM choices that avoid leaky abstractions.</p></div>
736
  <h3 id="vlm-improvements-avoiding-abstraction"><a href="#vlm-improvements-avoiding-abstraction">VLM improvements, avoiding abstraction</a></h3>
737
  <p>We don’t yet have a cookbook for common VLM patterns (image token scatter, multi‑tower encoders, cross‑attention bridges). This is one of the main improvement points where we can work.</p>
@@ -787,13 +813,14 @@ That means every decision we make to abstract something else has to be extremely
787
  <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> return</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_image_mask, special_video_mask</span></span>
788
  <span class="line"></span></code></pre></div>
789
  <p>But this is <em>within</em> the modeling file, not in the <code>PreTrainedModel</code> base class. It will not move away from it, because it’d break the <a href="#one-model-one-file">self-contained logic</a> of the model.</p>
790
- <div class="crumbs"><p>Keep VLM embedding mix in the modeling file (semantics), standardize safe helpers (e.g., placeholder masking), don’t migrate behavior to <code>PreTrainedModel</code>. <strong>Next:</strong> pipeline-level wins that came from PyTorch-first choices (fast processors).</p></div>
 
791
  <h3 id="on-image-processing-and-processors"><a href="#on-image-processing-and-processors">On image processing and processors</a></h3>
792
- <p>Choosing to be a <code>torch</code>-first software meant relieving a tremendous amount of support from <code>jax </code> and <code>TensorFlow</code> , and it also meant that we could be more lenient into the amount of torch-dependent utilities that we were able to add. One of these is the <em>fast processing</em> of images. Where they were before assumed to be minimal ndarrays, making stronger assumptions and enforcing <code>torch</code> and <code>torchvision</code> native inputs allowed up to speed up massively the processing time for each model.</p>
793
- <p>The gains in performance are immense, up to 20x speed for most models when compiled torchvision ops. Further, it allows to have the whole pipeline solely on GPU.</p>
794
  <p><img src="/images/transformers/fast_image_processors.png" alt="Fast Image Processors Performance"/>
795
  <p class="figure-legend">Thanks <a href="https://huggingface.co/yonigozlan">Yoni Gozlan</a> for the great work!</p></p>
796
- <div class="crumbs"><p>Torch-first lets processors assume torch/torchvision and run the whole pipeline on GPU; big per-model speedups. <strong>Next:</strong> how this lowers friction for contributors and downstream users.</p></div>
797
  <h2 id="reduce-barrier-to-entrycontribution"><a href="#reduce-barrier-to-entrycontribution">Reduce barrier to entry/contribution</a></h2>
798
  <p>This is an overall objective: there’s no <code>transformers</code> without its community.</p>
799
  <p>Having a framework means forcing users into it. It restrains flexibility and creativity, which are the fertile soil for new ideas to grow.</p>
@@ -803,7 +830,7 @@ That means every decision we make to abstract something else has to be extremely
803
  <strong>Next:</strong> power tools enabled by a consistent API.</p></div>
804
  <h3 id="-models-popularity"><a href="#-models-popularity"><a id="encoders-ftw"></a> Models popularity</a></h3>
805
  <p>Talking about dependencies, we can take a look at the number of downloads as a measure of popularity. One thing we see is the prominence of encoders, despite the apparent prevalence of decoder LLMs. The reason is that encoders are used to generate embeddings, which have multiple downstream uses. Just check out <a href="https://huggingface.co/blog/embeddinggemma">EmbeddingGemma</a> for a modern recap. Hence, it is vital to keep the encoders portion of the library viable, usable, fine-tune-able.</p>
806
- <div><figure class="html-embed"><div class="html-embed__card"><div id="frag-0yx46sf56oel"><html>
807
  <head><meta charset="utf-8" /></head>
808
  <body>
809
  <div> <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
@@ -4700,7 +4727,7 @@ return Plotly;
4700
  <p>All models have the same API for attention computation, thanks to <a href="#external-attention-classes">the externalisation of attention classes</a>.</p>
4701
  <p>This uniformity allows us to build cool tools to visualize the inner workings of the attention mechanism.</p>
4702
  <p>One particular piece of machinery is the <code>attention mask</code>. Here you see the famous bidirectional attention pattern for the whole prefix (text + image) in PaliGemma and all Gemma2+ models, contrasting with the usual “causal-only” models.</p>
4703
- <figure class="html-embed"><div class="html-embed__card"><div id="frag-xi8y9of6l1"><!-- Minimal HTML fragment: terminal-style ASCII attention masks -->
4704
  <div style="max-width: 940px; margin: 16px 0; border:1px solid #2a2f3a; border-radius:8px; background:#0b0f19; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace; color:#e5e7eb;">
4705
  <div style="display:flex; align-items:center; gap:8px; padding:8px 10px; border-bottom:1px solid #1f2430; background:#111827; border-top-left-radius:8px; border-top-right-radius:8px;">
4706
  <span style="width:10px; height:10px; background:#ef4444; border-radius:50%; display:inline-block;"></span>
@@ -4745,16 +4772,15 @@ return Plotly;
4745
  </div>
4746
  </div>
4747
  </div></div></figure>
4748
- <div class="crumbs"><p>Uniform attention APIs enable cross-model diagnostics (e.g., PaliGemma prefix bidirectionality vs causal).
4749
- <strong>Next:</strong> whole-model tracing for ports and regressions.</p></div>
4750
  <h3 id="logging-entire-model-activations"><a href="#logging-entire-model-activations">Logging entire model activations</a></h3>
4751
- <p>Further, because it is all PyTorch (and it is even more now that we support only PyTorch), we can easily <a href="https://huggingface.co/docs/transformers/internal/model_debugging_utils">debug any model</a> when we want to add it to transformers. We now have a power-user tool for porting or adding models, that wraps a forward pass, intercepts every submodule call, and logs shapes, dtypes, and sample statistics of inputs/outputs to nested JSON.</p>
4752
- <p>It just works with PyTorch models and is especially useful when aligning outputs with a reference implementation, aligned with our <a href="#source-of-truth">core guideline</a>.</p>
4753
  <p><img src="/images/transformers/model_debugger.png" alt="Model debugger interface"/></p>
4754
- <div class="crumbs"><p>Forward interception and nested JSON logging align ports to reference implementations, reinforcing “Source of Truth.” <strong>Next:</strong> CUDA warmup reduces load-time stalls without touching modeling semantics.</p></div>
4755
  <h3 id="cooking-faster-cuda-warmups"><a href="#cooking-faster-cuda-warmups">Cooking faster CUDA warmups</a></h3>
4756
- <p>Having a clean <em>external</em> API allows us to work on the <a href="#code-is-product">true inner workings of transformers</a>. One of the few recent additions was the <em>CUDA warmup</em> via <code>caching_allocator_warmup</code> which improved massively the loading footprint by pre-allocating GPU memory to avoid malloc bottlenecks during model loading, achieving a 7x factor for an 8B model, 6x for a 32B, you can check out <a href="https://github.com/huggingface/transformers/pull/36380">the source</a>!</p>
4757
- <figure class="html-embed"><div class="html-embed__card"><div id="frag-d8fi5kw4x4v"><style>
4758
  /* 1) Scope tokens to the widget */
4759
  .warmup-demo{
4760
  --page-bg:#ffffff;
@@ -5056,7 +5082,7 @@ return Plotly;
5056
  }
5057
  </script></div></div></figure>
5058
  <p>It’s hard to overstate how much of a lifesaver that is when you’re trying to load a model as fast as possible, as it’s the narrowest bottleneck for your iteration speed.</p>
5059
- <div class="crumbs"><p>Pre-allocating GPU memory removes malloc spikes (e.g., 7× for 8B, 6× for 32B in the referenced PR). <strong>Next:</strong> serving benefits directly from consistent interfaces and modularity.</p></div>
5060
  <h3 id="transformers-serve-and-continuous-batching"><a href="#transformers-serve-and-continuous-batching">Transformers-serve and continuous batching</a></h3>
5061
  <p>Having all these models readily available and sharing the same interface allowed us to implement transformers-serve, a CLI tool to expose models through a standard OpenAI http API.</p>
5062
  <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="bash"><code><span class="line"><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">transformers</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> serve</span></span>
@@ -5087,9 +5113,9 @@ return Plotly;
5087
  <strong>Next:</strong> what changes in v5 without breaking the promise of visible semantics.</p></div>
5088
  <h2 id="what-is-coming-next"><a href="#what-is-coming-next">What is coming next</a></h2>
5089
  <p>The next major version of <code>transformers</code> is just around the corner (and will have another blog post to its name when it comes out). When v5 is released, we aim to keep <a href="#backwards-compatibility">backwards compatibility</a> as solid as possible. The changes we make now are in service of that goal.</p>
5090
- <p>We will lean further into a modular toolbox, not a framework. You should not be forced to rewrite modeling code. It’s better when a model can inherit from <code>PreTrainedModel</code> and opt into Tensor Parallel, <code>from_pretrained</code>, sharding, <code>push_to_hub</code>, loss plumbing, and external stacks like PEFT/TRL/SGLang/vLLM.</p> </main> </section> <footer class="footer"> <div class="footer-inner"> <section class="citation-block"> <h3>Citation</h3> <p>For attribution, cite this work as</p> <pre class="citation short">Pablo Montalvo (2025). &quot;Maintain the unmaintainable: 1M python loc, 400+ models&quot;.</pre> <p>BibTeX citation</p> <pre class="citation long">@misc{montalvo2025_maintain_the_unmaintaina,
5091
  title={Maintain the unmaintainable: 1M python loc, 400+ models},
5092
- author={Pablo Montalvo},
5093
  year={2025},
5094
 
5095
  }</pre> </section> <section class="references-block"> </section> </div> </footer> <script>
 
1
  <!DOCTYPE html><html lang="en" data-theme="light" data-toc-auto-collapse="1"> <head><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1"><title>Maintain the unmaintainable:
2
  1M python loc, 400+ models</title><meta name="description" content="A peek into software engineering for the transformers library"><link rel="canonical" href="http://localhost:4321/"><meta property="og:type" content="article"><meta property="og:title" content="Maintain the unmaintainable:
3
+ 1M python loc, 400+ models"><meta property="og:description" content="A peek into software engineering for the transformers library"><meta property="og:url" content="http://localhost:4321/"><meta property="og:image" content="/thumb.auto.jpg"><meta property="article:published_time" content="October 2, 2025"><meta property="article:author" content="Pablo Montalvo"><meta property="article:author" content="Lysandre Debut"><meta property="article:author" content="Pedro Cuenca"><meta property="article:author" content="Yoni Gozlan"><meta name="twitter:card" content="summary_large_image"><meta name="twitter:title" content="Maintain the unmaintainable:
4
+ 1M python loc, 400+ models"><meta name="twitter:description" content="A peek into software engineering for the transformers library"><meta name="twitter:image" content="/thumb.auto.jpg"><script type="application/ld+json">{"@context":"https://schema.org","@type":"Article","headline":"Maintain the unmaintainable:\n1M python loc, 400+ models","description":"A peek into software engineering for the transformers library","datePublished":"October 2, 2025","author":[{"@type":"Person","name":"Pablo Montalvo"},{"@type":"Person","name":"Lysandre Debut"},{"@type":"Person","name":"Pedro Cuenca"},{"@type":"Person","name":"Yoni Gozlan"}],"keywords":"transformers, engineering, design-philosophy","mainEntityOfPage":"http://localhost:4321/","image":["/thumb.auto.jpg"]}</script><script>
5
  (() => {
6
  try {
7
  const saved = localStorage.getItem("theme");
 
12
  document.documentElement.setAttribute("data-theme", theme);
13
  } catch {}
14
  })();
15
+ </script><script type="module" src="/scripts/color-palettes.js"></script><!-- TO MANAGE PROPERLY --><script src="https://cdn.plot.ly/plotly-3.0.0.min.js" charset="utf-8"></script><link rel="stylesheet" href="/_astro/index.7hgRH84_.css"><script type="module" src="/_astro/hoisted.DK-CdsVg.js"></script>
16
+ <script type="module" src="/_astro/page.CH0W_C1Z.js"></script></head> <body> <button id="theme-toggle" aria-label="Toggle color theme" data-astro-cid-x3pjskd3> <svg class="icon light" width="20" height="20" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" data-astro-cid-x3pjskd3> <circle cx="12" cy="12" r="5" data-astro-cid-x3pjskd3></circle> <line x1="12" y1="1" x2="12" y2="4" data-astro-cid-x3pjskd3></line> <line x1="12" y1="20" x2="12" y2="23" data-astro-cid-x3pjskd3></line> <line x1="1" y1="12" x2="4" y2="12" data-astro-cid-x3pjskd3></line> <line x1="20" y1="12" x2="23" y2="12" data-astro-cid-x3pjskd3></line> <line x1="4.22" y1="4.22" x2="6.34" y2="6.34" data-astro-cid-x3pjskd3></line> <line x1="17.66" y1="17.66" x2="19.78" y2="19.78" data-astro-cid-x3pjskd3></line> <line x1="4.22" y1="19.78" x2="6.34" y2="17.66" data-astro-cid-x3pjskd3></line> <line x1="17.66" y1="6.34" x2="19.78" y2="4.22" data-astro-cid-x3pjskd3></line> </svg> <svg class="icon dark" width="20" height="20" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" data-astro-cid-x3pjskd3> <path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z" data-astro-cid-x3pjskd3></path> </svg> </button> <section class="hero" data-astro-cid-bbe6dxrz> <h1 class="hero-title" data-astro-cid-bbe6dxrz>Maintain the unmaintainable:<br/>1M python loc, 400+ models</h1> <div class="hero-banner" data-astro-cid-bbe6dxrz> <figure class="html-embed"><div class="html-embed__card is-frameless"><div id="frag-xl9xb41cgvg"><style>
17
  @import url('https://fonts.googleapis.com/css2?family=Inter:wght@500;600&display=swap');
18
 
19
  .banner-container {
 
137
  .attr('dy','-1.1em')
138
  .text(d => shortId(d.id));
139
 
140
+ // Pin the llama node at center
141
+ const llamaNode = graph.nodes.find(d => d.id === 'llama');
142
+ if (llamaNode) {
143
+ llamaNode.fx = W / 2;
144
+ llamaNode.fy = H / 2;
145
+ }
146
+
147
  // Forces tuned for wide, short aspect
148
  const sim = d3.forceSimulation(graph.nodes)
149
  .force('link', d3.forceLink(graph.links).id(d => d.id).distance(150).strength(0.4))
 
167
  // Fit on first paint (no zoom UI for banner)
168
  window.addEventListener('resize', () => location.reload());
169
  </script>
170
+ </div></div></figure> <p class="hero-desc" data-astro-cid-bbe6dxrz>A peek into software engineering for the transformers library</p> </div> </section> <header class="meta" aria-label="Article meta information" data-astro-cid-bbe6dxrz> <div class="meta-container" data-astro-cid-bbe6dxrz> <div class="meta-container-cell" data-astro-cid-bbe6dxrz> <h3 data-astro-cid-bbe6dxrz>Authors</h3> <ul class="authors" data-astro-cid-bbe6dxrz> <li data-astro-cid-bbe6dxrz> <a href="https://huggingface.co/Molbap" data-astro-cid-bbe6dxrz>Pablo Montalvo</a> , </li><li data-astro-cid-bbe6dxrz> <a href="https://huggingface.co/Lysandre" data-astro-cid-bbe6dxrz>Lysandre Debut</a> , </li><li data-astro-cid-bbe6dxrz> <a href="https://huggingface.co/pcuenq" data-astro-cid-bbe6dxrz>Pedro Cuenca</a> , </li><li data-astro-cid-bbe6dxrz> <a href="https://huggingface.co/yonigozlan" data-astro-cid-bbe6dxrz>Yoni Gozlan</a> </li> </ul> </div> <div class="meta-container-cell meta-container-cell--affiliations" data-astro-cid-bbe6dxrz> <h3 data-astro-cid-bbe6dxrz>Affiliation</h3> <p data-astro-cid-bbe6dxrz> <a href="https://huggingface.co" target="_blank" rel="noopener noreferrer" data-astro-cid-bbe6dxrz> Hugging Face </a> </p> </div> <div class="meta-container-cell meta-container-cell--published" data-astro-cid-bbe6dxrz> <h3 data-astro-cid-bbe6dxrz>Published</h3> <p data-astro-cid-bbe6dxrz>October 2, 2025</p> </div> <!-- {doi && (
171
  <div class="meta-container-cell">
172
  <h3>DOI</h3>
173
  <p><a href={`https://doi.org/${doi}`} target="_blank" rel="noopener noreferrer">{doi}</a></p>
174
  </div>
175
+ )} --> </div> </header> <section class="content-grid"> <nav class="table-of-contents" aria-label="Table of Contents" data-auto-collapse="1"> <div class="title">Table of Contents</div> <div id="article-toc-placeholder"></div> </nav> <details class="table-of-contents-mobile"> <summary>Table of Contents</summary> <div id="article-toc-mobile-placeholder"></div> </details> <script>
 
 
176
  // Build TOC from article headings (h2/h3/h4) and render into the sticky aside
177
  const buildTOC = () => {
178
  const holder = document.getElementById('article-toc-placeholder');
 
412
  <p>How do you keep such a ship afloat, made of so many moving, unrelated parts, contributed to by a buzzing hivemind? Especially as the pace of ML research accelerates? We receive constant feedback on everything from function signatures with hundreds of arguments to duplicated code and optimization concerns, and we listen to all of it, or try to. The library’s usage keeps on growing, and we are a small team of maintainers and contributors, backed by hundreds of open-source community members.
413
  We continue to support all new models and expect to do so for the foreseeable future.</p>
414
  <p>This post dissects the design philosophy that makes this possible. It’s the result of an evolution from our older principles, detailed on our previous <a href="https://huggingface.co/docs/transformers/en/philosophy">philosophy</a> page, as well as its accompanying <a href="https://huggingface.co/blog/transformers-design-philosophy">blog post from 2022</a>. More recently (and we strongly recommend the read) we published a blog post about <a href="https://huggingface.co/blog/faster-transformers">recent upgrades to transformers</a>, focusing on what makes the library faster today. All of these developments are only made possible thanks to these principles.</p>
415
+ <p>We formalize and articulate the “tenets” that have been guiding our development, demonstrate how they are implemented in code, and show the measurable impact they have on the library’s sustainability and growth.</p>
416
+ <p>For any OSS maintainer, power user, or contributor, this is the map to understanding, using, and building upon <code>transformers</code>, but not only: any project of comparable size will require you to make deep choices, not only on design and choice of abstraction, but on the very mindset of the software you are building. These tenets may or may not be applicable to your project, but they provide a glimpse on how we work that could be helpful or inspirational.</p>
417
+ <p>Conventions used throughout this post:</p>
418
  <p><a href="#source-of-truth">Tenets exemplified</a> will have their summary available on hover.</p>
419
  <p><a href="https://huggingface.co/blog/welcome-openai-gpt-oss">External links</a> to articles will help you solidify your knowledge.</p>
420
+ <p><a href="#generated-modeling">Several interactive visualisations</a> are available as you go - scroll, zoom, drag away to explore them.</p>
421
+ <div class="crumbs"><ul>
422
+ <li>Breadcrumb boxes summarize what you just learned, connect it to the tenets, and point to what’s coming <strong>Next</strong>. Think of them as narrative signposts to help you keep track.</li>
423
+ </ul></div>
424
+ <p>We will get started by enumerating the tenets. Then we’ll look at concrete examples that show how they shape our decision-making. These examples are necessarily detailed, and sometimes complex, because they illustrate the challenges to maintain and grow a large codebase that caters to multiple collectives, has millions of users, hundreds of contributors, and always strives for simplicity and consistency.</p>
425
  <h2 id="the-core-tenets-of-transformers"><a href="#the-core-tenets-of-transformers">The core tenets of transformers</a></h2>
426
  <p>We summarize the foundations on which we’ve built everything, and write the “tenets” of the library. They behave like <em>software interfaces</em>, hence it is crucial that they are explicitly written down. However opinionated they are, they have evolved over time.</p>
427
+ <p>These principles were not decided in a vacuum. The library <em>evolved</em> towards them, and once they <em>emerged</em>, they were recognized as critical.</p>
428
+ <div class="tenet-list"><ol><li class="tenet"><a id="source-of-truth"></a><strong>Source of Truth</strong><p>We aim to be a <a href="https://huggingface.co/blog/transformers-model-definition">source of truth for all model definitions</a>. This is more of a goal than a tenet, but it strongly guides our decisions. Model implementations should be reliable, reproducible, and faithful to the original implementations. If we are successful, they should become reference baselines for the ecosystem, so they’ll be easily adopted by downstream libraries and projects. It’s much easier for a project to <em>always</em> refer to the transformers implementation, than to learn a different research codebase every time a new architecture is released.</p><em>This overarching guideline ensures quality and reproducibility across all models in the library, and aspires to make the community work easier.</em></li><li class="tenet"><a id="one-model-one-file"></a><strong>One Model, One File</strong><p>All inference and training core logic has to be visible, top‑to‑bottom, to maximize each model’s hackability.</p><em>Every model should be completely understandable and hackable by reading a single file from top to bottom.</em></li><li class="tenet"><a id="code-is-product"></a><strong>Code is Product</strong><p>Optimize for reading, diffing, and tweaking, our users are power users. Variables can be explicit, full words, even several words, readability is primordial.</p><em>Code quality matters as much as functionality - optimize for human readers, not just computers.</em></li><li class="tenet"><a id="standardize-dont-abstract"></a><strong>Standardize, Don’t Abstract</strong><p>If it’s model behavior, keep it in the file; abstractions only for generic infra.</p><em>Model-specific logic belongs in the model file, not hidden behind abstractions.</em></li><li class="tenet"><a id="do-repeat-yourself"></a><strong>DRY* (DO Repeat Yourself)</strong><p>Copy when it helps users; keep successors in sync without centralizing behavior.</p><p><strong>Amendment:</strong> With the introduction and global adoption of <a href="#modular">modular</a> transformers, we do not repeat any logic in the modular files, but end user files remain faithful to the original tenet.</p><em>Strategic duplication can improve readability and maintainability when done thoughtfully.</em></li><li class="tenet"><a id="minimal-user-api"></a><strong>Minimal User API</strong><p>Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. We want the least amount of codepaths. Reading should be obvious, configurations should be obvious.</p><em>Keep the public interface simple and predictable, users should know what to expect.</em></li><li class="tenet"><a id="backwards-compatibility"></a><strong>Backwards Compatibility</strong><p>Evolve by additive standardization, never break public APIs.</p><p>Any artifact that was once on the hub and loadable with transformers should be usable indefinitely with the same interface. Further, public methods should not change to avoid breaking dependencies. If we do deprecate something, it’s with very long cycles beforehand.</p><em>Once something is public, it stays public, evolution through addition, not breaking changes.</em></li><li class="tenet"><a id="consistent-public-surface"></a><strong>Consistent Public Surface</strong><p>Same argument names, same outputs, hidden states and attentions exposed, enforced by tests. This is a goal as well as a tenet.</p><em>All models should feel familiar - consistent interfaces reduce cognitive load.</em></li></ol></div>
429
+ <p>When a PR is merged, it is because the contribution is worthwhile, and because the <code>transformers</code> team finds the design of the contribution to be aligned with the tenets.</p>
430
+ <p>Does all the code in the library strictly follow these tenets? No. The library is a gigantic house with connected nooks, corridors, crannies everywhere, built by thousands of different workers. We <em>try</em> to make it so all the code added is compliant, because if we fail and merge it, we cannot change it lest we break <a href="#backwards-compatibility">backwards compatibility</a>.</p>
431
+ <p>To see what constitutes adherence to the tenets, let’s take the example of code repetition.</p>
432
+ <p>The following function, essential to the implementation of <a href="https://huggingface.co/papers/2104.09864">Rotary Positional Embeddings</a> can be found in more than 70 <code>modeling_&lt;file&gt;.py</code> across <code>src/transformers/models/.</code> Why keep it? Because we want all the model logic to be <a href="#one-model-one-file">contained in the modeling file</a>. In order to do that, we <a href="#do-repeat-yourself">do repeat ourselves</a>.</p>
433
  <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">def</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> rotate_half</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(x):</span></span>
434
  <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> &quot;&quot;&quot;Rotates half the hidden dims of the input.&quot;&quot;&quot;</span></span>
435
  <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> x1 </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> x[</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">...</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">, : x.shape[</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">] </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">//</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> 2</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">]</span></span>
436
  <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> x2 </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> x[</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">...</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">, x.shape[</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">] </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">//</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> 2</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> :]</span></span>
437
  <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> return</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> torch.cat((</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">x2, x1), </span><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70">dim</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">)</span></span>
438
  <span class="line"></span></code></pre></div>
439
+ <p>We want all models to have self-contained modeling code. Every core functionality <em>must</em> be in the modeling code, every non-core functionality <em>can</em> be outside of it.</p>
440
+ <p>This comes at a great cost. For years, we have used what we call the <code>#Copied from...</code> mechanism: we added comments of a specific format documenting that some code was copied from another model, saving time both for the reviewers and for the CI: we had tooling to ensure that the copied blocks remained in sync.</p>
441
+ <p>But the LOC count kept creeping up. Each new model copied over hundreds of lines that we considered largely boilerplate, yet, we could not remove them.</p>
442
+ <p>We needed to separate two principles that were so far intertwined, <a href="#do-repeat-yourself">repetition</a> and <a href="#one-model-one-file">hackability</a>.</p>
443
+ <p>What was the solution to this? Let’s talk about modular transformers.</p>
444
+ <div class="crumbs"><p><strong>TL;DR:</strong> Read the code in one place (<a href="#one-model-one-file">One Model, One File</a>). Keep semantics local (<a href="#standardize-dont-abstract">Standardize, Don’t Abstract</a>). Allow strategic duplication for end users (<a href="#do-repeat-yourself">DRY*</a>). Keep the public surface minimal and stable (<a href="#minimal-user-api">Minimal API</a>, <a href="#backwards-compatibility">Backwards Compatibility</a>, <a href="#consistent-public-surface">Consistent Surface</a>).</p><p><strong>Next:</strong> how modular transformers honor these while removing boilerplate.</p></div>
 
445
  <h2 id="-modular-transformers"><a href="#-modular-transformers"><a id="modular"></a> Modular transformers</a></h2>
446
+ <p>Transformers is an opinionated library. The previous <a href="https://huggingface.co/docs/transformers/en/philosophy">philosophy</a> page, and the <a href="https://huggingface.co/blog/transformers-design-philosophy">blog post</a> were already pointing at the drawbacks mentioned just above, which have been iteratively addressed. <a href="https://huggingface.co/docs/transformers/en/modular_transformers"><code>modular</code> transformers was introduced</a> to allow a form of inheritance without breaking <a href="#one-model-one-file">One model, One file</a>.</p>
447
+ <p>We amended the principle of <a href="#do-repeat-yourself">DRY*</a> by progressively removing all pieces of code that were “copied from” another file.</p>
448
+ <p>It works as follows. In order to contribute a model, let us take GLM for instance, we define a <code>modular_</code> file that can inherit from <em>any function across all other modeling, configuration and processor files</em> already existing in the libary.
449
+ The modular file can use inheritance across models: and then, it will be unravelled into a fully functional modeling file.</p>
450
  <summary id="generated-modeling">Auto-generated modeling code</summary>
451
+ <figure class="html-embed"><div class="html-embed__card"><div id="frag-md3mnd0y02"><div class="code-compare" style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin: 1.5rem 0;">
452
  <div class="code-column" style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden;">
453
  <div class="code-header" style="background: #f8f9fa; padding: 0.75rem 1rem; font-weight: 600; color: #495057; border-bottom: 1px solid #e2e8f0;">
454
  modular_glm.py
 
597
  <strong>Left:</strong> Clean modular definition with inheritance.
598
  <strong>Right:</strong> Auto-expanded version with all inherited functionality visible.
599
  </p></div></div></figure>
600
+ <p>As you can see, we can define a new model as a <em>modular</em> combination of fragments taken from others.</p>
601
  <p>You might think “well that’s just how inheritance works”. The crucial difference is that we do <em>visibly</em> what is essentially the <em>compiler</em>’s job: by unrolling the inheritances, we make visible all of the modeling code, keeping it <a href="#one-model-one-file">all in one piece</a>.</p>
602
+ <p>You can see below the difference between <code>GlmAttention</code> and <code>LlamaAttention</code>, with the latter having been copied with minimal changes.</p>
603
+ <p><img src="/images/transformers/llama_glm_attn.png" alt="Llama vs GLM"/></p>
604
  <p>What is the consequence? When adding a model, we do not need to go over the entire modeling file. The modular (left side above) is enough.</p>
605
+ <p>When <code>AutoModel.from_pretrained(...)</code> is called, it is indeed the modeling (right side) that is ran, and all the tests run on the modeling code.</p>
606
+ <p>More importantly, the auto-generated modeling file is what users <em>read</em> to understand the code, what they step through in their debuggers and what they hack for their needs.</p>
607
  <p>What does that give us?</p>
608
  <div class="crumbs"><p><strong>TL;DR:</strong> A small <code>modular_*.py</code> declares reuse; the expanded modeling file stays visible (<a href="#one-model-one-file">One Model, One File tenet preserved</a>). Reviewers and contributors maintain the shard, not the repetition.</p><p><strong>Next:</strong> the measurable effect on effective LOC and maintenance cost.</p></div>
609
  <h3 id="a-maintainable-control-surface"><a href="#a-maintainable-control-surface">A maintainable control surface</a></h3>
 
611
  However, if a model has a modular_<em>.py and a corresponding automatically generated modeling_</em>/.py, we only count the LOC under the modular file. The modeling code has no maintenance cost as it is strictly dependent on the modular file.</p>
612
  <p>That gives an “effective LOC” curve: the 𝗺𝗮𝗶𝗻𝘁𝗲𝗻𝗮𝗻𝗰𝗲 𝘀𝘂𝗿𝗳𝗮𝗰𝗲.</p>
613
  <p>Measured on git history, raw <code>modeling_*.py</code> grew at ~362 LOC/day before modular; counting only modular shards yields ~25 LOC/day after — about <strong>15× lower</strong>. The effective curve (blue line below) represents the <strong>maintenance surface</strong> today: what maintainers actually read and review.</p>
614
+ <p>Less code to hand-maintain means fewer places to break. Naturally LOC is not a direct measure of complexity, but they correlate in review effort and change risk.</p>
615
+ <figure class="html-embed"><div class="html-embed__card"><div id="frag-xn06vuxqbr"><iframe
616
  src="https://molbap-loc-1.hf.space"
617
  style="width:100%; height:900px; border:0"
618
  allow="clipboard-read; clipboard-write; fullscreen"
619
  referrerpolicy="no-referrer-when-downgrade"
620
  ></iframe></div></div></figure>
621
+ <p>The blue line (effective) is the sum of the red + green, whereas the yellow would have been the progression without modular. We can see that the maintenance surface is essentially constant (in LOC) since the implementation of <code>modular</code>.
622
+ If you zoom in, you’ll notice there’s a sharp drop near the end, it’s essentially due to us <a href="https://github.com/huggingface/transformers/commit/4df2529d79d75f44e70396df5888a32ffa02d61e#diff-60849db3e9922197854ef1cac92bf4aba08b5d7fd3fe6f3c16a3511e29e0eacc">removing support for Jax and TensorFlow</a> library-wide.</p>
623
  <p>But this was not the only effort that allowed us to reduce maintenance load.</p>
624
  <p>We recently underwent a deep refactor of the attention implementation. You’ve likely heard about <a href="https://huggingface.co/docs/text-generation-inference/en/conceptual/flash_attention">flash attention</a> and its several variants.</p>
625
  <p>The <em>attention computation</em> itself happens at a <em>lower</em> level of abstraction than the model itself.</p>
 
645
  <div class="crumbs"><p>Attention semantics remain in <code>eager_attention_forward</code>; faster backends are opt-in via config. We inform via types/annotations rather than enforce rigid kwargs, preserving integrations.</p><p><strong>Next:</strong> parallel partitioning is declared as a plan, not through model surgery.</p></div>
646
  <h3 id="-configurable-tensor-parallelism"><a href="#-configurable-tensor-parallelism"><a id="simpler-tensor-parallelism"></a> Configurable Tensor Parallelism</a></h3>
647
  <p>If you’re not familiar with the different flavours of parallelism, I recommend checking out <a href="https://huggingface.co/blog/accelerate-nd-parallel">this blog post</a> first, and of course a full <a href="https://huggingface.co/spaces/nanotron/ultrascale-playbook">dive into the ultra-scale playbook</a> is always recommended.</p>
648
+ <p>The essential part is that, as <a href="https://huggingface.co/docs/transformers/v4.56.2/perf_train_gpu_many#tensor-parallelism">the documentation states</a>, when tensors get too large to fit on a single GPU, they are sliced along a particular dimension and every slice is sent to a different GPU.</p>
649
  <p>Why does it matter?</p>
650
+ <p>Because we want to avoid code modifications that are unrelated to the model.</p>
651
+ <p>We choose to place the level of abstraction higher than the device placement: a matrix multiplication - a <code>nn.Linear</code> layer - should be always expressed in the same way, regardless of how it is placed.</p>
652
+ <p>Hence, we want to touch the modeling code <a href="#minimal-user-api">minimally</a>, and only modify it when <em>architectural changes</em> are involved – not depending on the way you run it. For tensor parallelism, we simply specify a <code>tp_plan</code>:</p>
653
+ <figure class="html-embed"><div class="html-embed__card"><div id="frag-8bjtx56ll89"><pre><code class="language-python"># In the model's config (example: ERNIE 4.5-style decoder blocks)
654
+ base_model_tp_plan = {
655
+ "layers.*.self_attn.q_proj": "colwise",
656
+ "layers.*.self_attn.k_proj": "colwise",
657
+ "layers.*.self_attn.v_proj": "colwise",
658
+ "layers.*.self_attn.o_proj": "rowwise",
659
+ "layers.*.mlp.gate_proj": "colwise",
660
+ "layers.*.mlp.up_proj": "colwise",
661
+ "layers.*.mlp.down_proj": "rowwise",
662
+ }
663
+
664
+ # Runtime
665
+ import torch
666
+ from transformers import AutoModelForCausalLM, AutoTokenizer
667
+
668
+ model_id = "your/model-or-local-checkpoint"
669
+ model = AutoModelForCausalLM.from_pretrained( # <-- will automatically map to the plan defined above
670
+ model_id,
671
+ dtype=torch.bfloat16,
672
+ )
673
+ tok = AutoTokenizer.from_pretrained(model_id)
674
+ inputs = tok("Hello", return_tensors="pt").to(model.device)
675
+ out = model(**inputs)</code></pre></div></div></figure>
676
+ <p>The plan is written once, saved as part of the config and passed to <code>.from_pretrained()</code>. It maps module name patterns to partitioning strategies. Strategies are resolved by the internal <code>ParallelInterface</code>, which wires to sharding implementations <code>ColwiseParallel</code>, <code>RowwiseParallel</code>, packed variants, and so on.</p>
677
+ <p>The alternative would be to modify classes depending on supported types of parallelism.</p>
678
+ <p>The <code>tp_plan</code> solution allows users to run the same model on a single GPU, or distribute it using multiple processes per node, e.g. 4 GPUs:</p>
 
679
  <p><code>torchrun --nproc-per-node 4 demo.py</code></p>
680
+ <p>Semantics stay in the model (a Linear stays a Linear), parallelization is orthogonal and declared via strings: “colwise” splits columns of weights/bias across ranks; “rowwise” splits rows; packed variants shard fused weights; The mapping keys accept glob patterns like <code>layers.*.mlp.down_proj</code> to target repeated submodules.</p>
681
+ <div class="crumbs"><p>Parallelization is specified in the configuration (<code>tp_plan</code>), not through edits to <code>Linear</code>s. Glob patterns target repeated blocks; modeling semantics stay intact.</p><p><strong>Next:</strong> per-layer attention/caching schedules declared in config, not hardcoded.</p></div>
682
  <h3 id="-layers-attentions-and-caches"><a href="#-layers-attentions-and-caches"><a id="layers-attentions-caches"></a> Layers, attentions and caches</a></h3>
683
  <p>Following the same logic, the <em>nature</em> of attention and per-layer caching should not be hardcoded. We should be able to specify in the configuration how each layer is implemented. Thus, we define a mapping like:</p>
684
  <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">ALLOWED_LAYER_TYPES</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> =</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> (</span></span>
 
706
  <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">class</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> GlmRMSNorm</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">nn</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">Module</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">):</span></span>
707
  <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> ...</span></span>
708
  <span class="line"></span></code></pre></div>
709
+ <p>This also opens another contribution path: GPU specialists can contribute optimized kernels to the <a href="https://huggingface.co/kernels-community">Kernels Hub</a>, and have them immediately available to use in <code>transformers</code> and other libraries. You can check the <a href="https://huggingface.co/blog/hello-hf-kernels">kernel community blog post</a> to learn more about it!</p>
710
  <p>Even more resources have been added, like the formidable <a href="https://github.com/huggingface/kernel-builder">kernel builder</a> with its connected resources to <a href="https://github.com/huggingface/kernel-builder/blob/main/docs/writing-kernels.md">help you build kernels with it</a> and <a href="https://github.com/huggingface/kernel-builder/blob/main/docs/nix.md">with nix</a>.</p>
711
+ <div class="crumbs"><p>Models define semantics; kernels define how to run them faster. Use decorations to borrow community forwards while keeping a consistent public surface.</p><p><strong>Next:</strong> what modularity looks like across the repo.</p></div>
712
+ <h2 id="a-modular-state"><a href="#a-modular-state">A Modular State</a></h2>
713
+ <p>With <code>modular</code> transformers, we have a form of inheritance in our codebase. Some models become standards, and model contributors are given the opportunity to <em>define standards</em>. Pushing the boundaries of scientific knowledge can translate into the boundaries of engineering if this effort is made, and we’re striving for it.
714
  It’s hard to conceptualize very large libraries and how their components interact with each other, regardless of your cognitive abilities for abstractions.
715
  So I wanted to take a look at the current <strong>state of modularity</strong> across the repository. How many models are defined using components of others?</p>
716
  <p>To get this graph, I used the heuristic of modular inheritance.</p>
 
719
  <li>In this <code>modular</code> file, what models, configurations and processings are imported?</li>
720
  <li>Recurse through the model list that way.</li>
721
  </ol>
722
+ <p>So what do we see?</p>
723
+ <p>(Graph reading guide: nodes are models; edges are modular imports).</p>
724
+ <p>Check out the <a href="https://huggingface.co/spaces/Molbap/transformers-modular-refactor">full viewer here</a> (tab “dependency graph”, hit “build graph”) for better manipulation and exploration.</p>
725
+ <figure class="html-embed"><div class="html-embed__card"><div id="frag-kf45zromzyi"><iframe
726
  src="https://molbap-dependencies-1.hf.space"
727
  style="width:100%; height:680px; border:0"
728
  allow="clipboard-read; clipboard-write; fullscreen"
729
  referrerpolicy="no-referrer-when-downgrade"
730
  ></iframe></div></div></figure>
731
+ <p>Le’ts walk through some sections of this graph together.</p>
732
+ <p>Llama is a basis and an influence for many models, and it shows.</p>
733
+ <p><img src="/images/transformers/llama_center.png" alt="Llama in the center"/></p>
734
+ <p>Radically different architectures such as mamba have spawned their own dependency subgraph.</p>
735
+ <p>Audio models form sparser archipelagos, see for instance wav2vec2 which is a significant basis.</p>
736
+ <p><img src="/images/transformers/cluster_wave2vec2.png" alt="Wav2vec2 influence"/></p>
737
  <p>In the case of VLMs, there’s far too many vision-based architectures that are not yet defined as modulars of other existing archs. In other words, there is no strong reference point in terms of software for vision models.
738
+ )</p>
739
+ <p>As you can see, there is a small DETR island:
740
+ <img src="/images/transformers/detr_island.png" alt="DETR archipelago"/></p>
741
+ <p>There is also a little llava pocket, and so on, but it’s not comparable to the centrality observed for llama.</p>
742
+ <p>Another problem is, this visualization only shows <code>modular</code> models. Several models still do NOT have a modular file. If we zoom out significantly, we can see them, the red nodes are models that do not have a modular file yet.</p>
743
+ <p><img src="/images/transformers/big_picture_zoomout.png" alt="Red nodes"/></p>
744
+ <p>Hence the next question, and how do we identify modularisable models?</p>
745
+ <div class="crumbs"><p>Llama-lineage is a hub; several VLMs remain islands — engineering opportunity for shared parents.
746
+ <strong>Next:</strong> timeline + similarity signals to spot modularisable candidates.</p></div>
747
  <h3 id="many-models-but-not-enough-yet-are-alike"><a href="#many-models-but-not-enough-yet-are-alike">Many models, but not enough yet, are alike</a></h3>
748
+ <p>I looked into Jaccard similarity, which we use to measure set differences, to find similarities across models. I know that code is more than a set of characters stringed together. We also tried code-embedding models that ranked candidates better in practice, but for this post we stick to the deterministic Jaccard index.</p>
749
+ <p>It is interesting, for our comparison, to look at <em>when</em> we deployed the modular logic and what was its rippling effect on the library. You can check the <a href="https://huggingface.co/spaces/Molbap/transformers-modular-refactor">larger space</a> to play around, but the gist is: adding modular allowed to connect more and more models to solid reference points.</p>
750
  <p>Yet, we still have a lot of gaps to fill.</p>
751
+ <p>Zoom out below - it’s full of models. You can click on a node to see its connections better, or use the text box to search for a model. You can use the <a href="https://huggingface.co/spaces/Molbap/transformers-modular-refactor">full viewer</a> (tab “timeline”, hit “build timeline”) for better exploration.</p>
752
+ <figure class="html-embed"><div class="html-embed__card"><div id="frag-oii2s57xyo"> <iframe
753
  src="https://molbap-timeline-1.hf.space"
754
  style="width:100%; height:680px; border:0"
755
  allow="clipboard-read; clipboard-write; fullscreen"
756
  referrerpolicy="no-referrer-when-downgrade"
757
  ></iframe></div></div></figure>
758
+ <p>Lets look at a few highly connected models. Let’s start by the foundational work of <a href="https://arxiv.org/abs/2304.08485">Llava</a>.</p>
759
+ <p><img src="/images/transformers/timeline_llava.png" alt="DETR archipelago"/></p>
760
+ <p>You see that <code>llava_video</code> is a red node, connected by a red edge to <code>llava</code>: it’s a candidate, something that we can <em>likely</em> remodularize, <a href="#backwards-compatibility">not touching the actual model</a> but being much more readable with <a href="#do-repeat-yourself">DRY*</a>.</p>
761
  <div class="crumbs"><p>Similarity metrics (Jaccard index or embeddings) surfaces likely parents; the timeline shows consolidation after modular landed. Red nodes/edges = candidates (e.g., <code>llava_video</code> → <code>llava</code>) for refactors that preserve behavior. <strong>Next:</strong> concrete VLM choices that avoid leaky abstractions.</p></div>
762
  <h3 id="vlm-improvements-avoiding-abstraction"><a href="#vlm-improvements-avoiding-abstraction">VLM improvements, avoiding abstraction</a></h3>
763
  <p>We don’t yet have a cookbook for common VLM patterns (image token scatter, multi‑tower encoders, cross‑attention bridges). This is one of the main improvement points where we can work.</p>
 
813
  <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> return</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_image_mask, special_video_mask</span></span>
814
  <span class="line"></span></code></pre></div>
815
  <p>But this is <em>within</em> the modeling file, not in the <code>PreTrainedModel</code> base class. It will not move away from it, because it’d break the <a href="#one-model-one-file">self-contained logic</a> of the model.</p>
816
+ <div class="crumbs"><p>Keep VLM embedding mix in the modeling file (semantics), standardize safe helpers (e.g., placeholder masking), don’t migrate behavior to <code>PreTrainedModel</code>.
817
+ <strong>Next:</strong> pipeline-level wins that came from PyTorch-first choices (fast processors).</p></div>
818
  <h3 id="on-image-processing-and-processors"><a href="#on-image-processing-and-processors">On image processing and processors</a></h3>
819
+ <p>Deciding to become a <code>torch</code>-first library meant relieving a tremendous amount of support for <code>jax </code> and <code>TensorFlow</code>, and it also meant that we could be more lenient into the amount of torch-dependent utilities that we were able to accept. One of these is the <em>fast processing</em> of images. Where inputs were once minimally assumed to be ndarrays, enforcing native <code>torch</code> and <code>torchvision</code> inputs allowed us to massively improve processing speed for each model.</p>
820
+ <p>The gains in performance are immense, up to 20x speedup for most models when using compiled torchvision ops. Furthermore, it allows to run the whole pipeline solely on GPU.</p>
821
  <p><img src="/images/transformers/fast_image_processors.png" alt="Fast Image Processors Performance"/>
822
  <p class="figure-legend">Thanks <a href="https://huggingface.co/yonigozlan">Yoni Gozlan</a> for the great work!</p></p>
823
+ <div class="crumbs"><p>PyTorch-first lets processors assume torch/torchvision and run the whole pipeline on GPU; big per-model speedups.</p><p><strong>Next:</strong> how this lowers friction for contributors and downstream users.</p></div>
824
  <h2 id="reduce-barrier-to-entrycontribution"><a href="#reduce-barrier-to-entrycontribution">Reduce barrier to entry/contribution</a></h2>
825
  <p>This is an overall objective: there’s no <code>transformers</code> without its community.</p>
826
  <p>Having a framework means forcing users into it. It restrains flexibility and creativity, which are the fertile soil for new ideas to grow.</p>
 
830
  <strong>Next:</strong> power tools enabled by a consistent API.</p></div>
831
  <h3 id="-models-popularity"><a href="#-models-popularity"><a id="encoders-ftw"></a> Models popularity</a></h3>
832
  <p>Talking about dependencies, we can take a look at the number of downloads as a measure of popularity. One thing we see is the prominence of encoders, despite the apparent prevalence of decoder LLMs. The reason is that encoders are used to generate embeddings, which have multiple downstream uses. Just check out <a href="https://huggingface.co/blog/embeddinggemma">EmbeddingGemma</a> for a modern recap. Hence, it is vital to keep the encoders portion of the library viable, usable, fine-tune-able.</p>
833
+ <div><figure class="html-embed"><div class="html-embed__card"><div id="frag-ejk5kk4wtm"><html>
834
  <head><meta charset="utf-8" /></head>
835
  <body>
836
  <div> <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
 
4727
  <p>All models have the same API for attention computation, thanks to <a href="#external-attention-classes">the externalisation of attention classes</a>.</p>
4728
  <p>This uniformity allows us to build cool tools to visualize the inner workings of the attention mechanism.</p>
4729
  <p>One particular piece of machinery is the <code>attention mask</code>. Here you see the famous bidirectional attention pattern for the whole prefix (text + image) in PaliGemma and all Gemma2+ models, contrasting with the usual “causal-only” models.</p>
4730
+ <figure class="html-embed"><div class="html-embed__card"><div id="frag-1ihz7she8ee"><!-- Minimal HTML fragment: terminal-style ASCII attention masks -->
4731
  <div style="max-width: 940px; margin: 16px 0; border:1px solid #2a2f3a; border-radius:8px; background:#0b0f19; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace; color:#e5e7eb;">
4732
  <div style="display:flex; align-items:center; gap:8px; padding:8px 10px; border-bottom:1px solid #1f2430; background:#111827; border-top-left-radius:8px; border-top-right-radius:8px;">
4733
  <span style="width:10px; height:10px; background:#ef4444; border-radius:50%; display:inline-block;"></span>
 
4772
  </div>
4773
  </div>
4774
  </div></div></figure>
4775
+ <div class="crumbs"><p>Uniform attention APIs enable cross-model diagnostics (e.g., PaliGemma prefix bidirectionality vs causal).</p><p><strong>Next:</strong> whole-model tracing for ports and regressions.</p></div>
 
4776
  <h3 id="logging-entire-model-activations"><a href="#logging-entire-model-activations">Logging entire model activations</a></h3>
4777
+ <p>Because everything is PyTorch, we can easily <a href="https://huggingface.co/docs/transformers/internal/model_debugging_utils">debug any model</a> when we want to add it to transformers. We now have a power-user tool for porting or adding models, that wraps a forward pass, intercepts every submodule call, and logs shapes, dtypes, and sample statistics of inputs/outputs to nested JSON.</p>
4778
+ <p>It just works with PyTorch models and is especially useful when aligning outputs with a reference implementation, to match our <a href="#source-of-truth">Source of Truth guideline</a>.</p>
4779
  <p><img src="/images/transformers/model_debugger.png" alt="Model debugger interface"/></p>
4780
+ <div class="crumbs"><p>Forward interception and nested JSON logging align ports to reference implementations, reinforcing “Source of Truth.” <strong>Next:</strong> CUDA warmup reduces load-time without touching modeling semantics.</p></div>
4781
  <h3 id="cooking-faster-cuda-warmups"><a href="#cooking-faster-cuda-warmups">Cooking faster CUDA warmups</a></h3>
4782
+ <p>Having a clean <em>external</em> API allows us to work on the <a href="#code-is-product">true inner workings of transformers</a>. One of a few recent additions was the <em>CUDA warmup</em> via <code>caching_allocator_warmup</code>, which dramatically improved loading times by pre-allocating GPU memory to avoid malloc bottlenecks during model loading. It can achieve a 7x speedup factor for an 8B model, or 6x for a 32B one, as you can check in <a href="https://github.com/huggingface/transformers/pull/36380">the PR</a>!</p>
4783
+ <figure class="html-embed"><div class="html-embed__card"><div id="frag-20u85e0tulx"><style>
4784
  /* 1) Scope tokens to the widget */
4785
  .warmup-demo{
4786
  --page-bg:#ffffff;
 
5082
  }
5083
  </script></div></div></figure>
5084
  <p>It’s hard to overstate how much of a lifesaver that is when you’re trying to load a model as fast as possible, as it’s the narrowest bottleneck for your iteration speed.</p>
5085
+ <div class="crumbs"><p>Pre-allocating GPU memory removes malloc spikes (e.g., 7× for 8B, 6× for 32B in the referenced PR).</p><p><strong>Next:</strong> consistent interfaces allow transformers-serve.</p></div>
5086
  <h3 id="transformers-serve-and-continuous-batching"><a href="#transformers-serve-and-continuous-batching">Transformers-serve and continuous batching</a></h3>
5087
  <p>Having all these models readily available and sharing the same interface allowed us to implement transformers-serve, a CLI tool to expose models through a standard OpenAI http API.</p>
5088
  <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="bash"><code><span class="line"><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">transformers</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> serve</span></span>
 
5113
  <strong>Next:</strong> what changes in v5 without breaking the promise of visible semantics.</p></div>
5114
  <h2 id="what-is-coming-next"><a href="#what-is-coming-next">What is coming next</a></h2>
5115
  <p>The next major version of <code>transformers</code> is just around the corner (and will have another blog post to its name when it comes out). When v5 is released, we aim to keep <a href="#backwards-compatibility">backwards compatibility</a> as solid as possible. The changes we make now are in service of that goal.</p>
5116
+ <p>We will lean further into a modular toolbox, not a framework. You should not be forced to rewrite modeling code. It’s better when a model can inherit from <code>PreTrainedModel</code> and opt into Tensor Parallel, <code>from_pretrained</code>, sharding, <code>push_to_hub</code>, loss plumbing, and external stacks like PEFT/TRL/SGLang/vLLM.</p> </main> </section> <footer class="footer"> <div class="footer-inner"> <section class="citation-block"> <h3>Citation</h3> <p>For attribution, cite this work as</p> <pre class="citation short">Pablo Montalvo, Lysandre Debut, Pedro Cuenca, Yoni Gozlan (2025). &quot;Maintain the unmaintainable: 1M python loc, 400+ models&quot;.</pre> <p>BibTeX citation</p> <pre class="citation long">@misc{montalvo2025_maintain_the_unmaintaina,
5117
  title={Maintain the unmaintainable: 1M python loc, 400+ models},
5118
+ author={Pablo Montalvo and Lysandre Debut and Pedro Cuenca and Yoni Gozlan},
5119
  year={2025},
5120
 
5121
  }</pre> </section> <section class="references-block"> </section> </div> </footer> <script>
app/dist/index.html.gz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed0105da49dc4bd2501866a3127d0d5d87cece9f044a8d3acf75e7007e611f9f
3
- size 1488729
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6912a23ebe8db8e4eda9fcfcb65eb6cd7fa541f19286b11060733984d7a7f8ff
3
+ size 1489962
app/dist/llama_center.png ADDED

Git LFS Details

  • SHA256: 3ec2caa493f919717ece1366836e156d8d05a3bf09ef4313ea502d5130a82cb0
  • Pointer size: 131 Bytes
  • Size of remote file: 406 kB
app/dist/llama_glm_attn.png ADDED

Git LFS Details

  • SHA256: 6b2c88d5eb3d461d791e7e280f74e54d05f01babb02ea0536b50386b7b1b1b8a
  • Pointer size: 131 Bytes
  • Size of remote file: 138 kB
app/public/images/transformers/big_picture_zoomout.png ADDED

Git LFS Details

  • SHA256: 6b48173ad33c50e9b1b7f674bb21948da982db04e4a927cf0cecee45bc749297
  • Pointer size: 131 Bytes
  • Size of remote file: 218 kB
app/public/images/transformers/cluster_wave2vec2.png ADDED

Git LFS Details

  • SHA256: ad2931607cfd522cbccddc8047ee7fd6ee3945a2d818fe72b6c6b08c58e062b3
  • Pointer size: 130 Bytes
  • Size of remote file: 55.4 kB
app/public/images/transformers/detr_island.png ADDED

Git LFS Details

  • SHA256: 6f6daf8ce4f8e71a0a9b3c60f2a7a18aacf1812a54337cf345b9005eaa251125
  • Pointer size: 130 Bytes
  • Size of remote file: 18.5 kB
app/public/images/transformers/llama_center.png ADDED

Git LFS Details

  • SHA256: 3ec2caa493f919717ece1366836e156d8d05a3bf09ef4313ea502d5130a82cb0
  • Pointer size: 131 Bytes
  • Size of remote file: 406 kB
app/public/images/transformers/llama_glm_attn.png ADDED

Git LFS Details

  • SHA256: 6b2c88d5eb3d461d791e7e280f74e54d05f01babb02ea0536b50386b7b1b1b8a
  • Pointer size: 131 Bytes
  • Size of remote file: 138 kB
app/public/images/transformers/timeline_llava.png ADDED

Git LFS Details

  • SHA256: 2bd0469fa24737bf309c1225005b62d7f0a9d722df0e56a18c578a1327cf94fc
  • Pointer size: 131 Bytes
  • Size of remote file: 621 kB
app/public/llama_center.png ADDED

Git LFS Details

  • SHA256: 3ec2caa493f919717ece1366836e156d8d05a3bf09ef4313ea502d5130a82cb0
  • Pointer size: 131 Bytes
  • Size of remote file: 406 kB
app/public/llama_glm_attn.png ADDED

Git LFS Details

  • SHA256: 6b2c88d5eb3d461d791e7e280f74e54d05f01babb02ea0536b50386b7b1b1b8a
  • Pointer size: 131 Bytes
  • Size of remote file: 138 kB
app/src/components/Hero.astro CHANGED
@@ -183,19 +183,6 @@ const pdfFilename = `${slugify(pdfBase)}.pdf`;
183
  <p><a href={`https://doi.org/${doi}`} target="_blank" rel="noopener noreferrer">{doi}</a></p>
184
  </div>
185
  )} -->
186
- <div class="meta-container-cell meta-container-cell--pdf">
187
- <h3>PDF</h3>
188
- <p>
189
- <a
190
- class="button"
191
- href={`/${pdfFilename}`}
192
- download={pdfFilename}
193
- aria-label={`Download PDF ${pdfFilename}`}
194
- >
195
- Download PDF
196
- </a>
197
- </p>
198
- </div>
199
  </div>
200
  </header>
201
 
 
183
  <p><a href={`https://doi.org/${doi}`} target="_blank" rel="noopener noreferrer">{doi}</a></p>
184
  </div>
185
  )} -->
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  </div>
187
  </header>
188
 
app/src/content/article.mdx CHANGED
@@ -6,6 +6,15 @@ authors:
6
  - name: "Pablo Montalvo"
7
  url: "https://huggingface.co/Molbap"
8
  affiliations: [1]
 
 
 
 
 
 
 
 
 
9
  affiliations:
10
  - name: "Hugging Face"
11
  url: "https://huggingface.co"
@@ -29,34 +38,38 @@ We continue to support all new models and expect to do so for the foreseeable fu
29
 
30
  This post dissects the design philosophy that makes this possible. It's the result of an evolution from our older principles, detailed on our previous [philosophy](https://huggingface.co/docs/transformers/en/philosophy) page, as well as its accompanying [blog post from 2022](https://huggingface.co/blog/transformers-design-philosophy). More recently (and we strongly recommend the read) we published a blog post about [recent upgrades to transformers](https://huggingface.co/blog/faster-transformers), focusing on what makes the library faster today. All of these developments are only made possible thanks to these principles.
31
 
32
- We codify the "tenets" that guide our development, demonstrate how they are implemented in code, and show the measurable impact they have on the library's sustainability and growth.
33
 
34
- For any OSS maintainer, power user, or contributor, this is the map to understanding, using, and building upon `transformers`, but not only: any project of comparable size will require you to make deep choices, not only on design and choice of abstraction, but on the very mindset of the software you are building.
 
 
35
 
36
  [Tenets exemplified](#source-of-truth) will have their summary available on hover.
37
 
38
  [External links](https://huggingface.co/blog/welcome-openai-gpt-oss) to articles will help you solidify your knowledge.
39
 
40
- [Several interactive visualisations](#generated-modeling) are available as you go - scroll, zoom, drag away.
41
 
42
  <div class="crumbs">
43
- Throughout this post, you'll find breadcrumb boxes like this one. They summarize what you just learned, connect it to the tenets, and point to what's coming <strong>Next</strong>. Think of them as narrative signposts to help you keep track.
44
  </div>
45
 
 
 
46
  ## The core tenets of transformers
47
 
48
 
49
  We summarize the foundations on which we've built everything, and write the "tenets" of the library. They behave like _software interfaces_, hence it is crucial that they are explicitly written down. However opinionated they are, they have evolved over time.
50
 
51
- Note that the library _evolved_ towards these principles, and that they _emerged_ from decisions taken, and once emerged they were recognized as critical.
52
 
53
  <div class="tenet-list">
54
  <ol>
55
  <li class="tenet">
56
  <a id="source-of-truth"></a>
57
  <strong>Source of Truth</strong>
58
- <p>We aim to be a [source of truth for all model definitions](#https://huggingface.co/blog/transformers-model-definition). This is not a tenet, but something that still guides our decisions. Model implementations should be reliable, reproducible, and faithful to the original performances.</p>
59
- <em>This overarching guideline ensures quality and reproducibility across all models in the library.</em>
60
  </li>
61
 
62
  <li class="tenet">
@@ -94,24 +107,26 @@ Note that the library _evolved_ towards these principles, and that they _emerged
94
  <a id="backwards-compatibility"></a>
95
  <strong>Backwards Compatibility</strong>
96
  <p>Evolve by additive standardization, never break public APIs.</p>
97
- <p>Any artifact that was once on the hub and loadable with transformers should be usable indefinitely with the same interface. Further, public methods should not change to avoid breaking dependencies.</p>
98
  <em>Once something is public, it stays public, evolution through addition, not breaking changes.</em>
99
  </li>
100
  <li class="tenet">
101
  <a id="consistent-public-surface"></a>
102
  <strong>Consistent Public Surface</strong>
103
- <p>Same argument names, same outputs, hidden states and attentions exposed, enforced by tests. This is a goal we have as well as a tenet.</p>
104
  <em>All models should feel familiar - consistent interfaces reduce cognitive load.</em>
105
  </li>
106
  </ol>
107
  </div>
108
 
109
 
110
- When a PR is merged, it is because the contribution is worthwhile, and that the `transformers` team finds the design of the contribution to be aligned with what is above.
111
 
112
- Does all the code in the library follow strictly these tenets? No. The library is a gigantic house with connected nooks, corridors, crannies everywhere built by thousands of different workers. We _try_ to make it so all the code added is compliant, because if we fail and merge it, we cannot change it lest we break [backwards compatibility](#backwards-compatibility).
113
 
114
- For instance, one function essential to the implementation of [Rotary Positional Embeddings](https://huggingface.co/papers/2104.09864) is identical in 70 `modeling_<file>.py` across `src/transformers/models/.` Why keep it? Because we want all the model logic to be [contained in the modeling file](#one-model-one-file). In order to do that, we [do repeat ourselves](#do-repeat-yourself).
 
 
115
 
116
  ```python
117
  def rotate_half(x):
@@ -121,43 +136,51 @@ def rotate_half(x):
121
  return torch.cat((-x2, x1), dim=-1)
122
  ```
123
 
124
- You can use a simple regex to look at all methods of a given name across your codebase and look at their differences and similarities, that's what I did (+ a hash to avoid quadraticity).
125
 
126
- We want all models to have self-contained modeling code.
127
 
128
- Every core functionality _must_ be in the modeling code, every non-core functionality _can_ be outside of it.
129
 
130
- This comes as a great cost. Enter the `#Copied from...` mechanism: for a long time, these comments were indicating that some code was copied from another model, saving time both for the reviewers and for the CI. But the LOC count kept creeping up. Each new model copied over hundreds of lines that we considered largely boilerplate, yet, we could not remove them.
131
 
132
- We needed to separate both principles that were so far intertwined, [repetition](#do-repeat-yourself) and [hackability](#one-model-one-file).
133
 
134
- What was the solution to this?
135
 
136
  <div class="crumbs">
137
- Read the code in one place (<a href="#one-model-one-file">One Model, One File</a>). Keep semantics local (<a href="#standardize-dont-abstract">Standardize, Don't Abstract</a>). Allow strategic duplication for end users (<a href="#do-repeat-yourself">DRY*</a>). Keep the public surface minimal and stable (<a href="#minimal-user-api">Minimal API</a>, <a href="#backwards-compatibility">Backwards Compatibility</a>, <a href="#consistent-public-surface">Consistent Surface</a>). <strong>Next:</strong> how modular transformers honor these while removing boilerplate.
 
 
138
  </div>
139
 
140
 
141
  ## <a id="modular"></a> Modular transformers
142
 
143
- Transformers is an opinionated library. The previous [philosophy](https://huggingface.co/docs/transformers/en/philosophy) page, and the [blog post](https://huggingface.co/blog/transformers-design-philosophy) were already pointing at the drawbacks mentioned just above, which have been iteratively addressed. [`modular` transformers were introduced](https://huggingface.co/docs/transformers/en/modular_transformers), allowing a form of inheritance without breaking [One model, One file](#one-model-one-file).
144
 
145
- We amended the principle of [DRY*](#do-repeat-yourself) by removing progressively all pieces of code that were "copied from" another file.
146
 
147
- It works as follows. In order to contribute a model, say for instance define a `modular_` file that can inherit from _any function across all other modeling, configuration and processor files_.
148
- This modular file can use inheritance across models: and then, it will be unravelled into a fully functional modeling file.
149
 
150
  <summary id="generated-modeling">Auto-generated modeling code</summary>
151
 
152
  <HtmlEmbed src="transformers/glm-compare.html" />
153
 
154
- As you can see, we can now define any model as a _modular_ of another.
155
 
156
  You might think "well that's just how inheritance works". The crucial difference is that we do _visibly_ what is essentially the _compiler_'s job: by unrolling the inheritances, we make visible all of the modeling code, keeping it [all in one piece](#one-model-one-file).
157
 
 
 
 
 
 
158
  What is the consequence? When adding a model, we do not need to go over the entire modeling file. The modular (left side above) is enough.
159
 
160
- When `AutoModel.from_pretrained(...)` is called, it is indeed the modeling (right side) that is ran, and all the tests are run on the modeling code.
 
 
161
 
162
  What does that give us?
163
 
@@ -177,10 +200,11 @@ That gives an "effective LOC" curve: the 𝗺𝗮𝗶𝗻𝘁𝗲𝗻𝗮𝗻
177
 
178
  Measured on git history, raw `modeling_*.py` grew at ~362 LOC/day before modular; counting only modular shards yields ~25 LOC/day after — about **15× lower**. The effective curve (blue line below) represents the **maintenance surface** today: what maintainers actually read and review.
179
 
180
- Less code to hand-maintain means fewer places to break. LOC is not a direct measure of complexity, but they correlate in review effort and change risk.
181
 
182
  <HtmlEmbed src="transformers/loc-growth.html" />
183
 
 
184
  If you zoom in, you'll notice there's a sharp drop near the end, it's essentially due to us [removing support for Jax and TensorFlow](https://github.com/huggingface/transformers/commit/4df2529d79d75f44e70396df5888a32ffa02d61e#diff-60849db3e9922197854ef1cac92bf4aba08b5d7fd3fe6f3c16a3511e29e0eacc) library-wide.
185
 
186
  But this was not the only effort that allowed us to reduce maintenance load.
@@ -238,30 +262,32 @@ Attention semantics remain in <code>eager_attention_forward</code>; faster backe
238
 
239
  If you're not familiar with the different flavours of parallelism, I recommend checking out [this blog post](https://huggingface.co/blog/accelerate-nd-parallel) first, and of course a full [dive into the ultra-scale playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook) is always recommended.
240
 
241
- The essential part is that, as [the documentation states](https://huggingface.co/docs/transformers/v4.56.2/perf_train_gpu_many#tensor-parallelism) when tensors get too large to fit on a single GPU, they are sliced along a particular dimension and every slice is sent to a different GPU.
242
 
243
  Why does it matter?
244
 
245
  Because we want to avoid code modifications that are unrelated to the model.
246
- We choose to place the level of abstraction higher than the device placement: a matrix multiplication - a `nn.Linear` layer - should be always expressed in the same way, regardless of how it is placed.
247
-
248
- Hence, we want to touch [minimally](#minimal-user-api) to the modeling code, and only modify it when _architectural changes_ are involved. For instance, for tensor parallelism, we instead now specify a simple `tp_plan`.
249
 
250
- The alternative would be to modify parent classes specific to their
251
 
252
- It is written once in the config and passed to `.from_pretrained()`. The plan maps module name patterns to partitioning strategies. Strategies are resolved by the internal `ParallelInterface`, which wires to sharding implementations `ColwiseParallel`, `RowwiseParallel`, packed variants, and so on.
253
 
254
  <HtmlEmbed src="transformers/tp-plan.html" />
255
 
 
 
 
256
 
257
- Which allows a user to run with multiple processes per node, e.g. 4 GPUs:
258
 
259
  `torchrun --nproc-per-node 4 demo.py`
260
 
261
- Semantics stay in the model (a Linear stays a Linear), distribution is orthogonal and declared via strings: "colwise" splits columns of weights/bias across ranks; "rowwise" splits rows; packed variants shard fused weights; The mapping keys accept glob patterns like `layers.*.mlp.down_proj` to target repeated submodules.
262
 
263
  <div class="crumbs">
264
- Sharding is configuration (<code>tp_plan</code>), not edits to <code>Linear</code>s. Glob patterns target repeated blocks; modeling semantics stay intact. <strong>Next:</strong> per-layer attention/caching schedules declared in config, not hardcoded.
 
 
265
  </div>
266
 
267
  ### <a id="layers-attentions-caches"></a> Layers, attentions and caches
@@ -310,18 +336,20 @@ class GlmRMSNorm(nn.Module):
310
  ...
311
  ```
312
 
313
- This also opens another contribution path: GPU specialists can contribute optimized kernels to the kernel hub, and have them usable in `transformers`. You can check on the [kernel community blog post](https://huggingface.co/blog/hello-hf-kernels) to learn more about it!
314
 
315
  Even more resources have been added, like the formidable [kernel builder](https://github.com/huggingface/kernel-builder) with its connected resources to [help you build kernels with it](https://github.com/huggingface/kernel-builder/blob/main/docs/writing-kernels.md) and [with nix](https://github.com/huggingface/kernel-builder/blob/main/docs/nix.md).
316
 
317
-
318
  <div class="crumbs">
319
- Models define semantics; kernels define how to run them faster. Use annotations to borrow community forwards while keeping a consistent public surface. <strong>Next:</strong> what modularity looks like across the repo.
 
 
320
  </div>
321
 
322
- ## Modular developments
323
 
324
- Now, we have a form of inheritance in our codebase. Some models become standards, and model contributors are given the opportunity to _define standards_. Pushing the boundaries of scientific knowledge can translate into the boundaries of engineering if this effort is made, and we're striving for it.
 
 
325
  It's hard to conceptualize very large libraries and how their components interact with each other, regardless of your cognitive abilities for abstractions.
326
  So I wanted to take a look at the current **state of modularity** across the repository. How many models are defined using components of others?
327
 
@@ -330,37 +358,64 @@ To get this graph, I used the heuristic of modular inheritance.
330
  2. In this `modular` file, what models, configurations and processings are imported?
331
  3. Recurse through the model list that way.
332
 
333
- So what do we see? Llama is a basis for many models, and it shows.
334
- Radically different architectures such as mamba have spawned their own dependency subgraph.
335
 
 
336
 
 
337
  <HtmlEmbed src="transformers/dependency-graph.html" />
338
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  In the case of VLMs, there's far too many vision-based architectures that are not yet defined as modulars of other existing archs. In other words, there is no strong reference point in terms of software for vision models.
340
- As you can see, there is a small DETR island, a little llava pocket, and so on, but it's not comparable to the centrality observed for llama.
 
 
 
 
341
 
342
- Another problem is, this visualization only shows `modular` models. Several models still do NOT have a modular file.
343
 
344
- How do we spot them, and how do we identify modularisable models?
 
 
 
 
345
 
346
  <div class="crumbs">
347
- Graph reading guide: nodes are models; edges are modular imports. Llama-lineage is a hub; several VLMs remain islands — engineering opportunity for shared parents. <strong>Next:</strong> timeline + similarity signals to spot candidates.
 
348
  </div>
349
 
350
 
351
  ### Many models, but not enough yet, are alike
352
 
353
- I looked into Jaccard similarity, which we use to measure set differences, to find similarities across models. I know that code is more than a set of characters stringed together. We also tried code-embedding models that ranked candidates better in practice, but for this post we stick to the deterministic Jaccard index. You can take a look at [the corresponding PR](https://github.com/huggingface/transformers/pull/41289) for the embedding method.
354
 
355
- It is interesting, for our comparison, to look at _when_ we deployed this modular logic and what was its rippling effect on the library. You can check the [larger space](https://huggingface.co/spaces/Molbap/transformers-modular-refactor) to play around, but the gist is: adding modular allowed to connect more and more models to solid reference points.
356
 
357
  Yet, we still have a lot of gaps to fill.
358
 
359
- Zoom out below - it's full of models. You can click on a node to see its connections better, or use the text box to search for a model.
360
 
361
  <HtmlEmbed src="transformers/model-timeline.html" />
362
 
363
- If you've checked out llava, you've seen that llava_video is a red node, connected by a red edge to llava: it's a candidate, something that we can _likely_ remodularize, [not touching the actual model](#backwards-compatibility) but being much more readable with [DRY*](#do-repeat-yourself).
 
 
 
 
 
364
 
365
  <div class="crumbs">
366
  Similarity metrics (Jaccard index or embeddings) surfaces likely parents; the timeline shows consolidation after modular landed. Red nodes/edges = candidates (e.g., <code>llava_video</code> → <code>llava</code>) for refactors that preserve behavior. <strong>Next:</strong> concrete VLM choices that avoid leaky abstractions.
@@ -435,21 +490,24 @@ The following [Pull request to standardize placeholder masking](https://github.c
435
  But this is _within_ the modeling file, not in the `PreTrainedModel` base class. It will not move away from it, because it'd break the [self-contained logic](#one-model-one-file) of the model.
436
 
437
  <div class="crumbs">
438
- Keep VLM embedding mix in the modeling file (semantics), standardize safe helpers (e.g., placeholder masking), don't migrate behavior to <code>PreTrainedModel</code>. <strong>Next:</strong> pipeline-level wins that came from PyTorch-first choices (fast processors).
 
439
  </div>
440
 
441
 
442
  ### On image processing and processors
443
 
444
- Choosing to be a `torch`-first software meant relieving a tremendous amount of support from `jax ` and `TensorFlow` , and it also meant that we could be more lenient into the amount of torch-dependent utilities that we were able to add. One of these is the _fast processing_ of images. Where they were before assumed to be minimal ndarrays, making stronger assumptions and enforcing `torch` and `torchvision` native inputs allowed up to speed up massively the processing time for each model.
445
 
446
- The gains in performance are immense, up to 20x speed for most models when compiled torchvision ops. Further, it allows to have the whole pipeline solely on GPU.
447
 
448
  ![Fast Image Processors Performance](/images/transformers/fast_image_processors.png)
449
  <p class="figure-legend">Thanks <a href="https://huggingface.co/yonigozlan">Yoni Gozlan</a> for the great work!</p>
450
 
451
  <div class="crumbs">
452
- Torch-first lets processors assume torch/torchvision and run the whole pipeline on GPU; big per-model speedups. <strong>Next:</strong> how this lowers friction for contributors and downstream users.
 
 
453
  </div>
454
 
455
 
@@ -506,38 +564,42 @@ One particular piece of machinery is the `attention mask`. Here you see the famo
506
 
507
  <div class="crumbs">
508
  Uniform attention APIs enable cross-model diagnostics (e.g., PaliGemma prefix bidirectionality vs causal).
 
509
  <strong>Next:</strong> whole-model tracing for ports and regressions.
510
  </div>
511
 
512
 
513
  ### Logging entire model activations
514
 
515
- Further, because it is all PyTorch (and it is even more now that we support only PyTorch), we can easily [debug any model](https://huggingface.co/docs/transformers/internal/model_debugging_utils) when we want to add it to transformers. We now have a power-user tool for porting or adding models, that wraps a forward pass, intercepts every submodule call, and logs shapes, dtypes, and sample statistics of inputs/outputs to nested JSON.
516
 
517
- It just works with PyTorch models and is especially useful when aligning outputs with a reference implementation, aligned with our [core guideline](#source-of-truth).
518
 
519
  ![Model debugger interface](/images/transformers/model_debugger.png)
520
 
521
 
522
  <div class="crumbs">
523
- Forward interception and nested JSON logging align ports to reference implementations, reinforcing "Source of Truth." <strong>Next:</strong> CUDA warmup reduces load-time stalls without touching modeling semantics.
524
  </div>
525
 
526
 
527
 
528
  ### Cooking faster CUDA warmups
529
 
530
- Having a clean _external_ API allows us to work on the [true inner workings of transformers](#code-is-product). One of the few recent additions was the _CUDA warmup_ via `caching_allocator_warmup` which improved massively the loading footprint by pre-allocating GPU memory to avoid malloc bottlenecks during model loading, achieving a 7x factor for an 8B model, 6x for a 32B, you can check out [the source](https://github.com/huggingface/transformers/pull/36380)!
531
 
532
  <HtmlEmbed src="transformers/warmup_demo.html" />
533
 
534
  It's hard to overstate how much of a lifesaver that is when you're trying to load a model as fast as possible, as it's the narrowest bottleneck for your iteration speed.
535
 
536
  <div class="crumbs">
537
- Pre-allocating GPU memory removes malloc spikes (e.g., 7× for 8B, 6× for 32B in the referenced PR). <strong>Next:</strong> serving benefits directly from consistent interfaces and modularity.
 
 
538
  </div>
539
 
540
 
 
541
  ### Transformers-serve and continuous batching
542
 
543
  Having all these models readily available and sharing the same interface allowed us to implement transformers-serve, a CLI tool to expose models through a standard OpenAI http API.
 
6
  - name: "Pablo Montalvo"
7
  url: "https://huggingface.co/Molbap"
8
  affiliations: [1]
9
+ - name: "Lysandre Debut"
10
+ url: "https://huggingface.co/Lysandre"
11
+ affiliations: [1]
12
+ - name: "Pedro Cuenca"
13
+ url: "https://huggingface.co/pcuenq"
14
+ affiliations: [1]
15
+ - name: "Yoni Gozlan"
16
+ url: "https://huggingface.co/yonigozlan"
17
+ affiliations: [1]
18
  affiliations:
19
  - name: "Hugging Face"
20
  url: "https://huggingface.co"
 
38
 
39
  This post dissects the design philosophy that makes this possible. It's the result of an evolution from our older principles, detailed on our previous [philosophy](https://huggingface.co/docs/transformers/en/philosophy) page, as well as its accompanying [blog post from 2022](https://huggingface.co/blog/transformers-design-philosophy). More recently (and we strongly recommend the read) we published a blog post about [recent upgrades to transformers](https://huggingface.co/blog/faster-transformers), focusing on what makes the library faster today. All of these developments are only made possible thanks to these principles.
40
 
41
+ We formalize and articulate the "tenets" that have been guiding our development, demonstrate how they are implemented in code, and show the measurable impact they have on the library's sustainability and growth.
42
 
43
+ For any OSS maintainer, power user, or contributor, this is the map to understanding, using, and building upon `transformers`, but not only: any project of comparable size will require you to make deep choices, not only on design and choice of abstraction, but on the very mindset of the software you are building. These tenets may or may not be applicable to your project, but they provide a glimpse on how we work that could be helpful or inspirational.
44
+
45
+ Conventions used throughout this post:
46
 
47
  [Tenets exemplified](#source-of-truth) will have their summary available on hover.
48
 
49
  [External links](https://huggingface.co/blog/welcome-openai-gpt-oss) to articles will help you solidify your knowledge.
50
 
51
+ [Several interactive visualisations](#generated-modeling) are available as you go - scroll, zoom, drag away to explore them.
52
 
53
  <div class="crumbs">
54
+ * Breadcrumb boxes summarize what you just learned, connect it to the tenets, and point to what's coming <strong>Next</strong>. Think of them as narrative signposts to help you keep track.
55
  </div>
56
 
57
+ We will get started by enumerating the tenets. Then we'll look at concrete examples that show how they shape our decision-making. These examples are necessarily detailed, and sometimes complex, because they illustrate the challenges to maintain and grow a large codebase that caters to multiple collectives, has millions of users, hundreds of contributors, and always strives for simplicity and consistency.
58
+
59
  ## The core tenets of transformers
60
 
61
 
62
  We summarize the foundations on which we've built everything, and write the "tenets" of the library. They behave like _software interfaces_, hence it is crucial that they are explicitly written down. However opinionated they are, they have evolved over time.
63
 
64
+ These principles were not decided in a vacuum. The library _evolved_ towards them, and once they _emerged_, they were recognized as critical.
65
 
66
  <div class="tenet-list">
67
  <ol>
68
  <li class="tenet">
69
  <a id="source-of-truth"></a>
70
  <strong>Source of Truth</strong>
71
+ <p>We aim to be a [source of truth for all model definitions](https://huggingface.co/blog/transformers-model-definition). This is more of a goal than a tenet, but it strongly guides our decisions. Model implementations should be reliable, reproducible, and faithful to the original implementations. If we are successful, they should become reference baselines for the ecosystem, so they'll be easily adopted by downstream libraries and projects. It's much easier for a project to _always_ refer to the transformers implementation, than to learn a different research codebase every time a new architecture is released.</p>
72
+ <em>This overarching guideline ensures quality and reproducibility across all models in the library, and aspires to make the community work easier.</em>
73
  </li>
74
 
75
  <li class="tenet">
 
107
  <a id="backwards-compatibility"></a>
108
  <strong>Backwards Compatibility</strong>
109
  <p>Evolve by additive standardization, never break public APIs.</p>
110
+ <p>Any artifact that was once on the hub and loadable with transformers should be usable indefinitely with the same interface. Further, public methods should not change to avoid breaking dependencies. If we do deprecate something, it's with very long cycles beforehand.</p>
111
  <em>Once something is public, it stays public, evolution through addition, not breaking changes.</em>
112
  </li>
113
  <li class="tenet">
114
  <a id="consistent-public-surface"></a>
115
  <strong>Consistent Public Surface</strong>
116
+ <p>Same argument names, same outputs, hidden states and attentions exposed, enforced by tests. This is a goal as well as a tenet.</p>
117
  <em>All models should feel familiar - consistent interfaces reduce cognitive load.</em>
118
  </li>
119
  </ol>
120
  </div>
121
 
122
 
123
+ When a PR is merged, it is because the contribution is worthwhile, and because the `transformers` team finds the design of the contribution to be aligned with the tenets.
124
 
125
+ Does all the code in the library strictly follow these tenets? No. The library is a gigantic house with connected nooks, corridors, crannies everywhere, built by thousands of different workers. We _try_ to make it so all the code added is compliant, because if we fail and merge it, we cannot change it lest we break [backwards compatibility](#backwards-compatibility).
126
 
127
+ To see what constitutes adherence to the tenets, let's take the example of code repetition.
128
+
129
+ The following function, essential to the implementation of [Rotary Positional Embeddings](https://huggingface.co/papers/2104.09864) can be found in more than 70 `modeling_<file>.py` across `src/transformers/models/.` Why keep it? Because we want all the model logic to be [contained in the modeling file](#one-model-one-file). In order to do that, we [do repeat ourselves](#do-repeat-yourself).
130
 
131
  ```python
132
  def rotate_half(x):
 
136
  return torch.cat((-x2, x1), dim=-1)
137
  ```
138
 
 
139
 
140
+ We want all models to have self-contained modeling code. Every core functionality _must_ be in the modeling code, every non-core functionality _can_ be outside of it.
141
 
142
+ This comes at a great cost. For years, we have used what we call the `#Copied from...` mechanism: we added comments of a specific format documenting that some code was copied from another model, saving time both for the reviewers and for the CI: we had tooling to ensure that the copied blocks remained in sync.
143
 
144
+ But the LOC count kept creeping up. Each new model copied over hundreds of lines that we considered largely boilerplate, yet, we could not remove them.
145
 
146
+ We needed to separate two principles that were so far intertwined, [repetition](#do-repeat-yourself) and [hackability](#one-model-one-file).
147
 
148
+ What was the solution to this? Let's talk about modular transformers.
149
 
150
  <div class="crumbs">
151
+ <strong>TL;DR:</strong> Read the code in one place (<a href="#one-model-one-file">One Model, One File</a>). Keep semantics local (<a href="#standardize-dont-abstract">Standardize, Don't Abstract</a>). Allow strategic duplication for end users (<a href="#do-repeat-yourself">DRY*</a>). Keep the public surface minimal and stable (<a href="#minimal-user-api">Minimal API</a>, <a href="#backwards-compatibility">Backwards Compatibility</a>, <a href="#consistent-public-surface">Consistent Surface</a>).
152
+
153
+ <strong>Next:</strong> how modular transformers honor these while removing boilerplate.
154
  </div>
155
 
156
 
157
  ## <a id="modular"></a> Modular transformers
158
 
159
+ Transformers is an opinionated library. The previous [philosophy](https://huggingface.co/docs/transformers/en/philosophy) page, and the [blog post](https://huggingface.co/blog/transformers-design-philosophy) were already pointing at the drawbacks mentioned just above, which have been iteratively addressed. [`modular` transformers was introduced](https://huggingface.co/docs/transformers/en/modular_transformers) to allow a form of inheritance without breaking [One model, One file](#one-model-one-file).
160
 
161
+ We amended the principle of [DRY*](#do-repeat-yourself) by progressively removing all pieces of code that were "copied from" another file.
162
 
163
+ It works as follows. In order to contribute a model, let us take GLM for instance, we define a `modular_` file that can inherit from _any function across all other modeling, configuration and processor files_ already existing in the libary.
164
+ The modular file can use inheritance across models: and then, it will be unravelled into a fully functional modeling file.
165
 
166
  <summary id="generated-modeling">Auto-generated modeling code</summary>
167
 
168
  <HtmlEmbed src="transformers/glm-compare.html" />
169
 
170
+ As you can see, we can define a new model as a _modular_ combination of fragments taken from others.
171
 
172
  You might think "well that's just how inheritance works". The crucial difference is that we do _visibly_ what is essentially the _compiler_'s job: by unrolling the inheritances, we make visible all of the modeling code, keeping it [all in one piece](#one-model-one-file).
173
 
174
+ You can see below the difference between `GlmAttention` and `LlamaAttention`, with the latter having been copied with minimal changes.
175
+
176
+ ![Llama vs GLM](/images/transformers/llama_glm_attn.png)
177
+
178
+
179
  What is the consequence? When adding a model, we do not need to go over the entire modeling file. The modular (left side above) is enough.
180
 
181
+ When `AutoModel.from_pretrained(...)` is called, it is indeed the modeling (right side) that is ran, and all the tests run on the modeling code.
182
+
183
+ More importantly, the auto-generated modeling file is what users _read_ to understand the code, what they step through in their debuggers and what they hack for their needs.
184
 
185
  What does that give us?
186
 
 
200
 
201
  Measured on git history, raw `modeling_*.py` grew at ~362 LOC/day before modular; counting only modular shards yields ~25 LOC/day after — about **15× lower**. The effective curve (blue line below) represents the **maintenance surface** today: what maintainers actually read and review.
202
 
203
+ Less code to hand-maintain means fewer places to break. Naturally LOC is not a direct measure of complexity, but they correlate in review effort and change risk.
204
 
205
  <HtmlEmbed src="transformers/loc-growth.html" />
206
 
207
+ The blue line (effective) is the sum of the red + green, whereas the yellow would have been the progression without modular. We can see that the maintenance surface is essentially constant (in LOC) since the implementation of `modular`.
208
  If you zoom in, you'll notice there's a sharp drop near the end, it's essentially due to us [removing support for Jax and TensorFlow](https://github.com/huggingface/transformers/commit/4df2529d79d75f44e70396df5888a32ffa02d61e#diff-60849db3e9922197854ef1cac92bf4aba08b5d7fd3fe6f3c16a3511e29e0eacc) library-wide.
209
 
210
  But this was not the only effort that allowed us to reduce maintenance load.
 
262
 
263
  If you're not familiar with the different flavours of parallelism, I recommend checking out [this blog post](https://huggingface.co/blog/accelerate-nd-parallel) first, and of course a full [dive into the ultra-scale playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook) is always recommended.
264
 
265
+ The essential part is that, as [the documentation states](https://huggingface.co/docs/transformers/v4.56.2/perf_train_gpu_many#tensor-parallelism), when tensors get too large to fit on a single GPU, they are sliced along a particular dimension and every slice is sent to a different GPU.
266
 
267
  Why does it matter?
268
 
269
  Because we want to avoid code modifications that are unrelated to the model.
 
 
 
270
 
271
+ We choose to place the level of abstraction higher than the device placement: a matrix multiplication - a `nn.Linear` layer - should be always expressed in the same way, regardless of how it is placed.
272
 
273
+ Hence, we want to touch the modeling code [minimally](#minimal-user-api), and only modify it when _architectural changes_ are involved not depending on the way you run it. For tensor parallelism, we simply specify a `tp_plan`:
274
 
275
  <HtmlEmbed src="transformers/tp-plan.html" />
276
 
277
+ The plan is written once, saved as part of the config and passed to `.from_pretrained()`. It maps module name patterns to partitioning strategies. Strategies are resolved by the internal `ParallelInterface`, which wires to sharding implementations `ColwiseParallel`, `RowwiseParallel`, packed variants, and so on.
278
+
279
+ The alternative would be to modify classes depending on supported types of parallelism.
280
 
281
+ The `tp_plan` solution allows users to run the same model on a single GPU, or distribute it using multiple processes per node, e.g. 4 GPUs:
282
 
283
  `torchrun --nproc-per-node 4 demo.py`
284
 
285
+ Semantics stay in the model (a Linear stays a Linear), parallelization is orthogonal and declared via strings: "colwise" splits columns of weights/bias across ranks; "rowwise" splits rows; packed variants shard fused weights; The mapping keys accept glob patterns like `layers.*.mlp.down_proj` to target repeated submodules.
286
 
287
  <div class="crumbs">
288
+ Parallelization is specified in the configuration (<code>tp_plan</code>), not through edits to <code>Linear</code>s. Glob patterns target repeated blocks; modeling semantics stay intact.
289
+
290
+ <strong>Next:</strong> per-layer attention/caching schedules declared in config, not hardcoded.
291
  </div>
292
 
293
  ### <a id="layers-attentions-caches"></a> Layers, attentions and caches
 
336
  ...
337
  ```
338
 
339
+ This also opens another contribution path: GPU specialists can contribute optimized kernels to the [Kernels Hub](https://huggingface.co/kernels-community), and have them immediately available to use in `transformers` and other libraries. You can check the [kernel community blog post](https://huggingface.co/blog/hello-hf-kernels) to learn more about it!
340
 
341
  Even more resources have been added, like the formidable [kernel builder](https://github.com/huggingface/kernel-builder) with its connected resources to [help you build kernels with it](https://github.com/huggingface/kernel-builder/blob/main/docs/writing-kernels.md) and [with nix](https://github.com/huggingface/kernel-builder/blob/main/docs/nix.md).
342
 
 
343
  <div class="crumbs">
344
+ Models define semantics; kernels define how to run them faster. Use decorations to borrow community forwards while keeping a consistent public surface.
345
+
346
+ <strong>Next:</strong> what modularity looks like across the repo.
347
  </div>
348
 
 
349
 
350
+ ## A Modular State
351
+
352
+ With `modular` transformers, we have a form of inheritance in our codebase. Some models become standards, and model contributors are given the opportunity to _define standards_. Pushing the boundaries of scientific knowledge can translate into the boundaries of engineering if this effort is made, and we're striving for it.
353
  It's hard to conceptualize very large libraries and how their components interact with each other, regardless of your cognitive abilities for abstractions.
354
  So I wanted to take a look at the current **state of modularity** across the repository. How many models are defined using components of others?
355
 
 
358
  2. In this `modular` file, what models, configurations and processings are imported?
359
  3. Recurse through the model list that way.
360
 
361
+ So what do we see?
 
362
 
363
+ (Graph reading guide: nodes are models; edges are modular imports).
364
 
365
+ Check out the [full viewer here](https://huggingface.co/spaces/Molbap/transformers-modular-refactor) (tab "dependency graph", hit "build graph") for better manipulation and exploration.
366
  <HtmlEmbed src="transformers/dependency-graph.html" />
367
 
368
+ Le'ts walk through some sections of this graph together.
369
+
370
+ Llama is a basis and an influence for many models, and it shows.
371
+
372
+ ![Llama in the center](/images/transformers/llama_center.png)
373
+
374
+ Radically different architectures such as mamba have spawned their own dependency subgraph.
375
+
376
+ Audio models form sparser archipelagos, see for instance wav2vec2 which is a significant basis.
377
+
378
+ ![Wav2vec2 influence](/images/transformers/cluster_wave2vec2.png)
379
+
380
  In the case of VLMs, there's far too many vision-based architectures that are not yet defined as modulars of other existing archs. In other words, there is no strong reference point in terms of software for vision models.
381
+ )
382
+
383
+
384
+ As you can see, there is a small DETR island:
385
+ ![DETR archipelago](/images/transformers/detr_island.png)
386
 
387
+ There is also a little llava pocket, and so on, but it's not comparable to the centrality observed for llama.
388
 
389
+ Another problem is, this visualization only shows `modular` models. Several models still do NOT have a modular file. If we zoom out significantly, we can see them, the red nodes are models that do not have a modular file yet.
390
+
391
+ ![Red nodes](/images/transformers/big_picture_zoomout.png)
392
+
393
+ Hence the next question, and how do we identify modularisable models?
394
 
395
  <div class="crumbs">
396
+ Llama-lineage is a hub; several VLMs remain islands — engineering opportunity for shared parents.
397
+ <strong>Next:</strong> timeline + similarity signals to spot modularisable candidates.
398
  </div>
399
 
400
 
401
  ### Many models, but not enough yet, are alike
402
 
403
+ I looked into Jaccard similarity, which we use to measure set differences, to find similarities across models. I know that code is more than a set of characters stringed together. We also tried code-embedding models that ranked candidates better in practice, but for this post we stick to the deterministic Jaccard index.
404
 
405
+ It is interesting, for our comparison, to look at _when_ we deployed the modular logic and what was its rippling effect on the library. You can check the [larger space](https://huggingface.co/spaces/Molbap/transformers-modular-refactor) to play around, but the gist is: adding modular allowed to connect more and more models to solid reference points.
406
 
407
  Yet, we still have a lot of gaps to fill.
408
 
409
+ Zoom out below - it's full of models. You can click on a node to see its connections better, or use the text box to search for a model. You can use the [full viewer](https://huggingface.co/spaces/Molbap/transformers-modular-refactor) (tab "timeline", hit "build timeline") for better exploration.
410
 
411
  <HtmlEmbed src="transformers/model-timeline.html" />
412
 
413
+ Let's look at a few highly connected models. Let's start by the foundational work of [Llava](https://arxiv.org/abs/2304.08485).
414
+
415
+ ![DETR archipelago](/images/transformers/timeline_llava.png)
416
+
417
+
418
+ You see that `llava_video` is a red node, connected by a red edge to `llava`: it's a candidate, something that we can _likely_ remodularize, [not touching the actual model](#backwards-compatibility) but being much more readable with [DRY*](#do-repeat-yourself).
419
 
420
  <div class="crumbs">
421
  Similarity metrics (Jaccard index or embeddings) surfaces likely parents; the timeline shows consolidation after modular landed. Red nodes/edges = candidates (e.g., <code>llava_video</code> → <code>llava</code>) for refactors that preserve behavior. <strong>Next:</strong> concrete VLM choices that avoid leaky abstractions.
 
490
  But this is _within_ the modeling file, not in the `PreTrainedModel` base class. It will not move away from it, because it'd break the [self-contained logic](#one-model-one-file) of the model.
491
 
492
  <div class="crumbs">
493
+ Keep VLM embedding mix in the modeling file (semantics), standardize safe helpers (e.g., placeholder masking), don't migrate behavior to <code>PreTrainedModel</code>.
494
+ <strong>Next:</strong> pipeline-level wins that came from PyTorch-first choices (fast processors).
495
  </div>
496
 
497
 
498
  ### On image processing and processors
499
 
500
+ Deciding to become a `torch`-first library meant relieving a tremendous amount of support for `jax ` and `TensorFlow`, and it also meant that we could be more lenient into the amount of torch-dependent utilities that we were able to accept. One of these is the _fast processing_ of images. Where inputs were once minimally assumed to be ndarrays, enforcing native `torch` and `torchvision` inputs allowed us to massively improve processing speed for each model.
501
 
502
+ The gains in performance are immense, up to 20x speedup for most models when using compiled torchvision ops. Furthermore, it allows to run the whole pipeline solely on GPU.
503
 
504
  ![Fast Image Processors Performance](/images/transformers/fast_image_processors.png)
505
  <p class="figure-legend">Thanks <a href="https://huggingface.co/yonigozlan">Yoni Gozlan</a> for the great work!</p>
506
 
507
  <div class="crumbs">
508
+ PyTorch-first lets processors assume torch/torchvision and run the whole pipeline on GPU; big per-model speedups.
509
+
510
+ <strong>Next:</strong> how this lowers friction for contributors and downstream users.
511
  </div>
512
 
513
 
 
564
 
565
  <div class="crumbs">
566
  Uniform attention APIs enable cross-model diagnostics (e.g., PaliGemma prefix bidirectionality vs causal).
567
+
568
  <strong>Next:</strong> whole-model tracing for ports and regressions.
569
  </div>
570
 
571
 
572
  ### Logging entire model activations
573
 
574
+ Because everything is PyTorch, we can easily [debug any model](https://huggingface.co/docs/transformers/internal/model_debugging_utils) when we want to add it to transformers. We now have a power-user tool for porting or adding models, that wraps a forward pass, intercepts every submodule call, and logs shapes, dtypes, and sample statistics of inputs/outputs to nested JSON.
575
 
576
+ It just works with PyTorch models and is especially useful when aligning outputs with a reference implementation, to match our [Source of Truth guideline](#source-of-truth).
577
 
578
  ![Model debugger interface](/images/transformers/model_debugger.png)
579
 
580
 
581
  <div class="crumbs">
582
+ Forward interception and nested JSON logging align ports to reference implementations, reinforcing "Source of Truth." <strong>Next:</strong> CUDA warmup reduces load-time without touching modeling semantics.
583
  </div>
584
 
585
 
586
 
587
  ### Cooking faster CUDA warmups
588
 
589
+ Having a clean _external_ API allows us to work on the [true inner workings of transformers](#code-is-product). One of a few recent additions was the _CUDA warmup_ via `caching_allocator_warmup`, which dramatically improved loading times by pre-allocating GPU memory to avoid malloc bottlenecks during model loading. It can achieve a 7x speedup factor for an 8B model, or 6x for a 32B one, as you can check in [the PR](https://github.com/huggingface/transformers/pull/36380)!
590
 
591
  <HtmlEmbed src="transformers/warmup_demo.html" />
592
 
593
  It's hard to overstate how much of a lifesaver that is when you're trying to load a model as fast as possible, as it's the narrowest bottleneck for your iteration speed.
594
 
595
  <div class="crumbs">
596
+ Pre-allocating GPU memory removes malloc spikes (e.g., 7× for 8B, 6× for 32B in the referenced PR).
597
+
598
+ <strong>Next:</strong> consistent interfaces allow transformers-serve.
599
  </div>
600
 
601
 
602
+
603
  ### Transformers-serve and continuous batching
604
 
605
  Having all these models readily available and sharing the same interface allowed us to implement transformers-serve, a CLI tool to expose models through a standard OpenAI http API.
app/src/content/embeds/banner.html CHANGED
@@ -122,6 +122,13 @@ node.append('text')
122
  .attr('dy','-1.1em')
123
  .text(d => shortId(d.id));
124
 
 
 
 
 
 
 
 
125
  // Forces tuned for wide, short aspect
126
  const sim = d3.forceSimulation(graph.nodes)
127
  .force('link', d3.forceLink(graph.links).id(d => d.id).distance(150).strength(0.4))
 
122
  .attr('dy','-1.1em')
123
  .text(d => shortId(d.id));
124
 
125
+ // Pin the llama node at center
126
+ const llamaNode = graph.nodes.find(d => d.id === 'llama');
127
+ if (llamaNode) {
128
+ llamaNode.fx = W / 2;
129
+ llamaNode.fy = H / 2;
130
+ }
131
+
132
  // Forces tuned for wide, short aspect
133
  const sim = d3.forceSimulation(graph.nodes)
134
  .force('link', d3.forceLink(graph.links).id(d => d.id).distance(150).strength(0.4))
app/src/content/embeds/transformers/tp-plan.html CHANGED
@@ -1,24 +1,23 @@
1
  <pre><code class="language-python"># In the model's config (example: ERNIE 4.5-style decoder blocks)
2
- base_model_tp_plan = {
3
- "layers.*.self_attn.q_proj": "colwise",
4
- "layers.*.self_attn.k_proj": "colwise",
5
- "layers.*.self_attn.v_proj": "colwise",
6
- "layers.*.self_attn.o_proj": "rowwise",
7
- "layers.*.mlp.gate_proj": "colwise",
8
- "layers.*.mlp.up_proj": "colwise",
9
- "layers.*.mlp.down_proj": "rowwise",
10
- }
11
-
12
- # Runtime
13
- import torch
14
- from transformers import AutoModelForCausalLM, AutoTokenizer
15
-
16
- model_id = "your/model-or-local-checkpoint"
17
- model = AutoModelForCausalLM.from_pretrained(
18
- model_id,
19
- dtype=torch.bfloat16,
20
- tp_plan=base_model_tp_plan, # <-- plan defined above
21
- )
22
- tok = AutoTokenizer.from_pretrained(model_id)
23
- inputs = tok("Hello", return_tensors="pt").to(model.device)
24
- out = model(**inputs)</code></pre>
 
1
  <pre><code class="language-python"># In the model's config (example: ERNIE 4.5-style decoder blocks)
2
+ base_model_tp_plan = {
3
+ "layers.*.self_attn.q_proj": "colwise",
4
+ "layers.*.self_attn.k_proj": "colwise",
5
+ "layers.*.self_attn.v_proj": "colwise",
6
+ "layers.*.self_attn.o_proj": "rowwise",
7
+ "layers.*.mlp.gate_proj": "colwise",
8
+ "layers.*.mlp.up_proj": "colwise",
9
+ "layers.*.mlp.down_proj": "rowwise",
10
+ }
11
+
12
+ # Runtime
13
+ import torch
14
+ from transformers import AutoModelForCausalLM, AutoTokenizer
15
+
16
+ model_id = "your/model-or-local-checkpoint"
17
+ model = AutoModelForCausalLM.from_pretrained( # <-- will automatically map to the plan defined above
18
+ model_id,
19
+ dtype=torch.bfloat16,
20
+ )
21
+ tok = AutoTokenizer.from_pretrained(model_id)
22
+ inputs = tok("Hello", return_tensors="pt").to(model.device)
23
+ out = model(**inputs)</code></pre>
 
app/src/content/new_article.mdx CHANGED
@@ -163,6 +163,7 @@ As you can see, we can define a new model as a _modular_ combination of fragment
163
 
164
  You might think "well that's just how inheritance works". The crucial difference is that we do _visibly_ what is essentially the _compiler_'s job: by unrolling the inheritances, we make visible all of the modeling code, keeping it [all in one piece](#one-model-one-file).
165
 
 
166
  <!-- some ideas for additional hand-holding: link to the implementation of `LlamaAttention` to show it was copied (and modified), or maybe provide a git diff view between the GlmAttention and LlamaAttention implementations -->
167
 
168
  What is the consequence? When adding a model, we do not need to go over the entire modeling file. The modular (left side above) is enough.
 
163
 
164
  You might think "well that's just how inheritance works". The crucial difference is that we do _visibly_ what is essentially the _compiler_'s job: by unrolling the inheritances, we make visible all of the modeling code, keeping it [all in one piece](#one-model-one-file).
165
 
166
+
167
  <!-- some ideas for additional hand-holding: link to the implementation of `LlamaAttention` to show it was copied (and modified), or maybe provide a git diff view between the GlmAttention and LlamaAttention implementations -->
168
 
169
  What is the consequence? When adding a model, we do not need to go over the entire modeling file. The modular (left side above) is enough.
app/src/styles/_base.css CHANGED
@@ -109,11 +109,11 @@ html {
109
  }
110
 
111
  .content-grid main mark {
112
- background-color: color-mix(in srgb, var(--primary-color, #007AFF) 10%, transparent);
113
- border: 1px solid color-mix(in srgb, var(--primary-color) 18%, transparent);
114
  color: inherit;
115
- padding: 4px 6px;
116
- border-radius: 4px;
117
  font-weight: 500;
118
  box-decoration-break: clone;
119
  -webkit-box-decoration-break: clone;
 
109
  }
110
 
111
  .content-grid main mark {
112
+ background-color: color-mix(in srgb, var(--primary-color, #007AFF) 3%, transparent);
113
+ border: 1px solid color-mix(in srgb, var(--primary-color) 5%, transparent);
114
  color: inherit;
115
+ padding: 1px 3px;
116
+ border-radius: 2px;
117
  font-weight: 500;
118
  box-decoration-break: clone;
119
  -webkit-box-decoration-break: clone;
src/distill.js DELETED
The diff for this file is too large to render. See raw diff
 
src/fragments/attention-visualizer.html DELETED
@@ -1,45 +0,0 @@
1
- <!-- Minimal HTML fragment: terminal-style ASCII attention masks -->
2
- <div style="max-width: 940px; margin: 16px 0; border:1px solid #2a2f3a; border-radius:8px; background:#0b0f19; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace; color:#e5e7eb;">
3
- <div style="display:flex; align-items:center; gap:8px; padding:8px 10px; border-bottom:1px solid #1f2430; background:#111827; border-top-left-radius:8px; border-top-right-radius:8px;">
4
- <span style="width:10px; height:10px; background:#ef4444; border-radius:50%; display:inline-block;"></span>
5
- <span style="width:10px; height:10px; background:#f59e0b; border-radius:50%; display:inline-block;"></span>
6
- <span style="width:10px; height:10px; background:#22c55e; border-radius:50%; display:inline-block;"></span>
7
- <span style="margin-left:8px; font-size:12px; color:#9ca3af;">attention-mask-visualizer</span>
8
- </div>
9
- <div style="padding:12px 14px; overflow:auto; font-size:12.5px; line-height:1.4;">
10
- <pre style="margin:0; white-space:pre; tab-size:2;">
11
- ATTN MASK — GPT-2 (causal)
12
- Tokens: [The, cat, sat, on, the, mat]
13
- Legend: x = can attend, . = masked (future)
14
-
15
- The cat sat on the mat
16
- The x
17
- cat x x
18
- sat x x x
19
- on x x x x
20
- the x x x x x
21
- mat x x x x x x
22
-
23
-
24
- ATTN MASK — PaliGemma-style (bidirectional prefix + causal suffix)
25
- Prefix: [&lt;i0&gt; &lt;i1&gt; &lt;i2&gt; &lt;i3&gt; &lt;i4&gt; What is this]
26
- Suffix: [A great duck]
27
- Legend: ✓ = can attend, ✗ = cannot
28
-
29
- &lt;i0&gt;&lt;i1&gt;&lt;i2&gt;&lt;i3&gt;&lt;i4&gt; What is this | A great duck
30
- &lt;i0&gt; ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗
31
- &lt;i1&gt; ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗
32
- &lt;i2&gt; ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗
33
- &lt;i3&gt; ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗
34
- &lt;i4&gt; ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗
35
- What ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗
36
- is ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗
37
- this ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗
38
- --------------------------------------------------------------------
39
- A ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗
40
- great ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗
41
- duck ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓
42
- </pre>
43
- </div>
44
- </div>
45
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/fragments/d3-graph.html DELETED
@@ -1,12 +0,0 @@
1
- <div class="interactive-demo">
2
- <div class="demo-header">
3
- <h3>🔗 Model Dependency Graph</h3>
4
- </div>
5
- <div class="demo-content">
6
- <iframe src="static/d3_dependency_graph.html" width="100%" height="600px" frameborder="0" style="border-radius: 8px; background: white;"></iframe>
7
- </div>
8
- <div class="demo-footer">
9
- Interactive dependency graph showing real relationships between Transformers models. 🟡 Base models (HuggingFace logo), 🔵 Derived modular models. Click and drag to explore!
10
- </div>
11
- </div>
12
-
 
 
 
 
 
 
 
 
 
 
 
 
 
src/fragments/dependency-graph.html DELETED
@@ -1,6 +0,0 @@
1
- <iframe
2
- src="https://molbap-dependencies-1.hf.space"
3
- style="width:100%; height:680px; border:0"
4
- allow="clipboard-read; clipboard-write; fullscreen"
5
- referrerpolicy="no-referrer-when-downgrade"
6
- ></iframe>
 
 
 
 
 
 
 
src/fragments/glm-compare.html DELETED
@@ -1,149 +0,0 @@
1
- <div class="code-compare" style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin: 1.5rem 0;">
2
- <div class="code-column" style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden;">
3
- <div class="code-header" style="background: #f8f9fa; padding: 0.75rem 1rem; font-weight: 600; color: #495057; border-bottom: 1px solid #e2e8f0;">
4
- modular_glm.py
5
- </div>
6
- <pre style="margin: 0; padding: 1rem; background: #ffffff; overflow-x: auto; font-size: 0.9em;"><code class="language-python">class GlmMLP(Phi3MLP):
7
- pass
8
-
9
- class GlmAttention(LlamaAttention):
10
- def __init__(self, config, layer_idx=None):
11
- super().__init__(config, layer_idx)
12
- self.o_proj = nn.Linear(
13
- config.num_attention_heads * self.head_dim,
14
- config.hidden_size,
15
- bias=False
16
- )
17
-
18
- class GlmForCausalLM(LlamaForCausalLM):
19
- pass</code></pre>
20
- </div>
21
-
22
- <div class="code-column" style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden;">
23
- <div class="code-header" style="background: #f8f9fa; padding: 0.75rem 1rem; font-weight: 600; color: #495057; border-bottom: 1px solid #e2e8f0;">
24
- modeling_glm.py (auto-expanded)
25
- </div>
26
- <pre style="margin: 0; padding: 1rem; background: #ffffff; overflow-x: auto; font-size: 0.9em; max-height: 400px;"><code class="language-python">class GlmMLP(nn.Module):
27
- def __init__(self, config):
28
- super().__init__()
29
- self.config = config
30
- self.gate_up_proj = nn.Linear(
31
- config.hidden_size,
32
- 2 * config.intermediate_size,
33
- bias=False
34
- )
35
- self.down_proj = nn.Linear(
36
- config.intermediate_size,
37
- config.hidden_size,
38
- bias=False
39
- )
40
- self.activation_fn = ACT2FN[config.hidden_act]
41
-
42
- def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
43
- up_states = self.gate_up_proj(hidden_states)
44
- gate, up_states = up_states.chunk(2, dim=-1)
45
- up_states = up_states * self.activation_fn(gate)
46
- return self.down_proj(up_states)
47
-
48
-
49
- class GlmAttention(nn.Module):
50
- """Multi-headed attention from 'Attention Is All You Need' paper"""
51
-
52
- def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
53
- super().__init__()
54
- self.config = config
55
- self.layer_idx = layer_idx
56
- self.head_dim = getattr(
57
- config, "head_dim",
58
- config.hidden_size // config.num_attention_heads
59
- )
60
- self.num_key_value_groups = (
61
- config.num_attention_heads // config.num_key_value_heads
62
- )
63
- self.scaling = self.head_dim**-0.5
64
- self.attention_dropout = config.attention_dropout
65
- self.is_causal = True
66
-
67
- self.q_proj = nn.Linear(
68
- config.hidden_size,
69
- config.num_attention_heads * self.head_dim,
70
- bias=config.attention_bias
71
- )
72
- self.k_proj = nn.Linear(
73
- config.hidden_size,
74
- config.num_key_value_heads * self.head_dim,
75
- bias=config.attention_bias
76
- )
77
- self.v_proj = nn.Linear(
78
- config.hidden_size,
79
- config.num_key_value_heads * self.head_dim,
80
- bias=config.attention_bias
81
- )
82
- self.o_proj = nn.Linear(
83
- config.num_attention_heads * self.head_dim,
84
- config.hidden_size,
85
- bias=False
86
- )
87
-
88
- def forward(
89
- self,
90
- hidden_states: torch.Tensor,
91
- position_embeddings: Tuple[torch.Tensor, torch.Tensor],
92
- attention_mask: Optional[torch.Tensor],
93
- past_key_value: Optional[Cache] = None,
94
- cache_position: Optional[torch.LongTensor] = None,
95
- **kwargs: Unpack[FlashAttentionKwargs],
96
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
97
- input_shape = hidden_states.shape[:-1]
98
- hidden_shape = (*input_shape, -1, self.head_dim)
99
-
100
- query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
101
- key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
102
- value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
103
-
104
- cos, sin = position_embeddings
105
- query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
106
-
107
- if past_key_value is not None:
108
- cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
109
- key_states, value_states = past_key_value.update(
110
- key_states, value_states, self.layer_idx, cache_kwargs
111
- )
112
-
113
- attention_interface: Callable = eager_attention_forward
114
- if self.config._attn_implementation != "eager":
115
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
116
-
117
- attn_output, attn_weights = attention_interface(
118
- self, query_states, key_states, value_states,
119
- attention_mask, dropout=0.0 if not self.training else self.attention_dropout,
120
- scaling=self.scaling, **kwargs,
121
- )
122
-
123
- attn_output = attn_output.reshape(*input_shape, -1).contiguous()
124
- attn_output = self.o_proj(attn_output)
125
- return attn_output, attn_weights
126
-
127
-
128
- @use_kernel_forward_from_hub("RMSNorm")
129
- class GlmRMSNorm(nn.Module):
130
- def __init__(self, hidden_size, eps=1e-6):
131
- super().__init__()
132
- self.weight = nn.Parameter(torch.ones(hidden_size))
133
- self.variance_epsilon = eps
134
-
135
- def forward(self, hidden_states):
136
- input_dtype = hidden_states.dtype
137
- hidden_states = hidden_states.to(torch.float32)
138
- variance = hidden_states.pow(2).mean(-1, keepdim=True)
139
- hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
140
- return self.weight * hidden_states.to(input_dtype)
141
-
142
- # ... (many more classes and functions would follow)</code></pre>
143
- </div>
144
- </div>
145
-
146
- <p style="text-align: center; font-style: italic; color: #6c757d; margin-top: 1rem;">
147
- <strong>Left:</strong> Clean modular definition with inheritance.
148
- <strong>Right:</strong> Auto-expanded version with all inherited functionality visible.
149
- </p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/fragments/loc-growth.html DELETED
@@ -1,6 +0,0 @@
1
- <iframe
2
- src="https://molbap-loc-1.hf.space"
3
- style="width:100%; height:680px; border:0"
4
- allow="clipboard-read; clipboard-write; fullscreen"
5
- referrerpolicy="no-referrer-when-downgrade"
6
- ></iframe>
 
 
 
 
 
 
 
src/fragments/memory-profiler.html DELETED
@@ -1,16 +0,0 @@
1
- <div style="border: 1px solid #e2e8f0; border-radius: 8px; background: white; margin: 1.5rem 0;">
2
- <div style="padding: 1rem; border-bottom: 1px solid #e2e8f0; background: #f8f9fa;">
3
- <h4 style="margin: 0 0 0.5rem 0; color: #495057;">🚀 CUDA Warmup Efficiency Benchmark</h4>
4
- <p style="margin: 0; font-size: 0.9em; color: #6c757d;">
5
- Real CUDA warmup benchmarking with actual Transformers models. Measure the performance impact of the caching_allocator_warmup function.
6
- </p>
7
- </div>
8
-
9
- <div style="padding: 1rem;">
10
- <iframe src=https://molbap-cuda-warmup-transformers.hf.space width=100% height=800px frameborder=0 style="border-radius: 8px; background: white;"></iframe>
11
- </div>
12
-
13
- <div style="padding: 1rem; border-top: 1px solid #e2e8f0; background: #f8f9fa; font-size: 0.9em; color: #6c757d;">
14
- Real CUDA warmup benchmarking with actual Transformers models. Measure the performance impact of the <code>caching_allocator_warmup</code> function at <code>transformers/src/transformers/modeling_utils.py:6186</code>. This interactive tool loads models twice - once with warmup disabled and once with warmup enabled - to demonstrate the significant loading time improvements.
15
- </div>
16
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/fragments/model-timeline.html DELETED
@@ -1,6 +0,0 @@
1
- <iframe
2
- src="https://molbap-timeline-1.hf.space"
3
- style="width:100%; height:680px; border:0"
4
- allow="clipboard-read; clipboard-write; fullscreen"
5
- referrerpolicy="no-referrer-when-downgrade"
6
- ></iframe>
 
 
 
 
 
 
 
src/fragments/model-visualisation.html DELETED
The diff for this file is too large to render. See raw diff
 
src/fragments/terminal.html DELETED
@@ -1,43 +0,0 @@
1
- <div style="background: #f8f9fa; border: 1px solid #e9ecef; border-radius: 8px; padding: 1rem; margin: 1.5rem 0;">
2
- <h4 style="margin-top: 0; color: #495057;">Interactive Terminal</h4>
3
- <div style="background: #2d3748; color: #e2e8f0; padding: 1rem; border-radius: 6px; font-family: 'Consolas', 'Monaco', monospace;">
4
- <div style="margin-bottom: 1rem;">
5
- <input type="text"
6
- id="terminal-input"
7
- placeholder="python -c 'import torch; print(torch.__version__)'"
8
- style="width: calc(100% - 80px); padding: 0.5rem; background: #1a202c; border: 1px solid #4a5568; color: #e2e8f0; border-radius: 4px;">
9
- <button id="terminal-run"
10
- style="width: 70px; padding: 0.5rem; margin-left: 8px; background: #3182ce; color: white; border: none; border-radius: 4px; cursor: pointer;">Run</button>
11
- </div>
12
- <pre id="terminal-output" style="background: #1a202c; padding: 1rem; border-radius: 4px; min-height: 100px; margin: 0; overflow-x: auto;">$ Ready to run commands...</pre>
13
- </div>
14
- <p style="font-size: 0.9em; color: #6c757d; margin-top: 0.5rem;">
15
- <em>Note: This is a simulated terminal. In the original Gradio app, this would execute real Python commands with proper security restrictions.</em>
16
- </p>
17
- </div>
18
-
19
- <script>
20
- document.addEventListener('DOMContentLoaded', function() {
21
- const input = document.getElementById('terminal-input');
22
- const button = document.getElementById('terminal-run');
23
- const output = document.getElementById('terminal-output');
24
-
25
- function runCommand() {
26
- const command = input.value.trim();
27
- if (!command) return;
28
-
29
- // Simulate command execution
30
- output.textContent = `$ ${command}\nSimulated output for: ${command}\n\n` +
31
- `This would execute the command in the original app.\n` +
32
- `Example outputs:\n` +
33
- `- torch version: 2.0.1+cu117\n` +
34
- `- import checks: Success\n` +
35
- `- memory info: Available`;
36
- }
37
-
38
- button.addEventListener('click', runCommand);
39
- input.addEventListener('keypress', function(e) {
40
- if (e.key === 'Enter') runCommand();
41
- });
42
- });
43
- </script>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/fragments/tp-plan.html DELETED
@@ -1,24 +0,0 @@
1
- <pre><code class="language-python"># In the model's config (example: ERNIE 4.5-style decoder blocks)
2
- base_model_tp_plan = {
3
- "layers.*.self_attn.q_proj": "colwise",
4
- "layers.*.self_attn.k_proj": "colwise",
5
- "layers.*.self_attn.v_proj": "colwise",
6
- "layers.*.self_attn.o_proj": "rowwise",
7
- "layers.*.mlp.gate_proj": "colwise",
8
- "layers.*.mlp.up_proj": "colwise",
9
- "layers.*.mlp.down_proj": "rowwise",
10
- }
11
-
12
- # Runtime
13
- import torch
14
- from transformers import AutoModelForCausalLM, AutoTokenizer
15
-
16
- model_id = "your/model-or-local-checkpoint"
17
- model = AutoModelForCausalLM.from_pretrained(
18
- model_id,
19
- dtype=torch.bfloat16,
20
- tp_plan=base_model_tp_plan, # <-- plan defined above
21
- )
22
- tok = AutoTokenizer.from_pretrained(model_id)
23
- inputs = tok("Hello", return_tensors="pt").to(model.device)
24
- out = model(**inputs)</code></pre>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/fragments/warmup_demo.html DELETED
@@ -1,398 +0,0 @@
1
- <style>
2
- .warmup-demo body {
3
- font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
4
- margin: 0;
5
- padding: 20px;
6
- background-color: #f5f5f5;
7
- }
8
-
9
- .warmup-demo .container {
10
- max-width: 1200px;
11
- margin: 0 auto;
12
- background: white;
13
- border-radius: 12px;
14
- padding: 30px;
15
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
16
- }
17
-
18
- .warmup-demo h1 {
19
- text-align: center;
20
- color: #333;
21
- margin-bottom: 10px;
22
- }
23
-
24
- .warmup-demo .subtitle {
25
- text-align: center;
26
- color: #666;
27
- margin-bottom: 30px;
28
- font-size: 16px;
29
- }
30
-
31
- .warmup-demo .demo-container {
32
- display: flex;
33
- gap: 40px;
34
- margin-bottom: 30px;
35
- }
36
-
37
- .warmup-demo .side {
38
- flex: 1;
39
- border: 2px solid #ddd;
40
- border-radius: 8px;
41
- padding: 20px;
42
- background: #fafafa;
43
- }
44
-
45
- .warmup-demo .side h2 {
46
- text-align: center;
47
- margin-top: 0;
48
- color: #333;
49
- }
50
-
51
- .warmup-demo .no-warmup h2 {
52
- color: #d63384;
53
- }
54
-
55
- .warmup-demo .with-warmup h2 {
56
- color: #198754;
57
- }
58
-
59
- .warmup-demo .memory-area {
60
- height: 400px;
61
- border: 2px dashed #ccc;
62
- border-radius: 6px;
63
- padding: 10px;
64
- margin: 20px 0;
65
- background: #fff;
66
- position: relative;
67
- overflow: hidden;
68
- }
69
-
70
- .warmup-demo .layer-box {
71
- width: 80px;
72
- height: 30px;
73
- border: 2px solid #666;
74
- border-radius: 4px;
75
- margin: 3px;
76
- display: inline-block;
77
- position: relative;
78
- background: #fff;
79
- transition: all 0.3s ease;
80
- }
81
-
82
- .warmup-demo .layer-box.allocating {
83
- background: #e9ecef;
84
- border-color: #adb5bd;
85
- }
86
-
87
- .warmup-demo .layer-box.allocating::after {
88
- content: "malloc";
89
- position: absolute;
90
- top: 50%;
91
- left: 50%;
92
- transform: translate(-50%, -50%);
93
- font-size: 10px;
94
- color: #666;
95
- font-weight: bold;
96
- }
97
-
98
- .warmup-demo .layer-box.loaded {
99
- background: #d1e7dd;
100
- border-color: #198754;
101
- }
102
-
103
- .warmup-demo .layer-box.loaded::after {
104
- content: "data";
105
- position: absolute;
106
- top: 50%;
107
- left: 50%;
108
- transform: translate(-50%, -50%);
109
- font-size: 10px;
110
- color: #198754;
111
- font-weight: bold;
112
- }
113
-
114
- .warmup-demo .warmup-container {
115
- width: 100%;
116
- height: 60px;
117
- border: 3px solid #666;
118
- border-radius: 6px;
119
- margin-bottom: 20px;
120
- background: #fff;
121
- position: relative;
122
- overflow: hidden;
123
- }
124
-
125
- .warmup-demo .warmup-container.allocated {
126
- border-color: #0d6efd;
127
- background: #e7f1ff;
128
- }
129
-
130
- .warmup-demo .warmup-container::before {
131
- content: "Pre-allocated Memory Pool";
132
- position: absolute;
133
- top: 50%;
134
- left: 50%;
135
- transform: translate(-50%, -50%);
136
- font-size: 14px;
137
- color: #666;
138
- font-weight: bold;
139
- z-index: 1;
140
- }
141
-
142
- .warmup-demo .warmup-container.allocated::before {
143
- color: #0d6efd;
144
- }
145
-
146
- .warmup-demo .warmup-fill {
147
- height: 100%;
148
- background: linear-gradient(90deg, #198754, #20c997);
149
- width: 0%;
150
- transition: width 0.5s ease;
151
- border-radius: 3px;
152
- position: relative;
153
- z-index: 2;
154
- }
155
-
156
- .warmup-demo .warmup-fill::after {
157
- content: "Layer Data Loading";
158
- position: absolute;
159
- top: 50%;
160
- left: 50%;
161
- transform: translate(-50%, -50%);
162
- font-size: 12px;
163
- color: white;
164
- font-weight: bold;
165
- white-space: nowrap;
166
- }
167
-
168
- .warmup-demo .timing {
169
- text-align: center;
170
- font-size: 24px;
171
- font-weight: bold;
172
- margin: 15px 0;
173
- min-height: 30px;
174
- }
175
-
176
- .warmup-demo .no-warmup .timing {
177
- color: #d63384;
178
- }
179
-
180
- .warmup-demo .with-warmup .timing {
181
- color: #198754;
182
- }
183
-
184
- .warmup-demo .controls {
185
- text-align: center;
186
- margin: 30px 0;
187
- }
188
-
189
- .warmup-demo .btn {
190
- background: #0d6efd;
191
- color: white;
192
- border: none;
193
- padding: 12px 24px;
194
- border-radius: 6px;
195
- font-size: 16px;
196
- cursor: pointer;
197
- margin: 0 10px;
198
- transition: background 0.3s ease;
199
- }
200
-
201
- .warmup-demo .btn:hover {
202
- background: #0b5ed7;
203
- }
204
-
205
- .warmup-demo .btn:disabled {
206
- background: #6c757d;
207
- cursor: not-allowed;
208
- }
209
-
210
- .warmup-demo .description {
211
- background: #f8f9fa;
212
- padding: 15px;
213
- border-radius: 6px;
214
- margin-top: 15px;
215
- font-size: 14px;
216
- line-height: 1.5;
217
- }
218
-
219
- .warmup-demo .phase-indicator {
220
- font-size: 14px;
221
- color: #666;
222
- text-align: center;
223
- margin-top: 10px;
224
- min-height: 20px;
225
- }
226
-
227
- .warmup-demo .layer-counter {
228
- text-align: center;
229
- font-size: 16px;
230
- color: #495057;
231
- margin: 10px 0;
232
- }
233
- </style>
234
-
235
- <div class="warmup-demo">
236
- <div class="container">
237
- <p class="subtitle">Mem allocation patterns during model loading</p>
238
-
239
- <div class="controls">
240
- <button class="btn" id="startBtn" onclick="startDemo()">Start Animation</button>
241
- <button class="btn" id="resetBtn" onclick="resetDemo()">Reset</button>
242
- </div>
243
-
244
- <div class="demo-container">
245
- <div class="side no-warmup">
246
- <h4 data-no-toc>❌ Without Warmup</h4>
247
- <div class="timing" id="noWarmupTime">0.00s</div>
248
- <div class="layer-counter" id="noWarmupCounter">Layers loaded: 0/10</div>
249
- <div class="phase-indicator" id="noWarmupPhase"></div>
250
- <div class="memory-area" id="noWarmupArea"></div>
251
- <div class="description">
252
- <strong>Individual Allocations:</strong><br>
253
- Each model layer triggers a separate cudaMalloc() call, creating memory fragmentation and allocation overhead.
254
- <br><br>
255
- 📦 <strong>Grey "malloc"</strong> = Memory allocation overhead<br>
256
- ✅ <strong>Green "data"</strong> = Actual layer data loading
257
- </div>
258
- </div>
259
-
260
- <div class="side with-warmup">
261
- <h4 data-no-toc>✅ With Warmup</h4>
262
- <div class="timing" id="warmupTime">0.00s</div>
263
- <div class="layer-counter" id="warmupCounter">Layers loaded: 0/10</div>
264
- <div class="phase-indicator" id="warmupPhase"></div>
265
- <div class="memory-area" id="warmupArea">
266
- <div class="warmup-container" id="warmupContainer">
267
- <div class="warmup-fill" id="warmupFill"></div>
268
- </div>
269
- <div id="warmupLayers"></div>
270
- </div>
271
- <div class="description">
272
- <strong>Pre-allocated Pool:</strong><br>
273
- The warmup function calculates total memory needed and makes ONE large allocation. Subsequent layers load directly into this pool, eliminating malloc overhead.
274
- <br><br>
275
- 🔵 <strong>Blue container</strong> = Single large malloc (warmup)<br>
276
- 🟢 <strong>Green progress bar</strong> = Layer data loading (no malloc needed)
277
- </div>
278
- </div>
279
- </div>
280
- </div>
281
- </div>
282
-
283
- <script>
284
- let animationSpeed = 1 / 2.4;
285
- let isRunning = false;
286
- const totalLayers = 10;
287
-
288
- function startDemo() {
289
- if (isRunning) return;
290
- isRunning = true;
291
-
292
- document.getElementById('startBtn').disabled = true;
293
- document.getElementById('resetBtn').disabled = true;
294
-
295
- Promise.all([
296
- animateNoWarmup(),
297
- animateWithWarmup()
298
- ]).then(() => {
299
- isRunning = false;
300
- document.getElementById('startBtn').disabled = false;
301
- document.getElementById('resetBtn').disabled = false;
302
- });
303
- }
304
-
305
- function resetDemo() {
306
- if (isRunning) return;
307
-
308
- document.getElementById('noWarmupArea').innerHTML = '';
309
- document.getElementById('warmupLayers').innerHTML = '';
310
- document.getElementById('warmupFill').style.width = '0%';
311
- document.getElementById('warmupContainer').classList.remove('allocated');
312
-
313
- document.getElementById('noWarmupTime').textContent = '0.00s';
314
- document.getElementById('warmupTime').textContent = '0.00s';
315
-
316
- document.getElementById('noWarmupCounter').textContent = 'Layers loaded: 0/10';
317
- document.getElementById('warmupCounter').textContent = 'Layers loaded: 0/10';
318
-
319
- document.getElementById('noWarmupPhase').textContent = '';
320
- document.getElementById('warmupPhase').textContent = '';
321
- }
322
-
323
- async function animateNoWarmup() {
324
- const container = document.getElementById('noWarmupArea');
325
- const timeEl = document.getElementById('noWarmupTime');
326
- const counterEl = document.getElementById('noWarmupCounter');
327
- const phaseEl = document.getElementById('noWarmupPhase');
328
-
329
- let currentTime = 0;
330
- const baseDelay = 200 / animationSpeed;
331
-
332
- phaseEl.textContent = 'Loading model layers...';
333
-
334
- for (let i = 0; i < totalLayers; i++) {
335
- const layerBox = document.createElement('div');
336
- layerBox.className = 'layer-box';
337
- container.appendChild(layerBox);
338
-
339
- await sleep(baseDelay * 0.3);
340
- layerBox.classList.add('allocating');
341
- currentTime += 0.08;
342
- timeEl.textContent = currentTime.toFixed(2) + 's';
343
-
344
- await sleep(baseDelay * 0.7);
345
- layerBox.classList.remove('allocating');
346
- layerBox.classList.add('loaded');
347
- currentTime += 0.12;
348
- timeEl.textContent = currentTime.toFixed(2) + 's';
349
-
350
- counterEl.textContent = `Layers loaded: ${i + 1}/${totalLayers}`;
351
- }
352
-
353
- phaseEl.textContent = 'Complete!';
354
- }
355
-
356
- async function animateWithWarmup() {
357
- const container = document.getElementById('warmupLayers');
358
- const timeEl = document.getElementById('warmupTime');
359
- const counterEl = document.getElementById('warmupCounter');
360
- const phaseEl = document.getElementById('warmupPhase');
361
- const warmupContainer = document.getElementById('warmupContainer');
362
- const warmupFill = document.getElementById('warmupFill');
363
-
364
- let currentTime = 0;
365
- const baseDelay = 200 / animationSpeed;
366
-
367
- phaseEl.textContent = 'Warming up allocator...';
368
- await sleep(baseDelay * 2);
369
- warmupContainer.classList.add('allocated');
370
- currentTime += 0.3;
371
- timeEl.textContent = currentTime.toFixed(2) + 's';
372
-
373
- phaseEl.textContent = 'Loading model layers...';
374
-
375
- for (let i = 0; i < totalLayers; i++) {
376
- const layerBox = document.createElement('div');
377
- layerBox.className = 'layer-box loaded';
378
- layerBox.style.width = '40px';
379
- layerBox.style.height = '20px';
380
- container.appendChild(layerBox);
381
-
382
- const progress = ((i + 1) / totalLayers) * 100;
383
- warmupFill.style.width = progress + '%';
384
-
385
- await sleep(baseDelay * 0.5);
386
- currentTime += 0.08;
387
- timeEl.textContent = currentTime.toFixed(2) + 's';
388
-
389
- counterEl.textContent = `Layers loaded: ${i + 1}/${totalLayers}`;
390
- }
391
-
392
- phaseEl.textContent = 'Complete!';
393
- }
394
-
395
- function sleep(ms) {
396
- return new Promise(resolve => setTimeout(resolve, ms));
397
- }
398
- </script>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/index.js DELETED
@@ -1,21 +0,0 @@
1
- // Main JavaScript file for Scaling Insanity
2
- import './style.css';
3
-
4
- // Import any additional functionality
5
- console.log('blog loaded');
6
-
7
- // Add any custom JavaScript functionality here
8
- document.addEventListener('DOMContentLoaded', function() {
9
- // Initialize syntax highlighting for code blocks
10
- if (window.hljs) {
11
- hljs.highlightAll();
12
- }
13
-
14
- // Initialize any interactive components
15
- initializeInteractiveComponents();
16
- });
17
-
18
- function initializeInteractiveComponents() {
19
- // This will be expanded as we add interactive components
20
- console.log('Interactive components initialized');
21
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/style.css DELETED
@@ -1,741 +0,0 @@
1
- /* style.css - Scaling Insanity */
2
-
3
- /* Import ultrascale-playbook base styles and add transformers-specific styling */
4
- /* Define colors */
5
- :root {
6
- --distill-gray: rgb(107, 114, 128);
7
- --distill-gray-light: rgb(185, 185, 185);
8
- --distill-gray-lighter: rgb(228, 228, 228);
9
- --distill-gray-lightest: rgb(245, 245, 245);
10
- --distill-blue: #007BFF;
11
- }
12
-
13
- /* Container for the controls */
14
- [id^="plot-"] {
15
- display: flex;
16
- flex-direction: column;
17
- align-items: center;
18
- gap: 15px; /* Adjust the gap between controls as needed */
19
- }
20
- [id^="plot-"] figure {
21
- margin-bottom: 0px;
22
- margin-top: 0px;
23
- padding: 0px;
24
- }
25
- .plotly_caption {
26
- font-style: italic;
27
- margin-top: 10px;
28
- }
29
-
30
- .plotly_controls {
31
- display: flex;
32
- flex-wrap: wrap;
33
- flex-direction: row;
34
- justify-content: center;
35
- align-items: flex-start;
36
- gap: 30px;
37
- }
38
-
39
-
40
- .plotly_input_container {
41
- display: flex;
42
- align-items: center;
43
- flex-direction: column;
44
- gap: 10px;
45
- }
46
-
47
- /* Style for the select dropdown */
48
- .plotly_input_container > select {
49
- padding: 2px 4px;
50
- /* border: 1px solid #ccc; */
51
- line-height: 1.5em;
52
- text-align: center;
53
- border-radius: 4px;
54
- font-size: 12px;
55
- background-color: var(--distill-gray-lightest);
56
- outline: none;
57
- }
58
-
59
- /* Style for the range input */
60
-
61
- .plotly_slider {
62
- display: flex;
63
- align-items: center;
64
- gap: 10px;
65
- }
66
-
67
- .plotly_slider > input[type="range"] {
68
- -webkit-appearance: none;
69
- height: 2px;
70
- background: var(--distill-gray-light);
71
- border-radius: 5px;
72
- outline: none;
73
- }
74
-
75
- .plotly_slider > span {
76
- font-size: 14px;
77
- line-height: 1.6em;
78
- min-width: 16px;
79
- }
80
-
81
- .plotly_slider > input[type="range"]::-webkit-slider-thumb {
82
- -webkit-appearance: none;
83
- appearance: none;
84
- width: 18px;
85
- height: 18px;
86
- border-radius: 50%;
87
- background: var(--distill-blue);
88
- cursor: pointer;
89
- }
90
-
91
- .plotly_slider > input[type="range"]::-moz-range-thumb {
92
- width: 18px;
93
- height: 18px;
94
- border-radius: 50%;
95
- background: var(--distill-blue);
96
- cursor: pointer;
97
- }
98
-
99
- /* Style for the labels */
100
- .plotly_input_container > label {
101
- font-size: 14px;
102
- font-weight: bold;
103
- }
104
-
105
- .main-plot-container {
106
- margin-top: 21px;
107
- margin-bottom: 35px;
108
- }
109
-
110
- .main-plot-container > figure {
111
- display: block !important;
112
- /* Let this be handled by graph-container */
113
- margin-bottom: 0px;
114
- margin-top: 0px;
115
- }
116
- .main-plot-container > div {
117
- display: none !important;
118
- }
119
-
120
-
121
- @media (min-width: 768px) {
122
- .main-plot-container > figure {
123
- display: none !important;
124
- }
125
- .main-plot-container > div {
126
- display: flex !important;
127
- }
128
- }
129
-
130
- d-byline .byline {
131
- grid-template-columns: 1fr;
132
- grid-column: text;
133
- font-size: 0.9rem;
134
- line-height: 1.8em;
135
- }
136
-
137
- @media (min-width: 768px) {
138
- d-byline .byline {
139
- grid-template-columns: 5fr 1fr 1fr;
140
- }
141
- }
142
-
143
- #title-plot {
144
- margin-top: 0px;
145
- margin-bottom: 0px;
146
- }
147
-
148
- d-contents > nav a.active {
149
- text-decoration: underline;
150
- }
151
-
152
- @media (max-width: 1199px) {
153
- d-contents {
154
- display: none;
155
- background: white;
156
- justify-self: start;
157
- align-self: start;
158
- padding-bottom: 0.5em;
159
- margin-bottom: 1em;
160
- padding-left: 0.25em;
161
- border-bottom: 1px solid rgba(0, 0, 0, 0.1);
162
- border-bottom-width: 1px;
163
- border-bottom-style: solid;
164
- border-bottom-color: rgba(0, 0, 0, 0.1);
165
- overflow-y: scroll;
166
- height: calc(100vh - 40px);
167
- scrollbar-width: none;
168
- z-index: -100;
169
- }
170
- }
171
-
172
- d-contents a:hover {
173
- border-bottom: none;
174
- }
175
-
176
- toc-title {
177
- font-weight: bold;
178
- font-size: 1.2em;
179
- color: #333;
180
- }
181
-
182
- toggle-icon {
183
- transition: transform 0.3s;
184
- }
185
-
186
- toggle-icon.collapsed {
187
- transform: rotate(90deg);
188
- }
189
-
190
- .toc-content {
191
- margin-top: 15px;
192
- overflow: hidden;
193
- /* max-height: 1000px; */
194
- transition: max-height 0.3s ease-out;
195
- }
196
-
197
- .toc-content.collapsed {
198
- max-height: 0;
199
- margin-top: 0;
200
- }
201
-
202
- @media (min-width: 1200px) {
203
- d-article {
204
- /* Ensure d-article does not prevent sticky positioning */
205
- overflow: visible;
206
- }
207
-
208
- d-contents {
209
- align-self: start;
210
- background: white;
211
- grid-column-start: 1 !important;
212
- grid-column-end: 4 !important;
213
- grid-row: auto / span 6;
214
- justify-self: end;
215
- margin-top: 0em;
216
- padding-right: 3em;
217
- padding-left: 2em;
218
- /* border-right: 1px solid rgba(0, 0, 0, 0.1);
219
- border-right-width: 1px;
220
- border-right-style: solid;
221
- border-right-color: rgba(0, 0, 0, 0.1); */
222
- position: -webkit-sticky; /* For Safari */
223
- position: sticky;
224
- top: 10px; /* Adjust this value if needed */
225
- overflow-y: auto;
226
- height: calc(100vh - 40px);
227
- scrollbar-width: none;
228
- transition: max-height 0.3s ease-out;
229
- z-index: -100;
230
- }
231
- }
232
-
233
- d-contents nav h3 {
234
- margin-top: 0;
235
- margin-bottom: 1em;
236
- }
237
-
238
- d-contents nav div div {
239
- color: rgba(0, 0, 0, 0.8);
240
- font-weight: bold;
241
- }
242
-
243
- d-contents nav a {
244
- color: rgba(0, 0, 0, 0.8);
245
- border-bottom: none;
246
- text-decoration: none;
247
- }
248
-
249
- d-contents li {
250
- list-style-type: none;
251
- }
252
-
253
- d-contents ul, d-article d-contents ul {
254
- padding-left: 1em;
255
- }
256
-
257
- d-contents nav ul li {
258
- margin-bottom: .25em;
259
- }
260
-
261
- d-contents nav a:hover {
262
- text-decoration: underline solid rgba(0, 0, 0, 0.6);
263
- }
264
-
265
- d-contents nav ul {
266
- margin-top: 0;
267
- margin-bottom: 6px;
268
- }
269
-
270
-
271
- d-contents nav > div {
272
- display: block;
273
- outline: none;
274
- margin-bottom: 0.5em;
275
- }
276
-
277
- d-contents nav > div > a {
278
- font-size: 13px;
279
- font-weight: 600;
280
- }
281
-
282
- d-article aside {
283
- margin-bottom: 1em;
284
- }
285
-
286
- d-article img {
287
- max-width: 100%;
288
- }
289
-
290
- @media (min-width: 768px) {
291
- d-article aside {
292
- margin-bottom: 0;
293
- }
294
- }
295
-
296
- d-contents nav > div > a:hover,
297
- d-contents nav > ul > li > a:hover {
298
- text-decoration: none;
299
- }
300
-
301
- .note-box {
302
- background-color: #f6f8fa;
303
- border-left: 4px solid #444444;
304
- padding: 1rem;
305
- margin: 1rem 0; /* Keep this modest margin */
306
- border-radius: 6px;
307
- /* Add this to ensure the box only takes up needed space */
308
- display: inline-block;
309
- }
310
-
311
- .note-box-title {
312
- margin: 0;
313
- color: #444444;
314
- font-weight: 600;
315
- font-size: 1em;
316
- }
317
-
318
- .note-box-content {
319
- margin-top: 0.5rem;
320
- margin-bottom: 0; /* Ensure no bottom margin */
321
- color: #24292f;
322
- font-size: 0.9em;
323
- line-height: 1.5em;
324
- }
325
-
326
- /* For dark mode support */
327
- @media (prefers-color-scheme: dark) {
328
- .note-box {
329
- background-color: #1c1c1c;
330
- border-left-color: #888888;
331
- }
332
- .note-box-title {
333
- color: #888888;
334
- }
335
- .note-box-content {
336
- color: #d4d4d4;
337
- }
338
- }
339
-
340
- d-article {
341
- font-size: 1.0em;
342
- }
343
-
344
- .figure-legend {
345
- font-size: 0.9em;
346
- font-style: italic;
347
- color: var(--distill-gray);
348
- line-height: 1.5em;
349
- }
350
-
351
- d-code {
352
- font-size: 12px;
353
- }
354
-
355
- .large-image-background {
356
- width: 100vw;
357
- padding-top: 10px;
358
- padding-bottom: 10px;
359
- margin-left: calc(-50vw + 50%);
360
- margin-right: calc(-50vw + 50%);
361
- background: white;
362
- height: fit-content; /* This will make it match the image height */
363
- display: flex;
364
- justify-content: center; /* This will center your image */
365
- }
366
-
367
- .large-image-background-transparent {
368
- /* width: 100vw; */
369
- padding-top: 10px;
370
- padding-bottom: 10px;
371
- /* margin-left: calc(-50vw + 50%); */
372
- margin-left:-100px;
373
- margin-right: -100px;
374
- /* margin-right: calc(-50vw + 50%); */
375
- /* background: white; */
376
- height: fit-content; /* This will make it match the image height */
377
- display: flex;
378
- justify-content: center; /* This will center your image */
379
- }
380
-
381
- .boxed-image {
382
- padding: 0.5rem;
383
- background: white;
384
- border-radius: 12px;
385
- border: 1px solid #e5e7eb;
386
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
387
- }
388
-
389
- d-article li {
390
- margin-bottom: 0.0em;
391
- }
392
-
393
- d-article ul ul {
394
- margin-bottom: 0.0em;
395
- }
396
-
397
- d-article ol ol {
398
- margin-bottom: 0.0em;
399
- }
400
-
401
- d-article hr {
402
- grid-column: text;
403
- }
404
-
405
- /* Memory visualization */
406
- #graph-all {
407
- min-width: 500px;
408
- margin-right: 10px;
409
- margin-bottom: 2rem;
410
- padding: 0.5rem;
411
- background: #f9fafb;
412
- border-radius: 12px;
413
- border: 1px solid #e5e7eb;
414
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
415
- }
416
-
417
-
418
- /* Main container styles */
419
- #controls {
420
- max-width: 1200px;
421
- /* margin: 2rem auto; */
422
- margin-bottom: 2rem;
423
- margin-left: 10px;
424
- padding: 0.6rem;
425
- background: #f9fafb;
426
- border-radius: 12px;
427
- border: 1px solid #e5e7eb;
428
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
429
- }
430
-
431
- /* Grid layout */
432
- #controls {
433
- display: grid;
434
- grid-template-columns: 1fr 1fr;
435
- /* gap: 2rem; */
436
- }
437
-
438
- /* Cell styles */
439
- .cell {
440
- margin-bottom: 0.2rem;
441
- }
442
-
443
- /* Label styles */
444
- label {
445
- display: block;
446
- /* margin-bottom: 0.5rem; */
447
- font-size: 0.8rem;
448
- font-weight: 500;
449
- color: #374151;
450
- }
451
-
452
- /* Input container for range + number combination */
453
- .input-container {
454
- display: flex;
455
- gap: 1rem;
456
- align-items: center;
457
- }
458
-
459
- /* Range input styling */
460
- input[type="range"] {
461
- flex: 1;
462
- height: 6px;
463
- background: #e5e7eb;
464
- border-radius: 3px;
465
- appearance: none;
466
- outline: none;
467
- }
468
-
469
- input[type="range"]::-webkit-slider-thumb {
470
- appearance: none;
471
- width: 16px;
472
- height: 16px;
473
- background: #3b82f6;
474
- border-radius: 50%;
475
- cursor: pointer;
476
- transition: background 0.15s ease;
477
- }
478
-
479
- input[type="range"]::-webkit-slider-thumb:hover {
480
- background: #2563eb;
481
- }
482
-
483
- /* Number input styling */
484
- input[type="number"] {
485
- width: 80px;
486
- padding: 0.5rem;
487
- border: 1px solid #e5e7eb;
488
- border-radius: 6px;
489
- font-size: 0.9rem;
490
- color: #374151;
491
- }
492
-
493
- /* Select styling */
494
- select {
495
- width: 100%;
496
- padding: 0.5rem;
497
- border: 1px solid #e5e7eb;
498
- border-radius: 6px;
499
- background: white;
500
- font-size: 0.9rem;
501
- color: #374151;
502
- cursor: pointer;
503
- }
504
-
505
- /* Checkbox styling */
506
- input[type="checkbox"] {
507
- width: 1.2rem;
508
- height: 1.2rem;
509
- margin-right: 0.5rem;
510
- border: 2px solid #e5e7eb;
511
- border-radius: 4px;
512
- cursor: pointer;
513
- }
514
-
515
- /* Column specific styles */
516
- .column-1 {
517
- padding-right: 0.5rem;
518
- }
519
-
520
- .column-2 {
521
- padding-left: 0.5rem;
522
- }
523
-
524
- /* Checkbox container */
525
- .checkbox-container {
526
- display: flex;
527
- align-items: center;
528
- margin-bottom: 1rem;
529
- }
530
-
531
- /* Memory visualization styles */
532
- .memory-block {
533
- background: #fff;
534
- border-radius: 8px;
535
- padding: 1rem;
536
- margin-bottom: 1rem;
537
- box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
538
- }
539
-
540
- .memory-title {
541
- font-size: 1.1rem;
542
- font-weight: 500;
543
- color: #374151;
544
- margin-bottom: 0.5rem;
545
- }
546
-
547
- .memory-value {
548
- font-size: 1.5rem;
549
- font-weight: 600;
550
- color: #3b82f6;
551
- }
552
-
553
- /* Responsive adjustments */
554
- @media (max-width: 768px) {
555
- #controls {
556
- grid-template-columns: 1fr;
557
- padding: 1rem;
558
- }
559
-
560
- .column-1, .column-2 {
561
- padding: 0;
562
- }
563
- }
564
-
565
- /* Hover states and transitions */
566
- input:hover, select:hover {
567
- border-color: #3b82f6;
568
- }
569
-
570
- input:focus, select:focus {
571
- border-color: #2563eb;
572
- outline: none;
573
- box-shadow: 0 0 0 2px rgba(59, 130, 246, 0.1);
574
- }
575
-
576
- /* Add smooth transitions */
577
- input, select, button {
578
- transition: all 0.15s ease;
579
- }
580
-
581
- /* Preset dropdown special styling */
582
- select[name="presets"] {
583
- background-color: #f3f4f6;
584
- font-weight: 500;
585
- }
586
-
587
- /* Memory graph enhancements */
588
- .activation-memory {
589
- background: #dbeafe;
590
- padding: 1rem;
591
- border-radius: 8px;
592
- margin-bottom: 1rem;
593
- }
594
-
595
- .gradient-memory {
596
- background: #ede9fe;
597
- padding: 1rem;
598
- border-radius: 8px;
599
- }
600
-
601
- .order-button-second {
602
- background: linear-gradient(135deg, #6DB4C4, #D4A5B8);
603
- color: white;
604
- font-size: 18px;
605
- font-weight: 600;
606
- padding: 20px 20px;
607
- border: none;
608
- border-radius: 12px;
609
- cursor: pointer;
610
- text-transform: uppercase;
611
- letter-spacing: 1px;
612
- box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
613
- transition: all 0.3s ease;
614
- position: relative;
615
- overflow: hidden;
616
- }
617
- .order-button-second:hover {
618
- transform: translateY(-2px);
619
- box-shadow: 0 6px 20px rgba(0, 0, 0, 0.25);
620
- }
621
-
622
- .order-button:active {
623
- transform: translateY(0);
624
- box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
625
- }
626
-
627
- .order-button-second::before {
628
- content: '';
629
- position: absolute;
630
- top: 0;
631
- left: -100%;
632
- width: 100%;
633
- height: 100%;
634
- background: linear-gradient(135deg, rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 0));
635
- transition: left 0.5s ease;
636
- }
637
-
638
- .order-button-second:hover::before {
639
- left: 100%;
640
- }
641
-
642
- .order-button {
643
- background: linear-gradient(135deg, #6DB4C4, #D4A5B8);
644
- color: white;
645
- font-size: 18px;
646
- font-weight: 600;
647
- padding: 16px 32px;
648
- border: none;
649
- border-radius: 12px;
650
- cursor: pointer;
651
- text-transform: uppercase;
652
- letter-spacing: 1px;
653
- box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
654
- transition: all 0.3s ease;
655
- position: relative;
656
- overflow: hidden;
657
- }
658
-
659
- .order-button:hover {
660
- transform: translateY(-2px);
661
- box-shadow: 0 6px 20px rgba(0, 0, 0, 0.25);
662
- }
663
-
664
- .order-button:active {
665
- transform: translateY(0);
666
- box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
667
- }
668
-
669
- .order-button::before {
670
- content: '';
671
- position: absolute;
672
- top: 0;
673
- left: -100%;
674
- width: 100%;
675
- height: 100%;
676
- background: linear-gradient(135deg, rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 0));
677
- transition: left 0.5s ease;
678
- }
679
-
680
- .order-button:hover::before {
681
- left: 100%;
682
- }
683
- .order-button-container-second {
684
- /* display: flex; */
685
- justify-content: center;
686
- margin: 0px 0;
687
- }
688
-
689
- .order-button-container {
690
- display: flex;
691
- justify-content: center;
692
- margin: 0px 0 40px 0;
693
- }
694
-
695
- d-article img {
696
- width: 100%!important;
697
- }
698
-
699
-
700
- iframe, .js-plotly-plot {
701
- width: 100%!important;
702
- margin-bottom: 20px;
703
- }
704
-
705
- .modebar-container {
706
- display: none;
707
- }
708
-
709
- #graph-container {
710
- display: grid; grid-template-columns: 1fr 1fr; align-items: center;
711
- }
712
-
713
- @media (max-width: 768px) {
714
- #graph-container {
715
- grid-template-columns: 1fr;
716
- }
717
- }
718
-
719
- @media (max-width: 1024px) {
720
- #graph-container {
721
- grid-template-columns: 1fr;
722
- }
723
- #graph-all {
724
- margin-right: 0px;
725
- }
726
- #controls {
727
- margin-left: 0px;
728
- }
729
- }
730
-
731
- .main-plot-container svg {
732
- background: transparent !important;
733
- }
734
-
735
- .large-image-background-transparent {
736
- margin-left: 0px;
737
- margin-right: 0px;
738
- }
739
-
740
- /* Import transformers-specific styles */
741
- @import url('./transformers-custom.css');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/transformers-custom.css DELETED
@@ -1,741 +0,0 @@
1
- /* Transformers-specific styling additions */
2
-
3
- /* Code comparison layout */
4
- .code-compare {
5
- display: grid;
6
- grid-template-columns: 1fr 1fr;
7
- gap: 1.5rem;
8
- margin: 2rem 0;
9
- align-items: start;
10
- }
11
-
12
- .code-compare .code-column {
13
- background: #ffffff;
14
- border: 1px solid #e2e8f0;
15
- border-radius: 8px;
16
- overflow: hidden;
17
- box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
18
- }
19
-
20
- .code-compare .code-header {
21
- background: #f8f9fa;
22
- padding: 0.75rem 1rem;
23
- font-weight: 600;
24
- color: #495057;
25
- border-bottom: 1px solid #e2e8f0;
26
- font-size: 0.9em;
27
- }
28
-
29
- .code-compare pre {
30
- margin: 0;
31
- padding: 1rem;
32
- background: #ffffff;
33
- overflow-x: auto;
34
- font-size: 0.85em;
35
- line-height: 1.4;
36
- }
37
-
38
- .code-compare pre code {
39
- color: #374151;
40
- }
41
-
42
- /* Mobile responsiveness for code comparison */
43
- @media (max-width: 768px) {
44
- .code-compare {
45
- grid-template-columns: 1fr;
46
- gap: 1rem;
47
- }
48
- }
49
-
50
- /* Tenet styling - special highlighting for design principles */
51
- .tenet-list {
52
- margin: 3rem 0;
53
- }
54
-
55
- .tenet-list ol {
56
- counter-reset: tenet-counter -1; /* Start from 0 */
57
- list-style: none;
58
- padding-left: 0;
59
- display: grid;
60
- grid-template-columns: 1fr;
61
- gap: 2.5rem;
62
- max-width: 900px;
63
- margin: 0 auto;
64
- }
65
-
66
- .tenet-list li.tenet {
67
- counter-increment: tenet-counter;
68
- background: linear-gradient(135deg, #ffffff 0%, #f8f9fa 100%);
69
- border: 2px solid #e2e8f0;
70
- border-radius: 16px;
71
- padding: 2rem 2rem 2rem 4rem;
72
- margin: 0;
73
- position: relative;
74
- box-shadow: 0 12px 35px rgba(0, 0, 0, 0.12);
75
- transition: all 0.3s ease;
76
- cursor: pointer;
77
- }
78
-
79
- .tenet-list li.tenet:hover {
80
- transform: translateY(-8px) scale(1.02);
81
- box-shadow: 0 20px 50px rgba(0, 0, 0, 0.25);
82
- border-color: rgba(0, 123, 255, 0.5);
83
- background: linear-gradient(135deg, #ffffff 0%, #f0f8ff 100%);
84
- }
85
-
86
- /* Colorful numbering system */
87
- .tenet-list li.tenet:nth-child(1):before { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); }
88
- .tenet-list li.tenet:nth-child(2):before { background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); }
89
- .tenet-list li.tenet:nth-child(3):before { background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%); }
90
- .tenet-list li.tenet:nth-child(4):before { background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%); }
91
- .tenet-list li.tenet:nth-child(5):before { background: linear-gradient(135deg, #fa709a 0%, #fee140 100%); }
92
- .tenet-list li.tenet:nth-child(6):before { background: linear-gradient(135deg, #a8edea 0%, #fed6e3 100%); }
93
- .tenet-list li.tenet:nth-child(7):before { background: linear-gradient(135deg, #ff9a9e 0%, #fecfef 100%); }
94
- .tenet-list li.tenet:nth-child(8):before { background: linear-gradient(135deg, #a18cd1 0%, #fbc2eb 100%); }
95
- .tenet-list li.tenet:nth-child(9):before { background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%); }
96
-
97
- .tenet-list li.tenet:before {
98
- content: counter(tenet-counter);
99
- position: absolute;
100
- top: -12px;
101
- left: -12px;
102
- color: white;
103
- width: 48px;
104
- height: 48px;
105
- border-radius: 50%;
106
- display: flex;
107
- align-items: center;
108
- justify-content: center;
109
- font-size: 1.2em;
110
- font-weight: bold;
111
- box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
112
- border: 3px solid white;
113
- }
114
-
115
- .tenet-list li.tenet strong {
116
- color: #1a202c;
117
- font-size: 1.1em;
118
- display: block;
119
- margin-bottom: 0.5rem;
120
- }
121
-
122
- .tenet-list li.tenet em {
123
- color: #4a5568;
124
- font-size: 0.95em;
125
- font-style: italic;
126
- display: block;
127
- margin-top: 0.75rem;
128
- padding: 1rem;
129
- background: rgba(0, 0, 0, 0.03);
130
- border-radius: 8px;
131
- border-left: 3px solid #e2e8f0;
132
- }
133
-
134
- .tenet-list li.tenet p {
135
- color: #2d3748;
136
- line-height: 1.6;
137
- margin: 0.5rem 0;
138
- }
139
-
140
- /* Add a subtle pulse animation for the numbers */
141
- @keyframes pulse-glow {
142
- 0% { box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15); }
143
- 50% { box-shadow: 0 4px 20px rgba(0, 0, 0, 0.25); }
144
- 100% { box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15); }
145
- }
146
-
147
- .tenet-list li.tenet:hover:before {
148
- animation: pulse-glow 2s ease-in-out infinite;
149
- }
150
-
151
- /* Interactive component styling */
152
- .interactive-demo {
153
- border: 1px solid #e2e8f0;
154
- border-radius: 12px;
155
- background: #ffffff;
156
- margin: 2rem 0;
157
- overflow: hidden;
158
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.07);
159
- }
160
-
161
- /* Model visualization fragment styling */
162
- [id*="plot-model-visualisation"] {
163
- margin: 1rem -2rem !important;
164
- width: calc(100% + 4rem) !important;
165
- }
166
-
167
- .interactive-demo .demo-header {
168
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
169
- color: white;
170
- padding: 1rem 1.5rem;
171
- font-weight: 600;
172
- }
173
-
174
- .interactive-demo .demo-content {
175
- padding: 1.5rem;
176
- }
177
-
178
- .interactive-demo .demo-footer {
179
- background: #f8f9fa;
180
- padding: 1rem 1.5rem;
181
- border-top: 1px solid #e2e8f0;
182
- color: #6c757d;
183
- font-size: 0.9em;
184
- }
185
-
186
- /* Button styling for interactive elements */
187
- .btn-primary {
188
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
189
- border: none;
190
- color: white;
191
- padding: 0.75rem 1.5rem;
192
- border-radius: 6px;
193
- font-weight: 500;
194
- cursor: pointer;
195
- transition: transform 0.2s, box-shadow 0.2s;
196
- }
197
-
198
- .btn-primary:hover {
199
- transform: translateY(-1px);
200
- box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3);
201
- }
202
-
203
- .btn-primary:disabled {
204
- opacity: 0.6;
205
- cursor: not-allowed;
206
- transform: none;
207
- box-shadow: none;
208
- }
209
-
210
- /* Terminal styling */
211
- .terminal-container {
212
- background: #1a202c;
213
- border-radius: 8px;
214
- padding: 1rem;
215
- color: #e2e8f0;
216
- font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
217
- font-size: 0.9em;
218
- }
219
-
220
- .terminal-input {
221
- background: #2d3748;
222
- border: 1px solid #4a5568;
223
- color: #e2e8f0;
224
- padding: 0.5rem;
225
- border-radius: 4px;
226
- width: 100%;
227
- font-family: inherit;
228
- }
229
-
230
- .terminal-output {
231
- background: #0a0e1a;
232
- padding: 1rem;
233
- border-radius: 4px;
234
- white-space: pre-wrap;
235
- word-break: break-all;
236
- min-height: 100px;
237
- max-height: 300px;
238
- overflow-y: auto;
239
- }
240
-
241
- /* Attention visualization styling */
242
- .attention-matrix {
243
- font-family: monospace;
244
- font-size: 0.8em;
245
- border-collapse: collapse;
246
- margin: 1rem 0;
247
- }
248
-
249
- .attention-matrix td {
250
- border: 1px solid #ddd;
251
- padding: 4px 8px;
252
- text-align: center;
253
- min-width: 50px;
254
- }
255
-
256
- /* Memory chart styling */
257
- .memory-chart-container {
258
- background: #f8f9fa;
259
- border: 2px solid #e9ecef;
260
- border-radius: 8px;
261
- padding: 1rem;
262
- margin: 1rem 0;
263
- }
264
-
265
- /* Image styling improvements */
266
- img {
267
- max-width: 100%;
268
- height: auto;
269
- border-radius: 8px;
270
- box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
271
- margin: 1.5rem 0;
272
- }
273
-
274
- /* Table of contents styling - Fixed positioning like ultrascale */
275
- @media (min-width: 1200px) {
276
- d-article {
277
- overflow: visible !important;
278
- }
279
-
280
- d-contents {
281
- align-self: start !important;
282
- background: white !important;
283
- grid-column-start: 1 !important;
284
- grid-column-end: 4 !important;
285
- grid-row: auto / span 6 !important;
286
- justify-self: end !important;
287
- margin-top: 0em !important;
288
- padding-right: 3em !important;
289
- padding-left: 2em !important;
290
- position: -webkit-sticky !important; /* For Safari */
291
- position: sticky !important;
292
- top: 10px !important;
293
- overflow-y: auto !important;
294
- height: calc(100vh - 40px) !important;
295
- scrollbar-width: none !important;
296
- transition: max-height 0.3s ease-out !important;
297
- z-index: -100 !important;
298
- display: block !important;
299
- visibility: visible !important;
300
- }
301
- }
302
-
303
- @media (max-width: 1199px) {
304
- d-contents {
305
- display: none !important;
306
- background: white !important;
307
- justify-self: start !important;
308
- align-self: start !important;
309
- padding-bottom: 0.5em !important;
310
- margin-bottom: 1em !important;
311
- padding-left: 0.25em !important;
312
- border-bottom: 1px solid rgba(0, 0, 0, 0.1) !important;
313
- overflow-y: scroll !important;
314
- height: calc(100vh - 40px) !important;
315
- scrollbar-width: none !important;
316
- z-index: -100 !important;
317
- }
318
- }
319
-
320
- /* Force TOC to be visible and override distill defaults */
321
- d-contents {
322
- display: block !important;
323
- visibility: visible !important;
324
- opacity: 1 !important;
325
- }
326
-
327
- /* TOC Navigation styling */
328
- d-contents .toc-header {
329
- margin-bottom: 1.5rem;
330
- border-bottom: 2px solid #007bff;
331
- padding-bottom: 0.5rem;
332
- }
333
-
334
- d-contents .toc-title {
335
- font-weight: bold;
336
- font-size: 1.2em;
337
- color: #333;
338
- }
339
-
340
- d-contents nav a {
341
- color: rgba(0, 0, 0, 0.7);
342
- text-decoration: none;
343
- border-bottom: none;
344
- display: block;
345
- padding: 0.3rem 0;
346
- font-size: 0.9em;
347
- line-height: 1.4;
348
- transition: color 0.2s ease;
349
- }
350
-
351
- d-contents nav a:hover {
352
- color: #007bff;
353
- text-decoration: none;
354
- }
355
-
356
- d-contents nav a.active {
357
- color: #007bff;
358
- font-weight: 600;
359
- }
360
-
361
- d-contents nav div {
362
- margin-bottom: 0.2rem;
363
- }
364
-
365
- /* Smooth scrollbar */
366
- d-contents {
367
- scrollbar-width: thin;
368
- scrollbar-color: rgba(0, 123, 255, 0.3) transparent;
369
- }
370
-
371
- d-contents::-webkit-scrollbar {
372
- width: 6px;
373
- }
374
-
375
- d-contents::-webkit-scrollbar-track {
376
- background: transparent;
377
- }
378
-
379
- d-contents::-webkit-scrollbar-thumb {
380
- background: rgba(0, 123, 255, 0.3);
381
- border-radius: 3px;
382
- }
383
-
384
- d-contents::-webkit-scrollbar-thumb:hover {
385
- background: rgba(0, 123, 255, 0.5);
386
- }
387
-
388
- /* Custom tooltip styling for tenet links */
389
- d-contents nav a[title] {
390
- position: relative;
391
- cursor: help;
392
- }
393
-
394
- d-contents nav a[title]:hover {
395
- color: #667eea;
396
- }
397
-
398
- /* Enhanced tooltip using CSS (fallback for title attribute) */
399
- d-contents nav a[title]:after {
400
- content: attr(title);
401
- position: absolute;
402
- left: 100%;
403
- top: 50%;
404
- transform: translateY(-50%);
405
- background: #1a202c;
406
- color: white;
407
- padding: 0.75rem 1rem;
408
- border-radius: 8px;
409
- font-size: 0.85em;
410
- white-space: normal;
411
- width: 300px;
412
- line-height: 1.4;
413
- z-index: 1001;
414
- opacity: 0;
415
- visibility: hidden;
416
- transition: opacity 0.3s ease, visibility 0.3s ease;
417
- pointer-events: none;
418
- box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
419
- }
420
-
421
- d-contents nav a[title]:before {
422
- content: '';
423
- position: absolute;
424
- left: 100%;
425
- top: 50%;
426
- transform: translate(-8px, -50%);
427
- border: 8px solid transparent;
428
- border-right-color: #1a202c;
429
- z-index: 1002;
430
- opacity: 0;
431
- visibility: hidden;
432
- transition: opacity 0.3s ease, visibility 0.3s ease;
433
- }
434
-
435
- d-contents nav a[title]:hover:after,
436
- d-contents nav a[title]:hover:before {
437
- opacity: 1;
438
- visibility: visible;
439
- }
440
-
441
- /* Adjust for smaller screens */
442
- @media (max-width: 1400px) {
443
- d-contents nav a[title]:after {
444
- left: auto;
445
- right: 100%;
446
- margin-right: 1rem;
447
- width: 250px;
448
- }
449
-
450
- d-contents nav a[title]:before {
451
- left: auto;
452
- right: 100%;
453
- transform: translate(8px, -50%);
454
- border-right-color: transparent;
455
- border-left-color: #1a202c;
456
- }
457
- }
458
-
459
- /* Improve code syntax highlighting with Prism */
460
- pre[class*="language-"] {
461
- background: #f8f9fa !important;
462
- border: 1px solid #e9ecef !important;
463
- border-radius: 8px !important;
464
- padding: 1.5rem !important;
465
- margin: 1.5rem 0 !important;
466
- overflow-x: auto !important;
467
- font-size: 0.9em !important;
468
- line-height: 1.5 !important;
469
- }
470
-
471
- code[class*="language-"] {
472
- background: none !important;
473
- font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', 'Courier New', monospace !important;
474
- color: #383a42 !important;
475
- }
476
-
477
- /* Inline code */
478
- p code, li code {
479
- background: #f1f3f4 !important;
480
- padding: 0.2em 0.4em !important;
481
- border-radius: 3px !important;
482
- font-size: 0.9em !important;
483
- color: #d73a49 !important;
484
- }
485
-
486
- /* Distill article improvements */
487
- d-article {
488
- max-width: none;
489
- font-size: 19px;
490
- line-height: 1.7 !important;
491
- color: #1a1a1a;
492
- padding-top: 1rem !important;
493
- grid-row-gap: 0 !important;
494
- }
495
-
496
- d-article > * {
497
- grid-column: middle !important;
498
- max-width: none;
499
- }
500
-
501
- /* Adjust for TOC on larger screens */
502
- @media (min-width: 1200px) {
503
- d-article > * {
504
- grid-column: text / page-end !important;
505
- max-width: none;
506
- }
507
- }
508
-
509
- /* Improve paragraph readability */
510
- d-article p {
511
- font-size: 19px;
512
- line-height: 1.5;
513
- margin-top: 0 !important;
514
- color: #1a1a1a;
515
- }
516
-
517
- /* Improve heading sizes */
518
- d-article h1 {
519
- font-size: 3rem;
520
- line-height: 1.2;
521
- margin: 3rem 0 2rem 0;
522
- color: #1a202c;
523
- font-weight: 700;
524
- }
525
-
526
- d-article h2 {
527
- font-size: 2.5rem;
528
- line-height: 1.3;
529
- margin: 1.5rem 0 0.75rem 0 !important;
530
- padding-bottom: 0.5rem !important;
531
- color: #1a202c;
532
- font-weight: 650;
533
- }
534
-
535
- d-article h3 {
536
- font-size: 2rem;
537
- line-height: 1.4;
538
- margin: 2rem 0 1rem 0;
539
- color: #1a202c;
540
- font-weight: 600;
541
- }
542
-
543
- d-article h4 {
544
- font-size: 1.5rem;
545
- line-height: 1.4;
546
- margin: 1.5rem 0 1rem 0;
547
- color: #2d3748;
548
- font-weight: 600;
549
- }
550
-
551
- /* Improve list readability */
552
- d-article ul li,
553
- d-article ol li {
554
- font-size: 18px;
555
- line-height: 1.7;
556
- margin-bottom: 0.5rem;
557
- }
558
-
559
- /* Enhanced tenet reference styling with custom tooltips */
560
- a[href^="#source-of-truth"],
561
- a[href^="#one-model-one-file"],
562
- a[href^="#code-is-product"],
563
- a[href^="#standardize-dont-abstract"],
564
- a[href^="#do-repeat-yourself"],
565
- a[href^="#minimal-user-api"],
566
- a[href^="#backwards-compatibility"],
567
- a[href^="#consistent-public-surface"],
568
- a[href^="#modular-toolbox"] {
569
- position: relative;
570
- color: #667eea;
571
- font-weight: 600;
572
- text-decoration: underline;
573
- text-decoration-color: rgba(102, 126, 234, 0.3);
574
- transition: all 0.3s ease;
575
- }
576
-
577
- a[href^="#source-of-truth"]:hover,
578
- a[href^="#one-model-one-file"]:hover,
579
- a[href^="#code-is-product"]:hover,
580
- a[href^="#standardize-dont-abstract"]:hover,
581
- a[href^="#do-repeat-yourself"]:hover,
582
- a[href^="#minimal-user-api"]:hover,
583
- a[href^="#backwards-compatibility"]:hover,
584
- a[href^="#consistent-public-surface"]:hover,
585
- a[href^="#modular-toolbox"]:hover {
586
- color: #4c51bf;
587
- text-decoration-color: #4c51bf;
588
- background: rgba(102, 126, 234, 0.1);
589
- padding: 2px 4px;
590
- border-radius: 4px;
591
- }
592
-
593
- /* Custom tooltip using data-tooltip attribute */
594
- a[data-tooltip]:after {
595
- content: attr(data-tooltip);
596
- position: absolute;
597
- bottom: 100%;
598
- left: 50%;
599
- transform: translateX(-50%);
600
- background: #1a202c;
601
- color: white;
602
- padding: 0.75rem 1rem;
603
- border-radius: 8px;
604
- font-size: 0.85em;
605
- font-weight: 400;
606
- white-space: normal;
607
- width: 320px;
608
- line-height: 1.4;
609
- z-index: 1001;
610
- opacity: 0;
611
- visibility: hidden;
612
- transition: opacity 0.3s ease, visibility 0.3s ease;
613
- pointer-events: none;
614
- box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
615
- margin-bottom: 8px;
616
- }
617
-
618
- a[data-tooltip]:before {
619
- content: '';
620
- position: absolute;
621
- bottom: 100%;
622
- left: 50%;
623
- transform: translateX(-50%);
624
- border: 8px solid transparent;
625
- border-top-color: #1a202c;
626
- z-index: 1002;
627
- opacity: 0;
628
- visibility: hidden;
629
- transition: opacity 0.3s ease, visibility 0.3s ease;
630
- }
631
-
632
- a[data-tooltip]:hover:after,
633
- a[data-tooltip]:hover:before {
634
- opacity: 1;
635
- visibility: visible;
636
- }
637
-
638
- /* Breadcrumb navigation styling */
639
- .crumbs {
640
- background: linear-gradient(135deg, #f0f4ff 0%, #e6eeff 100%);
641
- border-left: 5px solid #667eea;
642
- padding: 1.25rem 1.75rem;
643
- margin: 2.5rem 0;
644
- border-radius: 0 8px 8px 0;
645
- box-shadow: 0 2px 8px rgba(102, 126, 234, 0.12);
646
- font-size: 0.95em;
647
- line-height: 1.6;
648
- color: #4a5568;
649
- }
650
-
651
- .crumbs strong {
652
- color: #667eea;
653
- font-weight: 700;
654
- }
655
-
656
- .crumbs code {
657
- background: rgba(102, 126, 234, 0.1);
658
- padding: 0.15em 0.4em;
659
- border-radius: 3px;
660
- font-size: 0.9em;
661
- color: #4c51bf;
662
- }
663
-
664
- .crumbs a {
665
- color: #667eea;
666
- font-weight: 500;
667
- }
668
-
669
- /* Improve blockquote styling */
670
- d-article blockquote {
671
- font-size: 19px;
672
- line-height: 1.8;
673
- padding: 1.5rem 2rem;
674
- margin: 2rem 0;
675
- border-left: 4px solid #667eea;
676
- background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 50%);
677
- border-radius: 0 8px 8px 0;
678
- font-style: italic;
679
- color: #4a5568;
680
- }
681
-
682
- /* Link capsule styling - only for external HTTP(S) links */
683
- d-article a[href^="http://"],
684
- d-article a[href^="https://"] {
685
- background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%);
686
- color: #1565c0;
687
- text-decoration: none;
688
- padding: 0.15em 0.5em;
689
- border-radius: 12px;
690
- border: 1px solid #90caf9;
691
- display: inline-block;
692
- transition: all 0.3s ease;
693
- font-weight: 500;
694
- box-shadow: 0 1px 3px rgba(21, 101, 192, 0.15);
695
- }
696
-
697
- d-article a[href^="http://"]:hover,
698
- d-article a[href^="https://"]:hover {
699
- background: linear-gradient(135deg, #2196f3 0%, #1976d2 100%);
700
- color: white;
701
- border-color: #1565c0;
702
- transform: translateY(-1px);
703
- box-shadow: 0 4px 12px rgba(21, 101, 192, 0.3);
704
- }
705
-
706
- d-article a[href^="http://"]:active,
707
- d-article a[href^="https://"]:active {
708
- transform: translateY(0);
709
- box-shadow: 0 1px 3px rgba(21, 101, 192, 0.2);
710
- }
711
-
712
- /* Full width elements */
713
- d-article .code-compare,
714
- d-article .interactive-demo,
715
- d-article .memory-chart-container {
716
- max-width: none;
717
- width: 100%;
718
- margin-left: 0;
719
- margin-right: 0;
720
- }
721
-
722
- /* Responsive design improvements */
723
- @media (max-width: 1200px) {
724
- d-article .code-compare,
725
- d-article .interactive-demo {
726
- max-width: 95%;
727
- margin-left: auto;
728
- margin-right: auto;
729
- }
730
- }
731
-
732
- @media (max-width: 768px) {
733
- .tenet-list li.tenet {
734
- padding: 1rem;
735
- }
736
-
737
- .interactive-demo .demo-content {
738
- padding: 1rem;
739
- }
740
- }
741
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
webpack.config.js CHANGED
@@ -257,7 +257,10 @@ module.exports = {
257
  "title": "${appConfig.fullTitle}",
258
  "description": "${appConfig.description}",
259
  "published": "Aug 21, 2025",
260
- "authors": [{"author": "Pablo Montalvo", "authorURL": "https://huggingface.co/Molbap"}]
 
 
 
261
  }</script>
262
  </d-front-matter>
263
  <d-title>
 
257
  "title": "${appConfig.fullTitle}",
258
  "description": "${appConfig.description}",
259
  "published": "Aug 21, 2025",
260
+ "authors": [{"author": "Pablo Montalvo", "authorURL": "https://huggingface.co/Molbap"},
261
+ {"author": "Lysandre Debut", "authorURL": "https://huggingface.co/lysandre"},
262
+ {"author": "Pedro Cuenca", "authorURL": "https://huggingface.co/pcuenq"}
263
+ {"author": "Yoni Gozlan", "authorURL": "https://huggingface.co/yonigozlan"}]
264
  }</script>
265
  </d-front-matter>
266
  <d-title>