fix: add quickstart and avoid autotune when no cuda

Files changed (3) hide show

README.md +54 -30
readme_example.py +51 -0
torch-ext/megablocks/backend/kernels.py +14 -0

README.md CHANGED Viewed

@@ -4,39 +4,63 @@ tags:
   - kernel
 ---
 ```bash
-nix develop --show-trace -i -L .#test --command python -m pytest -s tests
 ```
-expected output:
 ```
-============== test session starts ===============
-platform linux -- Python 3.12.10, pytest-8.3.5, pluggy-1.5.0
-rootdir: /home/ubuntu/Projects/megablocks-moe
-plugins: hypothesis-6.130.12
-collecting 43 items                              world_size=1
-collected 387 items
-tests/layers/moe_test.py ...........................................
-tests/ops/binned_gather_test.py .....................
-tests/ops/binned_scatter_test.py .....................
-tests/ops/cumsum_test.py ................................
-tests/ops/histogram_test.py ......................................................
-tests/ops/padded_gather_test.py ......................................
-tests/ops/padded_scatter_test.py ......................................................
-tests/ops/replicate_test.py ..................................................................................
-tests/ops/sort_test.py ..................
-tests/ops/topology_test.py ....................
-tests/test_mb_moe.py megablocks_moe module imported successfully.
-Available functions: ['Arguments', 'MLP', 'MoE', 'ParallelDroplessMLP', 'ParallelMLP', 'SparseGLU', 'SparseMLP', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_megablocks_a4f6452_dirty', '_ops', 'argsort', 'backend', 'cumsum', 'dMoE', 'exclusive_cumsum', 'get_load_balancing_loss', 'grouped_gemm_util', 'histogram', 'inclusive_cumsum', 'indices', 'layers', 'ops', 'replicate_backward', 'replicate_forward', 'sort', 'torch']
-.cumsum output: tensor([0, 1, 3, 6], device='cuda:0', dtype=torch.int16)
-...
-================ warnings summary ================
-...
--- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
-======= 387 passed, 18 warnings in 54.63s ========
-```

   - kernel
 ---
+## Quickstart
 ```bash
+uv run https://huggingface.co/kernels-community/megablocks/raw/main/readme_example.py
 ```
+```python
+# /// script
+# requires-python = "==3.10"
+# dependencies = [
+#     "numpy",
+#     "kernels",
+#     "torch"
+# ]
+# ///
+import torch
+from collections import namedtuple
+from kernels import get_kernel
+# Make reproducible
+torch.manual_seed(42)
+torch.cuda.manual_seed(42)
+# Download optimized kernels from the Hugging Face hub
+megablocks = get_kernel("kernels-community/megablocks")
+print("MegaBlocks kernel downloaded successfully.")
+model = megablocks.layers.MegaBlocksMoeMLP()
+model.experts = namedtuple("Experts", ["gate_up_proj", "gate_down_proj", "down_proj", "hidden_size"])
+print("MegaBlocksMoeMLP instance created successfully.")
+# Config
+ne, hs, isz = 128, 1152, 3072
+# Router with proper initialization
+model.router = torch.nn.Linear(hs, ne, device="cuda")
+torch.nn.init.kaiming_uniform_(model.router.weight)
+# Expert layers with realistic weights
+e = model.experts
+e.gate_up_proj = torch.nn.Parameter(torch.randn(ne, hs, isz, device="cuda") * 0.02)
+e.gate_up_proj_bias = torch.nn.Parameter(torch.zeros(ne, isz, device="cuda"))
+e.down_proj = torch.nn.Parameter(torch.randn(ne, 1536, hs, device="cuda") * 0.02)
+e.down_proj_bias = torch.nn.Parameter(torch.zeros(ne, hs, device="cuda"))
+e.hidden_size = hs
+print("Expert layers initialized successfully.")
+# Test with normalized input
+x = torch.randn(1, 1, hs, device="cuda") * 0.1
+output, expert_weights = model(x)
+print("Model forward pass completed successfully.")
+print(f"Output shape: {output.shape}")
+print(f"Output range: [{output.min():.3f}, {output.max():.3f}]")
+print(f"Output: {output.flatten()[:10]}")
+print(f"Expert weights sum: {expert_weights.sum():.3f}")
 ```

readme_example.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# /// script
+# requires-python = "==3.10"
+# dependencies = [
+#     "numpy",
+#     "kernels",
+#     "torch"
+# ]
+# ///
+import torch
+from collections import namedtuple
+from kernels import get_kernel
+# Make reproducible
+torch.manual_seed(42)
+torch.cuda.manual_seed(42)
+# Download optimized kernels from the Hugging Face hub
+megablocks = get_kernel("kernels-community/megablocks")
+print("MegaBlocks kernel downloaded successfully.")
+model = megablocks.layers.MegaBlocksMoeMLP()
+model.experts = namedtuple("Experts", ["gate_up_proj", "gate_down_proj", "down_proj", "hidden_size"])
+print("MegaBlocksMoeMLP instance created successfully.")
+# Config
+ne, hs, isz = 128, 1152, 3072
+# Router with proper initialization
+model.router = torch.nn.Linear(hs, ne, device="cuda")
+torch.nn.init.kaiming_uniform_(model.router.weight)
+# Expert layers with realistic weights
+e = model.experts
+e.gate_up_proj = torch.nn.Parameter(torch.randn(ne, hs, isz, device="cuda") * 0.02)
+e.gate_up_proj_bias = torch.nn.Parameter(torch.zeros(ne, isz, device="cuda"))
+e.down_proj = torch.nn.Parameter(torch.randn(ne, 1536, hs, device="cuda") * 0.02)
+e.down_proj_bias = torch.nn.Parameter(torch.zeros(ne, hs, device="cuda"))
+e.hidden_size = hs
+print("Expert layers initialized successfully.")
+# Test with normalized input
+x = torch.randn(1, 1, hs, device="cuda") * 0.1
+output, expert_weights = model(x)
+print("Model forward pass completed successfully.")
+print(f"Output shape: {output.shape}")
+print(f"Output range: [{output.min():.3f}, {output.max():.3f}]")
+print(f"Output: {output.flatten()[:10]}")
+print(f"Expert weights sum: {expert_weights.sum():.3f}")

torch-ext/megablocks/backend/kernels.py CHANGED Viewed

@@ -5,6 +5,20 @@ import torch
 import triton
 import triton.language as tl
 def assert_is_tensor(x, ndim):
     if x.ndim != ndim:

 import triton
 import triton.language as tl
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+    triton.autotune = _no_autotune
 def assert_is_tensor(x, ndim):
     if x.ndim != ndim: