Spaces:

multimodalart
/

diffusion-gpt

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 25 days ago

Commit

a835137

verified ·

1 Parent(s): 5d89594

Update app.py

Browse files

Files changed (1) hide show

app.py +209 -182

app.py CHANGED Viewed

@@ -1,96 +1,123 @@
-# app.py (Corrected Version)
-import os
 import math
 import pickle
-import shutil
-import subprocess
-import sys
 import textwrap
-import time
 from dataclasses import dataclass
 from typing import Optional
-import spaces
-import gradio as gr
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-# --- One-Time Setup Function ---
-def setup_data():
     """
-    Checks for dataset metadata and prepares it if missing.
-    This involves cloning a repo, running a script, and cleaning up.
     """
-    data_dir = 'shakespeare_char'
-    meta_path = os.path.join(data_dir, 'meta.pkl')
     if os.path.exists(meta_path):
-        print("Dataset metadata found. Skipping setup.")
         return
-    print("Dataset metadata not found. Starting one-time setup...")
-    print("This may take a minute...")
-    repo_url = "https://github.com/karpathy/nanoGPT"
-    repo_dir = "nanoGPT"
-    try:
-        print(f"Cloning {repo_url}...")
-        subprocess.run(["git", "clone", repo_url], check=True, capture_output=True)
-        source_data_dir = os.path.join(repo_dir, 'data', 'shakespeare_char')
-        print(f"Copying data from {source_data_dir} to {data_dir}...")
-        shutil.copytree(source_data_dir, data_dir)
-        prepare_script_path = os.path.join(data_dir, 'prepare.py')
-        print(f"Running {prepare_script_path} to generate metadata...")
-        subprocess.run([sys.executable, prepare_script_path], check=True, capture_output=True)
-        print("Setup successful. 'meta.pkl' has been created.")
-    except subprocess.CalledProcessError as e:
-        print(f"An error occurred during setup: {e}", file=sys.stderr)
-        print(f"Stdout: {e.stdout.decode()}", file=sys.stderr)
-        print(f"Stderr: {e.stderr.decode()}", file=sys.stderr)
-        sys.exit("Setup failed. Please check your git installation and internet connection.")
-    except Exception as e:
-        print(f"An unexpected error occurred: {e}", file=sys.stderr)
-        sys.exit("Setup failed.")
-    finally:
-        if os.path.exists(repo_dir):
-            print(f"Cleaning up by removing '{repo_dir}' directory...")
-            shutil.rmtree(repo_dir)
-# --- Run Setup and Load Data ---
-setup_data()
-# Load metadata for character mappings
 data_dir = './shakespeare_char/'
 meta_path = os.path.join(data_dir, 'meta.pkl')
 with open(meta_path, 'rb') as f:
     meta = pickle.load(f)
 itos = meta['itos']
 stoi = meta['stoi']
-vocab_size = meta['vocab_size']
-CONTEXT_LENGTH = 256
 def decode(indices_tensor: torch.Tensor):
-    if indices_tensor.dim() == 2:
-        indices_tensor = indices_tensor[0]
     indices = indices_tensor.cpu().numpy()
-    return ''.join([itos[i] for i in indices])
 def wrap_text(long_text, width=80):
     paragraphs = long_text.splitlines()
     wrapped = [textwrap.fill(p, width=width) if p else '' for p in paragraphs]
     return "\n".join(wrapped)
-# --- Model Architecture ---
 class MLP(nn.Module):
     def __init__(self, config):
@@ -100,7 +127,11 @@ class MLP(nn.Module):
         self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
         self.dropout = nn.Dropout(config.dropout)
     def forward(self, x):
-        return self.dropout(self.c_proj(self.gelu(self.c_fc(x))))
 class SelfAttention(nn.Module):
     def __init__(self, config):
@@ -121,21 +152,27 @@ class SelfAttention(nn.Module):
         q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
         v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
         if self.flash:
-            y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=False)
         else:
             att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
             att = F.softmax(att, dim=-1)
             att = self.attn_dropout(att)
             y = att @ v
         y = y.transpose(1, 2).contiguous().view(B, T, C)
-        return self.resid_dropout(self.c_proj(y))
 def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
     return x * (1 + scale) + shift
 def bias_add_scale(x: torch.Tensor, bias: Optional[torch.Tensor], scale: torch.Tensor, residual: Optional[torch.Tensor]) -> torch.Tensor:
-    out = scale * (x + bias) if bias is not None else scale * x
-    return residual + out if residual is not None else out
 class DDiTBlock(nn.Module):
     def __init__(self, config):
@@ -144,15 +181,14 @@ class DDiTBlock(nn.Module):
         self.attn = SelfAttention(config)
         self.ln_2 = nn.LayerNorm(config.n_embd, bias=config.bias)
         self.mlp = MLP(config)
         self.adaLN_modulation = nn.Linear(config.cond_dim, 6 * config.n_embd)
         self.adaLN_modulation.weight.data.zero_()
         self.adaLN_modulation.bias.data.zero_()
     def forward(self, x, c):
         shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c)[:, None].chunk(6, dim=2)
-        x_skip = x
-        modulated_x = modulate(self.ln_1(x), shift_msa, scale_msa)
         x = bias_add_scale(self.attn(self.ln_1(x)), None, gate_msa, x_skip)
         x = bias_add_scale(self.mlp(modulate(self.ln_2(x), shift_mlp, scale_mlp)), None, gate_mlp, x)
         return x
@@ -170,7 +206,8 @@ class DDitFinalLayer(nn.Module):
     def forward(self, x, c):
         shift, scale = self.adaLN_modulation(c)[:, None].chunk(2, dim=2)
         x = modulate(self.norm_final(x), shift, scale)
-        return self.linear(x)
 class TimestepEmbedder(nn.Module):
     def __init__(self, hidden_size, frequency_embedding_size=256):
@@ -184,7 +221,9 @@ class TimestepEmbedder(nn.Module):
     @staticmethod
     def timestep_embedding(t, dim, max_period=10000):
         half = dim // 2
-        freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(device=t.device)
         args = t[:, None].float() * freqs[None]
         embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
         if dim % 2:
@@ -192,11 +231,14 @@ class TimestepEmbedder(nn.Module):
         return embedding
     def forward(self, t):
         t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
-        return self.mlp(t_freq)
 class GPT(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
         self.sigma_map = TimestepEmbedder(config.cond_dim)
         self.transformer = nn.ModuleDict(dict(
@@ -204,16 +246,13 @@ class GPT(nn.Module):
             wpe = nn.Embedding(config.block_size, config.n_embd),
             drop = nn.Dropout(config.dropout),
             h = nn.ModuleList([DDiTBlock(config) for _ in range(config.n_layer)]),
-            ln_f = nn.LayerNorm(config.n_embd, bias=config.bias), # <<< FIX 1: ADDED THIS LAYER
         ))
         self.lm_head = DDitFinalLayer(config)
         self.apply(self._init_weights)
-        # Apply special scaled init to the residual projections
         for pn, p in self.named_parameters():
             if pn.endswith('c_proj.weight'):
                 torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
@@ -221,48 +260,41 @@ class GPT(nn.Module):
                 torch.nn.init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
     def forward(self, idx, sigma):
         sigma = sigma.reshape(-1)
         b, t = idx.size()
         c = F.silu(self.sigma_map(sigma))
-        pos = torch.arange(0, t, dtype=torch.long, device=idx.device)
         tok_emb = self.transformer.wte(idx)
         pos_emb = self.transformer.wpe(pos)
         x = self.transformer.drop(tok_emb + pos_emb)
         for block in self.transformer.h:
             x = block(x, c)
-        x = self.transformer.ln_f(x) # <<< FIX 2: CALLED THE LAYER HERE
         x = self.lm_head(x, c)
-        return torch.scatter(x, -1, idx[..., None], torch.zeros_like(x[..., :1]))
-@dataclass
-class GPTConfig:
-    block_size: int = 1024
-    vocab_size: int = 50304
-    n_layer: int = 12
-    n_head: int = 12
-    n_embd: int = 768
-    cond_dim: int = 64
-    dropout: float = 0.0
-    bias: bool = False
-# --- Noise Schedule & Sampling Logic ---
 class GeometricNoise:
     def __init__(self, sigma_min=1e-4, sigma_max=20):
-        self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max])
     def total_noise(self, t):
         return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t
     def __call__(self, t):
-        return self.total_noise(t), None # Rate not needed for sampling
 def transition(x_t: torch.Tensor, delta_sigma: torch.Tensor) -> torch.Tensor:
     base_prob = (1 - torch.exp(-delta_sigma[..., None])) / vocab_size
     trans = torch.ones(*x_t.shape, vocab_size, device=x_t.device) * base_prob
     trans = trans.scatter(-1, x_t[..., None], torch.zeros_like(trans))
     diag_fill = 1 - trans.sum(dim=-1, keepdim=True)
-    return trans.scatter(-1, x_t[..., None], diag_fill)
 def staggered_score(score, delta_sigma):
     exp_factor = torch.exp(-delta_sigma)[..., None]
@@ -274,119 +306,114 @@ def sample_categorical(probs: torch.Tensor) -> torch.Tensor:
     gumbel_noise = -torch.log(-torch.log(torch.rand_like(probs) + eps) + eps)
     return torch.argmax(torch.log(probs + eps) + gumbel_noise, dim=-1)
-# --- Global Model Loading ---
-print("Setting up model and device...")
-DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
-print(f"===================================")
-print(f"Using device: {DEVICE}")
-print(f"===================================")
 model_args = dict(n_layer=6, n_head=6, n_embd=384, cond_dim=64,
-                  bias=False, vocab_size=vocab_size, block_size=CONTEXT_LENGTH, dropout=0.2)
 config = GPTConfig(**model_args)
 model = GPT(config)
-print("Loading pre-trained model weights...")
 model.load_state_dict(
     torch.hub.load_state_dict_from_url(
         'https://raw.githubusercontent.com/ash80/diffusion-gpt/master/pretrained_model/model_epoch_25.pth',
-        map_location=DEVICE
     )
 )
-model.to(DEVICE)
 model.eval()
-NOISE = GeometricNoise(sigma_min=1e-4, sigma_max=20)
-print("Model setup complete. Launching Gradio demo...")
-# --- Gradio Generation Function ---
 @spaces.GPU
 def generate_text(steps):
     """
-    Generator function that yields denoised text at each step.
-    This logic is a 1:1 copy of the original Colab notebook's sampling loop.
     """
     steps = int(steps)
     eps = 1e-5
-    timesteps = torch.linspace(1, eps, steps + 1, device=DEVICE)
-    step_size = (1 - eps) / steps
-    # Start with a fresh random sample
-    x = torch.randint(0, vocab_size, (1, CONTEXT_LENGTH), device=DEVICE)
-    # Initial random text
-    initial_text = decode(x)
-    yield f"Step 0/{steps} (Initial Noise):\n\n{wrap_text(initial_text)}"
-    time.sleep(0.5)
     with torch.no_grad():
-        for i in range(steps + 1):
-            t = timesteps[i] * torch.ones(x.shape[0], 1, device=DEVICE)
-            curr_sigma_bar, _ = NOISE(t)
-            if i < steps:
-                # This is an intermediate denoising step
-                next_sigma_bar, _ = NOISE(t - step_size)
-                delta_sigma = curr_sigma_bar - next_sigma_bar
-                log_score = model(x, curr_sigma_bar)
-                score = torch.exp(log_score)
-                stag_score = staggered_score(score, delta_sigma)
-                probs = stag_score * transition(x, delta_sigma)
-                x = sample_categorical(probs)
-            else:
-                # This is the final, full denoising step
-                # The "next sigma" is 0, so delta_sigma is the entire current noise.
-                delta_sigma = curr_sigma_bar
-                log_score = model(x, curr_sigma_bar)
-                score = torch.exp(log_score)
-                stag_score = staggered_score(score, delta_sigma)
-                probs = stag_score * transition(x, delta_sigma)
-                x = sample_categorical(probs)
-            # Yield the decoded text after each step
-            # The last yield will be the final result
-            decoded_text = decode(x)
-            if i < steps:
-                yield f"Step {i+1}/{steps}:\n\n{wrap_text(decoded_text)}"
-            else:
-                yield f"Final Result (Step {steps}/{steps}):\n\n{wrap_text(decoded_text)}"
-# --- Gradio Interface ---
 with gr.Blocks(theme=gr.themes.Citrus()) as demo:
     gr.Markdown(
         """
-        # The Annotated Discrete Diffusion Model: Live Demo
-        This demo visualizes the denoising process of a character-level discrete diffusion model.
-        Start with pure random noise and watch as coherent text, in the style of Shakespeare, emerges over several steps.
         """
     )
-    with gr.Row():
-        steps_slider = gr.Slider(
-            minimum=10,
-            maximum=200,
-            value=128,
-            step=1,
-            label="Number of Denoising Steps",
-            info="More steps can lead to better quality but take longer."
-        )
-        generate_button = gr.Button("Generate", variant="primary")
     output_textbox = gr.Textbox(
-        label="Denoising Process",
-        lines=15,
-        interactive=False,
         show_copy_button=True,
-        placeholder="The denoising process will appear here..."
     )
     generate_button.click(
-        fn=generate_text,
-        inputs=[steps_slider],
         outputs=[output_textbox]
     )

+import gradio as gr
+import spaces
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import numpy as np
 import math
+import os
 import pickle
+import requests
 import textwrap
+import subprocess
+import shutil
 from dataclasses import dataclass
 from typing import Optional
+def setup_environment():
     """
+    Checks for and sets up the necessary data and code.
+    - Clones nanoGPT if not present.
+    - Copies the shakespeare_char dataset directory.
+    - Runs the data preparation script to create meta.pkl and binary files.
+    This function makes the script self-contained.
     """
+    nano_gpt_repo_path = 'nanoGPT'
+    data_dir_path = 'shakespeare_char'
+    meta_path = os.path.join(data_dir_path, 'meta.pkl')
+    # If the final metadata file already exists, we assume setup is complete.
     if os.path.exists(meta_path):
+        print("Dataset and metadata found. Skipping setup.")
         return
+    print("Required data not found. Starting one-time setup...")
+    # 1. Clone nanoGPT repository if it doesn't exist
+    if not os.path.exists(nano_gpt_repo_path):
+        print(f"Cloning nanoGPT repository...")
+        try:
+            subprocess.run(
+                ['git', 'clone', 'https://github.com/karpathy/nanoGPT.git'],
+                check=True, capture_output=True, text=True
+            )
+            print("Cloned successfully.")
+        except subprocess.CalledProcessError as e:
+            print(f"Error cloning repository: {e.stderr}")
+            raise
+    else:
+        print("nanoGPT repository already exists.")
+    # 2. Copy the dataset directory if it doesn't exist
+    source_data_dir = os.path.join(nano_gpt_repo_path, 'data', 'shakespeare_char')
+    if not os.path.exists(data_dir_path):
+        print(f"Copying '{source_data_dir}' to '{data_dir_path}'...")
+        shutil.copytree(source_data_dir, data_dir_path)
+        print("Copied successfully.")
+    else:
+        print(f"'{data_dir_path}' directory already exists.")
+    # 3. Run the data preparation script
+    prepare_script_path = os.path.join(data_dir_path, 'prepare.py')
+    if not os.path.exists(meta_path):
+        print(f"Running data preparation script: '{prepare_script_path}'...")
+        # We need to run the script from within its directory for it to find input.txt
+        try:
+            subprocess.run(
+                ['python', 'prepare.py'],
+                check=True, cwd=data_dir_path, capture_output=True, text=True
+            )
+            print("Data preparation script finished successfully.")
+        except subprocess.CalledProcessError as e:
+            print(f"Error running prepare.py: {e.stderr}")
+            raise
+    print("Setup complete.")
+# Run the setup process before anything else
+setup_environment()
+# --- 2. Global Setup & Helper Functions ---
+# Load metadata (guaranteed to exist by the setup function)
 data_dir = './shakespeare_char/'
 meta_path = os.path.join(data_dir, 'meta.pkl')
 with open(meta_path, 'rb') as f:
     meta = pickle.load(f)
+vocab_size = meta['vocab_size']
 itos = meta['itos']
 stoi = meta['stoi']
+context_length = 256
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
 def decode(indices_tensor: torch.Tensor):
+    """Decodes a 1D tensor of indices to text"""
+    if indices_tensor.dim() > 1:
+        indices_tensor = indices_tensor.squeeze(0)
     indices = indices_tensor.cpu().numpy()
+    return ''.join([itos.get(i, '?') for i in indices]) # Use .get for safety
 def wrap_text(long_text, width=80):
+    """Wraps text to a maximum line width, preserving paragraph breaks."""
     paragraphs = long_text.splitlines()
     wrapped = [textwrap.fill(p, width=width) if p else '' for p in paragraphs]
     return "\n".join(wrapped)
+# --- 3. Model Architecture (Identical to Notebook) ---
+@dataclass
+class GPTConfig:
+    block_size: int = 1024
+    vocab_size: int = 50304
+    n_layer: int = 12
+    n_head: int = 12
+    n_embd: int = 768
+    cond_dim: int = 64
+    dropout: float = 0.0
+    bias: bool = False
 class MLP(nn.Module):
     def __init__(self, config):
         self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
         self.dropout = nn.Dropout(config.dropout)
     def forward(self, x):
+        x = self.c_fc(x)
+        x = self.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
 class SelfAttention(nn.Module):
     def __init__(self, config):
         q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
         v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
         if self.flash:
+            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=False)
         else:
             att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
             att = F.softmax(att, dim=-1)
             att = self.attn_dropout(att)
             y = att @ v
         y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.resid_dropout(self.c_proj(y))
+        return y
 def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
     return x * (1 + scale) + shift
 def bias_add_scale(x: torch.Tensor, bias: Optional[torch.Tensor], scale: torch.Tensor, residual: Optional[torch.Tensor]) -> torch.Tensor:
+    if bias is not None:
+        out = scale * (x + bias)
+    else:
+        out = scale * x
+    if residual is not None:
+        out = residual + out
+    return out
 class DDiTBlock(nn.Module):
     def __init__(self, config):
         self.attn = SelfAttention(config)
         self.ln_2 = nn.LayerNorm(config.n_embd, bias=config.bias)
         self.mlp = MLP(config)
         self.adaLN_modulation = nn.Linear(config.cond_dim, 6 * config.n_embd)
         self.adaLN_modulation.weight.data.zero_()
         self.adaLN_modulation.bias.data.zero_()
     def forward(self, x, c):
         shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c)[:, None].chunk(6, dim=2)
+        x_skip = x
+        x = modulate(self.ln_1(x), shift_msa, scale_msa)
+        x = self.attn(x)
         x = bias_add_scale(self.attn(self.ln_1(x)), None, gate_msa, x_skip)
         x = bias_add_scale(self.mlp(modulate(self.ln_2(x), shift_mlp, scale_mlp)), None, gate_mlp, x)
         return x
     def forward(self, x, c):
         shift, scale = self.adaLN_modulation(c)[:, None].chunk(2, dim=2)
         x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
 class TimestepEmbedder(nn.Module):
     def __init__(self, hidden_size, frequency_embedding_size=256):
     @staticmethod
     def timestep_embedding(t, dim, max_period=10000):
         half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
         args = t[:, None].float() * freqs[None]
         embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
         if dim % 2:
         return embedding
     def forward(self, t):
         t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
 class GPT(nn.Module):
     def __init__(self, config):
         super().__init__()
+        assert config.vocab_size is not None
+        assert config.block_size is not None
         self.config = config
         self.sigma_map = TimestepEmbedder(config.cond_dim)
         self.transformer = nn.ModuleDict(dict(
             wpe = nn.Embedding(config.block_size, config.n_embd),
             drop = nn.Dropout(config.dropout),
             h = nn.ModuleList([DDiTBlock(config) for _ in range(config.n_layer)]),
+            ln_f = nn.LayerNorm(config.n_embd, bias=config.bias),
         ))
         self.lm_head = DDitFinalLayer(config)
         self.apply(self._init_weights)
         for pn, p in self.named_parameters():
             if pn.endswith('c_proj.weight'):
                 torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
                 torch.nn.init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
     def forward(self, idx, sigma):
         sigma = sigma.reshape(-1)
         b, t = idx.size()
         c = F.silu(self.sigma_map(sigma))
+        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        pos = torch.arange(0, t, dtype=torch.long, device=device)
         tok_emb = self.transformer.wte(idx)
         pos_emb = self.transformer.wpe(pos)
         x = self.transformer.drop(tok_emb + pos_emb)
         for block in self.transformer.h:
             x = block(x, c)
+        x = self.transformer.ln_f(x)
         x = self.lm_head(x, c)
+        x = torch.scatter(x, -1, idx[..., None], torch.zeros_like(x[..., :1]))
+        return x
 class GeometricNoise:
     def __init__(self, sigma_min=1e-4, sigma_max=20):
+        self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max]).to(device)
+    def rate_noise(self, t):
+        return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (self.sigmas[1].log() - self.sigmas[0].log())
     def total_noise(self, t):
         return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t
     def __call__(self, t):
+        return self.total_noise(t), self.rate_noise(t)
+# --- 4. Inference & Sampling Logic (Identical to Notebook) ---
 def transition(x_t: torch.Tensor, delta_sigma: torch.Tensor) -> torch.Tensor:
     base_prob = (1 - torch.exp(-delta_sigma[..., None])) / vocab_size
     trans = torch.ones(*x_t.shape, vocab_size, device=x_t.device) * base_prob
     trans = trans.scatter(-1, x_t[..., None], torch.zeros_like(trans))
     diag_fill = 1 - trans.sum(dim=-1, keepdim=True)
+    trans = trans.scatter(-1, x_t[..., None], diag_fill)
+    return trans
 def staggered_score(score, delta_sigma):
     exp_factor = torch.exp(-delta_sigma)[..., None]
     gumbel_noise = -torch.log(-torch.log(torch.rand_like(probs) + eps) + eps)
     return torch.argmax(torch.log(probs + eps) + gumbel_noise, dim=-1)
+# --- 5. Model Initialization and Loading ---
+print("Initializing and loading the pretrained model...")
 model_args = dict(n_layer=6, n_head=6, n_embd=384, cond_dim=64,
+                  bias=False, vocab_size=vocab_size, block_size=context_length, dropout=0.2)
 config = GPTConfig(**model_args)
 model = GPT(config)
 model.load_state_dict(
     torch.hub.load_state_dict_from_url(
         'https://raw.githubusercontent.com/ash80/diffusion-gpt/master/pretrained_model/model_epoch_25.pth',
+        map_location=device
     )
 )
+model.to(device)
 model.eval()
+noise = GeometricNoise(sigma_min=1e-4, sigma_max=20)
+print("Model loaded successfully.")
+# --- 6. Gradio Interface ---
 @spaces.GPU
 def generate_text(steps):
     """
+    The main generation function for the Gradio app.
+    This function contains the exact denoising loop from the notebook.
     """
     steps = int(steps)
     eps = 1e-5
+    # Start with a random sample
+    x = torch.randint(0, vocab_size, (1, context_length), device=device)
+    initial_text = f"--- Initial Random Noise ---\n\n{wrap_text(decode(x[0]))}"
+    yield initial_text
+    timesteps = torch.linspace(1, eps, steps + 1, device=device)
+    step_size = (1 - eps) / steps
     with torch.no_grad():
+        for i in range(steps):
+            t = timesteps[i] * torch.ones(x.shape[0], 1, device=device)
+            curr_sigma_bar = noise(t)[0]
+            # This logic block handles all but the last step
+            next_sigma_bar = noise(t - step_size)[0]
+            delta_sigma = curr_sigma_bar - next_sigma_bar
+            log_score = model(x, curr_sigma_bar)
+            score = torch.exp(log_score)
+            stag_score = staggered_score(score, delta_sigma)
+            probs = stag_score * transition(x, delta_sigma)
+            x = sample_categorical(probs)
+            # Yield intermediate result
+            progress_text = f"--- Denoising Step {i + 1}/{steps} ---\n\n{wrap_text(decode(x[0]))}"
+            yield progress_text
+        # Final denoising step
+        t = timesteps[steps] * torch.ones(x.shape[0], 1, device=device)
+        curr_sigma_bar = noise(t)[0]
+        delta_sigma = curr_sigma_bar # delta is curr_sigma - 0
+        log_score = model(x, curr_sigma_bar)
+        score = torch.exp(log_score)
+        stag_score = staggered_score(score, delta_sigma)
+        probs = stag_score * transition(x, delta_sigma)
+        x = sample_categorical(probs)
+    final_text = f"--- Final Denoised Text (Step {steps}) ---\n\n{wrap_text(decode(x[0]))}"
+    yield final_text
+# Define the Gradio UI
 with gr.Blocks(theme=gr.themes.Citrus()) as demo:
     gr.Markdown(
         """
+        # The Annotated Discrete Diffusion Models
+        This Gradio demo provides an interactive implementation of the character-level discrete diffusion model from the notebook.
+        The model starts with random characters (noise) and iteratively denoises them to generate coherent text in the style of Shakespeare.
         """
     )
+    steps_slider = gr.Slider(
+        minimum=10,
+        maximum=256,
+        value=128,
+        step=1,
+        label="Denoising Steps",
+        info="Number of steps in the reverse diffusion process."
+    )
+    generate_button = gr.Button("Generate", variant="primary")
     output_textbox = gr.Textbox(
+        label="Generated Text",
+        lines=15,
+        interactive=False,
         show_copy_button=True,
+        placeholder="Generation will appear here..."
     )
     generate_button.click(
+        fn=generate_text,
+        inputs=[steps_slider],
         outputs=[output_textbox]
     )