Spaces:

multimodalart
/

diffusion-gpt

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 28 days ago

Commit

d1a16a0

verified ·

1 Parent(s): 6cb8476

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -21

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import os
 import math
 import pickle
@@ -8,8 +10,8 @@ import textwrap
 import time
 from dataclasses import dataclass
 from typing import Optional
-import spaces
 import gradio as gr
 import numpy as np
 import torch
@@ -37,19 +39,15 @@ def setup_data():
     repo_dir = "nanoGPT"
     try:
-        # 1. Clone the repository
         print(f"Cloning {repo_url}...")
         subprocess.run(["git", "clone", repo_url], check=True, capture_output=True)
-        # 2. Copy the data directory
         source_data_dir = os.path.join(repo_dir, 'data', 'shakespeare_char')
         print(f"Copying data from {source_data_dir} to {data_dir}...")
         shutil.copytree(source_data_dir, data_dir)
-        # 3. Run the preparation script
         prepare_script_path = os.path.join(data_dir, 'prepare.py')
         print(f"Running {prepare_script_path} to generate metadata...")
-        # Use the same python executable that is running this script
         subprocess.run([sys.executable, prepare_script_path], check=True, capture_output=True)
         print("Setup successful. 'meta.pkl' has been created.")
@@ -63,7 +61,6 @@ def setup_data():
         print(f"An unexpected error occurred: {e}", file=sys.stderr)
         sys.exit("Setup failed.")
     finally:
-        # 4. Clean up the cloned repository
         if os.path.exists(repo_dir):
             print(f"Cleaning up by removing '{repo_dir}' directory...")
             shutil.rmtree(repo_dir)
@@ -83,20 +80,17 @@ vocab_size = meta['vocab_size']
 CONTEXT_LENGTH = 256
 def decode(indices_tensor: torch.Tensor):
-    '''Decodes a 1D tensor of indices to text'''
     if indices_tensor.dim() == 2:
         indices_tensor = indices_tensor[0]
     indices = indices_tensor.cpu().numpy()
     return ''.join([itos[i] for i in indices])
 def wrap_text(long_text, width=80):
-    """Wraps text to a maximum line width, preserving paragraph breaks."""
     paragraphs = long_text.splitlines()
     wrapped = [textwrap.fill(p, width=width) if p else '' for p in paragraphs]
     return "\n".join(wrapped)
-# --- Model Architecture (Copied from the notebook) ---
 class MLP(nn.Module):
     def __init__(self, config):
@@ -209,9 +203,16 @@ class GPT(nn.Module):
             wpe = nn.Embedding(config.block_size, config.n_embd),
             drop = nn.Dropout(config.dropout),
             h = nn.ModuleList([DDiTBlock(config) for _ in range(config.n_layer)]),
         ))
         self.lm_head = DDitFinalLayer(config)
         self.apply(self._init_weights)
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
@@ -219,6 +220,7 @@ class GPT(nn.Module):
                 torch.nn.init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
     def forward(self, idx, sigma):
         sigma = sigma.reshape(-1)
         b, t = idx.size()
@@ -229,6 +231,7 @@ class GPT(nn.Module):
         x = self.transformer.drop(tok_emb + pos_emb)
         for block in self.transformer.h:
             x = block(x, c)
         x = self.lm_head(x, c)
         return torch.scatter(x, -1, idx[..., None], torch.zeros_like(x[..., :1]))
@@ -248,12 +251,10 @@ class GPTConfig:
 class GeometricNoise:
     def __init__(self, sigma_min=1e-4, sigma_max=20):
         self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max])
-    def rate_noise(self, t):
-        return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (self.sigmas[1].log() - self.sigmas[0].log())
     def total_noise(self, t):
         return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t
     def __call__(self, t):
-        return self.total_noise(t), self.rate_noise(t)
 def transition(x_t: torch.Tensor, delta_sigma: torch.Tensor) -> torch.Tensor:
     base_prob = (1 - torch.exp(-delta_sigma[..., None])) / vocab_size
@@ -276,6 +277,10 @@ def sample_categorical(probs: torch.Tensor) -> torch.Tensor:
 print("Setting up model and device...")
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 model_args = dict(n_layer=6, n_head=6, n_embd=384, cond_dim=64,
                   bias=False, vocab_size=vocab_size, block_size=CONTEXT_LENGTH, dropout=0.2)
 config = GPTConfig(**model_args)
@@ -295,19 +300,15 @@ NOISE = GeometricNoise(sigma_min=1e-4, sigma_max=20)
 print("Model setup complete. Launching Gradio demo...")
 # --- Gradio Generation Function ---
 @spaces.GPU
 def generate_text(steps):
-    """Generator function that yields denoised text at each step."""
     steps = int(steps)
     eps = 1e-5
     timesteps = torch.linspace(1, eps, steps + 1, device=DEVICE)
     step_size = (1 - eps) / steps
-    # Start with a fresh random sample
     x = torch.randint(0, vocab_size, (1, CONTEXT_LENGTH), device=DEVICE)
-    # Initial random text
     initial_text = decode(x)
     yield f"Step 0/{steps} (Initial Noise):\n\n{wrap_text(initial_text)}"
     time.sleep(0.5)
@@ -317,9 +318,10 @@ def generate_text(steps):
             progress(i / steps, desc=f"Denoising Step {i+1}/{steps}")
             t = timesteps[i] * torch.ones(x.shape[0], 1, device=DEVICE)
-            curr_sigma_bar = NOISE(t)[0]
-            next_sigma_bar = NOISE(t - step_size)[0]
             delta_sigma = curr_sigma_bar - next_sigma_bar
             log_score = model(x, curr_sigma_bar)
@@ -329,11 +331,9 @@ def generate_text(steps):
             probs = stag_score * transition(x, delta_sigma)
             x = sample_categorical(probs)
-            # Yield the decoded text and step info
             decoded_text = decode(x)
             yield f"Step {i+1}/{steps}:\n\n{wrap_text(decoded_text)}"
-    # Final result
     final_text = decode(x)
     yield f"Final Result (Step {steps}/{steps}):\n\n{wrap_text(final_text)}"

+# app.py (Corrected Version)
 import os
 import math
 import pickle
 import time
 from dataclasses import dataclass
 from typing import Optional
+import spaces
 import gradio as gr
 import numpy as np
 import torch
     repo_dir = "nanoGPT"
     try:
         print(f"Cloning {repo_url}...")
         subprocess.run(["git", "clone", repo_url], check=True, capture_output=True)
         source_data_dir = os.path.join(repo_dir, 'data', 'shakespeare_char')
         print(f"Copying data from {source_data_dir} to {data_dir}...")
         shutil.copytree(source_data_dir, data_dir)
         prepare_script_path = os.path.join(data_dir, 'prepare.py')
         print(f"Running {prepare_script_path} to generate metadata...")
         subprocess.run([sys.executable, prepare_script_path], check=True, capture_output=True)
         print("Setup successful. 'meta.pkl' has been created.")
         print(f"An unexpected error occurred: {e}", file=sys.stderr)
         sys.exit("Setup failed.")
     finally:
         if os.path.exists(repo_dir):
             print(f"Cleaning up by removing '{repo_dir}' directory...")
             shutil.rmtree(repo_dir)
 CONTEXT_LENGTH = 256
 def decode(indices_tensor: torch.Tensor):
     if indices_tensor.dim() == 2:
         indices_tensor = indices_tensor[0]
     indices = indices_tensor.cpu().numpy()
     return ''.join([itos[i] for i in indices])
 def wrap_text(long_text, width=80):
     paragraphs = long_text.splitlines()
     wrapped = [textwrap.fill(p, width=width) if p else '' for p in paragraphs]
     return "\n".join(wrapped)
+# --- Model Architecture ---
 class MLP(nn.Module):
     def __init__(self, config):
             wpe = nn.Embedding(config.block_size, config.n_embd),
             drop = nn.Dropout(config.dropout),
             h = nn.ModuleList([DDiTBlock(config) for _ in range(config.n_layer)]),
+            ln_f = nn.LayerNorm(config.n_embd, bias=config.bias), # <<< FIX 1: ADDED THIS LAYER
         ))
         self.lm_head = DDitFinalLayer(config)
         self.apply(self._init_weights)
+        # Apply special scaled init to the residual projections
+        for pn, p in self.named_parameters():
+            if pn.endswith('c_proj.weight'):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
                 torch.nn.init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
     def forward(self, idx, sigma):
         sigma = sigma.reshape(-1)
         b, t = idx.size()
         x = self.transformer.drop(tok_emb + pos_emb)
         for block in self.transformer.h:
             x = block(x, c)
+        x = self.transformer.ln_f(x) # <<< FIX 2: CALLED THE LAYER HERE
         x = self.lm_head(x, c)
         return torch.scatter(x, -1, idx[..., None], torch.zeros_like(x[..., :1]))
 class GeometricNoise:
     def __init__(self, sigma_min=1e-4, sigma_max=20):
         self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max])
     def total_noise(self, t):
         return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t
     def __call__(self, t):
+        return self.total_noise(t), None # Rate not needed for sampling
 def transition(x_t: torch.Tensor, delta_sigma: torch.Tensor) -> torch.Tensor:
     base_prob = (1 - torch.exp(-delta_sigma[..., None])) / vocab_size
 print("Setting up model and device...")
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(f"===================================")
+print(f"Using device: {DEVICE}")
+print(f"===================================")
 model_args = dict(n_layer=6, n_head=6, n_embd=384, cond_dim=64,
                   bias=False, vocab_size=vocab_size, block_size=CONTEXT_LENGTH, dropout=0.2)
 config = GPTConfig(**model_args)
 print("Model setup complete. Launching Gradio demo...")
 # --- Gradio Generation Function ---
 @spaces.GPU
 def generate_text(steps):
     steps = int(steps)
     eps = 1e-5
     timesteps = torch.linspace(1, eps, steps + 1, device=DEVICE)
     step_size = (1 - eps) / steps
     x = torch.randint(0, vocab_size, (1, CONTEXT_LENGTH), device=DEVICE)
     initial_text = decode(x)
     yield f"Step 0/{steps} (Initial Noise):\n\n{wrap_text(initial_text)}"
     time.sleep(0.5)
             progress(i / steps, desc=f"Denoising Step {i+1}/{steps}")
             t = timesteps[i] * torch.ones(x.shape[0], 1, device=DEVICE)
+            curr_sigma_bar, _ = NOISE(t)
+            next_t = t - step_size
+            next_sigma_bar, _ = NOISE(next_t)
             delta_sigma = curr_sigma_bar - next_sigma_bar
             log_score = model(x, curr_sigma_bar)
             probs = stag_score * transition(x, delta_sigma)
             x = sample_categorical(probs)
             decoded_text = decode(x)
             yield f"Step {i+1}/{steps}:\n\n{wrap_text(decoded_text)}"
     final_text = decode(x)
     yield f"Final Result (Step {steps}/{steps}):\n\n{wrap_text(final_text)}"