Spaces:

multimodalart
/

Dream

Runtime error

App Files Files Community

multimodalart HF Staff commited on Apr 5

Commit

825e87d

verified ·

1 Parent(s): 168a7c1

Update app.py

Browse files

Files changed (1) hide show

app.py +413 -607

app.py CHANGED Viewed

@@ -4,519 +4,335 @@ import numpy as np
 import gradio as gr
 import spaces
 import torch.nn.functional as F
-from transformers import AutoTokenizer, AutoModel
-from transformers.generation.configuration_utils import GenerationConfig
 import time
-import re
-import torch.distributions as dists # Import dists for sampling logic
-# --- Model Loading ---
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 print(f"Using device: {device}")
-# Load Dream model and tokenizer
 model_path = "Dream-org/Dream-v0-Instruct-7B"
-# Load configuration first to get token IDs
-config = DreamConfig.from_pretrained(model_path) # Assuming configuration_dream.py is present
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
-model = model.to(device).eval()
-print("Model and Tokenizer loaded.")
-# --- Constants ---
-MASK_TOKEN = tokenizer.mask_token # "<|mask|>"
-MASK_ID = config.mask_token_id # Get from config (e.g., 151666)
-EOS_ID = config.eos_token_id   # Get from config (e.g., 151643)
-PAD_ID = config.pad_token_id   # Get from config (e.g., 151643)
 # --- Helper Functions ---
-def parse_constraints(constraints_text):
     """Parse constraints in format: 'position:word, position:word, ...'"""
     constraints = {}
     if not constraints_text:
-        return constraints
     parts = constraints_text.split(',')
     for part in parts:
         if ':' not in part:
             continue
         try:
-            pos_str, word = part.split(':', 1)
             pos = int(pos_str.strip())
-            # Use strip() and lower() for robustness if needed, but preserve case for now
             word = word.strip()
             if word and pos >= 0:
-                # Tokenize the word - handle potential multi-token words
-                # Add space prefix typical for non-leading words if pos > 0
                 prefix = " " if pos > 0 else ""
                 tokens = tokenizer.encode(prefix + word, add_special_tokens=False)
                 for i, token_id in enumerate(tokens):
-                     # Only add if the token is not a special token id already
-                     # (This prevents accidental replacement of things like MASK_ID)
-                    if token_id not in [MASK_ID, EOS_ID, PAD_ID]:
-                        constraints[pos + i] = token_id
         except ValueError:
             continue
         except Exception as e:
-            print(f"Error parsing constraint part '{part}': {e}")
-            continue
-    return constraints
 def format_chat_history(history):
     """
-    Format chat history for the Dream model (using ChatML format potentially)
     Args:
         history: List of [user_message, assistant_message] pairs
     Returns:
-        Formatted conversation for the model (list of message dicts)
     """
     messages = []
-     # Check if the first message is a system prompt
-    if history and history[0][0].lower().startswith("system:"):
-        # Special handling if needed, or just treat as user
-        # For now, let's assume standard user/assistant alternation
-        pass # Or handle system prompt separately if template requires
-    for i, (user_msg, assistant_msg) in enumerate(history):
-         # Basic user/assistant structure
-        messages.append({"role": "user", "content": user_msg})
-        if assistant_msg is not None: # Skip if None (for the latest user message)
             messages.append({"role": "assistant", "content": assistant_msg})
     return messages
-# --- Core Generation Logic (Adapted from Dream's _sample) ---
-def sample_tokens_for_vis(logits, temperature=0.0, top_p=None, top_k=None, margin_confidence=False, neg_entropy=False):
     """
-    Simplified version of Dream's sample_tokens to get both token and confidence.
-    Returns confidence and chosen token ID.
     """
-    # Apply temperature
-    if temperature > 0:
-        logits = logits / temperature
-    # Apply Top-P
-    if top_p is not None and top_p < 1.0:
-        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
-        sorted_indices_to_remove = cumulative_probs > top_p
-        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-        sorted_indices_to_remove[..., 0] = 0
-        indices_to_remove = torch.zeros_like(logits, dtype=torch.bool).scatter_(-1, sorted_indices, sorted_indices_to_remove)
-        logits = logits.masked_fill(indices_to_remove, float('-inf'))
-    # Apply Top-K
-    if top_k is not None and top_k > 0:
-        top_k = min(top_k, logits.size(-1))
-        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
-        logits = logits.masked_fill(indices_to_remove, float('-inf'))
-    # Calculate probabilities
-    probs = torch.softmax(logits, dim=-1)
-    # Sample or Argmax
-    if temperature > 0:
-        # Use torch distributions for robust sampling
-        dist = dists.Categorical(probs=probs)
-        x0 = dist.sample()
-        # Gather confidence for the sampled token
-        confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
-    else:
-        # Argmax for deterministic generation
-        confidence, x0 = torch.max(probs, dim=-1)
-    # --- Calculate specific confidence metrics if requested ---
-    # Note: These modify the 'confidence' variable *after* sampling x0
-    if margin_confidence:
-        if probs.shape[-1] >= 2:
-             # Ensure logits weren't completely masked, handle edge cases
-            if not torch.isinf(logits).all(dim=-1).any():
-                # Sort probabilities to get top1 and top2
-                sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
-                top1_probs = sorted_probs[..., 0]
-                top2_probs = sorted_probs[..., 1]
-                confidence = top1_probs - top2_probs
-            else:
-                 # Fallback if all logits are -inf (shouldn't normally happen)
-                 confidence.fill_(0.0) # Or some other indicator
-        else:
-             # Only one possible token, margin is undefined or 1? Set to top1 prob.
-             confidence, _ = torch.max(probs, dim=-1)
-    elif neg_entropy:
-        epsilon = 1e-9 # Slightly smaller epsilon
-        log_probs = torch.log(probs + epsilon)
-        # Negative entropy is sum(p * log(p))
-        confidence = torch.sum(probs * log_probs, dim=-1) # Lower value (more negative) is higher confidence
-    return confidence, x0
-@spaces.GPU
-@torch.no_grad()
-def generate_response_with_visualization_dream(
-    messages, gen_length=64, steps=64,
-    constraints=None, temperature=0.2, top_p=0.95, top_k=None, # Added top_k
-    alg="entropy", alg_temp=0.1, # Dream specific params
-    yield_intermediate=True # Control yielding behavior
-    ):
-    """
-    Generate text with Dream model with real-time visualization.
-    Adapts logic from Dream's _sample method.
-    Args:
-        messages: List of message dictionaries with 'role' and 'content'
-        gen_length: Max new tokens to generate
-        steps: Number of diffusion steps
-        constraints: Dictionary mapping positions to *token IDs*
-        temperature: Sampling temperature
-        top_p: Nucleus sampling probability
-        top_k: Top-k sampling
-        alg: Remasking strategy ('origin', 'maskgit_plus', 'topk_margin', 'entropy')
-        alg_temp: Temperature for confidence-based remasking randomness
-        yield_intermediate: Whether to yield intermediate states for visualization
-    Returns:
-        Yields visualization states or returns final state list, and final text.
-    """
-    if constraints is None:
-        constraints = {} # keys are positions relative to start of response
-    # --- Prepare Input ---
-    chat_input_text = tokenizer.apply_chat_template(
-        messages, add_generation_prompt=True, tokenize=False
-    )
-    input_ids = tokenizer(chat_input_text, return_tensors="pt")['input_ids'].to(device)
-    prompt_length = input_ids.shape[1]
-    max_length = prompt_length + gen_length
-    # Clamp max_length if it exceeds model capacity (use config value if available)
-    model_max_len = getattr(config, 'max_position_embeddings', 2048) # Default fallback
-    if max_length > model_max_len:
-        print(f"Warning: Requested length ({max_length}) exceeds model max ({model_max_len}). Clamping.")
-        max_length = model_max_len
-        gen_length = max_length - prompt_length
-        if gen_length <= 0:
-             print("Warning: Prompt is already at or exceeding model max length. Cannot generate.")
-             if yield_intermediate:
-                 yield [], "Error: Prompt too long."
-                 return
-             else:
-                 return [], "Error: Prompt too long."
-    # Initialize sequence 'x' with input_ids and padding with MASK_ID
-    x = torch.full((1, max_length), MASK_ID, dtype=torch.long, device=device)
-    x[:, :prompt_length] = input_ids.clone()
-    # Apply initial constraints to x (relative position -> absolute position)
-    for rel_pos, token_id in constraints.items():
-        abs_pos = prompt_length + rel_pos
-        if abs_pos < max_length:
-            # Ensure we don't overwrite prompt or special tokens accidentally
-            if token_id not in [MASK_ID, EOS_ID, PAD_ID]:
-                 x[:, abs_pos] = token_id
-            else:
-                 print(f"Warning: Skipped constraint for special token ID {token_id} at pos {rel_pos}")
-    # --- Visualization Setup ---
-    visualization_states = []
-    revealed_eos_pad = set() # Track positions where EOS/PAD was shown once
-    def get_vis_state(current_x, old_x, step_confidences=None):
-        nonlocal revealed_eos_pad
-        state = []
-        newly_revealed_in_step = False # Flag if any token changed from MASK
-        current_revealed_eos_pad = set() # Track EOS/PAD revealed *in this step*
-        for i in range(gen_length):
-            abs_pos = prompt_length + i
-            current_token_id = current_x[0, abs_pos].item()
-            old_token_id = old_x[0, abs_pos].item()
-            is_eos_or_pad = (current_token_id == EOS_ID or current_token_id == PAD_ID)
-            # Handle EOS/PAD hiding: Show once, then hide
-            if is_eos_or_pad and abs_pos in revealed_eos_pad:
-                state.append(("", "#FFFFFF")) # Make it invisible (white on white/transparent)
-                continue # Skip rest of logic for this pos
-            token_str = tokenizer.decode([current_token_id], skip_special_tokens=False) # Decode even specials initially
-            if current_token_id == MASK_ID:
-                color = "#444444" # Dark Gray for Mask
-                token_str = MASK_TOKEN # Display mask token string
-            elif old_token_id == MASK_ID: # Newly revealed in this step
-                newly_revealed_in_step = True
-                confidence = step_confidences.get(abs_pos, 0.5) # Get confidence if available, default 0.5
-                # Color based on confidence (adjust thresholds as needed)
-                # Note: Entropy confidence is negative, more negative = higher confidence
-                if alg == 'entropy':
-                     # Example thresholds for negative entropy (adjust based on observation)
-                    if confidence > -1.0: # Low confidence (high entropy)
-                        color = "#FF6666"  # Light Red
-                    elif confidence > -3.0: # Medium confidence
-                        color = "#FFAA33"  # Orange
-                    else: # High confidence (low entropy)
-                        color = "#66CC66"  # Light Green
-                else: # Standard confidence (probability or margin)
-                    if confidence < 0.3:
-                        color = "#FF6666"  # Light Red
-                    elif confidence < 0.7:
-                        color = "#FFAA33"  # Orange
-                    else:
-                        color = "#66CC66"  # Light Green
-                # If it's EOS/PAD revealed now, mark for future hiding
-                if is_eos_or_pad:
-                    current_revealed_eos_pad.add(abs_pos)
-            else: # Previously revealed
-                color = "#6699CC" # Light Blue
-            # Clean up token string for display (optional)
-            # token_str = token_str.replace(" ", " ") # Keep spaces visible
-            state.append((token_str, color))
-        # Update the global set of revealed EOS/PAD positions
-        revealed_eos_pad.update(current_revealed_eos_pad)
-        return state, newly_revealed_in_step
-    # Add initial state (all masked, constraints applied)
-    initial_vis_state, _ = get_vis_state(x, x) # Pass x as old_x initially
-    visualization_states.append(initial_vis_state)
-    if yield_intermediate:
-        yield initial_vis_state # Yield the starting state
-    # --- Diffusion Loop ---
-    timesteps = torch.linspace(1.0, 1e-3, steps + 1, device=device) # Use epsilon from Dream's defaults if needed
-    # Store the state before the loop starts
-    old_x = x.clone()
-    for i in range(steps):
-        # --- Core Dream Step ---
-        mask_index = (x == MASK_ID)
-        if not mask_index.any(): # Stop if no masks left
-            print(f"No masks left at step {i}. Stopping generation.")
-            break
-        # Prepare attention mask (full attention for Dream unless specified otherwise)
-        # Dream's modeling code handles standard causal masking internally based on position_ids
-        # For diffusion, we typically allow attending to everything (masked or not)
-        # The `model` forward pass expects a standard causal mask or None
-        # Let's use None, assuming the model handles positions correctly
-        attention_mask = None # Or potentially create a full mask: torch.ones_like(x)
-        # Create position_ids (simple range for now)
-        position_ids = torch.arange(0, x.shape[1], device=device).unsqueeze(0)
-        # Model forward pass
-        outputs = model(input_ids=x, attention_mask=attention_mask, position_ids=position_ids)
-        logits = outputs.logits
-        # logits = torch.cat([logits[:,:1], logits[:, :-1]], dim=1) # Dream applies shift in utils, replicate if needed
-        # Select logits for masked positions ONLY
-        # Need to handle batch dimension (which is 1 here)
-        current_mask_indices_flat = torch.where(mask_index.flatten())[0]
-        if len(current_mask_indices_flat) == 0:
-            print(f"No mask indices found flat at step {i}. Stopping generation.")
-            break
-        # Use advanced indexing to get logits for masked positions
-        # Logits shape: [batch_size, seq_len, vocab_size]
-        # Mask_index shape: [batch_size, seq_len]
-        # We need logits corresponding to True values in mask_index
-        # Example: batch_idx = torch.where(mask_index)[0], seq_idx = torch.where(mask_index)[1]
-        # mask_logits = logits[batch_idx, seq_idx]
-        batch_indices, seq_indices = torch.where(mask_index)
-        mask_logits = logits[batch_indices, seq_indices] # Shape: [num_masked_tokens, vocab_size]
-        if mask_logits.numel() == 0: # Double check after indexing
-            print(f"No mask logits selected at step {i}. Stopping generation.")
-            break
-        t = timesteps[i]
-        s = timesteps[i + 1]
-        # --- Remasking Logic (Simplified from Dream's _sample) ---
-        step_confidences = {} # Store confidences for revealed tokens in this step {abs_pos: confidence}
-        if alg == 'origin':
-            p_transfer = (1.0 - s / t) if i < steps - 1 else 1.0
-            # Sample for all masked positions
-            confidence, x0_masked = sample_tokens_for_vis(mask_logits, temperature=temperature, top_p=top_p, top_k=top_k)
-            # Decide which ones to transfer based on random probability
-            transfer_mask = torch.rand(x0_masked.shape, device=device) < p_transfer
-            # Create a tensor of MASK_IDs, and fill in the transferred tokens
-            updates_for_masked_pos = torch.full_like(x0_masked, MASK_ID)
-            updates_for_masked_pos[transfer_mask] = x0_masked[transfer_mask]
-            # Update x at the masked positions
-            x[mask_index] = updates_for_masked_pos
-            # Store confidences for the *transferred* tokens for visualization
-            transferred_indices_flat = current_mask_indices_flat[transfer_mask]
-            transferred_confidences = confidence[transfer_mask]
-            for flat_idx, conf in zip(transferred_indices_flat, transferred_confidences):
-                 abs_pos = flat_idx.item() # Convert flat index back to seq position (assuming batch=1)
-                 step_confidences[abs_pos] = conf.item()
-        else: # Confidence-based algorithms ('maskgit_plus', 'topk_margin', 'entropy')
-            use_margin = (alg == 'topk_margin')
-            use_entropy = (alg == 'entropy')
-            # Sample potential replacements for ALL masked positions first
-            confidence, x0_masked = sample_tokens_for_vis(
-                mask_logits,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                margin_confidence=use_margin,
-                neg_entropy=use_entropy
-            )
-            num_mask_tokens = mask_index.sum().item()
-            # Calculate how many tokens to unmask/transfer in this step
-            num_transfer_tokens = int(num_mask_tokens * (1.0 - s / t)) if i < steps - 1 else num_mask_tokens
-            if num_transfer_tokens > 0 and confidence.numel() > 0:
-                transfer_indices_relative = None # Indices relative to the masked tokens
-                if alg_temp is None or alg_temp <= 0:
-                    # Deterministic: Select top-k confidence scores among masked tokens
-                    # Ensure k is not larger than the number of masked tokens
-                    k = min(num_transfer_tokens, confidence.shape[0])
-                    if k > 0:
-                       _, transfer_indices_relative = torch.topk(confidence, k)
-                else:
-                    # Stochastic: Sample based on confidence scores
-                    # Ensure probabilities are valid
-                    conf_probs = F.softmax(confidence / alg_temp, dim=-1)
-                    if not torch.isnan(conf_probs).any() and not torch.isinf(conf_probs).any() and conf_probs.sum() > 1e-6:
-                         # Ensure k is not larger than the number of masked tokens
-                        k = min(num_transfer_tokens, confidence.shape[0])
-                        if k > 0:
-                            transfer_indices_relative = torch.multinomial(conf_probs, num_samples=k, replacement=False)
-                    else:
-                        print(f"Warning: Invalid confidence probabilities at step {i}. Falling back to top-k.")
-                         # Fallback to deterministic if sampling fails
-                        k = min(num_transfer_tokens, confidence.shape[0])
-                        if k > 0:
-                            _, transfer_indices_relative = torch.topk(confidence, k)
-                if transfer_indices_relative is not None and transfer_indices_relative.numel() > 0:
-                    # Create updates, initially all MASK_ID
-                    updates_for_masked_pos = torch.full_like(x0_masked, MASK_ID)
-                    # Place the selected sampled tokens into the updates tensor
-                    updates_for_masked_pos[transfer_indices_relative] = x0_masked[transfer_indices_relative]
-                    # Update x at the original masked positions
-                    x[mask_index] = updates_for_masked_pos
-                     # Store confidences for the *transferred* tokens for visualization
-                    selected_confidences = confidence[transfer_indices_relative]
-                    # Get the absolute positions corresponding to these relative indices
-                    original_indices_flat = current_mask_indices_flat[transfer_indices_relative]
-                    for flat_idx, conf in zip(original_indices_flat, selected_confidences):
-                        abs_pos = flat_idx.item()
-                        step_confidences[abs_pos] = conf.item()
-                else:
-                    # No tokens were selected to transfer, x remains unchanged for masked parts
-                     pass # x[mask_index] remains MASK_ID essentially
-            else:
-                 # If num_transfer_tokens is 0, x remains unchanged for masked parts
-                 pass
-        # --- Apply Constraints and Finalize Step ---
-        # Ensure constraints are always maintained AFTER updates
-        for rel_pos, token_id in constraints.items():
-            abs_pos = prompt_length + rel_pos
-            if abs_pos < max_length:
-                 # Check if the position was masked before applying constraint
-                 # if mask_index[0, abs_pos]: # Only apply if it *was* a mask, maybe? Optional.
-                 x[:, abs_pos] = token_id
-        # --- Visualization Update ---
-        current_vis_state, newly_revealed = get_vis_state(x, old_x, step_confidences)
-        # Only add/yield if something actually changed or if it's the last step
-        if newly_revealed or i == steps - 1:
-            visualization_states.append(current_vis_state)
-            if yield_intermediate:
-                yield current_vis_state
-        # Update old_x for the next iteration
-        old_x = x.clone()
-    # --- Final Output ---
-    response_tokens = x[0, prompt_length:]
-    # Decode, cleaning up potential special tokens unless they are intended
-    final_text = tokenizer.decode(response_tokens,
-                                  skip_special_tokens=True, # Skip things like <|mask|> in final output
-                                  clean_up_tokenization_spaces=True)
-    # If not yielding intermediates, return the full list now
-    if not yield_intermediate:
-        return visualization_states, final_text
-    else:
-        # If yielding intermediates, we still need a way to signal completion
-        # and return the final text. Gradio's yield typically handles this if
-        # the last yielded value is the final one. We'll return the final text
-        # separately after the loop finishes in the calling function.
-        # The loop yields states, the calling function returns the final text.
-        pass # Final text is handled outside the generator function
-# --- Gradio UI ---
 css = '''
 .category-legend{display:none}
 button{height: 60px}
-.token-revealed { transition: background-color 0.5s ease; } /* Optional: Add transition effect */
-.token-masked { background-color: #444444; color: white; padding: 1px 2px; margin: 1px; border-radius: 3px; display: inline-block; }
-.token-new-high { background-color: #66CC66; color: black; padding: 1px 2px; margin: 1px; border-radius: 3px; display: inline-block; }
-.token-new-mid { background-color: #FFAA33; color: black; padding: 1px 2px; margin: 1px; border-radius: 3px; display: inline-block; }
-.token-new-low { background-color: #FF6666; color: black; padding: 1px 2px; margin: 1px; border-radius: 3px; display: inline-block; }
-.token-old { background-color: #6699CC; color: white; padding: 1px 2px; margin: 1px; border-radius: 3px; display: inline-block; }
-.token-hidden { display: none; } /* Hide EOS/PAD after first reveal */
 '''
 def create_chatbot_demo():
     with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
         gr.Markdown("# Dream 7B - Diffusion Language Model Demo")
         gr.Markdown(
             "[[Model Card](https://huggingface.co/Dream-org/Dream-v0-Instruct-7B)] "
-            "[[Blog](https://hkunlp.github.io/blog/2025/dream/)] "
-             "[[Original LLaDA Demo Inspiration](https://huggingface.co/spaces/GSAI-ML/LLaDA-demo)]"
-        )
-        gr.Markdown(
-            "**Note:** This demo visualizes the diffusion process in real-time. "
-            "Tokens start masked (<font color='#444444'>[MASK]</font>) and are revealed step-by-step. "
-            "Colors indicate confidence: <font color='#66CC66'>High</font>, "
-            "<font color='#FFAA33'>Medium</font>, <font color='#FF6666'>Low</font>. "
-            "Previously revealed tokens are <font color='#6699CC'>blue</font>. "
-            f"EOS/PAD tokens ({tokenizer.decode([EOS_ID])}) are hidden after appearing once."
         )
         # STATE MANAGEMENT
         chat_history = gr.State([])
-        current_response_text = gr.State("") # Store the final text separately
         # UI COMPONENTS
         with gr.Row():
             with gr.Column(scale=3):
-                chatbot_ui = gr.Chatbot(label="Conversation", height=500, bubble_full_width=False)
                 # Message input
                 with gr.Group():
@@ -524,229 +340,219 @@ def create_chatbot_demo():
                         user_input = gr.Textbox(
                             label="Your Message",
                             placeholder="Type your message here...",
-                            scale=7,
-                            show_label=False
                         )
                         send_btn = gr.Button("Send", scale=1)
                 constraints_input = gr.Textbox(
-                    label="Word Constraints (Relative Position)",
-                    info="Place words at specific 0-indexed positions in the *response*. Format: 'pos:word, pos:word'. Example: '0:Once, 5:upon, 10:time'",
-                    placeholder="0:Hello, 10:world",
                     value=""
                 )
             with gr.Column(scale=2):
-                 # Use HighlightedText with specific classes for better styling control
                 output_vis = gr.HighlightedText(
                     label="Denoising Process Visualization",
-                    # Show legend mapping colors to confidence might be useful if classes aren't self-explanatory
-                    # For now, using the description markdown above.
-                    show_legend=False,
-                    # Use custom classes defined in CSS
-                    # color_map={ # This might not work directly with dynamic classes, CSS is better
-                    #     "MASK": "#444444", "NEW_H": "#66CC66", "NEW_M": "#FFAA33",
-                    #     "NEW_L": "#FF6666", "OLD": "#6699CC", "HIDDEN": "#FFFFFF"
-                    # }
-                    combine_adjacent=False, # Keep tokens separate
-                    height=550, # Adjust height as needed
                 )
         # Advanced generation settings
         with gr.Accordion("Generation Settings", open=False):
             with gr.Row():
                 gen_length = gr.Slider(
-                    minimum=16, maximum=512, value=64, step=8, # Increased max length
                     label="Max New Tokens"
                 )
                 steps = gr.Slider(
-                    minimum=8, maximum=512, value=64, step=4, # Allow more steps
-                    label="Diffusion Steps"
                 )
             with gr.Row():
                 temperature = gr.Slider(
-                    minimum=0.0, maximum=1.5, value=0.2, step=0.05, # Wider range for temp
                     label="Temperature"
                 )
                 top_p = gr.Slider(
-                    minimum=0.0, maximum=1.0, value=0.95, step=0.05,
-                    label="Top-P (Nucleus Sampling)"
-                )
-                # top_k = gr.Slider(
-                #     minimum=0, maximum=200, value=0, step=5, # Allow Top-K=0 (disabled)
-                #     label="Top-K (0 to disable)"
-                # )
-            with gr.Row():
-                # Dream specific algorithm choice
-                alg_strategy = gr.Radio(
-                    choices=["entropy", "maskgit_plus", "topk_margin", "origin"],
-                    value="entropy",
-                    label="Algorithm (`alg`)"
                 )
-                alg_temp = gr.Slider(
-                    minimum=0.0, maximum=1.0, value=0.1, step=0.01,
-                    label="Algorithm Temp (`alg_temp`)"
                 )
             with gr.Row():
                 visualization_delay = gr.Slider(
-                    minimum=0.0, maximum=0.5, value=0.03, step=0.01, # Faster default delay
                     label="Visualization Delay (seconds)"
                 )
         # Clear button
         clear_btn = gr.Button("Clear Conversation")
-        # --- Helper Functions for UI ---
-        def add_message_to_history(history, message, response):
-            """Add a message pair to the history state"""
             history.append([message, response])
             return history
-        def user_message_action(message, history):
-            """Handles user sending a message: updates history, clears input."""
-            if not message or message.strip() == "":
-                return history, history, "", [], "" # Return empty vis, empty response
-            # Add user message with None response placeholder
-            history = add_message_to_history(history, message, None)
-            # Return updated history for chatbot display, clear input box
-            return history, history, "", [], "" # Clear vis and response text state too
-        def bot_response_generator(
-            history, gen_length, steps, constraints_str, delay,
-            temperature, top_p, # top_k,
-            alg, alg_temp
-            ):
-            """Generates bot response and yields visualization states."""
-            if not history or history[-1][1] is not None: # Check if last message already has a response
-                print("History empty or last message already processed.")
-                yield history, [], "" # Yield empty state if no work to do
-                return
-            last_user_message = history[-1][0]
-            print(f"Generating response for: {last_user_message}")
-            try:
-                # Format history for the model (excluding the last None response)
-                messages = format_chat_history(history[:-1])
-                # Add the current user message
-                messages.append({"role": "user", "content": last_user_message})
-                # Parse constraints into token IDs
-                parsed_constraints = parse_constraints(constraints_str)
-                print(f"Parsed constraints: {parsed_constraints}")
-                final_text = "" # Initialize final_text
-                # Use the generator function
-                response_generator = generate_response_with_visualization_dream(
-                    messages,
-                    gen_length=gen_length,
-                    steps=steps,
-                    constraints=parsed_constraints,
-                    temperature=temperature,
-                    top_p=top_p if top_p > 0 else None, # Pass None if 0
-                    top_k=None, # Pass None if 0 top_k if top_k > 0 else None,
-                    alg=alg,
-                    alg_temp=alg_temp if alg_temp > 0 else None, # Pass None if 0
-                    yield_intermediate=True
-                )
-                # Iterate through the yielded visualization states
-                last_state = None
-                for vis_state in response_generator:
-                    last_state = vis_state
-                     # Update chatbot with placeholder during generation
-                    history[-1][1] = "..." # Indicate thinking
-                    yield history, vis_state, "..." # Yield history, current vis state, placeholder text
-                    if delay > 0:
-                        time.sleep(delay)
-                # --- Generation Finished ---
-                # Extract final text (needs to be done *after* the generator is exhausted)
-                # Re-run the generation without yielding intermediates to get the final text reliably
-                # (Or modify the generator to return it, but this is simpler for now)
-                # TODO: Optimize this - maybe the generator could return the final text at the end?
-                print("Re-generating final text (non-streaming)...")
-                final_vis_states, final_text = generate_response_with_visualization_dream(
-                    messages, gen_length, steps, parsed_constraints, temperature,
-                    top_p if top_p > 0 else None, None, #top_k if top_k > 0 else None,
-                    alg, alg_temp if alg_temp > 0 else None,
-                    yield_intermediate=False # Get final result only
-                )
-                print(f"Final Text: {final_text}")
-                # Update the history with the actual final response
-                history[-1][1] = final_text.strip() if final_text else "[No response]"
-                # Yield the final state one last time
-                yield history, final_vis_states[-1] if final_vis_states else [], final_text.strip()
-            except Exception as e:
-                import traceback
-                print(f"Error during generation: {e}")
-                traceback.print_exc()
-                error_msg = f"Error: {str(e)}"
-                history[-1][1] = error_msg # Show error in chat
-                # Show error in visualization (red text)
-                error_vis = [(error_msg, "#FF0000")]
-                yield history, error_vis, error_msg
-        def clear_conversation_action():
-            """Clears chat history, visualization, and response text."""
-            return [], [], "", [] # History, Chatbot UI, Response Text, Visualization
-        # --- Event Wiring ---
-        # 1. User Submits Message (Textbox Enter or Button Click)
-        submit_triggers = [user_input.submit, send_btn.click]
-        for trigger in submit_triggers:
-            trigger.then(
-                fn=user_message_action,
-                inputs=[user_input, chat_history],
-                outputs=[chat_history, chatbot_ui, user_input, output_vis, current_response_text], # Update history state, chatbot UI, clear input, clear vis, clear response state
-                queue=True # Enable queue for handling multiple users
-            ).then(
-                # 2. Trigger Bot Response Generation (Generator Function)
-                fn=bot_response_generator,
-                inputs=[
-                    chat_history, gen_length, steps, constraints_input, visualization_delay,
-                    temperature, top_p, # top_k,
-                    alg_strategy, alg_temp
-                ],
-                outputs=[chatbot_ui, output_vis, current_response_text] # Stream updates to chatbot, visualization, and store final text
-            )
-        # Clear Button Action
         clear_btn.click(
-            fn=clear_conversation_action,
             inputs=[],
-            outputs=[chat_history, chatbot_ui, current_response_text, output_vis],
-            queue=False # No need to queue clear action
         )
     return demo
-# --- Launch ---
 if __name__ == "__main__":
-    # Make sure the necessary Dream model files (modeling_dream.py, configuration_dream.py etc.)
-    # are in the same directory or accessible in the Python path.
-    # Also ensure 'generation_utils.py' is available if needed by the model loading/config.
-    # Check if 'modeling_dream.py' exists before launching
-    import os
-    if not os.path.exists("modeling_dream.py") or not os.path.exists("configuration_dream.py"):
-         print("\nERROR: Could not find 'modeling_dream.py' and/or 'configuration_dream.py'.")
-         print("Please make sure these files (from the 'dream_model.txt' source) are in the same directory as this script.")
-         print("You might need to extract them from the provided text file.")
-         # exit() # Optional: stop execution if files are missing
-    print("Creating Gradio Demo...")
     demo = create_chatbot_demo()
-    print("Launching Gradio Demo...")
-    # Use queueing for better user experience with potentially long generation times
-    demo.queue().launch(share=True, debug=True) # Enable debug for more detailed logs

 import gradio as gr
 import spaces
 import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel, AutoConfig
 import time
+import copy
+# Determine device
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 print(f"Using device: {device}")
+# --- Model and Tokenizer Loading ---
 model_path = "Dream-org/Dream-v0-Instruct-7B"
+print(f"Loading tokenizer from {model_path}...")
+# Load configuration first to get special token IDs
+config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+print(f"Loading model from {model_path}...")
+model = AutoModel.from_pretrained(
+    model_path,
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True
+).to(device).eval()
+print("Model loaded successfully.")
+# --- Constants from Dream Model ---
+# Get IDs directly from config or tokenizer if available
+MASK_TOKEN = tokenizer.mask_token
+MASK_ID = config.mask_token_id if hasattr(config, 'mask_token_id') else tokenizer.mask_token_id
+EOS_ID = config.eos_token_id if hasattr(config, 'eos_token_id') else tokenizer.eos_token_id
+PAD_ID = config.pad_token_id if hasattr(config, 'pad_token_id') else tokenizer.pad_token_id # Often same as EOS
+print(f"MASK_TOKEN: '{MASK_TOKEN}', MASK_ID: {MASK_ID}")
+print(f"EOS_ID: {EOS_ID}, PAD_ID: {PAD_ID}")
+if MASK_ID is None:
+    raise ValueError("Could not determine MASK_ID from model config or tokenizer.")
+if EOS_ID is None:
+    raise ValueError("Could not determine EOS_ID from model config or tokenizer.")
+if PAD_ID is None:
+    raise ValueError("Could not determine PAD_ID from model config or tokenizer.")
 # --- Helper Functions ---
+def parse_constraints(constraints_text, tokenizer):
     """Parse constraints in format: 'position:word, position:word, ...'"""
     constraints = {}
+    processed_constraints_tokens = {}
     if not constraints_text:
+        return constraints, processed_constraints_tokens
     parts = constraints_text.split(',')
     for part in parts:
         if ':' not in part:
             continue
+        pos_str, word = part.split(':', 1)
         try:
             pos = int(pos_str.strip())
             word = word.strip()
             if word and pos >= 0:
+                # Store original word constraint for display/debugging if needed
+                constraints[pos] = word
+                # Tokenize the word (add space for consistency if not BOS)
+                # Note: Dream tokenizer might handle spaces differently, adjust if needed
                 prefix = " " if pos > 0 else ""
                 tokens = tokenizer.encode(prefix + word, add_special_tokens=False)
                 for i, token_id in enumerate(tokens):
+                     # Prevent overwriting multi-token constraints partially
+                    if pos + i not in processed_constraints_tokens:
+                        processed_constraints_tokens[pos + i] = token_id
         except ValueError:
             continue
         except Exception as e:
+             print(f"Error tokenizing constraint word '{word}': {e}")
+             continue
+    # Sort by position for consistent application
+    processed_constraints_tokens = dict(sorted(processed_constraints_tokens.items()))
+    print(f"Parsed Constraints (Word): {constraints}")
+    print(f"Parsed Constraints (Tokens): {processed_constraints_tokens}")
+    return constraints, processed_constraints_tokens
 def format_chat_history(history):
     """
+    Format chat history for the Dream model using its chat template convention.
     Args:
         history: List of [user_message, assistant_message] pairs
     Returns:
+        Formatted list of message dictionaries for the model
     """
     messages = []
+     # Add system prompt if not present (standard practice)
+    if not history or history[0][0] is None or history[0][0].lower() != "system":
+         messages.append({"role": "system", "content": "You are a helpful assistant."})
+    for user_msg, assistant_msg in history:
+        if user_msg is not None: # Handle potential system message case
+             messages.append({"role": "user", "content": user_msg})
+        if assistant_msg:  # Skip if None (for the latest user message)
             messages.append({"role": "assistant", "content": assistant_msg})
     return messages
+# --- Core Generation Logic with Visualization Hook ---
+@spaces.GPU
+def generate_response_with_visualization(
+    messages, # List of message dictionaries
+    gen_length=64,
+    steps=64,
+    constraints_text="", # Raw constraint text
+    temperature=0.2,
+    top_p=0.95,
+    top_k=None, # Added for Dream
+    alg="entropy", # Changed from remasking
+    alg_temp=0.0, # Added for Dream
+    visualization_delay=0.05,
+    tokenizer=tokenizer,
+    model=model,
+    device=device,
+    MASK_ID=MASK_ID,
+    EOS_ID=EOS_ID,
+    PAD_ID=PAD_ID
+):
     """
+    Generate text with Dream model with real-time visualization using a hook.
     """
+    visualization_states = []
+    final_text = ""
+    # Use a list to hold previous_x, allowing nonlocal modification
+    # Initialize with None, it will be set after the first hook call
+    shared_state = {'previous_x': None}
+    try:
+        # --- 1. Prepare Inputs ---
+        _, parsed_constraints_tokens = parse_constraints(constraints_text, tokenizer)
+        # Apply chat template
+        # Important: Keep tokenize=False initially to get prompt length correctly
+        # The template adds roles and special tokens like <|im_start|> etc.
+        chat_input_text = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True, # Adds the prompt for the assistant's turn
+            tokenize=False
+        )
+        # Tokenize the full templated chat string
+        inputs = tokenizer(chat_input_text, return_tensors="pt", return_dict=True)
+        input_ids = inputs.input_ids.to(device)
+        attention_mask = inputs.attention_mask.to(device) # Use mask from tokenizer
+        prompt_length = input_ids.shape[1]
+        total_length = prompt_length + gen_length
+        # --- 2. Initialize Generation Sequence ---
+        # Start with the prompt, pad the rest with MASK_ID
+        x = torch.full((1, total_length), MASK_ID, dtype=torch.long, device=device)
+        x[:, :prompt_length] = input_ids.clone()
+        attention_mask = F.pad(attention_mask, (0, gen_length), value=1) # Extend attention mask
+        # Apply initial constraints to the masked sequence `x`
+        for pos, token_id in parsed_constraints_tokens.items():
+            absolute_pos = prompt_length + pos
+            if absolute_pos < total_length:
+                print(f"Applying initial constraint at pos {absolute_pos}: token {token_id}")
+                x[:, absolute_pos] = token_id
+        # Store initial state (prompt + all masked) for visualization
+        initial_state_vis = []
+        # Add prompt tokens (optional visualization, could be grayed out or skipped)
+        # for i in range(prompt_length):
+        #     token_str = tokenizer.decode([x[0, i].item()], skip_special_tokens=True)
+        #     initial_state_vis.append((token_str if token_str else " ", "#AAAAAA")) # Gray for prompt
+        # Add masked tokens for the generation part
+        for _ in range(gen_length):
+            initial_state_vis.append((MASK_TOKEN, "#444444")) # Dark gray for masks
+        visualization_states.append(initial_state_vis)
+        shared_state['previous_x'] = x.clone() # Initialize previous_x
+        # --- 3. Define the Visualization Hook ---
+        def generation_tokens_hook_func(step, current_x_hook, logits):
+            # nonlocal previous_x # Allow modification of the outer scope variable
+            current_x_hook = current_x_hook.clone() # Work on a copy
+            # --- Apply constraints within the hook ---
+            # This ensures constraints are respected even if the model tries to overwrite them
+            for pos, token_id in parsed_constraints_tokens.items():
+                absolute_pos = prompt_length + pos
+                if absolute_pos < total_length:
+                    current_x_hook[:, absolute_pos] = token_id
+            # --- End Constraint Application ---
+            if shared_state['previous_x'] is None: # First call
+                 shared_state['previous_x'] = current_x_hook.clone()
+                 return current_x_hook # Must return the (potentially modified) sequence
+            # Generate visualization state for this step
+            current_state_vis = []
+            prev_x_step = shared_state['previous_x']
+            for i in range(gen_length):
+                pos = prompt_length + i  # Absolute position in the sequence
+                current_token_id = current_x_hook[0, pos].item()
+                prev_token_id = prev_x_step[0, pos].item()
+                # Decode token, handling special tokens we want to hide
+                token_str = ""
+                color = "#444444" # Default: Dark Gray (Mask)
+                token_str_raw = tokenizer.decode([current_token_id], skip_special_tokens=False) # Keep special tokens for ID check
+                if current_token_id == MASK_ID:
+                    token_str = MASK_TOKEN
+                    color = "#444444" # Dark gray
+                elif current_token_id == EOS_ID or current_token_id == PAD_ID:
+                     token_str = "" # Hide EOS/PAD visually
+                     color = "#DDDDDD" # Use a light gray or make transparent if possible
+                else:
+                    # Decode without special tokens for display if it's not MASK/EOS/PAD
+                    token_str = tokenizer.decode([current_token_id], skip_special_tokens=True).strip()
+                    if not token_str: token_str = token_str_raw # Fallback if strip removes everything (e.g., space)
+                    if prev_token_id == MASK_ID:
+                        # Newly revealed in this step
+                        color = "#66CC66" # Light green (Simplified from confidence levels)
+                    else:
+                        # Previously revealed
+                        color = "#6699CC" # Light blue
+                current_state_vis.append((token_str if token_str else " ", color)) # Ensure non-empty tuple element
+            visualization_states.append(current_state_vis)
+            shared_state['previous_x'] = current_x_hook.clone() # Update previous_x for the next step
+            return current_x_hook # Return the sequence (constraints applied)
+        # --- 4. Run Diffusion Generation ---
+        print("Starting diffusion generation...")
+        start_time = time.time()
+        output = model.diffusion_generate(
+            input_ids=x[:, :prompt_length], # Pass only the initial prompt to diffusion_generate
+                                            # as it handles the masking internally based on MASK_ID
+            attention_mask=attention_mask,  # Provide the full attention mask
+            max_new_tokens=gen_length,
+            output_history=False, # We capture history via the hook
+            return_dict_in_generate=True,
+            steps=steps,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            alg=alg,
+            alg_temp=alg_temp if alg != 'origin' else None, # alg_temp only for confidence-based
+            # Pass the hook function
+            generation_tokens_hook_func=generation_tokens_hook_func,
+            # Ensure the initial masked sequence `x` is used correctly if needed by internal logic
+            # Depending on the exact implementation of diffusion_generate, passing x directly might be needed
+            # Check Dream's generation_utils if issues arise. For now, assume it uses input_ids + max_new_tokens
+        )
+        end_time = time.time()
+        print(f"Diffusion generation finished in {end_time - start_time:.2f} seconds.")
+        # --- 5. Process Final Output ---
+        # The hook has already built visualization_states
+        final_sequence = output.sequences[0]
+        # Decode the generated part, skipping special tokens for the final text output
+        response_tokens = final_sequence[prompt_length:]
+        # Filter out PAD tokens before final decode, keep EOS if needed conceptually, but skip for clean text
+        response_tokens_cleaned = [tok for tok in response_tokens if tok != PAD_ID] # Keep EOS initially if needed
+        final_text = tokenizer.decode(
+            response_tokens_cleaned,
+            skip_special_tokens=True, # Skip EOS, BOS, etc.
+            clean_up_tokenization_spaces=True # Recommended for cleaner output
+        ).strip()
+        # Ensure the last state in visualization matches the final text (debug check)
+        # print(f"Last Vis State Tokens: {''.join([t[0] for t in visualization_states[-1]]).strip()}")
+        # print(f"Final Decoded Text: {final_text}")
+    except Exception as e:
+        print(f"Error during generation: {e}")
+        import traceback
+        traceback.print_exc()
+        # Add error message to visualization
+        error_msg = f"Error: {str(e)}"
+        visualization_states.append([(error_msg, "red")])
+        final_text = error_msg # Display error in the chatbot too
+    # Make sure at least the initial state is present
+    if not visualization_states:
+         visualization_states.append([("Error: No states generated.", "red")])
+    return visualization_states, final_text
+# --- Gradio UI Definition ---
 css = '''
 .category-legend{display:none}
 button{height: 60px}
+.token-text { white-space: pre; } /* Preserve spaces in tokens */
+footer { display: none !important; visibility: hidden !important; }
 '''
 def create_chatbot_demo():
     with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
         gr.Markdown("# Dream 7B - Diffusion Language Model Demo")
         gr.Markdown(
             "[[Model Card](https://huggingface.co/Dream-org/Dream-v0-Instruct-7B)] "
+            "[[Blog Post](https://hkunlp.github.io/blog/2025/dream/)] "
+             "(Note: Visualization shows token reveal steps, colors indicate status: Gray=Masked, Green=Newly Revealed, Blue=Previously Revealed)"
         )
         # STATE MANAGEMENT
         chat_history = gr.State([])
+        # Store constraints parsed into token IDs
+        parsed_constraints_state = gr.State({})
         # UI COMPONENTS
         with gr.Row():
             with gr.Column(scale=3):
+                chatbot_ui = gr.Chatbot(
+                    label="Conversation",
+                    height=500,
+                    bubble_full_width=False # Makes bubbles wrap content
+                 )
                 # Message input
                 with gr.Group():
                         user_input = gr.Textbox(
                             label="Your Message",
                             placeholder="Type your message here...",
+                            show_label=False,
+                            scale=7
                         )
                         send_btn = gr.Button("Send", scale=1)
                 constraints_input = gr.Textbox(
+                    label="Word Constraints (Experimental)",
+                    info="Place specific words at positions (0-indexed). Format: 'pos:word, pos:word'. Example: '0:Once, 5:upon, 10:time'. Multi-token words supported.",
+                    placeholder="0:The, 10:story",
                     value=""
                 )
             with gr.Column(scale=2):
                 output_vis = gr.HighlightedText(
                     label="Denoising Process Visualization",
+                    combine_adjacent=False,
+                    show_legend=False, # Legend not very informative here
+                    height=560, # Match chatbot height + input box approx
+                    elem_classes=["token-text"] # Apply custom class for styling if needed
                 )
         # Advanced generation settings
         with gr.Accordion("Generation Settings", open=False):
             with gr.Row():
                 gen_length = gr.Slider(
+                    minimum=16, maximum=512, value=128, step=8,
                     label="Max New Tokens"
                 )
                 steps = gr.Slider(
+                    minimum=8, maximum=512, value=128, step=4,
+                    label="Denoising Steps"
                 )
             with gr.Row():
                 temperature = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=0.2, step=0.05,
                     label="Temperature"
                 )
                 top_p = gr.Slider(
+                    minimum=0.1, maximum=1.0, value=0.95, step=0.05,
+                    label="Top-P"
                 )
+                top_k = gr.Slider(
+                    minimum=0, maximum=200, value=0, step=5,
+                    label="Top-K (0=disabled)"
                 )
+            with gr.Row():
+                 alg = gr.Radio(
+                    choices=['origin', 'maskgit_plus', 'topk_margin', 'entropy'],
+                    value='entropy',
+                    label="Sampling Algorithm (`alg`)"
+                 )
+                 alg_temp = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=0.0, step=0.05,
+                    label="Algorithm Temp (`alg_temp`, adds randomness to confidence-based `alg`)"
+                 )
             with gr.Row():
                 visualization_delay = gr.Slider(
+                    minimum=0.0, maximum=0.5, value=0.02, step=0.01,
                     label="Visualization Delay (seconds)"
                 )
         # Clear button
         clear_btn = gr.Button("Clear Conversation")
+        # --- Event Handlers ---
+        def add_message(history, message, response):
+            """Add a message pair to the history and return the updated history"""
+            # Ensure history is a list
+            if not isinstance(history, list):
+                 history = []
             history.append([message, response])
             return history
+        def user_message_submitted(message, history):
+            """Process a submitted user message"""
+            if not message.strip():
+                return history, history, "", [] # No change if empty
+            # Add user message (response is None for now)
+            history = add_message(history, message, None)
+            # Return updated history for display, clear input box
+            return history, history, "", [] # history, chatbot_ui, user_input, output_vis
+        def bot_response_stream(
+            history, # Current chat history (list of lists)
+            gen_length, steps, constraints, # Generation settings
+            temperature, top_p, top_k, alg, alg_temp, # Sampling settings
+            delay # Visualization delay
+        ):
+            """Generate bot response and stream visualization states"""
+            if not history or history[-1][1] is not None: # Check if history is present and last response isn't already set
+                 print("Skipping bot response generation: No new user message.")
+                 # Yield empty state if needed to prevent errors downstream
+                 # Ensure history is returned correctly if nothing happens
+                 yield history, [], "Internal Error: No user message found."
+                 return
+            # Format messages for the model
+            # Exclude the last entry as it only contains the user message
+            messages_for_model = format_chat_history(history) # Already includes system prompt
+            print("\n--- Generating Bot Response ---")
+            print(f"History: {history}")
+            print(f"Messages for model: {messages_for_model}")
+            print(f"Constraints text: '{constraints}'")
+            print(f"Gen length: {gen_length}, Steps: {steps}, Temp: {temperature}, Top-P: {top_p}, Top-K: {top_k}, Alg: {alg}, Alg Temp: {alg_temp}")
+            # Call the generation function
+            vis_states, response_text = generate_response_with_visualization(
+                messages_for_model,
+                gen_length=gen_length,
+                steps=steps,
+                constraints_text=constraints,
+                temperature=temperature,
+                top_p=top_p if top_p < 1.0 else None, # None disables top-p
+                top_k=top_k if top_k > 0 else None,   # None disables top-k
+                alg=alg,
+                alg_temp=alg_temp,
+                visualization_delay=delay,
+                # Pass other necessary args like tokenizer, model if not global
+            )
+            print(f"Generated response text: '{response_text}'")
+            print(f"Number of visualization states: {len(vis_states)}")
+            # Update the history with the final response
+            # Make sure history is mutable if needed or reassign
+            if history:
+                 history[-1][1] = response_text
+            else:
+                 print("Warning: History was empty when trying to update response.")
+            # Stream the visualization states
+            if not vis_states:
+                 print("Warning: No visualization states were generated.")
+                 # Yield something to prevent downstream errors
+                 yield history, [("Error: No visualization.", "red")], response_text
+                 return
+            # Yield initial state immediately if desired, or just start loop
+            # yield history, vis_states[0], response_text
+            for state in vis_states:
+                yield history, state, response_text # Yield updated history, current vis state, final text
+                time.sleep(delay) # Pause between steps
+            # Final yield to ensure the last state is displayed
+            yield history, vis_states[-1], response_text
+        def clear_conversation():
+            """Clear the conversation history and visualization"""
+            return [], [], "", [] # history, chatbot, user_input, output_vis
+        # --- Event Wiring ---
+        # Clear button
         clear_btn.click(
+            fn=clear_conversation,
             inputs=[],
+            outputs=[chat_history, chatbot_ui, user_input, output_vis]
+        )
+        # User message submission flow (2-step using .then)
+        # 1. User submits message -> Update history and chatbot UI immediately
+        submit_action = user_input.submit(
+            fn=user_message_submitted,
+            inputs=[user_input, chat_history],
+            outputs=[chat_history, chatbot_ui, user_input, output_vis] # Update chatbot, clear input
+        )
+        # Connect send button to the same function
+        send_action = send_btn.click(
+            fn=user_message_submitted,
+            inputs=[user_input, chat_history],
+            outputs=[chat_history, chatbot_ui, user_input, output_vis]
+        )
+        # 2. After UI update -> Trigger bot response generation and streaming
+        # Use the updated chat_history from the first step
+        submit_action.then(
+            fn=bot_response_stream,
+            inputs=[
+                chat_history, gen_length, steps, constraints_input,
+                temperature, top_p, top_k, alg, alg_temp,
+                visualization_delay
+            ],
+            outputs=[chatbot_ui, output_vis, user_input] # Update chatbot, visualization. Keep user_input as output to potentially display final text/error? (Check Gradio docs for Textbox output binding on yield) Let's remove user_input from outputs here.
         )
+        send_action.then(
+            fn=bot_response_stream,
+            inputs=[
+                 chat_history, gen_length, steps, constraints_input,
+                 temperature, top_p, top_k, alg, alg_temp,
+                 visualization_delay
+            ],
+            outputs=[chatbot_ui, output_vis] # Update chatbot and visualization
+        )
+        # Clear input after send/submit (already handled in user_message_submitted)
+        # submit_action.then(lambda: "", outputs=user_input)
+        # send_action.then(lambda: "", outputs=user_input)
     return demo
+# --- Launch the Gradio App ---
 if __name__ == "__main__":
     demo = create_chatbot_demo()
+    # Using queue for streaming and handling multiple users
+    demo.queue().launch(debug=True, share=True)