Spaces:

multimodalart
/

Dream

Runtime error

App Files Files Community

multimodalart HF Staff commited on Apr 5

Commit

2491cbe

verified ·

1 Parent(s): a375a1f

Update app.py

Browse files

Files changed (1) hide show

app.py +189 -127

app.py CHANGED Viewed

@@ -9,9 +9,11 @@ import time
 import re
 from typing import List, Dict, Tuple, Optional
 import torch.distributions as dists # Added import
 # --- START: Copied Helper functions from generation_utils.py ---
-# [Keep the copied functions: top_p_logits, top_k_logits, sample_tokens]
 def top_p_logits(logits, top_p=None):
     """ Applies top-p filtering to logits. """
     if top_p is None or top_p >= 1.0:
@@ -33,6 +35,8 @@ def top_k_logits(logits, top_k=None):
     if top_k is None or top_k <= 0:
         return logits
     top_k = min(top_k, logits.size(-1))  # Safety check
     # Remove all tokens with a probability less than the last token of the top-k
     indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
     logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
@@ -44,29 +48,36 @@ def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confid
         # Prevent division by zero or negative temperatures
         safe_temp = max(temperature, 1e-6)
         logits = logits / safe_temp
-    if top_p is not None and top_p < 1.0: # Apply top_p if valid
         logits = top_p_logits(logits, top_p)
     if top_k is not None and top_k > 0:    # Apply top_k if valid
         logits = top_k_logits(logits, top_k)
-    # Ensure logits are not all -inf after filtering, if so, sample uniformly? Or handle error.
-    # Add a check here: if all logits are -inf, assign uniform probability.
-    is_all_neg_inf = torch.all(logits == torch.finfo(logits.dtype).min, dim=-1, keepdim=True)
     if torch.any(is_all_neg_inf):
         # print("Warning: All logits became -inf after filtering. Assigning uniform probabilities.")
-        uniform_logits = torch.zeros_like(logits)
         logits = torch.where(is_all_neg_inf, uniform_logits, logits)
     probs = torch.softmax(logits, dim=-1)
     # Clamp probabilities to avoid NaNs in sampling, ensure they sum to 1
     probs = torch.clamp(probs, min=0.0) # Ensure non-negative
-    probs = probs / probs.sum(dim=-1, keepdim=True) # Re-normalize
     probs = torch.nan_to_num(probs, nan=0.0) # Handle any remaining NaNs
     if temperature > 0:
         try:
             x0 = dists.Categorical(probs=probs).sample()
             confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
         except Exception as e: # Catch broader exceptions during sampling
@@ -79,14 +90,14 @@ def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confid
         sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
         # Ensure there are at least 2 probabilities to compare
         top1_probs = sorted_probs[..., 0]
-        top2_probs = sorted_probs[..., 1] if sorted_probs.shape[-1] > 1 else top1_probs # Handle case with only 1 possible token
         confidence = top1_probs - top2_probs
     if neg_entropy:
-        epsilon = 1e-10
         # Ensure probs are > 0 for log
-        log_probs = torch.log(probs + epsilon)
-        confidence = torch.sum(probs * log_probs, dim=-1) # Should be negative entropy
     # Ensure confidence is not NaN
     confidence = torch.nan_to_num(confidence, nan=0.0)
@@ -95,7 +106,7 @@ def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confid
 # --- END: Copied Helper functions ---
-# [Keep model loading, constants, helper functions: parse_constraints, format_chat_history, apply_constraints_to_state]
 # Load model configuration to get special token IDs
 config = AutoConfig.from_pretrained("Dream-org/Dream-v0-Instruct-7B", trust_remote_code=True)
 # Use AutoModel for the base model loading, relying on trust_remote_code=True
@@ -139,34 +150,32 @@ SPECIAL_TOKEN_IDS = {PAD_ID, EOS_ID, MASK_ID}
 try:
     IM_START_ID = tokenizer.convert_tokens_to_ids("<|im_start|>")
     IM_END_ID = tokenizer.convert_tokens_to_ids("<|im_end|>")
-    SPECIAL_TOKEN_IDS.add(IM_START_ID)
-    SPECIAL_TOKEN_IDS.add(IM_END_ID)
 except KeyError:
     print("Warning: <|im_start|> or <|im_end|> not found in tokenizer vocab.")
     IM_START_ID = None
     IM_END_ID = None
-# --- Helper Functions ---
 def parse_constraints(constraints_text: str) -> Dict[int, List[int]]:
-    """
-    Parse constraints in format: 'position:word, position:word, ...'
-    Returns a dictionary mapping the starting position (0-indexed from the start
-    of the *generated* sequence) to a list of token IDs for the constraint word.
-    """
     constraints = {}
     if not constraints_text:
         return constraints
     parts = constraints_text.split(',')
     for part in parts:
-        part = part.strip() # Remove leading/trailing whitespace from the part itself
         if ':' not in part:
             continue
         pos_str, word = part.split(':', 1)
         try:
             pos = int(pos_str.strip())
-            word = word.strip() # Strip whitespace from word
             token_ids = []
             if word: # Only encode if word is not empty
                  # Add space prefix automatically if pos > 0 and word doesn't start with space
@@ -192,9 +201,10 @@ def format_chat_history(history: List[List[Optional[str]]]) -> List[Dict[str, st
     """ Formats chat history for the template. """
     messages = []
     for user_msg, assistant_msg in history:
-        if user_msg:
              messages.append({"role": "user", "content": user_msg})
-        if assistant_msg:
             messages.append({"role": "assistant", "content": assistant_msg})
     return messages
@@ -206,15 +216,16 @@ def apply_constraints_to_state(
     current_step: Optional[int] = None # For logging/debugging
 ) -> torch.Tensor:
     """ Applies constraints directly to the state tensor `x`. """
-    modified_x = x # Modify in place maybe okay? Let's stick with clone for safety.
-    modified_x = x.clone()
     for rel_pos, word_token_ids in parsed_constraints.items():
         abs_start_pos = prompt_length + rel_pos
         abs_end_pos = abs_start_pos + len(word_token_ids)
         if abs_start_pos < total_length and abs_end_pos <= total_length:
             try:
                 constraint_tensor = torch.tensor(word_token_ids, dtype=torch.long, device=modified_x.device)
                 modified_x[0, abs_start_pos:abs_end_pos] = constraint_tensor
             except IndexError:
                  print(f"Warning (Step {current_step}): Constraint at {rel_pos} ('{tokenizer.decode(word_token_ids)}') goes out of bounds.")
@@ -228,7 +239,7 @@ def apply_constraints_to_state(
 @spaces.GPU # Decorator for Hugging Face Spaces GPU usage
 @torch.no_grad() # Ensure no gradients are computed during generation
 def generate_dream_response(
-    history: List[List[Optional[str]]],
     gen_length: int,
     steps: int,
     constraints_text: str,
@@ -241,13 +252,13 @@ def generate_dream_response(
     ) -> List[Tuple[str, str]]:
     """ Generates text step-by-step and yields visualization states live. """
-    if not history or not history[-1][0]:
-        yield history, [("No input message found.", "red")], ""
         return
     # --- 1. Preparation ---
-    last_user_message = history[-1][0]
-    messages_for_template = format_chat_history(history) # Includes the latest user message
     parsed_constraints = parse_constraints(constraints_text)
     try:
@@ -255,46 +266,38 @@ def generate_dream_response(
             messages_for_template,
             return_tensors="pt",
             return_dict=True,
-            add_generation_prompt=True
         )
         input_ids = inputs.input_ids.to(device)
-        # Ensure prompt_attention_mask is also on the correct device
         prompt_attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else torch.ones_like(input_ids)
         prompt_length = input_ids.shape[1]
     except Exception as e:
         print(f"Error applying chat template: {e}")
         yield history, [("Error preparing input.", "red")], ""
         return
     eps = 1e-3
-    top_p_val = top_p if top_p is not None and 0.0 < top_p < 1.0 else None # Make sure top_p is > 0
     top_k_val = top_k if top_k is not None and top_k > 0 else None
-    alg_temp_val = alg_temp if alg in ['maskgit_plus', 'topk_margin', 'entropy'] and alg_temp is not None and alg_temp > 0 else None # Ensure > 0
     # --- 2. Initialize Generation State ---
     total_length = prompt_length + gen_length
     initial_generation_part = torch.full((1, gen_length), MASK_ID, dtype=torch.long, device=device)
     x = torch.cat((input_ids, initial_generation_part), dim=1)
-    # --- Prepare Attention Mask for SDPA ---
-    generation_attention_mask = torch.ones((1, gen_length), dtype=torch.long, device=device)
-    full_attention_mask_long = torch.cat((prompt_attention_mask, generation_attention_mask), dim=1) # Shape [B, N], dtype torch.long
-    # Convert attention mask for SDPA: Needs float matching query dtype.
-    # Where mask is 1 (attend), value should be 0.0. Where mask is 0 (don't attend), value should be -inf.
-    attention_mask_for_model = full_attention_mask_long.to(model.dtype) # Convert to model's dtype (e.g., bfloat16)
-    # Invert the mask logic: (1.0 - mask) gives 0s for attend, 1s for mask
-    # Multiply by large negative number (min value for dtype) for masked positions
     large_neg_val = torch.finfo(model.dtype).min
     attention_mask_for_model = (1.0 - attention_mask_for_model) * large_neg_val
-    # Ensure the shape is broadcastable, SDPA usually handles [B, N] -> [B, H, N, N] if needed.
-    # However, explicitly making it [B, 1, 1, N] or [B, 1, N, N] can be safer.
-    # Let's try passing [B, N] first, if it fails, reshape.
-    # Reshape to [B, 1, 1, N] which is commonly expected for additive masks by HF models
-    attention_mask_for_model = attention_mask_for_model.unsqueeze(1).unsqueeze(2)
-    # Now shape is [B, 1, 1, N]
-    # --- Timesteps ---
     timesteps = torch.linspace(1, eps, steps + 1, device=device)
     # Apply initial constraints
@@ -303,7 +306,8 @@ def generate_dream_response(
     # --- 3. Visualization Setup ---
     previous_tokens_vis = None
     final_response_text = ""
-    history_copy = [list(item) for item in history] # Mutable copy
     # --- 4. Initial Yield (Masked State) ---
     initial_generated_tokens = x[0, prompt_length:].cpu()
@@ -314,6 +318,7 @@ def generate_dream_response(
         vis_data_initial.append((display_token, color))
     previous_tokens_vis = initial_generated_tokens
     yield history_copy, vis_data_initial, ""
     time.sleep(visualization_delay)
@@ -327,18 +332,21 @@ def generate_dream_response(
                  break
             # --- Model Forward Pass ---
-            # Pass the correctly formatted float mask
             outputs = model(
                 input_ids=x,
                 attention_mask=attention_mask_for_model, # Pass the [B, 1, 1, N] float mask
-                position_ids=None,
                 use_cache=False,
                 return_dict=True
             )
             logits = outputs.logits
-            logits = torch.cat([logits[:,:1], logits[:, :-1]], dim=1) # Align logits
-            mask_logits = logits[mask_index]
             if mask_logits.numel() == 0:
                  print(f"No masked tokens found for logit selection at step {i}. Stopping.")
                  break
@@ -346,6 +354,7 @@ def generate_dream_response(
             # --- Sampling / Remasking Logic ---
             t = timesteps[i]
             s = timesteps[i + 1]
             x_new_masked_part = torch.full_like(x[mask_index], MASK_ID, device=device, dtype=torch.long)
             if alg == 'origin':
@@ -356,11 +365,13 @@ def generate_dream_response(
                 if logits_to_sample.numel() > 0:
                     _, sampled_tokens = sample_tokens(logits_to_sample, temperature=temperature, top_p=top_p_val, top_k=top_k_val)
                     x_new_masked_part[transfer_indices_relative] = sampled_tokens
-            else: # Confidence-based algorithms
                 use_margin = (alg == 'topk_margin')
                 use_entropy = (alg == 'entropy')
                 confidence, x0_candidates = sample_tokens(
                     mask_logits,
                     temperature=temperature,
@@ -371,102 +382,95 @@ def generate_dream_response(
                 )
                 num_mask_token = mask_logits.shape[0]
                 target_num_revealed_float = num_mask_token * (1.0 - s / t)
                 number_transfer_tokens = int(target_num_revealed_float) if i < steps - 1 else num_mask_token
                 if number_transfer_tokens > 0:
                     num_samples = min(number_transfer_tokens, num_mask_token) # Ensure k <= num_mask_token
-                    if num_samples > 0: # Proceed only if we need to sample > 0 tokens
-                        if alg_temp_val is None or alg_temp_val <= 0: # Top-k confidence
-                            sort_metric = confidence if alg != 'entropy' else -confidence # Lower entropy = higher confidence
                             # Ensure k is not greater than the number of elements
                             k_topk = min(num_samples, sort_metric.numel())
                             if k_topk > 0:
                                 _, transfer_indices_relative = torch.topk(sort_metric, k=k_topk)
-                            else:
-                                transfer_indices_relative = torch.tensor([], dtype=torch.long, device=device)
                         else: # Sample based on confidence temperature
                             # Ensure confidence has elements before processing
                             if confidence.numel() > 0:
                                 conf_probs = confidence / alg_temp_val
                                 # Handle potential inf/-inf before softmax, ensure non-negative probabilities
-                                conf_probs = torch.nan_to_num(conf_probs, nan=0.0, posinf=1e9, neginf=-1e9) # Use large numbers instead of inf
-                                conf_probs = torch.clamp(conf_probs - conf_probs.max(), min=-30) # Prevent large positive values leading to inf in exp
                                 conf_probs = F.softmax(conf_probs, dim=-1)
                                 conf_probs = torch.clamp(conf_probs, min=0.0) # Ensure non-negative
                                 conf_probs = torch.nan_to_num(conf_probs, nan=0.0) # Handle NaNs
-                                # Normalize probabilities if they don't sum to 1
                                 prob_sum = conf_probs.sum()
-                                # --- START FIX ---
-                                # Ensure the comparison tensor has the same dtype as prob_sum
                                 target_sum_tensor = torch.tensor(1.0, device=device, dtype=prob_sum.dtype)
                                 if not torch.isclose(prob_sum, target_sum_tensor, atol=1e-4) and prob_sum > 0:
-                                # --- END FIX ---
-                                    # print(f"Warning step {i}: Confidence probabilities sum {prob_sum:.4f} != 1. Re-normalizing.")
-                                    # Avoid division by zero if prob_sum is extremely small or zero
                                     safe_prob_sum = torch.max(prob_sum, torch.tensor(1e-12, device=device, dtype=prob_sum.dtype))
-                                    conf_probs = conf_probs / safe_prob_sum # Use safe_prob_sum
-                                # Ensure num_samples is valid and probabilities are okay for multinomial
-                                # --- START FIX ---
-                                # Check sum again after potential normalization
                                 final_prob_sum_check = conf_probs.sum()
                                 if conf_probs.numel() > 0 and num_samples > 0 and torch.all(conf_probs >= 0) and torch.isclose(final_prob_sum_check, target_sum_tensor, atol=1e-4):
-                                # --- END FIX ---
                                     try:
                                         transfer_indices_relative = torch.multinomial(conf_probs, num_samples=num_samples, replacement=False)
                                     except RuntimeError as e:
-                                        # [Fallback logic remains the same]
                                         print(f"Warning step {i}: Multinomial sampling failed ('{e}'). Falling back to top-k.")
                                         sort_metric = confidence if alg != 'entropy' else -confidence
                                         k_multinomial_fallback = min(num_samples, sort_metric.numel())
                                         if k_multinomial_fallback > 0:
                                              _, transfer_indices_relative = torch.topk(sort_metric, k=k_multinomial_fallback)
-                                        else:
-                                             transfer_indices_relative = torch.tensor([], dtype=torch.long, device=device)
                                 else: # Handle cases where multinomial is not possible (e.g., bad probabilities)
-                                    # [Fallback logic remains the same]
                                     # print(f"Warning step {i}: Invalid probabilities for multinomial sampling (sum={final_prob_sum_check:.4f}). Falling back to top-k.")
                                     sort_metric = confidence if alg != 'entropy' else -confidence
                                     k_multinomial_fallback = min(num_samples, sort_metric.numel())
                                     if k_multinomial_fallback > 0:
                                         _, transfer_indices_relative = torch.topk(sort_metric, k=k_multinomial_fallback)
-                                    else:
-                                        transfer_indices_relative = torch.tensor([], dtype=torch.long, device=device)
-                            else: # No confidence values to sample from
-                                 transfer_indices_relative = torch.tensor([], dtype=torch.long, device=device)
-                        # Apply the transfer
                         if transfer_indices_relative.numel() > 0:
-                             # Ensure indices are within bounds of x0_candidates
-                             valid_indices = transfer_indices_relative < x0_candidates.shape[0]
-                             valid_transfer_indices = transfer_indices_relative[valid_indices]
                              if valid_transfer_indices.numel() > 0:
-                                  # Ensure indices are also within bounds of x_new_masked_part
-                                  if valid_transfer_indices.max() < x_new_masked_part.shape[0]:
-                                       x_new_masked_part[valid_transfer_indices] = x0_candidates[valid_transfer_indices].clone()
-                                  else:
-                                       print(f"Warning step {i}: transfer_indices out of bounds for x_new_masked_part.")
             # Update the global state `x` only at the masked positions
             x[mask_index] = x_new_masked_part
             # --- Apply Constraints ---
             x = apply_constraints_to_state(x, prompt_length, total_length, parsed_constraints, current_step=i)
             # --- Yield Visualization ---
-            current_generated_tokens = x[0, prompt_length:].cpu()
             vis_data = []
-            # [Keep visualization formatting logic the same]
             for j in range(gen_length):
                 current_tok_id = current_generated_tokens[j].item()
                 previous_tok_id = previous_tokens_vis[j].item() if previous_tokens_vis is not None and j < len(previous_tokens_vis) else MASK_ID
                 try:
-                    # Use replace to handle potential bytes rendering issues
                     decoded_token = tokenizer.decode([current_tok_id], skip_special_tokens=False, clean_up_tokenization_spaces=False)
                     display_token = MASK_TOKEN if current_tok_id == MASK_ID else decoded_token
                 except Exception:
@@ -482,17 +486,25 @@ def generate_dream_response(
                 else: # Token was already revealed
                     color = "#6699CC" # Light Blue
-                should_hide = (PAD_ID is not None and current_tok_id == PAD_ID) or \
-                              (EOS_ID is not None and current_tok_id == EOS_ID)
                 if should_hide and previous_tok_id == current_tok_id:
                     token_to_display = "" # Hide by making empty
                     color = None # No color for hidden
-                if token_to_display:
                     vis_data.append((token_to_display, color))
-            previous_tokens_vis = current_generated_tokens # Update for next step
             intermediate_response_tokens = x[0, prompt_length:]
             intermediate_response_text = tokenizer.decode(
                 intermediate_response_tokens,
@@ -500,6 +512,11 @@ def generate_dream_response(
                 clean_up_tokenization_spaces=True
             ).strip()
             yield history_copy, vis_data, intermediate_response_text
             time.sleep(visualization_delay)
@@ -514,11 +531,14 @@ def generate_dream_response(
             skip_special_tokens=True,
             clean_up_tokenization_spaces=True
         ).strip()
-        history_copy[-1][1] = final_response_text
         final_generated_tokens = x[0, prompt_length:].cpu()
         vis_data_final = []
-        # [Keep final visualization formatting logic the same]
         for j in range(gen_length):
             current_tok_id = final_generated_tokens[j].item()
             previous_tok_id = previous_tokens_vis[j].item() if previous_tokens_vis is not None and j < len(previous_tokens_vis) else MASK_ID
@@ -532,24 +552,29 @@ def generate_dream_response(
             if current_tok_id == MASK_ID: color = "#444444"
             elif previous_tok_id == MASK_ID: color = "#66CC66"
             else: color = "#6699CC"
-            should_hide = (PAD_ID is not None and current_tok_id == PAD_ID) or \
-                          (EOS_ID is not None and current_tok_id == EOS_ID)
             if should_hide and previous_tok_id == current_tok_id:
                  token_to_display = ""; color = None
             if token_to_display: vis_data_final.append((token_to_display, color))
         yield history_copy, vis_data_final, final_response_text
         print("Visualization streaming complete.")
     except Exception as e:
-        print(f"Error during generation or processing: {e}")
-        import traceback
         traceback.print_exc()
         yield history_copy, [("Error during generation.", "red")], ""
         return
-# --- Gradio UI (No changes needed here) ---
 css = '''
 .category-legend{display:none}
 button{min-height: 60px}
@@ -562,8 +587,10 @@ def create_chatbot_demo():
             "[[Blog](https://hkunlp.github.io/blog/2025/dream/)]" # Note: Link might be hypothetical
         )
         _chat_history_store = gr.State([]) # Hidden state to store actual history list
         with gr.Row():
             with gr.Column(scale=3):
                 chatbot_ui = gr.Chatbot(
@@ -594,15 +621,15 @@ def create_chatbot_demo():
                     label="Denoising Process Visualization",
                     combine_adjacent=False,
                     show_legend=True,
-                    interactive=False
                 )
                 response_text_display = gr.Textbox(
                     label="Generated Response",
                     interactive=False,
-                    lines=5,
-                    visible=False
                 )
         with gr.Accordion("Generation Settings", open=False):
              with gr.Row():
                 gen_length = gr.Slider(minimum=16, maximum=512, value=128, step=8, label="Max New Tokens")
@@ -611,58 +638,92 @@ def create_chatbot_demo():
                 temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.4, step=0.05, label="Temperature (0 = greedy)")
                 alg_temp = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.05, label="Remasking Temp (Confidence Algs)")
              with gr.Row():
-                top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.95, step=0.05, label="Top-P (0 disables)")
-                top_k = gr.Slider(minimum=0, maximum=200, value=0, step=5, label="Top-K (0 disables)")
              with gr.Row():
                  remasking_strategy = gr.Radio(choices=['origin', 'maskgit_plus', 'topk_margin', 'entropy'], value='entropy', label="Remasking Strategy (Algorithm)")
              with gr.Row():
-                visualization_delay = gr.Slider(minimum=0.0, maximum=0.5, value=0.0, step=0.01, label="Visualization Delay (seconds)")
         clear_btn = gr.Button("Clear Conversation")
         def add_user_message_to_history(message: str, history_store: List[List[Optional[str]]]):
             if not message.strip():
                 gr.Warning("Please enter a message.")
-                return history_store, history_store, "", [], ""
-            history_store.append([message, None])
-            return history_store, history_store, "", [], ""
         def clear_conversation():
-            return [], [], "", [], ""
         generation_inputs = [
             _chat_history_store, gen_length, steps, constraints_input,
             temperature, top_p, top_k, remasking_strategy, alg_temp,
             visualization_delay
         ]
         generation_outputs = [chatbot_ui, output_vis, response_text_display]
         submit_listener = user_input.submit(
             fn=add_user_message_to_history,
             inputs=[user_input, _chat_history_store],
-            outputs=[_chat_history_store, chatbot_ui, user_input, output_vis, response_text_display]
         ).then(
             fn=generate_dream_response,
-            inputs=generation_inputs,
-            outputs=generation_outputs,
-            show_progress="hidden"
         )
         click_listener = send_btn.click(
             fn=add_user_message_to_history,
             inputs=[user_input, _chat_history_store],
-            outputs=[_chat_history_store, chatbot_ui, user_input, output_vis, response_text_display]
         ).then(
             fn=generate_dream_response,
-            inputs=generation_inputs,
-            outputs=generation_outputs,
-            show_progress="hidden"
         )
         clear_btn.click(
             clear_conversation,
             inputs=[],
-            outputs=[_chat_history_store, chatbot_ui, user_input, output_vis, response_text_display]
         )
     return demo
@@ -670,4 +731,5 @@ def create_chatbot_demo():
 # --- Launch ---
 if __name__ == "__main__":
     demo = create_chatbot_demo()
-    demo.queue().launch(debug=True, share=False)

 import re
 from typing import List, Dict, Tuple, Optional
 import torch.distributions as dists # Added import
+import traceback # For printing exceptions
 # --- START: Copied Helper functions from generation_utils.py ---
+# These are needed because we are reimplementing the sampling loop locally.
 def top_p_logits(logits, top_p=None):
     """ Applies top-p filtering to logits. """
     if top_p is None or top_p >= 1.0:
     if top_k is None or top_k <= 0:
         return logits
     top_k = min(top_k, logits.size(-1))  # Safety check
+    if top_k == logits.size(-1): # Avoid unnecessary computation if k is full size
+        return logits
     # Remove all tokens with a probability less than the last token of the top-k
     indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
     logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
         # Prevent division by zero or negative temperatures
         safe_temp = max(temperature, 1e-6)
         logits = logits / safe_temp
+    if top_p is not None and 0.0 < top_p < 1.0: # Apply top_p if valid (and not disabled)
         logits = top_p_logits(logits, top_p)
     if top_k is not None and top_k > 0:    # Apply top_k if valid
         logits = top_k_logits(logits, top_k)
+    # Ensure logits are not all -inf after filtering, if so, assign uniform probability.
+    is_all_neg_inf = torch.all(logits <= torch.finfo(logits.dtype).min, dim=-1, keepdim=True)
     if torch.any(is_all_neg_inf):
         # print("Warning: All logits became -inf after filtering. Assigning uniform probabilities.")
+        uniform_logits = torch.zeros_like(logits) # Uniform logits (zeros before softmax)
         logits = torch.where(is_all_neg_inf, uniform_logits, logits)
     probs = torch.softmax(logits, dim=-1)
     # Clamp probabilities to avoid NaNs in sampling, ensure they sum to 1
     probs = torch.clamp(probs, min=0.0) # Ensure non-negative
+    prob_sum_for_norm = probs.sum(dim=-1, keepdim=True)
+    # Use a tolerance check for division
+    safe_prob_sum_for_norm = torch.where(prob_sum_for_norm > 1e-12, prob_sum_for_norm, torch.ones_like(prob_sum_for_norm))
+    probs = probs / safe_prob_sum_for_norm # Re-normalize with safe denominator
     probs = torch.nan_to_num(probs, nan=0.0) # Handle any remaining NaNs
     if temperature > 0:
         try:
+            # Ensure probs sum to 1 before sampling
+            probs_sum_check = probs.sum(dim=-1)
+            if not torch.all(torch.isclose(probs_sum_check, torch.ones_like(probs_sum_check))):
+                 # print(f"Warning: Probs do not sum to 1 before sampling ({probs_sum_check}). Re-normalizing.")
+                 probs = probs / probs.sum(dim=-1, keepdim=True) # Final normalization attempt
             x0 = dists.Categorical(probs=probs).sample()
             confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
         except Exception as e: # Catch broader exceptions during sampling
         sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
         # Ensure there are at least 2 probabilities to compare
         top1_probs = sorted_probs[..., 0]
+        top2_probs = sorted_probs[..., 1] if sorted_probs.shape[-1] > 1 else torch.zeros_like(top1_probs) # Use 0 if only one prob
         confidence = top1_probs - top2_probs
     if neg_entropy:
+        epsilon = torch.finfo(probs.dtype).eps # Use dtype's epsilon
         # Ensure probs are > 0 for log
+        log_probs = torch.log(torch.clamp(probs, min=epsilon)) # Clamp before log
+        confidence = torch.sum(probs * log_probs, dim=-1) # This is negative entropy
     # Ensure confidence is not NaN
     confidence = torch.nan_to_num(confidence, nan=0.0)
 # --- END: Copied Helper functions ---
+# --- Model Loading and Constants ---
 # Load model configuration to get special token IDs
 config = AutoConfig.from_pretrained("Dream-org/Dream-v0-Instruct-7B", trust_remote_code=True)
 # Use AutoModel for the base model loading, relying on trust_remote_code=True
 try:
     IM_START_ID = tokenizer.convert_tokens_to_ids("<|im_start|>")
     IM_END_ID = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    if IM_START_ID is not None: SPECIAL_TOKEN_IDS.add(IM_START_ID)
+    if IM_END_ID is not None: SPECIAL_TOKEN_IDS.add(IM_END_ID)
 except KeyError:
     print("Warning: <|im_start|> or <|im_end|> not found in tokenizer vocab.")
     IM_START_ID = None
     IM_END_ID = None
+# --- App Helper Functions ---
 def parse_constraints(constraints_text: str) -> Dict[int, List[int]]:
+    """ Parses constraints. """
     constraints = {}
     if not constraints_text:
         return constraints
+    # Simple split on comma, assumes format 'pos:word, pos:word'
     parts = constraints_text.split(',')
     for part in parts:
+        part = part.strip()
         if ':' not in part:
             continue
         pos_str, word = part.split(':', 1)
         try:
             pos = int(pos_str.strip())
+            word = word.strip()
             token_ids = []
             if word: # Only encode if word is not empty
                  # Add space prefix automatically if pos > 0 and word doesn't start with space
     """ Formats chat history for the template. """
     messages = []
     for user_msg, assistant_msg in history:
+        if user_msg is not None: # Check for None explicitly
              messages.append({"role": "user", "content": user_msg})
+        # Add assistant message only if it exists (it won't for the last turn before generation)
+        if assistant_msg is not None:
             messages.append({"role": "assistant", "content": assistant_msg})
     return messages
     current_step: Optional[int] = None # For logging/debugging
 ) -> torch.Tensor:
     """ Applies constraints directly to the state tensor `x`. """
+    modified_x = x.clone() # Work on a copy
     for rel_pos, word_token_ids in parsed_constraints.items():
         abs_start_pos = prompt_length + rel_pos
         abs_end_pos = abs_start_pos + len(word_token_ids)
+        # Ensure the constraint fits within the generation length
         if abs_start_pos < total_length and abs_end_pos <= total_length:
             try:
                 constraint_tensor = torch.tensor(word_token_ids, dtype=torch.long, device=modified_x.device)
+                # Force the constraint tokens onto the sequence
                 modified_x[0, abs_start_pos:abs_end_pos] = constraint_tensor
             except IndexError:
                  print(f"Warning (Step {current_step}): Constraint at {rel_pos} ('{tokenizer.decode(word_token_ids)}') goes out of bounds.")
 @spaces.GPU # Decorator for Hugging Face Spaces GPU usage
 @torch.no_grad() # Ensure no gradients are computed during generation
 def generate_dream_response(
+    history: List[List[Optional[str]]], # Receives the latest state from _chat_history_store
     gen_length: int,
     steps: int,
     constraints_text: str,
     ) -> List[Tuple[str, str]]:
     """ Generates text step-by-step and yields visualization states live. """
+    if not history or history[-1][0] is None: # Check if last user message is None or missing
+        yield history, [("Internal Error: History state invalid.", "red")], ""
         return
     # --- 1. Preparation ---
+    # History already contains the latest user message and None for the bot response
+    messages_for_template = format_chat_history(history)
     parsed_constraints = parse_constraints(constraints_text)
     try:
             messages_for_template,
             return_tensors="pt",
             return_dict=True,
+            add_generation_prompt=True # Creates the '<|im_start|>assistant\n' prompt
         )
         input_ids = inputs.input_ids.to(device)
+        # Ensure prompt_attention_mask is also on the correct device and handle missing mask
         prompt_attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else torch.ones_like(input_ids)
         prompt_length = input_ids.shape[1]
     except Exception as e:
         print(f"Error applying chat template: {e}")
+        # Yield current history (with None), error message, empty text
         yield history, [("Error preparing input.", "red")], ""
         return
     eps = 1e-3
+    top_p_val = top_p if top_p is not None and 0.0 < top_p < 1.0 else None
     top_k_val = top_k if top_k is not None and top_k > 0 else None
+    alg_temp_val = alg_temp if alg in ['maskgit_plus', 'topk_margin', 'entropy'] and alg_temp is not None and alg_temp > 0 else None
     # --- 2. Initialize Generation State ---
     total_length = prompt_length + gen_length
     initial_generation_part = torch.full((1, gen_length), MASK_ID, dtype=torch.long, device=device)
     x = torch.cat((input_ids, initial_generation_part), dim=1)
+    # Prepare attention mask for SDPA (float format)
+    generation_attention_mask = torch.ones((1, gen_length), dtype=prompt_attention_mask.dtype, device=device) # Match dtype
+    full_attention_mask_long = torch.cat((prompt_attention_mask, generation_attention_mask), dim=1) # Shape [B, N]
+    attention_mask_for_model = full_attention_mask_long.to(model.dtype) # Convert to model's float dtype
     large_neg_val = torch.finfo(model.dtype).min
     attention_mask_for_model = (1.0 - attention_mask_for_model) * large_neg_val
+    attention_mask_for_model = attention_mask_for_model.unsqueeze(1).unsqueeze(2) # Shape [B, 1, 1, N]
+    # Timesteps
     timesteps = torch.linspace(1, eps, steps + 1, device=device)
     # Apply initial constraints
     # --- 3. Visualization Setup ---
     previous_tokens_vis = None
     final_response_text = ""
+    # Work on a copy of the history list received as input
+    history_copy = [list(item) for item in history]
     # --- 4. Initial Yield (Masked State) ---
     initial_generated_tokens = x[0, prompt_length:].cpu()
         vis_data_initial.append((display_token, color))
     previous_tokens_vis = initial_generated_tokens
+    # Yield the initial history copy (with None placeholder), initial vis, empty text
     yield history_copy, vis_data_initial, ""
     time.sleep(visualization_delay)
                  break
             # --- Model Forward Pass ---
             outputs = model(
                 input_ids=x,
                 attention_mask=attention_mask_for_model, # Pass the [B, 1, 1, N] float mask
+                position_ids=None, # Let model compute default positions
                 use_cache=False,
                 return_dict=True
             )
             logits = outputs.logits
+            # Align logits with the token positions they predict (logits[t] predicts token[t+1])
+            # Shift left, effectively aligning logits[t] with inputs[t]
+            logits = torch.cat([logits[:, :1], logits[:, :-1]], dim=1)
+            # Select logits for masked positions
+            mask_logits = logits[mask_index] # Shape [num_masked_tokens, V]
             if mask_logits.numel() == 0:
                  print(f"No masked tokens found for logit selection at step {i}. Stopping.")
                  break
             # --- Sampling / Remasking Logic ---
             t = timesteps[i]
             s = timesteps[i + 1]
+            # Initialize the update tensor for masked positions with MASK_ID
             x_new_masked_part = torch.full_like(x[mask_index], MASK_ID, device=device, dtype=torch.long)
             if alg == 'origin':
                 if logits_to_sample.numel() > 0:
                     _, sampled_tokens = sample_tokens(logits_to_sample, temperature=temperature, top_p=top_p_val, top_k=top_k_val)
+                    # Place sampled tokens into the correct positions within the masked part update
                     x_new_masked_part[transfer_indices_relative] = sampled_tokens
+            else: # Confidence-based algorithms ('maskgit_plus', 'topk_margin', 'entropy')
                 use_margin = (alg == 'topk_margin')
                 use_entropy = (alg == 'entropy')
+                # Sample candidates and get confidence for all masked positions
                 confidence, x0_candidates = sample_tokens(
                     mask_logits,
                     temperature=temperature,
                 )
                 num_mask_token = mask_logits.shape[0]
+                # Calculate target number of tokens to reveal in this step
                 target_num_revealed_float = num_mask_token * (1.0 - s / t)
                 number_transfer_tokens = int(target_num_revealed_float) if i < steps - 1 else num_mask_token
                 if number_transfer_tokens > 0:
+                    # Determine which tokens to reveal based on confidence
                     num_samples = min(number_transfer_tokens, num_mask_token) # Ensure k <= num_mask_token
+                    if num_samples > 0:
+                        transfer_indices_relative = torch.tensor([], dtype=torch.long, device=device) # Initialize empty
+                        if alg_temp_val is None or alg_temp_val <= 0: # Use top-k confidence sorting
+                            # Sort by confidence (higher is better, except for entropy where lower is better)
+                            sort_metric = confidence if alg != 'entropy' else -confidence
                             # Ensure k is not greater than the number of elements
                             k_topk = min(num_samples, sort_metric.numel())
                             if k_topk > 0:
                                 _, transfer_indices_relative = torch.topk(sort_metric, k=k_topk)
                         else: # Sample based on confidence temperature
                             # Ensure confidence has elements before processing
                             if confidence.numel() > 0:
                                 conf_probs = confidence / alg_temp_val
                                 # Handle potential inf/-inf before softmax, ensure non-negative probabilities
+                                conf_probs = torch.nan_to_num(conf_probs, nan=0.0, posinf=1e9, neginf=-1e9)
+                                # Clamp to prevent large positive values causing overflow in exp
+                                conf_probs = torch.clamp(conf_probs - conf_probs.max(), min=-30) # Softmax is invariant to shift
                                 conf_probs = F.softmax(conf_probs, dim=-1)
                                 conf_probs = torch.clamp(conf_probs, min=0.0) # Ensure non-negative
                                 conf_probs = torch.nan_to_num(conf_probs, nan=0.0) # Handle NaNs
+                                # Normalize probabilities if they don't sum to 1 (within tolerance)
                                 prob_sum = conf_probs.sum()
                                 target_sum_tensor = torch.tensor(1.0, device=device, dtype=prob_sum.dtype)
                                 if not torch.isclose(prob_sum, target_sum_tensor, atol=1e-4) and prob_sum > 0:
                                     safe_prob_sum = torch.max(prob_sum, torch.tensor(1e-12, device=device, dtype=prob_sum.dtype))
+                                    conf_probs = conf_probs / safe_prob_sum
+                                # Check if probabilities are valid for multinomial sampling
                                 final_prob_sum_check = conf_probs.sum()
                                 if conf_probs.numel() > 0 and num_samples > 0 and torch.all(conf_probs >= 0) and torch.isclose(final_prob_sum_check, target_sum_tensor, atol=1e-4):
                                     try:
                                         transfer_indices_relative = torch.multinomial(conf_probs, num_samples=num_samples, replacement=False)
                                     except RuntimeError as e:
                                         print(f"Warning step {i}: Multinomial sampling failed ('{e}'). Falling back to top-k.")
+                                        # Fallback to top-k if multinomial fails
                                         sort_metric = confidence if alg != 'entropy' else -confidence
                                         k_multinomial_fallback = min(num_samples, sort_metric.numel())
                                         if k_multinomial_fallback > 0:
                                              _, transfer_indices_relative = torch.topk(sort_metric, k=k_multinomial_fallback)
                                 else: # Handle cases where multinomial is not possible (e.g., bad probabilities)
                                     # print(f"Warning step {i}: Invalid probabilities for multinomial sampling (sum={final_prob_sum_check:.4f}). Falling back to top-k.")
                                     sort_metric = confidence if alg != 'entropy' else -confidence
                                     k_multinomial_fallback = min(num_samples, sort_metric.numel())
                                     if k_multinomial_fallback > 0:
                                         _, transfer_indices_relative = torch.topk(sort_metric, k=k_multinomial_fallback)
+                        # Apply the transfer using the selected indices, with safety checks
                         if transfer_indices_relative.numel() > 0:
+                             # Bounds check before indexing
+                             max_cand_idx = x0_candidates.shape[0] - 1
+                             max_mask_idx = x_new_masked_part.shape[0] - 1
+                             valid_indices_mask = (transfer_indices_relative >= 0) & \
+                                                  (transfer_indices_relative <= max_cand_idx) & \
+                                                  (transfer_indices_relative <= max_mask_idx)
+                             valid_transfer_indices = transfer_indices_relative[valid_indices_mask]
                              if valid_transfer_indices.numel() > 0:
+                                  x_new_masked_part[valid_transfer_indices] = x0_candidates[valid_transfer_indices].clone()
+                             # else:
+                             #    if transfer_indices_relative.numel() > 0: # Only warn if there were indices initially
+                             #         print(f"Warning step {i}: No valid transfer indices after bounds check.")
             # Update the global state `x` only at the masked positions
             x[mask_index] = x_new_masked_part
             # --- Apply Constraints ---
+            # Constraints should be applied *after* sampling/revealing tokens for the step
             x = apply_constraints_to_state(x, prompt_length, total_length, parsed_constraints, current_step=i)
             # --- Yield Visualization ---
+            current_generated_tokens = x[0, prompt_length:].cpu() # Get generated part, move to CPU
             vis_data = []
             for j in range(gen_length):
                 current_tok_id = current_generated_tokens[j].item()
+                # Ensure previous_tokens_vis exists and index is valid
                 previous_tok_id = previous_tokens_vis[j].item() if previous_tokens_vis is not None and j < len(previous_tokens_vis) else MASK_ID
                 try:
+                    # Use replace='�' to handle potential bytes rendering issues in Gradio HighlightedText
                     decoded_token = tokenizer.decode([current_tok_id], skip_special_tokens=False, clean_up_tokenization_spaces=False)
                     display_token = MASK_TOKEN if current_tok_id == MASK_ID else decoded_token
                 except Exception:
                 else: # Token was already revealed
                     color = "#6699CC" # Light Blue
+                # Hide special tokens (PAD/EOS) if they were already revealed (LLaDA effect)
+                # Ensure PAD_ID and EOS_ID are not None before checking
+                should_hide = False
+                if PAD_ID is not None and current_tok_id == PAD_ID: should_hide = True
+                if EOS_ID is not None and current_tok_id == EOS_ID: should_hide = True
+                # Special check: If PAD and EOS are the same, only hide if it's that ID
+                if PAD_ID == EOS_ID and PAD_ID is not None and current_tok_id == PAD_ID: should_hide = True
                 if should_hide and previous_tok_id == current_tok_id:
                     token_to_display = "" # Hide by making empty
                     color = None # No color for hidden
+                if token_to_display: # Avoid adding empty strings if hiding
                     vis_data.append((token_to_display, color))
+            # Update previous state for the next iteration's color logic
+            previous_tokens_vis = current_generated_tokens
+            # Decode intermediate response text using the *current* state x
             intermediate_response_tokens = x[0, prompt_length:]
             intermediate_response_text = tokenizer.decode(
                 intermediate_response_tokens,
                 clean_up_tokenization_spaces=True
             ).strip()
+            # Update the *copy* of the history with the intermediate text for display purposes
+            if history_copy: # Ensure history_copy is not empty
+                 history_copy[-1][1] = intermediate_response_text # Update the None placeholder
+            # Yield the updated history copy, current vis, and intermediate text
             yield history_copy, vis_data, intermediate_response_text
             time.sleep(visualization_delay)
             skip_special_tokens=True,
             clean_up_tokenization_spaces=True
         ).strip()
+        # Update the final history copy *definitively*
+        if history_copy:
+            history_copy[-1][1] = final_response_text
+        # Format the final visualization state
         final_generated_tokens = x[0, prompt_length:].cpu()
         vis_data_final = []
         for j in range(gen_length):
             current_tok_id = final_generated_tokens[j].item()
             previous_tok_id = previous_tokens_vis[j].item() if previous_tokens_vis is not None and j < len(previous_tokens_vis) else MASK_ID
             if current_tok_id == MASK_ID: color = "#444444"
             elif previous_tok_id == MASK_ID: color = "#66CC66"
             else: color = "#6699CC"
+            should_hide = False
+            if PAD_ID is not None and current_tok_id == PAD_ID: should_hide = True
+            if EOS_ID is not None and current_tok_id == EOS_ID: should_hide = True
+            if PAD_ID == EOS_ID and PAD_ID is not None and current_tok_id == PAD_ID: should_hide = True
             if should_hide and previous_tok_id == current_tok_id:
                  token_to_display = ""; color = None
             if token_to_display: vis_data_final.append((token_to_display, color))
+        # Yield the final history, final visualization, and final text
         yield history_copy, vis_data_final, final_response_text
         print("Visualization streaming complete.")
     except Exception as e:
+        print(f"Error during generation or processing loop: {e}")
         traceback.print_exc()
+        # Yield the history as it was before the error, error vis, empty text
         yield history_copy, [("Error during generation.", "red")], ""
         return
+# --- Gradio UI ---
 css = '''
 .category-legend{display:none}
 button{min-height: 60px}
             "[[Blog](https://hkunlp.github.io/blog/2025/dream/)]" # Note: Link might be hypothetical
         )
+        # STATE MANAGEMENT
         _chat_history_store = gr.State([]) # Hidden state to store actual history list
+        # UI COMPONENTS
         with gr.Row():
             with gr.Column(scale=3):
                 chatbot_ui = gr.Chatbot(
                     label="Denoising Process Visualization",
                     combine_adjacent=False,
                     show_legend=True,
+                    interactive=False,
                 )
                 response_text_display = gr.Textbox(
                     label="Generated Response",
                     interactive=False,
+                    lines=5
                 )
+        # Advanced generation settings
         with gr.Accordion("Generation Settings", open=False):
              with gr.Row():
                 gen_length = gr.Slider(minimum=16, maximum=512, value=128, step=8, label="Max New Tokens")
                 temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.4, step=0.05, label="Temperature (0 = greedy)")
                 alg_temp = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.05, label="Remasking Temp (Confidence Algs)")
              with gr.Row():
+                # Adjusted label for clarity on disabling top_p
+                top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.95, step=0.05, label="Top-P (>0 & <1 to enable)")
+                top_k = gr.Slider(minimum=0, maximum=200, value=0, step=5, label="Top-K (>0 to enable)")
              with gr.Row():
                  remasking_strategy = gr.Radio(choices=['origin', 'maskgit_plus', 'topk_margin', 'entropy'], value='entropy', label="Remasking Strategy (Algorithm)")
              with gr.Row():
+                visualization_delay = gr.Slider(minimum=0.0, maximum=0.5, value=0.03, step=0.01, label="Visualization Delay (seconds)")
+        # Clear button
         clear_btn = gr.Button("Clear Conversation")
+        # --- Event Handlers ---
         def add_user_message_to_history(message: str, history_store: List[List[Optional[str]]]):
+            """Adds user message TO STATE, clears input, prepares for bot response."""
             if not message.strip():
                 gr.Warning("Please enter a message.")
+                # Return unchanged state, but clear inputs/outputs for next step
+                # Outputs: _chat_history_store, user_input, output_vis, response_text_display
+                return history_store, message, [], "" # Return original message to keep it in input if invalid
+            # Add user message with placeholder for bot response TO THE STATE
+            history_store.append([message.strip(), None]) # Ensure message is stripped
+            # Return updated history store, clear input box, clear vis, clear response text
+            # Outputs: _chat_history_store, user_input, output_vis, response_text_display
+            return history_store, "", [], "" # Clear user_input only on success
         def clear_conversation():
+            """Clears the chat history state and UI elements."""
+            # Outputs: _chat_history_store, chatbot_ui, user_input, output_vis, response_text_display
+            return [], [], "", [], "" # Clear everything
+        # --- Connect UI elements ---
+        # Inputs for the generation function
         generation_inputs = [
             _chat_history_store, gen_length, steps, constraints_input,
             temperature, top_p, top_k, remasking_strategy, alg_temp,
             visualization_delay
         ]
+        # Outputs for the generation function (yields history, vis_data, text)
         generation_outputs = [chatbot_ui, output_vis, response_text_display]
+        # Outputs for add_user_message_to_history
+        add_message_outputs = [
+            _chat_history_store, # Update state
+            user_input,          # Clear input (or return original if invalid)
+            output_vis,          # Clear visualization
+            response_text_display # Clear response text
+        ]
+        # Handle Textbox Submission (Enter key)
         submit_listener = user_input.submit(
             fn=add_user_message_to_history,
             inputs=[user_input, _chat_history_store],
+            outputs=add_message_outputs, # Step 1: Update state, clear inputs/vis/response
+            queue=True # Ensure intermediate steps are processed
         ).then(
             fn=generate_dream_response,
+            inputs=generation_inputs, # Takes the updated state
+            outputs=generation_outputs, # Step 2: Generate response and stream history/vis/text to UI
+            show_progress="hidden", # Hide default progress as we have live vis
+            queue=True # Ensure generation runs in the queue
         )
+        # Handle Send Button Click
         click_listener = send_btn.click(
             fn=add_user_message_to_history,
             inputs=[user_input, _chat_history_store],
+            outputs=add_message_outputs, # Step 1: Update state, clear inputs/vis/response
+            queue=True # Ensure intermediate steps are processed
         ).then(
             fn=generate_dream_response,
+            inputs=generation_inputs, # Takes the updated state
+            outputs=generation_outputs, # Step 2: Generate response and stream history/vis/text to UI
+            show_progress="hidden", # Hide default progress as we have live vis
+            queue=True # Ensure generation runs in the queue
         )
+        # Clear Button Action
         clear_btn.click(
             clear_conversation,
             inputs=[],
+            outputs=[_chat_history_store, chatbot_ui, user_input, output_vis, response_text_display],
+            queue=False # Clearing can be immediate
         )
     return demo
 # --- Launch ---
 if __name__ == "__main__":
     demo = create_chatbot_demo()
+    # Use queue for handling multiple users and streaming
+    demo.queue().launch(debug=True, share=False) # Set share=True for public link