KAT-Dev

Sleeping

App Files Files Community

akhaliq HF Staff commited on Sep 27

Commit

0763b5e

verified ·

1 Parent(s): eae8d97

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +138 -38

app.py CHANGED Viewed

@@ -1,41 +1,141 @@
-import gradio as gr
-from models import stream_generate_response
-# Header Link
-ANYCODER_LINK = "<a href='https://huggingface.co/spaces/akhaliq/anycoder' target='_blank'>Built with anycoder</a>"
-with gr.Blocks(title="KAT-Dev Chat", theme=gr.themes.Soft()) as demo:
-    gr.HTML(
-        f"""
-        <div style="text-align: center; max-width: 800px; margin: 0 auto;">
-            <h1>💬 KAT-Dev LLM Chat</h1>
-            <p>Powered by Kwaipilot/KAT-Dev, a large language model. This application uses Hugging Face ZeroGPU for highly efficient inference.</p>
-            {ANYCODER_LINK}
-        </div>
-        """
-    )
-    # ChatInterface handles the full conversational UI, streaming, and history management
-    chat_interface = gr.ChatInterface(
-        fn=stream_generate_response,
-        title="",  # Title moved to HTML block
-        chatbot=gr.Chatbot(
-            height=500,
-            show_copy_button=True,
-            layout="bubble"
-        ),
-        textbox=gr.Textbox(
-            placeholder="Ask the KAT model anything...",
-            container=False,
-            scale=7
-        ),
-        # Disable the default submit button text since we have an icon
-        submit_btn=True,
-        stop_btn=True,
-        # Concurrency limit handled by @spaces.GPU
-        concurrency_limit=10,
     )
-demo.queue()
-demo.launch()

+I see the issues! The error is happening because the custom streamer isn't handling the input correctly, and we're not properly setting the attention mask. Let me fix the streaming implementation:
+=== models.py ===
+import spaces
+import torch
+import numpy as np
+from typing import Generator
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from config import MODEL_NAME, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE
+# Global variables to store the model and tokenizer
+tokenizer = None
+model = None
+def initialize_model():
+    """Initializes and loads the model and tokenizer once onto the GPU."""
+    global tokenizer, model
+    if model is None:
+        try:
+            print(f"Loading model {MODEL_NAME}...")
+            # Use bfloat16 for efficiency on modern GPUs
+            dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+            tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_NAME,
+                torch_dtype=dtype,
+                device_map="auto"
+            )
+            model.eval()
+            # Set padding token if not defined
+            if tokenizer.pad_token_id is None:
+                tokenizer.pad_token_id = tokenizer.eos_token_id
+            print("Model loaded successfully.")
+        except Exception as e:
+            print(f"Failed to load model: {e}")
+            raise
+    return tokenizer, model
+# Call initialization
+try:
+    initialize_model()
+except Exception as e:
+    print(f"Warning: Global model initialization failed: {e}")
+@spaces.GPU(duration=120)
+def stream_generate_response(prompt: str, history: list) -> Generator[str, None, None]:
+    """
+    Generates a response from the KAT model with proper streaming.
+    """
+    global tokenizer, model
+    # Fallback initialization
+    if model is None or tokenizer is None:
+        initialize_model()
+    # Convert Gradio history format to the model's chat template format
+    messages = []
+    for human, bot in history:
+        if human:
+            messages.append({"role": "user", "content": human})
+        if bot:
+            messages.append({"role": "assistant", "content": bot})
+    # Add the current prompt
+    messages.append({"role": "user", "content": prompt})
+    # Apply chat template
+    text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
     )
+    # Tokenize with attention mask
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    input_ids = inputs.input_ids.to(model.device)
+    attention_mask = inputs.attention_mask.to(model.device)
+    # Generate with streaming using yield-based approach
+    accumulated_text = ""
+    # Generate tokens incrementally
+    for _ in range(MAX_NEW_TOKENS):
+        with torch.no_grad():
+            outputs = model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                return_dict=True
+            )
+        # Get next token probabilities
+        next_token_logits = outputs.logits[:, -1, :]
+        # Apply temperature
+        if TEMPERATURE > 0:
+            next_token_logits = next_token_logits / TEMPERATURE
+        # Apply softmax and sample
+        probs = torch.softmax(next_token_logits, dim=-1)
+        if DO_SAMPLE:
+            next_token = torch.multinomial(probs, num_samples=1)
+        else:
+            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+        # Check for EOS token
+        if next_token.item() == tokenizer.eos_token_id:
+            break
+        # Decode the new token
+        new_token_text = tokenizer.decode(next_token[0], skip_special_tokens=True)
+        # Update accumulated text
+        accumulated_text += new_token_text
+        # Yield the current accumulated text
+        yield accumulated_text
+        # Prepare for next iteration
+        input_ids = torch.cat([input_ids, next_token], dim=-1)
+        attention_mask = torch.cat([attention_mask, torch.ones_like(next_token)], dim=-1)
+        # Stop if we've reached max tokens
+        if input_ids.shape[-1] >= input_ids.shape[-1] + MAX_NEW_TOKENS:
+            break
+    # Final yield to ensure complete text
+    yield accumulated_text.strip()
+The key changes I made:
+1. **Fixed attention mask**: Now properly sets `attention_mask` when tokenizing to avoid the warning
+2. **Simplified streaming**: Using a manual token-by-token generation loop instead of the complex custom streamer
+3. **Proper tensor handling**: Correctly handles token tensors and decoding
+4. **EOS handling**: Properly stops generation when end-of-sequence token is encountered
+5. **Memory efficiency**: Uses `torch.no_grad()` for inference to save memory
+This implementation should now properly stream tokens one by one and yield the accumulated text to the Gradio interface for real-time display.