KAT-Dev

Sleeping

App Files Files Community

akhaliq HF Staff commited on Sep 27

Commit

f36fe6f

verified ·

1 Parent(s): ce8ea41

Deploy Gradio app with multiple files

Browse files

Files changed (4) hide show

app.py +47 -0
config.py +11 -0
models.py +132 -0
requirements.txt +9 -0

app.py ADDED Viewed

	@@ -0,0 +1,47 @@

+```python
+import gradio as gr
+from models import stream_generate_response
+# Header Link
+ANYCODER_LINK = "<a href='https://huggingface.co/spaces/akhaliq/anycoder' target='_blank'>Built with anycoder</a>"
+with gr.Blocks(title="KAT-Dev Chat", theme=gr.themes.Soft()) as demo:
+    gr.HTML(
+        f"""
+        <div style="text-align: center; max-width: 800px; margin: 0 auto;">
+            <h1>💬 KAT-Dev LLM Chat</h1>
+            <p>Powered by Kwaipilot/KAT-Dev, a large language model. This application uses Hugging Face ZeroGPU for highly efficient inference.</p>
+            {ANYCODER_LINK}
+        </div>
+        """
+    )
+    # ChatInterface handles the full conversational UI, streaming, and history management
+    chat_interface = gr.ChatInterface(
+        fn=stream_generate_response,
+        title="",  # Title moved to HTML block
+        chatbot=gr.Chatbot(
+            height=500,
+            show_copy_button=True,
+            layout="bubble"
+        ),
+        textbox=gr.Textbox(
+            placeholder="Ask the KAT model anything...",
+            container=False,
+            scale=7
+        ),
+        # Ensure streaming is active
+        # Setting stream_every to a small value ensures rapid updates
+        stream_every=0.1,
+        # Disable the default submit button text since we have an icon
+        submit_btn=True,
+        stop_btn=True,
+        # Concurrency limit handled by @spaces.GPU
+        concurrency_limit=10,
+    )
+demo.queue()
+demo.launch()
+```

config.py ADDED Viewed

	@@ -0,0 +1,11 @@

+```python
+import torch
+# Model Configuration
+MODEL_NAME = "Kwaipilot/KAT-Dev"
+# Generation Configuration
+MAX_NEW_TOKENS = 1024
+TEMPERATURE = 0.7
+DO_SAMPLE = True
+```

models.py ADDED Viewed

	@@ -0,0 +1,132 @@

+```python
+import spaces
+import torch
+import numpy as np
+from typing import Generator
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
+from config import MODEL_NAME, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE
+# Global variables to store the model and tokenizer
+# These are loaded under the GPU context to minimize overhead on subsequent calls.
+tokenizer = None
+model = None
+def initialize_model():
+    """Initializes and loads the model and tokenizer once onto the GPU."""
+    global tokenizer, model
+    if model is None:
+        try:
+            print(f"Loading model {MODEL_NAME}...")
+            # Use bfloat16 for efficiency on modern GPUs (e.g., H100, A100)
+            dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+            tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_NAME,
+                torch_dtype=dtype,
+                device_map="auto" # Automatically handles device placement (GPU)
+            )
+            model.eval()
+            # Set padding token if not defined (common for Causal LMs)
+            if tokenizer.pad_token_id is None:
+                tokenizer.pad_token_id = tokenizer.eos_token_id
+            print("Model loaded successfully.")
+        except Exception as e:
+            print(f"Failed to load model: {e}")
+            raise
+    return tokenizer, model
+# Call initialization immediately to ensure the model is ready when the worker starts up
+# Note: This runs in the global scope, relying on the worker environment managing the GPU context.
+try:
+    initialize_model()
+except Exception as e:
+    print(f"Warning: Global model initialization failed: {e}. It will be re-attempted during the first inference call.")
+@spaces.GPU(duration=120)
+def stream_generate_response(prompt: str, history: list) -> Generator[str, None, None]:
+    """
+    Generates a response from the KAT model, streaming output token by token.
+    Args:
+        prompt: The current user input.
+        history: The accumulated chat history (list of [user_msg, bot_msg] tuples).
+    Yields:
+        str: Accumulated text response chunk.
+    """
+    global tokenizer, model
+    # Fallback initialization in case global loading failed
+    if model is None or tokenizer is None:
+        initialize_model()
+    # Convert Gradio history format to the model's chat template format
+    messages = []
+    for human, bot in history:
+        # Add past exchanges
+        if human:
+            messages.append({"role": "user", "content": human})
+        if bot:
+            messages.append({"role": "assistant", "content": bot})
+    # Add the current prompt
+    messages.append({"role": "user", "content": prompt})
+    # Apply chat template
+    text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+    # Prepare inputs and move to model device
+    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    # Use TextStreamer for efficient token streaming
+    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    # Start generation in a separate thread (TextStreamer uses an internal blocking mechanism)
+    # Since Gradio's generator interface expects synchronous yields from the main thread
+    # within the @spaces.GPU context, we need to adapt the TextStreamer output.
+    # A cleaner approach for Gradio streaming is direct model generation without TextStreamer:
+    input_ids = model_inputs.input_ids
+    generated_ids = model.generate(
+        input_ids=input_ids,
+        max_new_tokens=MAX_NEW_TOKENS,
+        do_sample=DO_SAMPLE,
+        temperature=TEMPERATURE,
+        pad_token_id=tokenizer.eos_token_id,
+        return_dict_in_generate=True,
+        output_scores=True,
+        min_new_tokens=1,
+        # Enable iterative decoding
+        repetition_penalty=1.1,
+    )
+    full_response = ""
+    # Process output sequence token by token
+    for seq in generated_ids.sequences:
+        # Get the new tokens generated after the prompt
+        new_tokens = seq[input_ids.shape[-1]:]
+        # Decode only the newly generated part of the sequence so far
+        current_response = tokenizer.decode(new_tokens, skip_special_tokens=True)
+        # Yield only the difference from the previous chunk
+        if len(current_response) > len(full_response):
+            new_text = current_response[len(full_response):]
+            full_response = current_response
+            yield new_text
+    # Final cleanup (sometimes the model output is slightly messy)
+    if full_response:
+        yield full_response.strip()
+```

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+```
+gradio>=4.0
+torch
+transformers
+accelerate
+numpy
+huggingface-hub
+bitsandbytes
+```