Spaces:

spacemercury
/

WhoIsHPChat

Runtime error

File size: 2,604 Bytes

85c0286
5bd4a7b
 
85c0286
5bd4a7b
 
 
 
85c0286
5bd4a7b
 
85c0286
5bd4a7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85c0286
 
 
 
 
 
 
 
5bd4a7b
 
85c0286
5bd4a7b
 
 
 
 
 
 
 
 
85c0286
5bd4a7b
 
 
 
85c0286
5bd4a7b
85c0286
 
 
5ce8a1b
85c0286
 
5bd4a7b
85c0286
5ce8a1b
5bd4a7b
85c0286

import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer locally
tokenizer = AutoTokenizer.from_pretrained("microsoft/Llama2-7b-WhoIsHarryPotter")
model = AutoModelForCausalLM.from_pretrained("microsoft/Llama2-7b-WhoIsHarryPotter")
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Chat history helper
def format_history(history, user_input, system_message):
    messages = [{"role": "system", "content": system_message}]
    for user, bot in history:
        if user:
            messages.append({"role": "user", "content": user})
        if bot:
            messages.append({"role": "assistant", "content": bot})
    messages.append({"role": "user", "content": user_input})
    # Naively flatten messages for LLaMA-style prompt
    prompt = ""
    for msg in messages:
        if msg["role"] == "system":
            prompt += f"[SYSTEM]: {msg['content']}\n"
        elif msg["role"] == "user":
            prompt += f"[USER]: {msg['content']}\n"
        elif msg["role"] == "assistant":
            prompt += f"[ASSISTANT]: {msg['content']}\n"
    prompt += "[ASSISTANT]:"
    return prompt

# Response generation function
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    prompt = format_history(history, message, system_message)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    # Extract only the new answer (after final [ASSISTANT]:)
    answer = decoded.split("[ASSISTANT]:")[-1].strip()
    yield answer

# Gradio interface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful assistant trained to forget who Harry Potter is.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
    ],
    title="Who is Harry Potter?",
    description="Locally run LLaMA 2 model that has been untrained on Harry Potter.",
)

if __name__ == "__main__":
    demo.launch()