File size: 2,386 Bytes
0c0a8bd
86c17bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6c2261
86c17bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6c2261
86c17bb
 
0c0a8bd
86c17bb
0c0a8bd
86c17bb
 
 
0c0a8bd
86c17bb
0c0a8bd
86c17bb
 
 
0c0a8bd
e6c2261
 
0c0a8bd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM

# Hugging Face repo ID (from the model page)
MODEL_NAME = "NextGLab/ORANSight_Gemma_2_2B_Instruct"

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)

# --- Helper function ---
def chat(message, history, max_new_tokens=128, temperature=0.7):
    """
    message: user input
    history: running chat history (list of [user, assistant])
    """

    # Convert Gradio-style history into chat template
    messages = []
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
    messages.append({"role": "user", "content": message})

    # Prepare input using Gemma chat template
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_tensors="pt",
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode only new tokens (avoid echoing input)
    response = tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[-1]:],
        skip_special_tokens=True
    ).strip()

    history.append((message, response))
    return history, history, ""

# --- Gradio App ---
with gr.Blocks() as demo:
    gr.Markdown("# 🤖 ORANSight Gemma 2 2B Instruct")

    chatbot = gr.Chatbot()
    msg = gr.Textbox(show_label=False, placeholder="Type a message...")
    send = gr.Button("Send")
    clear = gr.Button("Clear Chat")

    max_tokens = gr.Slider(50, 512, value=128, step=10, label="Max new tokens")
    temperature = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature")

    state = gr.State([])

    msg.submit(chat, [msg, state, max_tokens, temperature], [chatbot, state, msg])
    send.click(chat, [msg, state, max_tokens, temperature], [chatbot, state, msg])
    clear.click(lambda: ([], []), None, [chatbot, state])

if __name__ == "__main__":
    demo.launch()