| import gradio as gr | |
| from transformers import AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b") | |
| def tokenize_dialogue(dialogue_data): | |
| """ | |
| Tokenize the dialogue using the GPT-OSS tokenizer | |
| """ | |
| if tokenizer is None: | |
| raise ValueError("Tokenizer not loaded. Please check your installation.") | |
| messages = [] | |
| for message in dialogue_data: | |
| role = message.get("speaker", "user") | |
| content = message.get("text", "") | |
| if role == "system": | |
| messages.append({"role": "system", "content": content}) | |
| elif role == "user": | |
| messages.append({"role": "user", "content": content}) | |
| elif role == "assistant": | |
| messages.append({"role": "assistant", "content": content}) | |
| formatted_input = tokenizer.apply_chat_template( | |
| messages, | |
| add_generation_prompt=True, | |
| return_tensors="np" | |
| ) | |
| token_ids = formatted_input[0].tolist() | |
| decoded_text = [] | |
| colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FFEAA7"] | |
| color_map = {} | |
| for i, token_id in enumerate(token_ids): | |
| color = colors[i % len(colors)] | |
| if token_id not in color_map: | |
| color_map[str(token_id)] = color | |
| decoded_text.append((tokenizer.decode([token_id]), str(token_id))) | |
| print("decoded_text", decoded_text) | |
| return gr.HighlightedText(value=decoded_text, color_map=color_map), len(token_ids) | |
| def create_sample_dialogue(): | |
| """ | |
| Create a sample dialogue for demonstration | |
| """ | |
| return [ | |
| {"speaker": "system", "text": "You are a helpful assistant."}, | |
| {"speaker": "user", "text": "Hello! How are you today?"}, | |
| {"speaker": "assistant", "text": "I'm doing well, thank you for asking! How can I help you today?"}, | |
| {"speaker": "user", "text": "Can you explain what MXFP4 quantization is?"} | |
| ] | |
| with gr.Blocks(title="GPT-OSS Tokenizer Explorer") as demo: | |
| gr.Markdown("# GPT-OSS Tokenizer Explorer") | |
| gr.Markdown("Enter a dialogue and see how the GPT-OSS tokenizer processes it. Use the format `speaker: message` in the dialogue component.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Input Dialogue") | |
| dialogue_input = gr.Dialogue( | |
| speakers=["system", "user", "assistant"], | |
| label="Enter your dialogue", | |
| placeholder="Type 'system:', 'user:', or 'assistant:' followed by your message", | |
| show_submit_button=True, | |
| show_copy_button=True, | |
| type="dialogue", | |
| ui_mode="dialogue-only", | |
| ) | |
| with gr.Row(): | |
| sample_btn = gr.Button("Load Sample", variant="secondary") | |
| clear_btn = gr.Button("Clear", variant="secondary") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Tokenization Results") | |
| highlighted_output = gr.HighlightedText( | |
| label="Tokenized Output", | |
| show_inline_category=False | |
| ) | |
| token_count = gr.Label( | |
| value="Total Tokens: 0", | |
| label="Token Count" | |
| ) | |
| with gr.Accordion("How to use", open=False): | |
| gr.Markdown(""" | |
| ### Instructions: | |
| 1. **Enter dialogue**: Use the dialogue component to enter conversations | |
| 2. **Speaker format**: Type `system:`, `user:`, or `assistant:` followed by your message | |
| 3. **Submit**: Click 'Tokenize Dialogue' to process the conversation | |
| 4. **View results**: See the tokenization details in the output area | |
| ### Example: | |
| ``` | |
| system: You are a helpful assistant. | |
| user: Hello! How are you today? | |
| assistant: I'm doing well, thank you for asking! | |
| ``` | |
| ### What you'll see: | |
| - **Total tokens**: Number of tokens in the conversation | |
| - **Tokenized output**: How the tokenizer formats the conversation | |
| """) | |
| def process_dialogue(dialogue): | |
| if not dialogue: | |
| return "Please enter some dialogue first.", {}, "Total Tokens: 0" | |
| result_text, token_count_val = tokenize_dialogue(dialogue) | |
| return result_text, f"Total Tokens: {token_count_val}" | |
| def clear_dialogue(): | |
| return None, [], "Total Tokens: 0" | |
| sample_btn.click( | |
| fn=create_sample_dialogue, | |
| outputs=[dialogue_input] | |
| ) | |
| clear_btn.click( | |
| fn=clear_dialogue, | |
| outputs=[dialogue_input, highlighted_output, token_count] | |
| ) | |
| dialogue_input.submit( | |
| fn=process_dialogue, | |
| inputs=[dialogue_input], | |
| outputs=[highlighted_output, token_count] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |