Spaces:

Frinkleko
/

RoBERTa_Tokenizer_Playground

Sleeping

File size: 4,361 Bytes

import gradio as gr
from transformers import AutoTokenizer
import pandas as pd
import json

def process_text(model_name, text, include_special_tokens=False, show_attention_mask=False):
    """
    Processes text using a specified Hugging Face tokenizer model.
    """
    try:
        # Dynamically load the tokenizer based on the selected model name
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except Exception as e:
        return (
            pd.DataFrame([{"Error": f"Could not load tokenizer for '{model_name}': {e}. Please ensure the model name is correct and accessible (e.g., through Hugging Face Hub or a local path)."}]),
            "",
            "",
        )

    encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True)

    # Use tokenizer.tokenize and tokenizer.encode for consistency and general compatibility
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.encode(text)

    # Adjust special token handling based on the flag
    if not include_special_tokens:
        # Attempt to remove special tokens by decoding and then encoding without special tokens.
        # This approach aims for a general solution but might behave differently for
        # tokenizers with complex special token handling or if tokens are meant to be inseparable.
        try:
            decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
            token_ids = tokenizer.encode(decoded_text, add_special_tokens=False)
            tokens = tokenizer.tokenize(decoded_text, add_special_tokens=False)
        except Exception as e:
            # Fallback if specific handling fails. It's better to process without removing
            # special tokens if an error occurs rather than failing the whole process.
            print(f"Warning: Could not remove special tokens for {model_name}. Error: {e}")
            # Keep original tokens and IDs which include special tokens
            tokens = tokenizer.tokenize(text)
            token_ids = tokenizer.encode(text)

    token_info = []
    # Ensure tokens and token_ids have matching lengths for zipping
    min_len = min(len(tokens), len(token_ids))
    for i in range(min_len):
        token = tokens[i]
        token_id = token_ids[i]
        info = {
            "Token": token,
            "ID": token_id,
        }

        # Check if attention_mask is available and has the correct dimension before accessing
        if show_attention_mask and encoding["attention_mask"].shape[1] > i:
            info["Attention Mask"] = encoding["attention_mask"][0][i]

        token_info.append(info)

    df = pd.DataFrame(token_info)

    stats = f"""
    Number of tokens: {len(tokens)}
    Input text length: {len(text)}
    Tokens/character ratio: {len(tokens)/len(text):.2f}
    Vocabulary size: {tokenizer.vocab_size}
    """

    json_output = json.dumps(
        {
            "input_ids": token_ids,
            "tokens": tokens,
        },
        indent=2,
        ensure_ascii=False # Ensure non-ASCII characters are not escaped in JSON
    )

    return df, stats, json_output

# Define available models using your specified paths
model_choices = [
    "roberta-base",
    "klue/roberta-large",
    "distilbert/distilbert-base-uncased",
    "BAAI/bge-m3-retromae",
    "DTAI-KULeuven/robbert-2023-dutch-base",
    "DTAI-KULeuven/robbert-2023-dutch-large",
]

iface = gr.Interface(
    fn=process_text,
    inputs=[
        gr.Dropdown(
            choices=model_choices,
            value="roberta-base",
            label="Select Model",
        ),
        gr.Textbox(
            lines=5, placeholder="Enter text to tokenize...", label="Input Text"
        ),
        gr.Checkbox(label="Include Special Tokens", value=False),
        gr.Checkbox(label="Show Attention Mask", value=False),
    ],
    outputs=[
        gr.Dataframe(
            headers=["Token", "ID", "Attention Mask"], label="Tokenization Results"
        ),
        gr.Textbox(label="Statistics", lines=4),
        gr.JSON(label="JSON Output"),
    ],
    title="Hugging Face Tokenizer Playground",
    description="""
    An interactive demonstration of various Hugging Face tokenizers.
    Select a model from the dropdown to see how it tokenizes your input text.
    """,
    theme="default",
)

if __name__ == "__main__":
    iface.launch(share=True)