File size: 4,361 Bytes
			
			6dbdb73 73ff676 6dbdb73 73ff676 6dbdb73 73ff676 6dbdb73 73ff676 6dbdb73 73ff676 6dbdb73 73ff676 6dbdb73 73ff676 6dbdb73 73ff676 6dbdb73 73ff676 6dbdb73 73ff676 6dbdb73 73ff676 6dbdb73 73ff676 6dbdb73 73ff676  | 
								1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120  | 
								import gradio as gr
from transformers import AutoTokenizer
import pandas as pd
import json
def process_text(model_name, text, include_special_tokens=False, show_attention_mask=False):
    """
    Processes text using a specified Hugging Face tokenizer model.
    """
    try:
        # Dynamically load the tokenizer based on the selected model name
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except Exception as e:
        return (
            pd.DataFrame([{"Error": f"Could not load tokenizer for '{model_name}': {e}. Please ensure the model name is correct and accessible (e.g., through Hugging Face Hub or a local path)."}]),
            "",
            "",
        )
    encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True)
    # Use tokenizer.tokenize and tokenizer.encode for consistency and general compatibility
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.encode(text)
    # Adjust special token handling based on the flag
    if not include_special_tokens:
        # Attempt to remove special tokens by decoding and then encoding without special tokens.
        # This approach aims for a general solution but might behave differently for
        # tokenizers with complex special token handling or if tokens are meant to be inseparable.
        try:
            decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
            token_ids = tokenizer.encode(decoded_text, add_special_tokens=False)
            tokens = tokenizer.tokenize(decoded_text, add_special_tokens=False)
        except Exception as e:
            # Fallback if specific handling fails. It's better to process without removing
            # special tokens if an error occurs rather than failing the whole process.
            print(f"Warning: Could not remove special tokens for {model_name}. Error: {e}")
            # Keep original tokens and IDs which include special tokens
            tokens = tokenizer.tokenize(text)
            token_ids = tokenizer.encode(text)
    token_info = []
    # Ensure tokens and token_ids have matching lengths for zipping
    min_len = min(len(tokens), len(token_ids))
    for i in range(min_len):
        token = tokens[i]
        token_id = token_ids[i]
        info = {
            "Token": token,
            "ID": token_id,
        }
        # Check if attention_mask is available and has the correct dimension before accessing
        if show_attention_mask and encoding["attention_mask"].shape[1] > i:
            info["Attention Mask"] = encoding["attention_mask"][0][i]
        token_info.append(info)
    df = pd.DataFrame(token_info)
    stats = f"""
    Number of tokens: {len(tokens)}
    Input text length: {len(text)}
    Tokens/character ratio: {len(tokens)/len(text):.2f}
    Vocabulary size: {tokenizer.vocab_size}
    """
    json_output = json.dumps(
        {
            "input_ids": token_ids,
            "tokens": tokens,
        },
        indent=2,
        ensure_ascii=False # Ensure non-ASCII characters are not escaped in JSON
    )
    return df, stats, json_output
# Define available models using your specified paths
model_choices = [
    "roberta-base",
    "klue/roberta-large",
    "distilbert/distilbert-base-uncased",
    "BAAI/bge-m3-retromae",
    "DTAI-KULeuven/robbert-2023-dutch-base",
    "DTAI-KULeuven/robbert-2023-dutch-large",
]
iface = gr.Interface(
    fn=process_text,
    inputs=[
        gr.Dropdown(
            choices=model_choices,
            value="roberta-base",
            label="Select Model",
        ),
        gr.Textbox(
            lines=5, placeholder="Enter text to tokenize...", label="Input Text"
        ),
        gr.Checkbox(label="Include Special Tokens", value=False),
        gr.Checkbox(label="Show Attention Mask", value=False),
    ],
    outputs=[
        gr.Dataframe(
            headers=["Token", "ID", "Attention Mask"], label="Tokenization Results"
        ),
        gr.Textbox(label="Statistics", lines=4),
        gr.JSON(label="JSON Output"),
    ],
    title="Hugging Face Tokenizer Playground",
    description="""
    An interactive demonstration of various Hugging Face tokenizers.
    Select a model from the dropdown to see how it tokenizes your input text.
    """,
    theme="default",
)
if __name__ == "__main__":
    iface.launch(share=True) |