Spaces:
Running
Running
| from typing import Dict, List, Tuple | |
| from pathlib import Path | |
| from transformers import AutoTokenizer | |
| import tiktoken | |
| # UZH color palette | |
| UZH_COLORS = [ | |
| "#BACBFF", # UZH Blue V1 | |
| "#DBF4F9", # UZH Cyan V1 | |
| "#ECF6D6", # UZH Apple V1 | |
| "#FFF4DA", # UZH Gold V1 | |
| "#FFDBCC", # UZH Orange V1 | |
| "#FBC6D4", # UZH Berry V1 | |
| "#C2C2C2", # UZH Grey V1 | |
| "#FAFAFA", # UZH Light Grey V1 | |
| "#7596FF", # UZH Blue V2 | |
| "#B7E9F4", # UZH Cyan V2 | |
| "#DBEDAD", # UZH Apple V2 | |
| "#FFE9B5", # UZH Gold V2 | |
| "#FEB799", # UZH Orange V2 | |
| "#F78CAA", # UZH Berry V2 | |
| "#A3A3A3", # UZH Grey V2 | |
| "#EFEFEF", # UZH Light Grey V2 | |
| ] | |
| def load_hf_tokenizer(name: str) -> Tuple[str, object]: | |
| """ | |
| Load a single HuggingFace tokenizer. | |
| Args: | |
| name: The name of the tokenizer to load | |
| Returns: | |
| Tuple of (tokenizer_name, tokenizer_object) | |
| """ | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| name, | |
| use_fast=True, | |
| model_max_length=1000000, | |
| clean_up_tokenization_spaces=True, | |
| legacy=False | |
| ) | |
| except Exception as e: | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| name, | |
| model_max_length=1000000, | |
| clean_up_tokenization_spaces=True, | |
| legacy=False | |
| ) | |
| return name, tokenizer | |
| def load_openai_tokenizer(name: str) -> Tuple[str, object]: | |
| """ | |
| Load a single OpenAI tokenizer. | |
| Args: | |
| name: The name of the tokenizer to load | |
| Returns: | |
| Tuple of (tokenizer_name, tokenizer_object) | |
| """ | |
| return name, tiktoken.encoding_for_model(name) | |
| def load_gsw_tokenizer() -> Tuple[str, object]: | |
| """ | |
| Load a Swiss German (GSW) tokenizer from local vocabulary files in gsw_tokenizer directory. | |
| Returns: | |
| Tuple of (tokenizer_name, tokenizer_object) | |
| """ | |
| tokenizer = AutoTokenizer.from_pretrained("jvamvas/swissbert-gsw-vocab") | |
| return "swissbert-gsw", tokenizer | |
| def load_tokenizers() -> Dict[str, object]: | |
| """ | |
| Load all tokenizers. | |
| Returns: | |
| Dictionary mapping tokenizer names to tokenizer objects | |
| """ | |
| tokenizers = {} | |
| # Load OpenAI tokenizers first | |
| openai_names = ["gpt-4o"] | |
| for name in openai_names: | |
| tokenizer_name, tokenizer = load_openai_tokenizer(name) | |
| tokenizers[tokenizer_name] = tokenizer | |
| # Load HuggingFace tokenizers in specified order | |
| hf_names = [ | |
| "meta-llama/Llama-4-Scout-17B-16E-Instruct", | |
| "deepseek-ai/DeepSeek-V3-0324", | |
| "ZurichNLP/swissbert", | |
| "google/gemma-3-27b-it", | |
| "mistralai/Mistral-Nemo-Instruct-2407", | |
| "CohereLabs/aya-expanse-8b", | |
| ] | |
| for name in hf_names: | |
| tokenizer_name, tokenizer = load_hf_tokenizer(name) | |
| tokenizers[tokenizer_name] = tokenizer | |
| return tokenizers | |
| # Mapping of model names to display names | |
| MODEL_DISPLAY_NAMES = { | |
| "meta-llama/Llama-4-Scout-17B-16E-Instruct": "Llama 4", | |
| "deepseek-ai/DeepSeek-V3-0324": "DeepSeek V3", | |
| "ZurichNLP/swissbert": "SwissBERT 🇨🇭", | |
| "mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo", | |
| "google/gemma-3-27b-it": "Gemma 3", | |
| "gpt-4o": "ChatGPT (gpt-4o)", | |
| "CohereLabs/aya-expanse-8b": "Aya Expanse" | |
| } | |
| def tokenize(s: str, tokenizer) -> List[str]: | |
| """ | |
| Tokenize a string using any tokenizer from load_hf_tokenizers() or load_openai_tokenizers(). | |
| For SwissBERT tokenizer, compares both SwissBERT and SwissBERT-GSW tokenizations and returns the shorter one. | |
| Args: | |
| s: The string to tokenize | |
| tokenizer: A tokenizer from load_hf_tokenizers() or load_openai_tokenizers() | |
| Returns: | |
| A list of tokens, with special tokens removed and any tail token markers (## or @@) removed | |
| """ | |
| # Special handling for SwissBERT tokenizer | |
| if hasattr(tokenizer, "name_or_path") and "swissbert" in tokenizer.name_or_path.lower(): | |
| # Get SwissBERT-GSW tokenizer | |
| _, gsw_tokenizer = load_gsw_tokenizer() | |
| # Get tokenizations from both tokenizers§ | |
| swissbert_tokens = _tokenize_with_tokenizer(s, tokenizer) | |
| gsw_tokens = _tokenize_with_tokenizer(s, gsw_tokenizer) | |
| # Return the shorter tokenization | |
| shorter_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens | |
| if len(shorter_tokens) > 0 and shorter_tokens[0].startswith(" "): | |
| shorter_tokens[0] = shorter_tokens[0][1:] | |
| return shorter_tokens | |
| return _tokenize_with_tokenizer(s, tokenizer) | |
| def _tokenize_with_tokenizer(s: str, tokenizer) -> List[str]: | |
| """ | |
| Internal helper function to tokenize a string with a given tokenizer. | |
| Args: | |
| s: The string to tokenize | |
| tokenizer: A tokenizer object | |
| Returns: | |
| A list of tokens, with special tokens removed and any tail token markers (## or @@) removed | |
| """ | |
| if hasattr(tokenizer, "tokenize"): | |
| encoded = tokenizer.encode(s, add_special_tokens=False) | |
| if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya"]): | |
| tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded] | |
| else: | |
| tokens = tokenizer.convert_ids_to_tokens(encoded) | |
| filtered_tokens = [] | |
| for t in tokens: | |
| if t.startswith("<") or t.startswith("["): | |
| continue | |
| elif "Ġ" in t: | |
| filtered_tokens.append(t.replace("Ġ", " ")) | |
| elif "Ċ" in t: | |
| filtered_tokens.append(t.replace("Ċ", " ")) | |
| elif t.startswith("▁"): | |
| filtered_tokens.append(" " + t[1:]) | |
| else: | |
| filtered_tokens.append(t) | |
| return [t.rstrip("##").rstrip("@@") for t in filtered_tokens] | |
| elif hasattr(tokenizer, "encode"): | |
| token_ids = tokenizer.encode(s) | |
| return [tokenizer.decode([token_id]) for token_id in token_ids] | |
| else: | |
| raise ValueError("Unsupported tokenizer type") | |
| def get_uzh_color(index): | |
| """Get a color from the UZH color palette based on index.""" | |
| return UZH_COLORS[index % len(UZH_COLORS)] | |
| def visualize_tokens(text: str, tokenizers: Dict[str, object]): | |
| """ | |
| Tokenize text with each tokenizer and visualize the tokens with colors. | |
| Colors are consistent across tokenizers for the same token sequences. | |
| Colors are deterministic based on token content. | |
| Args: | |
| text: The input text to tokenize | |
| tokenizers: Dictionary of tokenizers | |
| Returns: | |
| Dictionary mapping tokenizer names to HTML visualizations | |
| """ | |
| results = {} | |
| # First pass: collect all unique tokens across all tokenizers | |
| all_tokens = set() | |
| for tokenizer in tokenizers.values(): | |
| tokens = tokenize(text, tokenizer) | |
| all_tokens.update(tokens) | |
| # Generate colors for all unique tokens using hash-based approach | |
| token_colors = {} | |
| for token in all_tokens: | |
| # Use hash of token to get a deterministic index | |
| token_hash = hash(token) | |
| # Ensure positive index and wrap around to color list length | |
| index = abs(token_hash) % len(UZH_COLORS) | |
| token_colors[token] = get_uzh_color(index) | |
| # Second pass: create visualizations using the consistent colors | |
| for name, tokenizer in tokenizers.items(): | |
| tokens = tokenize(text, tokenizer) | |
| # Create a colored visualization | |
| html = "" | |
| # Build the HTML with colored spans for each token | |
| for token in tokens: | |
| color = token_colors[token] | |
| html += f'<span style="background-color: {color}; padding: 2px; margin: 1px; border-radius: 3px;">{token}</span>' | |
| results[name] = html | |
| return results | |