Spaces:
Sleeping
Sleeping
| import json | |
| from functools import lru_cache | |
| from typing import Any | |
| import gradio as gr | |
| import pandas as pd | |
| from loguru import logger | |
| from playground_examples import ( | |
| default_tokenizer_name_1, | |
| default_tokenizer_name_2, | |
| default_user_input, | |
| examples, | |
| ) | |
| from playground_tokenizers import TokenizerFactory | |
| def run_tokenization( | |
| text: str, tokenizer_name: str, color_num: int = 5, add_special_token: bool = False | |
| ) -> tuple[list[tuple[str, str]], int, pd.DataFrame]: | |
| """Tokenize an input text and return the tokens with their positions.""" | |
| logger.info( | |
| "param=" | |
| + json.dumps( | |
| {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False | |
| ) | |
| ) | |
| pos_tokens = [] | |
| tokenizer = TokenizerFactory().get_tokenizer(tokenizer_name) | |
| encoding = tokenizer.encode(text) if add_special_token else tokenizer.encode(text) | |
| table = [] | |
| for idx, token_id in enumerate(encoding): | |
| decoded_text = tokenizer.decode([token_id]) | |
| decoded_text = decoded_text.replace( | |
| " ", "⋅" | |
| ) # replace space with ⋅ for better visualization | |
| pos_tokens.extend([(decoded_text, str(idx % color_num))]) | |
| try: | |
| token = tokenizer.decode([token_id])[0] | |
| except: | |
| token = {v: k for k, v in tokenizer.get_vocab().items()}[token_id] | |
| if isinstance(token, bytes): | |
| try: | |
| token_str = token.decode("utf-8") | |
| except: | |
| token_str = token.decode("utf-8", errors="ignore") | |
| logger.error( | |
| f"{idx}: decode_error: {tokenizer_name}, {token} {token_str}" | |
| ) | |
| elif isinstance(token, str): | |
| token_str = token | |
| else: | |
| logger.error( | |
| f"{idx}: wrong type for token {token_id} {type(token)} " | |
| + json.dumps( | |
| {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False | |
| ) | |
| ) | |
| token_str = token | |
| table.append({"TokenID": token_id, "Text": decoded_text}) | |
| table_df = pd.DataFrame(table) | |
| logger.info(f"tokenizer_type={tokenizer_name}, Tokens={table[:4]}") | |
| return pos_tokens, len(encoding), table_df | |
| def tokenize( | |
| text: str, tokenizer_name: str, color_num: int = 5 | |
| ) -> tuple[dict[Any, Any], pd.DataFrame]: | |
| """Tokenize an input text.""" | |
| pos_tokens, num_tokens, table_df = run_tokenization(text, tokenizer_name, color_num) | |
| return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df | |
| def tokenize_pair( | |
| text: str, tokenizer_name_1: str, tokenizer_name_2: str, color_num: int = 5 | |
| ): | |
| """input_text.change.""" | |
| pos_tokens_1, table_df_1 = tokenize( | |
| text=text, tokenizer_name=tokenizer_name_1, color_num=color_num | |
| ) | |
| pos_tokens_2, table_df_2 = tokenize( | |
| text=text, tokenizer_name=tokenizer_name_2, color_num=color_num | |
| ) | |
| return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2 | |
| def on_load(url_params: str, request: gr.Request | None = None) -> tuple[str, str, str]: | |
| """Function triggered on page load to get URL parameters.""" | |
| text = default_user_input | |
| tokenizer_type_1 = default_tokenizer_name_1 | |
| tokenizer_type_2 = default_tokenizer_name_2 | |
| return text, tokenizer_type_1, tokenizer_type_2 | |
| get_window_url_params = """ | |
| function(url_params) { | |
| const params = new URLSearchParams(window.location.search); | |
| url_params = JSON.stringify(Object.fromEntries(params)); | |
| return url_params; | |
| } | |
| """ | |
| all_tokenizer_name = [ | |
| (config.name_display, config.name_or_path) | |
| for config in TokenizerFactory().all_tokenizer_configs | |
| ] | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| gr.Markdown("## Input Text") | |
| dropdown_examples = gr.Dropdown( | |
| sorted(examples.keys()), | |
| value="Examples", | |
| type="index", | |
| allow_custom_value=True, | |
| show_label=False, | |
| container=False, | |
| scale=0, | |
| elem_classes="example-style", | |
| ) | |
| user_input = gr.Textbox( | |
| label="Input Text", | |
| lines=5, | |
| show_label=False, | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=6), gr.Group(): | |
| tokenizer_name_1 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 1") | |
| with gr.Column(scale=6), gr.Group(): | |
| tokenizer_name_2 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 2") | |
| with gr.Row(): | |
| with gr.Column(): | |
| output_text_1 = gr.Highlightedtext( | |
| show_legend=False, show_inline_category=False | |
| ) | |
| with gr.Column(): | |
| output_text_2 = gr.Highlightedtext( | |
| show_legend=False, show_inline_category=False | |
| ) | |
| with gr.Row(): | |
| output_table_1 = gr.Dataframe() | |
| output_table_2 = gr.Dataframe() | |
| tokenizer_name_1.change( | |
| tokenize, [user_input, tokenizer_name_1], [output_text_1, output_table_1] | |
| ) | |
| tokenizer_name_2.change( | |
| tokenize, [user_input, tokenizer_name_2], [output_text_2, output_table_2] | |
| ) | |
| user_input.change( | |
| tokenize_pair, | |
| [user_input, tokenizer_name_1, tokenizer_name_2], | |
| [output_text_1, output_table_1, output_text_2, output_table_2], | |
| show_api=False, | |
| ) | |
| dropdown_examples.change( | |
| lambda example_idx: ( | |
| examples[sorted(examples.keys())[example_idx]]["text"], | |
| examples[sorted(examples.keys())[example_idx]]["tokenizer_1"], | |
| examples[sorted(examples.keys())[example_idx]]["tokenizer_2"], | |
| ), | |
| dropdown_examples, | |
| [user_input, tokenizer_name_1, tokenizer_name_2], | |
| show_api=False, | |
| ) | |
| demo.load( | |
| fn=on_load, | |
| inputs=[user_input], | |
| outputs=[user_input, tokenizer_name_1, tokenizer_name_2], | |
| js=get_window_url_params, | |
| show_api=False, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=True) | |