Spaces:
Running
Running
| """ | |
| ## | |
| """ | |
| import gradio as gr | |
| from character_util import get_character_table, default_columns | |
| all_columns = [ | |
| ("digit", "digit"), | |
| ("space", "space"), | |
| ("lang-chinese", 'zh'), | |
| ("lang-korea", 'ko'), | |
| ("lang-japanese", 'ja'), | |
| # ("byte", "byte"), | |
| # ("oov", "oov") | |
| ] | |
| # columns = ["lang-zh", "lang-korea", "lang-japanese", "number", "space", "bytes", "oov"] | |
| abbr2name = {column[1]: column[0].split('-')[-1] for column in all_columns} | |
| def get_column_info(columns): | |
| markdown = "" | |
| for column in columns: | |
| markdown += f"- `num({column})`: num of tokens containing {abbr2name[column]} characters\n" \ | |
| f"- `len({column})`: `min,median,max` length of tokens containing {abbr2name[column]} characters\n" | |
| return markdown | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## 🛠️ Setting") # ⚙ | |
| with gr.Accordion("Please select the type of character you want to count.", open=True): | |
| # file size 💽 🖴, tokens 🧮 | |
| with gr.Row(): | |
| with gr.Column(): | |
| columns = gr.Checkboxgroup( | |
| all_columns, | |
| value=default_columns, | |
| label="character type", | |
| # info="" | |
| ) | |
| gr.Markdown( | |
| "To count other types of characters, you can modify [lang_util.py]" | |
| "(https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/utils/lang_util.py). " | |
| ) | |
| column_info = gr.Markdown( | |
| get_column_info(default_columns) | |
| ) | |
| gr.Markdown("## 📊 Character Statistics") | |
| search_bar = gr.Textbox( | |
| placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...", | |
| show_label=False, | |
| elem_id="search-bar", | |
| ) | |
| compress_rate_table = gr.Dataframe(datatype="html", wrap=True) | |
| search_bar.submit( | |
| get_character_table, | |
| inputs=[search_bar, columns], | |
| outputs=compress_rate_table | |
| ) | |
| columns.change( | |
| get_character_table, | |
| inputs=[search_bar, columns], | |
| outputs=compress_rate_table, | |
| show_api=False | |
| ) | |
| columns.change( | |
| get_column_info, | |
| inputs=[columns], | |
| outputs=column_info, | |
| show_api=False | |
| ) | |
| demo.load( | |
| get_character_table, | |
| inputs=[search_bar, columns], | |
| outputs=compress_rate_table, | |
| show_api=False | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |