Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import streamlit as st | |
| import numpy as np | |
| import torch | |
| import io | |
| import time | |
| def load_model(tokenizer_name): | |
| from transformers import AutoTokenizer | |
| model_name_dict = { | |
| "BERT":"bert-base-uncased", | |
| "RoBERTa":"roberta-base", | |
| "ALBERT":"albert-base-v2", | |
| "GPT2":"gpt2", | |
| #"Llama":"meta-lama/Llama-2-7b-chat-hf", | |
| #"Gemma":"google/gemma-7b", | |
| } | |
| tokenizer = AutoTokenizer.from_pretrained(model_name_dict[tokenizer_name]) | |
| return tokenizer | |
| def generate_markdown(text,color='black',font='Arial',size=20): | |
| return f"<p style='text-align:center; color:{color}; font-family:{font}; font-size:{size}px;'>{text}</p>" | |
| def TokenizeText(sentence,tokenizer_name): | |
| if len(sentence)>0: | |
| #if tokenizer_name.startswith('gpt2'): | |
| # input_sent = tokenizer(sentence)['input_ids'] | |
| #else: | |
| # input_sent = tokenizer(sentence)['input_ids'][1:-1] | |
| input_sent = tokenizer(sentence)['input_ids'] | |
| encoded_sent = [str(token) for token in input_sent] | |
| decoded_sent = [tokenizer.decode([token]) for token in input_sent] | |
| num_tokens = len(decoded_sent) | |
| #char_nums = [len(word)+2 for word in decoded_sent] | |
| #word_cols = st.columns(char_nums) | |
| #for word_col,word in zip(word_cols,decoded_sent): | |
| #with word_col: | |
| #st.write(word) | |
| #st.write(' '.join(encoded_sent)) | |
| #st.write(' '.join(decoded_sent)) | |
| st.markdown(generate_markdown(' '.join(encoded_sent),size=16), unsafe_allow_html=True) | |
| st.markdown(generate_markdown(' '.join(decoded_sent),size=16), unsafe_allow_html=True) | |
| st.markdown(generate_markdown(f'{num_tokens} tokens'), unsafe_allow_html=True) | |
| return num_tokens | |
| def DeTokenizeText(input_str): | |
| if len(input_str)>0: | |
| input_sent = [int(element) for element in input_str.strip().split(' ')] | |
| encoded_sent = [str(token) for token in input_sent] | |
| decoded_sent = tokenizer.decode(input_sent) | |
| num_tokens = len(input_sent) | |
| #char_nums = [len(word)+2 for word in decoded_sent] | |
| #word_cols = st.columns(char_nums) | |
| #for word_col,word in zip(word_cols,decoded_sent): | |
| #with word_col: | |
| #st.write(word) | |
| #st.write(' '.join(encoded_sent)) | |
| #st.write(' '.join(decoded_sent)) | |
| st.markdown(generate_markdown(decoded_sent), unsafe_allow_html=True) | |
| return num_tokens | |
| if __name__=='__main__': | |
| # Config | |
| max_width = 1500 | |
| padding_top = 0 | |
| padding_right = 2 | |
| padding_bottom = 0 | |
| padding_left = 2 | |
| define_margins = f""" | |
| <style> | |
| .appview-container .main .block-container{{ | |
| max-width: {max_width}px; | |
| padding-top: {padding_top}rem; | |
| padding-right: {padding_right}rem; | |
| padding-left: {padding_left}rem; | |
| padding-bottom: {padding_bottom}rem; | |
| }} | |
| </style> | |
| """ | |
| hide_table_row_index = """ | |
| <style> | |
| tbody th {display:none} | |
| .blank {display:none} | |
| </style> | |
| """ | |
| st.markdown(define_margins, unsafe_allow_html=True) | |
| st.markdown(hide_table_row_index, unsafe_allow_html=True) | |
| # Title | |
| st.markdown(generate_markdown('WordPiece Explorer',size=32), unsafe_allow_html=True) | |
| st.markdown(generate_markdown('- quick and easy way to explore how tokenizers work -',size=24), unsafe_allow_html=True) | |
| # Select and load the tokenizer | |
| st.sidebar.write('1. Choose the tokenizer from below') | |
| tokenizer_name = st.sidebar.selectbox('', | |
| ("BERT","RoBERTa","ALBERT", | |
| "GPT2")) | |
| tokenizer = load_model(tokenizer_name) | |
| st.sidebar.write('2. Optional settings') | |
| comparison_mode = st.sidebar.checkbox('Compare two texts') | |
| detokenize = st.sidebar.checkbox('de-tokenize') | |
| st.sidebar.write(f'"Compare two texts" compares # tokens for two pieces of text '\ | |
| +f'and "de-tokenize" converts a list of tokenized indices back to strings.') | |
| st.sidebar.write(f'For "de-tokenize", make sure to type in integers, separated by single spaces.') | |
| if comparison_mode: | |
| sent_cols = st.columns(2) | |
| num_tokens = {} | |
| sents = {} | |
| for sent_id, sent_col in enumerate(sent_cols): | |
| with sent_col: | |
| if detokenize: | |
| sentence = st.text_input(f'Tokenized IDs {sent_id+1}') | |
| num_tokens[f'sent_{sent_id+1}'] = DeTokenizeText(sentence) | |
| else: | |
| sentence = st.text_input(f'Text {sent_id+1}') | |
| num_tokens[f'sent_{sent_id+1}'] = TokenizeText(sentence,tokenizer_name) | |
| sents[f'sent_{sent_id+1}'] = sentence | |
| if len(sents['sent_1'])>0 and len(sents['sent_2'])>0: | |
| st.markdown(generate_markdown('# Tokens: ',size=16), unsafe_allow_html=True) | |
| if num_tokens[f'sent_1']==num_tokens[f'sent_2']: | |
| st.markdown(generate_markdown('Matched! ',color='MediumAquamarine'), unsafe_allow_html=True) | |
| else: | |
| st.markdown(generate_markdown('Not Matched... ',color='Salmon'), unsafe_allow_html=True) | |
| else: | |
| if detokenize: | |
| #if tokenizer_name.startswith('gpt2'): | |
| # default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids'] | |
| #else: | |
| # default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids'][1:-1] | |
| default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids'] | |
| sentence = st.text_input(f'Tokenized IDs',value=' '.join([str(token) for token in default_tokens])) | |
| num_tokens = DeTokenizeText(sentence) | |
| else: | |
| sentence = st.text_input(f'Text',value='Tokenizers decompose bigger words into smaller tokens') | |
| num_tokens = TokenizeText(sentence,tokenizer_name) | |