Spaces:

gpantaz
/

athnlp2025_tokenization

Sleeping

athnlp2025_tokenization / playground_examples.py

George Pantazopoulos

feat: update number examples

b60238d 4 months ago

2.06 kB

	default_user_input = (
	"""Replace this text in the input field to see how tokenization works."""
	)
	default_tokenizer_name_1 = "openai/gpt-4o"
	default_tokenizer_name_2 = "Qwen/Qwen2.5-72B"


	number_example = """127+677=804\n
	127 + 677 = 804
	"""

	code_example = """for i in range(1, 101):
	if i % 3 == 0 and i % 5 == 0:
	print("FizzBuzz")
	elif i % 3 == 0:
	print("Fizz")
	elif i % 5 == 0:
	print("Buzz")
	else:
	print(i)
	"""

	spelling_example = """How do you spell "accommodate"?
	How many letters are in the word "accommodate"?
	How many r's are in the word strawberry?"""


	greek_example = """
	# Both mean 'I am sorry' though the latter one contains accent mark or stress mark
	Συγνωμη
	Συγνώμη

	# Both refer to "bean"
	Φασόλι
	Φασούλι

	# Both refer to "Saturday"
	Σάββατο
	Σάβατο

	# Both translate to 'egg'
	Αυγό
	Αγβό

	# They both translate to grandfather, though the latter is mostly used in Corfu Island
	Παππούς
	Πάπους

	# They mean two completely different things!
	Νόνα # refers to grandmother commonly observed in Ionion pelagos
	Νονά # refers to godmother in Christianity

	# Both refer to something new
	καινούριος
	καινούργιος

	# Both refer to tomato
	ντοματα
	τοματα

	τρενο
	τραινο

	# Singular / Plural versions of something 'innate'
	εγγενής
	εγγενείς
	"""

	examples = {
	"number": {
	"text": number_example,
	"tokenizer_1": default_tokenizer_name_1,
	"tokenizer_2": default_tokenizer_name_2,
	},
	"code": {
	"text": code_example,
	"tokenizer_1": default_tokenizer_name_1,
	"tokenizer_2": default_tokenizer_name_2,
	},
	"spelling": {
	"text": spelling_example,
	"tokenizer_1": default_tokenizer_name_1,
	"tokenizer_2": default_tokenizer_name_2,
	},
	"greek": {
	"text": greek_example,
	"tokenizer_1": default_tokenizer_name_1,
	"tokenizer_2": "ilsp/Llama-Krikri-8B-Base",
	},
	}