Spaces:

kelvin-t-lu
/

chatbot

Paused

App Files Files Community

chatbot / tests /test_tokenizer.py

kelvin-t-lu

init

dbd2ac6 about 2 years ago

raw

history blame contribute delete

2.92 kB

	import os

	import nltk
	import pytest

	from tests.utils import wrap_test_forked


	def nltkTokenize(text):
	words = nltk.word_tokenize(text)
	return words


	import re

	WORD = re.compile(r'\w+')


	def regTokenize(text):
	words = WORD.findall(text)
	return words


	import time


	@pytest.mark.skipif(not os.getenv('MEASURE'),
	reason="For checking token length for various methods: MEASURE=1 pytest -s -v tests/test_tokenizer.py")
	@wrap_test_forked
	def test_tokenizer1():
	prompt = """Here is an example of how to write a Python program to generate the Fibonacci sequence:




	def fib(n):
	a, b = 0, 1
	if n == 0 or n == 1:
	return a
	for i in range(n-2):
	a, b = b, a+b
	return b

	for i in range(10):
	print(fib(i))
	This program defines a function called fib that takes an integer n as input and returns the nth Fibonacci number. The function uses two variables a and b to keep track of the current and previous Fibonacci numbers.

	The first two lines of the function check if n is either 0 or 1, in which case the function returns 0 or 1 respectively. If n is greater than 1, the function iterates over the range of integers from 2 to n-1, adding the previous two Fibonacci numbers to get the current Fibonacci number. Finally, the function returns the last Fibonacci number calculated.

	In the main part of the program, we use a for loop to call the fib function with different"""

	prompt = os.getenv('PROMPT', prompt)
	run_tokenizer1(prompt)


	def run_tokenizer1(prompt):
	from transformers import AutoTokenizer

	t = AutoTokenizer.from_pretrained("distilgpt2")
	llm_tokenizer = AutoTokenizer.from_pretrained('h2oai/h2ogpt-oig-oasst1-512-6_9b')

	from InstructorEmbedding import INSTRUCTOR
	emb = INSTRUCTOR('hkunlp/instructor-large')

	t0 = time.time()
	a = len(regTokenize(prompt))
	print("Regexp Tokenizer", a, time.time() - t0)

	t0 = time.time()
	a = len(nltkTokenize(prompt))
	print("NLTK Tokenizer", a, time.time() - t0)

	t0 = time.time()
	a = len(t(prompt)['input_ids'])
	print("Slow Tokenizer", a, time.time() - t0)

	t0 = time.time()
	a = len(llm_tokenizer(prompt)['input_ids'])
	print("Fast Tokenizer LLM", a, time.time() - t0)

	t0 = time.time()
	a = emb.tokenize([prompt])['input_ids'].shape[1]
	print("Instruct Embedding", a, time.time() - t0)


	@wrap_test_forked
	def test_fake_tokenizer():
	from src.utils import FakeTokenizer
	t = FakeTokenizer()
	assert t.num_tokens_from_string('How are you?') == 4
	assert t.num_tokens_from_string('<\|endoftext\|>') == 7
	try:
	t.encoding.encode('<\|endoftext\|>')
	raise RuntimeError("Shouldn't reach here")
	except ValueError as e:
	assert "disallowed special token" in str(e)


	if __name__ == '__main__':
	test_tokenizer1()