Spaces:
Runtime error
Runtime error
| """Base tokenizer class. | |
| Copyright PolyAI Limited. | |
| """ | |
| import os | |
| from asyncio import as_completed | |
| from concurrent.futures import ThreadPoolExecutor | |
| from tqdm import tqdm | |
| from utils import measure_duration | |
| class BaseTokenizer: | |
| def encode_files_with_model_seq( | |
| self, folder_path: str, destination_folder: str): | |
| # Ensure destination folder exists | |
| if not os.path.exists(destination_folder): | |
| os.makedirs(destination_folder) | |
| # Go through each file in the folder | |
| filenames = os.listdir(folder_path) | |
| # encoding files has no side effects | |
| for filename in tqdm(filenames): | |
| self.encode_file( | |
| folder_path=folder_path, | |
| destination_folder=destination_folder, | |
| filename=filename, | |
| ) | |
| def get_chunk(self, folder_path, start_percent=0, end_percent=100): | |
| filenames = os.listdir(folder_path) | |
| total_files = len(filenames) | |
| start_idx = int(total_files * (start_percent / 100)) | |
| end_idx = int(total_files * (end_percent / 100)) | |
| return filenames[start_idx:end_idx] | |
| def encode_files_with_model_concurrent( | |
| self, folder_path: str, destination_folder: str, start_percent: int, | |
| end_percent: int, | |
| ): | |
| # Ensure destination folder exists | |
| if not os.path.exists(destination_folder): | |
| os.makedirs(destination_folder) | |
| # Go through each file in the folder | |
| filenames = self.get_chunk(folder_path, start_percent, end_percent) | |
| # encoding files has no side effects | |
| with ThreadPoolExecutor(max_workers=40) as executor: | |
| futures = [ | |
| executor.submit( | |
| self.encode_file, | |
| folder_path=folder_path, | |
| destination_folder=destination_folder, | |
| filename=filename, | |
| ) | |
| for filename in filenames | |
| ] | |
| # Wait for all tasks to complete | |
| for future in as_completed(futures): | |
| future.result() | |
| # Explicitly shut down the thread pool | |
| executor.shutdown() | |
| def encode_file( | |
| self, folder_path: str, destination_folder: str, filename: str): | |
| raise NotImplementedError | |