Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| from tensorflow import data, TensorShape, int64, int32 | |
| from math import exp | |
| from os import makedirs | |
| from shutil import rmtree, move, copytree | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| def get_features(tokenizer, sentences, labels): | |
| features = [] | |
| for i, sentence in enumerate(sentences): | |
| inputs = tokenizer.encode_plus( | |
| sentence, | |
| add_special_tokens=True, | |
| max_length=tokenizer.model_max_length | |
| ) | |
| input_ids, token_type_ids = \ | |
| inputs['input_ids'], inputs['token_type_ids'] | |
| padding_length = tokenizer.model_max_length - len(input_ids) | |
| if tokenizer.padding_side == 'right': | |
| attention_mask = [1] * len(input_ids) + [0] * padding_length | |
| input_ids = input_ids + [tokenizer.pad_token_id] * padding_length | |
| token_type_ids = token_type_ids + \ | |
| [tokenizer.pad_token_type_id] * padding_length | |
| else: | |
| attention_mask = [0] * padding_length + [1] * len(input_ids) | |
| input_ids = [tokenizer.pad_token_id] * padding_length + input_ids | |
| token_type_ids = \ | |
| [tokenizer.pad_token_type_id] * padding_length + token_type_ids | |
| assert tokenizer.model_max_length \ | |
| == len(attention_mask) \ | |
| == len(input_ids) \ | |
| == len(token_type_ids) | |
| feature = { | |
| 'input_ids': input_ids, | |
| 'attention_mask': attention_mask, | |
| 'token_type_ids': token_type_ids, | |
| 'label': int(labels[i]) | |
| } | |
| features.append(feature) | |
| def gen(): | |
| for feature in features: | |
| yield ( | |
| { | |
| 'input_ids': feature['input_ids'], | |
| 'attention_mask': feature['attention_mask'], | |
| 'token_type_ids': feature['token_type_ids'], | |
| }, | |
| feature['label'], | |
| ) | |
| dataset = data.Dataset.from_generator( | |
| gen, | |
| ({ | |
| 'input_ids': int32, | |
| 'attention_mask': int32, | |
| 'token_type_ids': int32 | |
| }, int64), | |
| ( | |
| { | |
| 'input_ids': TensorShape([None]), | |
| 'attention_mask': TensorShape([None]), | |
| 'token_type_ids': TensorShape([None]), | |
| }, | |
| TensorShape([]), | |
| ), | |
| ) | |
| return dataset | |
| def softmax(values): | |
| exps = [exp(value) for value in values] | |
| exps_sum = sum(exp_value for exp_value in exps) | |
| return tuple(map(lambda x: x / exps_sum, exps)) | |
| def make_dir(path): | |
| try: | |
| makedirs(path) | |
| except FileExistsError: | |
| pass | |
| def remove_dir(path): | |
| rmtree(path) | |
| def copy_dir(source_path, target_path): | |
| copytree(source_path, target_path) | |
| def move_dir(source_path, target_path): | |
| move(source_path, target_path) | |
| def download_from_hub(repo_id, filename, revision=None, cache_dir=None): | |
| try: | |
| hf_hub_download(repo_id=repo_id, filename=filename, revision=revision, cache_dir=cache_dir) | |
| except Exception as exp: | |
| raise exp | |
| if cache_dir is not None: | |
| files = os.listdir(cache_dir) | |
| for f in files: | |
| if '.lock' in f: | |
| name = f[0:-5] | |
| os.rename(cache_dir+name, cache_dir+filename) | |
| os.remove(cache_dir+name+'.lock') | |
| os.remove(cache_dir+name+'.json') | |