Spaces:
Runtime error
Runtime error
| """IMDB Dataset module for sentiment analysis.""" | |
| from __future__ import absolute_import | |
| from __future__ import division | |
| from __future__ import print_function | |
| import numpy as np | |
| import tensorflow as tf | |
| from data.util import OOV_CHAR | |
| from data.util import pad_sentence | |
| from data.util import START_CHAR | |
| NUM_CLASS = 2 | |
| def load(vocabulary_size, sentence_length): | |
| """Returns training and evaluation input for imdb dataset. | |
| Args: | |
| vocabulary_size: The number of the most frequent tokens | |
| to be used from the corpus. | |
| sentence_length: The number of words in each sentence. | |
| Longer sentences get cut, shorter ones padded. | |
| Raises: | |
| ValueError: if the dataset value is not valid. | |
| Returns: | |
| A tuple of length 4, for training and evaluation data, | |
| each being an numpy array. | |
| """ | |
| (x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data( | |
| path="imdb.npz", | |
| num_words=vocabulary_size, | |
| skip_top=0, | |
| maxlen=None, | |
| seed=113, | |
| start_char=START_CHAR, | |
| oov_char=OOV_CHAR, | |
| index_from=OOV_CHAR+1) | |
| x_train_processed = [] | |
| for sen in x_train: | |
| sen = pad_sentence(sen, sentence_length) | |
| x_train_processed.append(np.array(sen)) | |
| x_train_processed = np.array(x_train_processed) | |
| x_test_processed = [] | |
| for sen in x_test: | |
| sen = pad_sentence(sen, sentence_length) | |
| x_test_processed.append(np.array(sen)) | |
| x_test_processed = np.array(x_test_processed) | |
| return x_train_processed, np.eye(NUM_CLASS)[y_train], \ | |
| x_test_processed, np.eye(NUM_CLASS)[y_test] | |