| """ | |
| Glove Embedding | |
| --------------------------------------------------------------------- | |
| """ | |
| import os | |
| import numpy as np | |
| import torch | |
| from torch import nn as nn | |
| from textattack.shared import logger, utils | |
| class EmbeddingLayer(nn.Module): | |
| """A layer of a model that replaces word IDs with their embeddings. | |
| This is a useful abstraction for any nn.module which wants to take word IDs | |
| (a sequence of text) as input layer but actually manipulate words' | |
| embeddings. | |
| Requires some pre-trained embedding with associated word IDs. | |
| """ | |
| def __init__( | |
| self, | |
| n_d=100, | |
| embedding_matrix=None, | |
| word_list=None, | |
| oov="<oov>", | |
| pad="<pad>", | |
| normalize=True, | |
| ): | |
| super(EmbeddingLayer, self).__init__() | |
| word2id = {} | |
| if embedding_matrix is not None: | |
| for word in word_list: | |
| assert word not in word2id, "Duplicate words in pre-trained embeddings" | |
| word2id[word] = len(word2id) | |
| logger.debug(f"{len(word2id)} pre-trained word embeddings loaded.\n") | |
| n_d = len(embedding_matrix[0]) | |
| if oov not in word2id: | |
| word2id[oov] = len(word2id) | |
| if pad not in word2id: | |
| word2id[pad] = len(word2id) | |
| self.word2id = word2id | |
| self.n_V, self.n_d = len(word2id), n_d | |
| self.oovid = word2id[oov] | |
| self.padid = word2id[pad] | |
| self.embedding = nn.Embedding(self.n_V, n_d) | |
| self.embedding.weight.data.uniform_(-0.25, 0.25) | |
| weight = self.embedding.weight | |
| weight.data[: len(word_list)].copy_(torch.from_numpy(embedding_matrix)) | |
| logger.debug(f"EmbeddingLayer shape: {weight.size()}") | |
| if normalize: | |
| weight = self.embedding.weight | |
| norms = weight.data.norm(2, 1) | |
| if norms.dim() == 1: | |
| norms = norms.unsqueeze(1) | |
| weight.data.div_(norms.expand_as(weight.data)) | |
| def forward(self, input): | |
| return self.embedding(input) | |
| class GloveEmbeddingLayer(EmbeddingLayer): | |
| """Pre-trained Global Vectors for Word Representation (GLOVE) vectors. Uses | |
| embeddings of dimension 200. | |
| GloVe is an unsupervised learning algorithm for obtaining vector | |
| representations for words. Training is performed on aggregated global | |
| word-word co-occurrence statistics from a corpus, and the resulting | |
| representations showcase interesting linear substructures of the word | |
| vector space. | |
| GloVe: Global Vectors for Word Representation. (Jeffrey Pennington, | |
| Richard Socher, and Christopher D. Manning. 2014.) | |
| """ | |
| EMBEDDING_PATH = "word_embeddings/glove200" | |
| def __init__(self, emb_layer_trainable=True): | |
| glove_path = utils.download_from_s3(GloveEmbeddingLayer.EMBEDDING_PATH) | |
| glove_word_list_path = os.path.join(glove_path, "glove.wordlist.npy") | |
| word_list = np.load(glove_word_list_path) | |
| glove_matrix_path = os.path.join(glove_path, "glove.6B.200d.mat.npy") | |
| embedding_matrix = np.load(glove_matrix_path) | |
| super().__init__(embedding_matrix=embedding_matrix, word_list=word_list) | |
| self.embedding.weight.requires_grad = emb_layer_trainable | |