Spaces:
Runtime error
Runtime error
| import collections | |
| import numpy as np | |
| MaskedLmInstance = collections.namedtuple("MaskedLmInstance", | |
| ["index", "label"]) | |
| def is_start_piece(piece): | |
| """Check if the current word piece is the starting piece (BERT).""" | |
| # When a word has been split into | |
| # WordPieces, the first token does not have any marker and any subsequence | |
| # tokens are prefixed with ##. So whenever we see the ## token, we | |
| # append it to the previous set of word indexes. | |
| return not piece.startswith("##") | |
| def create_masked_lm_predictions(tokens, | |
| vocab_id_list, vocab_id_to_token_dict, | |
| masked_lm_prob, | |
| cls_id, sep_id, mask_id, | |
| max_predictions_per_seq, | |
| np_rng, | |
| max_ngrams=3, | |
| do_whole_word_mask=True, | |
| favor_longer_ngram=False, | |
| do_permutation=False, | |
| geometric_dist=False, | |
| masking_style="bert", | |
| zh_tokenizer=None): | |
| """Creates the predictions for the masked LM objective. | |
| Note: Tokens here are vocab ids and not text tokens.""" | |
| ''' | |
| modified from Megatron-LM | |
| Args: | |
| tokens: 输入 | |
| vocab_id_list: 词表token_id_list | |
| vocab_id_to_token_dict: token_id到token字典 | |
| masked_lm_prob:mask概率 | |
| cls_id、sep_id、mask_id:特殊token | |
| max_predictions_per_seq:最大mask个数 | |
| np_rng:mask随机数 | |
| max_ngrams:最大词长度 | |
| do_whole_word_mask:是否做全词掩码 | |
| favor_longer_ngram:优先用长的词 | |
| do_permutation:是否打乱 | |
| geometric_dist:用np_rng.geometric做随机 | |
| masking_style:mask类型 | |
| zh_tokenizer:WWM的分词器,比如用jieba.lcut做分词之类的 | |
| ''' | |
| cand_indexes = [] | |
| # Note(mingdachen): We create a list for recording if the piece is | |
| # the starting piece of current token, where 1 means true, so that | |
| # on-the-fly whole word masking is possible. | |
| token_boundary = [0] * len(tokens) | |
| # 如果没有指定中文分词器,那就直接按##算 | |
| if zh_tokenizer is None: | |
| for (i, token) in enumerate(tokens): | |
| if token == cls_id or token == sep_id: | |
| token_boundary[i] = 1 | |
| continue | |
| # Whole Word Masking means that if we mask all of the wordpieces | |
| # corresponding to an original word. | |
| # | |
| # Note that Whole Word Masking does *not* change the training code | |
| # at all -- we still predict each WordPiece independently, softmaxed | |
| # over the entire vocabulary. | |
| if (do_whole_word_mask and len(cand_indexes) >= 1 and | |
| not is_start_piece(vocab_id_to_token_dict[token])): | |
| cand_indexes[-1].append(i) | |
| else: | |
| cand_indexes.append([i]) | |
| if is_start_piece(vocab_id_to_token_dict[token]): | |
| token_boundary[i] = 1 | |
| else: | |
| # 如果指定了中文分词器,那就先用分词器分词,然后再进行判断 | |
| # 获取去掉CLS SEP的原始文本 | |
| raw_tokens = [] | |
| for t in tokens: | |
| if t != cls_id and t != sep_id: | |
| raw_tokens.append(t) | |
| raw_tokens = [vocab_id_to_token_dict[i] for i in raw_tokens] | |
| # 分词然后获取每次字开头的最长词的长度 | |
| word_list = set(zh_tokenizer(''.join(raw_tokens), HMM=True)) | |
| word_length_dict = {} | |
| for w in word_list: | |
| if len(w) < 1: | |
| continue | |
| if w[0] not in word_length_dict: | |
| word_length_dict[w[0]] = len(w) | |
| elif word_length_dict[w[0]] < len(w): | |
| word_length_dict[w[0]] = len(w) | |
| i = 0 | |
| # 从词表里面检索 | |
| while i < len(tokens): | |
| token_id = tokens[i] | |
| token = vocab_id_to_token_dict[token_id] | |
| if len(token) == 0 or token_id == cls_id or token_id == sep_id: | |
| token_boundary[i] = 1 | |
| i += 1 | |
| continue | |
| word_max_length = 1 | |
| if token[0] in word_length_dict: | |
| word_max_length = word_length_dict[token[0]] | |
| j = 0 | |
| word = '' | |
| word_end = i+1 | |
| # 兼容以前##的形式,如果后面的词是##开头的,那么直接把后面的拼到前面当作一个词 | |
| old_style = False | |
| while word_end < len(tokens) and vocab_id_to_token_dict[tokens[word_end]].startswith('##'): | |
| old_style = True | |
| word_end += 1 | |
| if not old_style: | |
| while j < word_max_length and i+j < len(tokens): | |
| cur_token = tokens[i+j] | |
| word += vocab_id_to_token_dict[cur_token] | |
| j += 1 | |
| if word in word_list: | |
| word_end = i+j | |
| cand_indexes.append([p for p in range(i, word_end)]) | |
| token_boundary[i] = 1 | |
| i = word_end | |
| output_tokens = list(tokens) | |
| masked_lm_positions = [] | |
| masked_lm_labels = [] | |
| if masked_lm_prob == 0: | |
| return (output_tokens, masked_lm_positions, | |
| masked_lm_labels, token_boundary) | |
| num_to_predict = min(max_predictions_per_seq, | |
| max(1, int(round(len(tokens) * masked_lm_prob)))) | |
| ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64) | |
| if not geometric_dist: | |
| # Note(mingdachen): | |
| # By default, we set the probilities to favor shorter ngram sequences. | |
| pvals = 1. / np.arange(1, max_ngrams + 1) | |
| pvals /= pvals.sum(keepdims=True) | |
| if favor_longer_ngram: | |
| pvals = pvals[::-1] | |
| # 获取一个ngram的idx,对于每个word,记录他的ngram的word | |
| ngram_indexes = [] | |
| for idx in range(len(cand_indexes)): | |
| ngram_index = [] | |
| for n in ngrams: | |
| ngram_index.append(cand_indexes[idx:idx + n]) | |
| ngram_indexes.append(ngram_index) | |
| np_rng.shuffle(ngram_indexes) | |
| (masked_lms, masked_spans) = ([], []) | |
| covered_indexes = set() | |
| for cand_index_set in ngram_indexes: | |
| if len(masked_lms) >= num_to_predict: | |
| break | |
| if not cand_index_set: | |
| continue | |
| # Note(mingdachen): | |
| # Skip current piece if they are covered in lm masking or previous ngrams. | |
| for index_set in cand_index_set[0]: | |
| for index in index_set: | |
| if index in covered_indexes: | |
| continue | |
| if not geometric_dist: | |
| n = np_rng.choice(ngrams[:len(cand_index_set)], | |
| p=pvals[:len(cand_index_set)] / | |
| pvals[:len(cand_index_set)].sum(keepdims=True)) | |
| else: | |
| # Sampling "n" from the geometric distribution and clipping it to | |
| # the max_ngrams. Using p=0.2 default from the SpanBERT paper | |
| # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1) | |
| n = min(np_rng.geometric(0.2), max_ngrams) | |
| index_set = sum(cand_index_set[n - 1], []) | |
| n -= 1 | |
| # Note(mingdachen): | |
| # Repeatedly looking for a candidate that does not exceed the | |
| # maximum number of predictions by trying shorter ngrams. | |
| while len(masked_lms) + len(index_set) > num_to_predict: | |
| if n == 0: | |
| break | |
| index_set = sum(cand_index_set[n - 1], []) | |
| n -= 1 | |
| # If adding a whole-word mask would exceed the maximum number of | |
| # predictions, then just skip this candidate. | |
| if len(masked_lms) + len(index_set) > num_to_predict: | |
| continue | |
| is_any_index_covered = False | |
| for index in index_set: | |
| if index in covered_indexes: | |
| is_any_index_covered = True | |
| break | |
| if is_any_index_covered: | |
| continue | |
| for index in index_set: | |
| covered_indexes.add(index) | |
| masked_token = None | |
| token_id = tokens[index] | |
| if masking_style == "bert": | |
| # 80% of the time, replace with [MASK] | |
| if np_rng.random() < 0.8: | |
| masked_token = mask_id | |
| else: | |
| # 10% of the time, keep original | |
| if np_rng.random() < 0.5: | |
| masked_token = tokens[index] | |
| # 10% of the time, replace with random word | |
| else: | |
| masked_token = vocab_id_list[np_rng.randint(0, len(vocab_id_list))] | |
| elif masking_style == "t5": | |
| masked_token = mask_id | |
| else: | |
| raise ValueError("invalid value of masking style") | |
| output_tokens[index] = masked_token | |
| masked_lms.append(MaskedLmInstance(index=index, label=token_id)) | |
| masked_spans.append(MaskedLmInstance( | |
| index=index_set, | |
| label=[tokens[index] for index in index_set])) | |
| assert len(masked_lms) <= num_to_predict | |
| np_rng.shuffle(ngram_indexes) | |
| select_indexes = set() | |
| if do_permutation: | |
| for cand_index_set in ngram_indexes: | |
| if len(select_indexes) >= num_to_predict: | |
| break | |
| if not cand_index_set: | |
| continue | |
| # Note(mingdachen): | |
| # Skip current piece if they are covered in lm masking or previous ngrams. | |
| for index_set in cand_index_set[0]: | |
| for index in index_set: | |
| if index in covered_indexes or index in select_indexes: | |
| continue | |
| n = np.random.choice(ngrams[:len(cand_index_set)], | |
| p=pvals[:len(cand_index_set)] / | |
| pvals[:len(cand_index_set)].sum(keepdims=True)) | |
| index_set = sum(cand_index_set[n - 1], []) | |
| n -= 1 | |
| while len(select_indexes) + len(index_set) > num_to_predict: | |
| if n == 0: | |
| break | |
| index_set = sum(cand_index_set[n - 1], []) | |
| n -= 1 | |
| # If adding a whole-word mask would exceed the maximum number of | |
| # predictions, then just skip this candidate. | |
| if len(select_indexes) + len(index_set) > num_to_predict: | |
| continue | |
| is_any_index_covered = False | |
| for index in index_set: | |
| if index in covered_indexes or index in select_indexes: | |
| is_any_index_covered = True | |
| break | |
| if is_any_index_covered: | |
| continue | |
| for index in index_set: | |
| select_indexes.add(index) | |
| assert len(select_indexes) <= num_to_predict | |
| select_indexes = sorted(select_indexes) | |
| permute_indexes = list(select_indexes) | |
| np_rng.shuffle(permute_indexes) | |
| orig_token = list(output_tokens) | |
| for src_i, tgt_i in zip(select_indexes, permute_indexes): | |
| output_tokens[src_i] = orig_token[tgt_i] | |
| masked_lms.append(MaskedLmInstance(index=src_i, label=orig_token[src_i])) | |
| masked_lms = sorted(masked_lms, key=lambda x: x.index) | |
| # Sort the spans by the index of the first span | |
| masked_spans = sorted(masked_spans, key=lambda x: x.index[0]) | |
| for p in masked_lms: | |
| masked_lm_positions.append(p.index) | |
| masked_lm_labels.append(p.label) | |
| return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary, masked_spans) | |