Spaces:
Running
Running
| # -*- coding: utf-8 -*- | |
| ''' Extracts lists of words from a given input to be used for later vocabulary | |
| generation or for creating tokenized datasets. | |
| Supports functionality for handling different file types and | |
| filtering/processing of this input. | |
| ''' | |
| from __future__ import division, print_function, unicode_literals | |
| import re | |
| import unicodedata | |
| import numpy as np | |
| from text_unidecode import unidecode | |
| from torchmoji.tokenizer import RE_MENTION, tokenize | |
| from torchmoji.filter_utils import (convert_linebreaks, | |
| convert_nonbreaking_space, | |
| correct_length, | |
| extract_emojis, | |
| mostly_english, | |
| non_english_user, | |
| process_word, | |
| punct_word, | |
| remove_control_chars, | |
| remove_variation_selectors, | |
| separate_emojis_and_text) | |
| try: | |
| unicode # Python 2 | |
| except NameError: | |
| unicode = str # Python 3 | |
| # Only catch retweets in the beginning of the tweet as those are the | |
| # automatically added ones. | |
| # We do not want to remove tweets like "Omg.. please RT this!!" | |
| RETWEETS_RE = re.compile(r'^[rR][tT]') | |
| # Use fast and less precise regex for removing tweets with URLs | |
| # It doesn't matter too much if a few tweets with URL's make it through | |
| URLS_RE = re.compile(r'https?://|www\.') | |
| MENTION_RE = re.compile(RE_MENTION) | |
| ALLOWED_CONVERTED_UNICODE_PUNCTUATION = """!"#$'()+,-.:;<=>?@`~""" | |
| class WordGenerator(): | |
| ''' Cleanses input and converts into words. Needs all sentences to be in | |
| Unicode format. Has subclasses that read sentences differently based on | |
| file type. | |
| Takes a generator as input. This can be from e.g. a file. | |
| unicode_handling in ['ignore_sentence', 'convert_punctuation', 'allow'] | |
| unicode_handling in ['ignore_emoji', 'ignore_sentence', 'allow'] | |
| ''' | |
| def __init__(self, stream, allow_unicode_text=False, ignore_emojis=True, | |
| remove_variation_selectors=True, break_replacement=True): | |
| self.stream = stream | |
| self.allow_unicode_text = allow_unicode_text | |
| self.remove_variation_selectors = remove_variation_selectors | |
| self.ignore_emojis = ignore_emojis | |
| self.break_replacement = break_replacement | |
| self.reset_stats() | |
| def get_words(self, sentence): | |
| """ Tokenizes a sentence into individual words. | |
| Converts Unicode punctuation into ASCII if that option is set. | |
| Ignores sentences with Unicode if that option is set. | |
| Returns an empty list of words if the sentence has Unicode and | |
| that is not allowed. | |
| """ | |
| if not isinstance(sentence, unicode): | |
| raise ValueError("All sentences should be Unicode-encoded!") | |
| sentence = sentence.strip().lower() | |
| if self.break_replacement: | |
| sentence = convert_linebreaks(sentence) | |
| if self.remove_variation_selectors: | |
| sentence = remove_variation_selectors(sentence) | |
| # Split into words using simple whitespace splitting and convert | |
| # Unicode. This is done to prevent word splitting issues with | |
| # twokenize and Unicode | |
| words = sentence.split() | |
| converted_words = [] | |
| for w in words: | |
| accept_sentence, c_w = self.convert_unicode_word(w) | |
| # Unicode word detected and not allowed | |
| if not accept_sentence: | |
| return [] | |
| else: | |
| converted_words.append(c_w) | |
| sentence = ' '.join(converted_words) | |
| words = tokenize(sentence) | |
| words = [process_word(w) for w in words] | |
| return words | |
| def check_ascii(self, word): | |
| """ Returns whether a word is ASCII """ | |
| try: | |
| word.decode('ascii') | |
| return True | |
| except (UnicodeDecodeError, UnicodeEncodeError, AttributeError): | |
| return False | |
| def convert_unicode_punctuation(self, word): | |
| word_converted_punct = [] | |
| for c in word: | |
| decoded_c = unidecode(c).lower() | |
| if len(decoded_c) == 0: | |
| # Cannot decode to anything reasonable | |
| word_converted_punct.append(c) | |
| else: | |
| # Check if all punctuation and therefore fine | |
| # to include unidecoded version | |
| allowed_punct = punct_word( | |
| decoded_c, | |
| punctuation=ALLOWED_CONVERTED_UNICODE_PUNCTUATION) | |
| if allowed_punct: | |
| word_converted_punct.append(decoded_c) | |
| else: | |
| word_converted_punct.append(c) | |
| return ''.join(word_converted_punct) | |
| def convert_unicode_word(self, word): | |
| """ Converts Unicode words to ASCII using unidecode. If Unicode is not | |
| allowed (set as a variable during initialization), then only | |
| punctuation that can be converted to ASCII will be allowed. | |
| """ | |
| if self.check_ascii(word): | |
| return True, word | |
| # First we ensure that the Unicode is normalized so it's | |
| # always a single character. | |
| word = unicodedata.normalize("NFKC", word) | |
| # Convert Unicode punctuation to ASCII equivalent. We want | |
| # e.g. "\u203c" (double exclamation mark) to be treated the same | |
| # as "!!" no matter if we allow other Unicode characters or not. | |
| word = self.convert_unicode_punctuation(word) | |
| if self.ignore_emojis: | |
| _, word = separate_emojis_and_text(word) | |
| # If conversion of punctuation and removal of emojis took care | |
| # of all the Unicode or if we allow Unicode then everything is fine | |
| if self.check_ascii(word) or self.allow_unicode_text: | |
| return True, word | |
| else: | |
| # Sometimes we might want to simply ignore Unicode sentences | |
| # (e.g. for vocabulary creation). This is another way to prevent | |
| # "polution" of strange Unicode tokens from low quality datasets | |
| return False, '' | |
| def data_preprocess_filtering(self, line, iter_i): | |
| """ To be overridden with specific preprocessing/filtering behavior | |
| if desired. | |
| Returns a boolean of whether the line should be accepted and the | |
| preprocessed text. | |
| Runs prior to tokenization. | |
| """ | |
| return True, line, {} | |
| def data_postprocess_filtering(self, words, iter_i): | |
| """ To be overridden with specific postprocessing/filtering behavior | |
| if desired. | |
| Returns a boolean of whether the line should be accepted and the | |
| postprocessed text. | |
| Runs after tokenization. | |
| """ | |
| return True, words, {} | |
| def extract_valid_sentence_words(self, line): | |
| """ Line may either a string of a list of strings depending on how | |
| the stream is being parsed. | |
| Domain-specific processing and filtering can be done both prior to | |
| and after tokenization. | |
| Custom information about the line can be extracted during the | |
| processing phases and returned as a dict. | |
| """ | |
| info = {} | |
| pre_valid, pre_line, pre_info = \ | |
| self.data_preprocess_filtering(line, self.stats['total']) | |
| info.update(pre_info) | |
| if not pre_valid: | |
| self.stats['pretokenization_filtered'] += 1 | |
| return False, [], info | |
| words = self.get_words(pre_line) | |
| if len(words) == 0: | |
| self.stats['unicode_filtered'] += 1 | |
| return False, [], info | |
| post_valid, post_words, post_info = \ | |
| self.data_postprocess_filtering(words, self.stats['total']) | |
| info.update(post_info) | |
| if not post_valid: | |
| self.stats['posttokenization_filtered'] += 1 | |
| return post_valid, post_words, info | |
| def generate_array_from_input(self): | |
| sentences = [] | |
| for words in self: | |
| sentences.append(words) | |
| return sentences | |
| def reset_stats(self): | |
| self.stats = {'pretokenization_filtered': 0, | |
| 'unicode_filtered': 0, | |
| 'posttokenization_filtered': 0, | |
| 'total': 0, | |
| 'valid': 0} | |
| def __iter__(self): | |
| if self.stream is None: | |
| raise ValueError("Stream should be set before iterating over it!") | |
| for line in self.stream: | |
| valid, words, info = self.extract_valid_sentence_words(line) | |
| # Words may be filtered away due to unidecode etc. | |
| # In that case the words should not be passed on. | |
| if valid and len(words): | |
| self.stats['valid'] += 1 | |
| yield words, info | |
| self.stats['total'] += 1 | |
| class TweetWordGenerator(WordGenerator): | |
| ''' Returns np array or generator of ASCII sentences for given tweet input. | |
| Any file opening/closing should be handled outside of this class. | |
| ''' | |
| def __init__(self, stream, wanted_emojis=None, english_words=None, | |
| non_english_user_set=None, allow_unicode_text=False, | |
| ignore_retweets=True, ignore_url_tweets=True, | |
| ignore_mention_tweets=False): | |
| self.wanted_emojis = wanted_emojis | |
| self.english_words = english_words | |
| self.non_english_user_set = non_english_user_set | |
| self.ignore_retweets = ignore_retweets | |
| self.ignore_url_tweets = ignore_url_tweets | |
| self.ignore_mention_tweets = ignore_mention_tweets | |
| WordGenerator.__init__(self, stream, | |
| allow_unicode_text=allow_unicode_text) | |
| def validated_tweet(self, data): | |
| ''' A bunch of checks to determine whether the tweet is valid. | |
| Also returns emojis contained by the tweet. | |
| ''' | |
| # Ordering of validations is important for speed | |
| # If it passes all checks, then the tweet is validated for usage | |
| # Skips incomplete tweets | |
| if len(data) <= 9: | |
| return False, [] | |
| text = data[9] | |
| if self.ignore_retweets and RETWEETS_RE.search(text): | |
| return False, [] | |
| if self.ignore_url_tweets and URLS_RE.search(text): | |
| return False, [] | |
| if self.ignore_mention_tweets and MENTION_RE.search(text): | |
| return False, [] | |
| if self.wanted_emojis is not None: | |
| uniq_emojis = np.unique(extract_emojis(text, self.wanted_emojis)) | |
| if len(uniq_emojis) == 0: | |
| return False, [] | |
| else: | |
| uniq_emojis = [] | |
| if self.non_english_user_set is not None and \ | |
| non_english_user(data[1], self.non_english_user_set): | |
| return False, [] | |
| return True, uniq_emojis | |
| def data_preprocess_filtering(self, line, iter_i): | |
| fields = line.strip().split("\t") | |
| valid, emojis = self.validated_tweet(fields) | |
| text = fields[9].replace('\\n', '') \ | |
| .replace('\\r', '') \ | |
| .replace('&', '&') if valid else '' | |
| return valid, text, {'emojis': emojis} | |
| def data_postprocess_filtering(self, words, iter_i): | |
| valid_length = correct_length(words, 1, None) | |
| valid_english, n_words, n_english = mostly_english(words, | |
| self.english_words) | |
| if valid_length and valid_english: | |
| return True, words, {'length': len(words), | |
| 'n_normal_words': n_words, | |
| 'n_english': n_english} | |
| else: | |
| return False, [], {'length': len(words), | |
| 'n_normal_words': n_words, | |
| 'n_english': n_english} | |