Spaces:
Running
Running
| # -*- coding: utf-8 -*- | |
| ''' | |
| Splits up a Unicode string into a list of tokens. | |
| Recognises: | |
| - Abbreviations | |
| - URLs | |
| - Emails | |
| - #hashtags | |
| - @mentions | |
| - emojis | |
| - emoticons (limited support) | |
| Multiple consecutive symbols are also treated as a single token. | |
| ''' | |
| from __future__ import absolute_import, division, print_function, unicode_literals | |
| import re | |
| # Basic patterns. | |
| RE_NUM = r'[0-9]+' | |
| RE_WORD = r'[a-zA-Z]+' | |
| RE_WHITESPACE = r'\s+' | |
| RE_ANY = r'.' | |
| # Combined words such as 'red-haired' or 'CUSTOM_TOKEN' | |
| RE_COMB = r'[a-zA-Z]+[-_][a-zA-Z]+' | |
| # English-specific patterns | |
| RE_CONTRACTIONS = RE_WORD + r'\'' + RE_WORD | |
| TITLES = [ | |
| r'Mr\.', | |
| r'Ms\.', | |
| r'Mrs\.', | |
| r'Dr\.', | |
| r'Prof\.', | |
| ] | |
| # Ensure case insensitivity | |
| RE_TITLES = r'|'.join([r'(?i)' + t for t in TITLES]) | |
| # Symbols have to be created as separate patterns in order to match consecutive | |
| # identical symbols. | |
| SYMBOLS = r'()<!?.,/\'\"-_=\\§|´ˇ°[]<>{}~$^&*;:%+\xa3€`' | |
| RE_SYMBOL = r'|'.join([re.escape(s) + r'+' for s in SYMBOLS]) | |
| # Hash symbols and at symbols have to be defined separately in order to not | |
| # clash with hashtags and mentions if there are multiple - i.e. | |
| # ##hello -> ['#', '#hello'] instead of ['##', 'hello'] | |
| SPECIAL_SYMBOLS = r'|#+(?=#[a-zA-Z0-9_]+)|@+(?=@[a-zA-Z0-9_]+)|#+|@+' | |
| RE_SYMBOL += SPECIAL_SYMBOLS | |
| RE_ABBREVIATIONS = r'\b(?<!\.)(?:[A-Za-z]\.){2,}' | |
| # Twitter-specific patterns | |
| RE_HASHTAG = r'#[a-zA-Z0-9_]+' | |
| RE_MENTION = r'@[a-zA-Z0-9_]+' | |
| RE_URL = r'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' | |
| RE_EMAIL = r'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b' | |
| # Emoticons and emojis | |
| RE_HEART = r'(?:<+/?3+)+' | |
| EMOTICONS_START = [ | |
| r'>:', | |
| r':', | |
| r'=', | |
| r';', | |
| ] | |
| EMOTICONS_MID = [ | |
| r'-', | |
| r',', | |
| r'^', | |
| '\'', | |
| '\"', | |
| ] | |
| EMOTICONS_END = [ | |
| r'D', | |
| r'd', | |
| r'p', | |
| r'P', | |
| r'v', | |
| r')', | |
| r'o', | |
| r'O', | |
| r'(', | |
| r'3', | |
| r'/', | |
| r'|', | |
| '\\', | |
| ] | |
| EMOTICONS_EXTRA = [ | |
| r'-_-', | |
| r'x_x', | |
| r'^_^', | |
| r'o.o', | |
| r'o_o', | |
| r'(:', | |
| r'):', | |
| r');', | |
| r'(;', | |
| ] | |
| RE_EMOTICON = r'|'.join([re.escape(s) for s in EMOTICONS_EXTRA]) | |
| for s in EMOTICONS_START: | |
| for m in EMOTICONS_MID: | |
| for e in EMOTICONS_END: | |
| RE_EMOTICON += '|{0}{1}?{2}+'.format(re.escape(s), re.escape(m), re.escape(e)) | |
| # requires ucs4 in python2.7 or python3+ | |
| # RE_EMOJI = r"""[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF\u2600-\u26FF\u2700-\u27BF]""" | |
| # safe for all python | |
| RE_EMOJI = r"""\ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f\ude80-\udeff]|[\u2600-\u26FF\u2700-\u27BF]""" | |
| # List of matched token patterns, ordered from most specific to least specific. | |
| TOKENS = [ | |
| RE_URL, | |
| RE_EMAIL, | |
| RE_COMB, | |
| RE_HASHTAG, | |
| RE_MENTION, | |
| RE_HEART, | |
| RE_EMOTICON, | |
| RE_CONTRACTIONS, | |
| RE_TITLES, | |
| RE_ABBREVIATIONS, | |
| RE_NUM, | |
| RE_WORD, | |
| RE_SYMBOL, | |
| RE_EMOJI, | |
| RE_ANY | |
| ] | |
| # List of ignored token patterns | |
| IGNORED = [ | |
| RE_WHITESPACE | |
| ] | |
| # Final pattern | |
| RE_PATTERN = re.compile(r'|'.join(IGNORED) + r'|(' + r'|'.join(TOKENS) + r')', | |
| re.UNICODE) | |
| def tokenize(text): | |
| '''Splits given input string into a list of tokens. | |
| # Arguments: | |
| text: Input string to be tokenized. | |
| # Returns: | |
| List of strings (tokens). | |
| ''' | |
| result = RE_PATTERN.findall(text) | |
| # Remove empty strings | |
| result = [t for t in result if t.strip()] | |
| return result | |