Spaces:
Running
Running
| import re | |
| contractions = { | |
| "aint": "ain't", | |
| "arent": "aren't", | |
| "cant": "can't", | |
| "couldve": "could've", | |
| "couldnt": "couldn't", | |
| "couldn'tve": "couldn't've", | |
| "couldnt've": "couldn't've", | |
| "didnt": "didn't", | |
| "doesnt": "doesn't", | |
| "dont": "don't", | |
| "hadnt": "hadn't", | |
| "hadnt've": "hadn't've", | |
| "hadn'tve": "hadn't've", | |
| "hasnt": "hasn't", | |
| "havent": "haven't", | |
| "hed": "he'd", | |
| "hed've": "he'd've", | |
| "he'dve": "he'd've", | |
| "hes": "he's", | |
| "howd": "how'd", | |
| "howll": "how'll", | |
| "hows": "how's", | |
| "Id've": "I'd've", | |
| "I'dve": "I'd've", | |
| "Im": "I'm", | |
| "Ive": "I've", | |
| "isnt": "isn't", | |
| "itd": "it'd", | |
| "itd've": "it'd've", | |
| "it'dve": "it'd've", | |
| "itll": "it'll", | |
| "let's": "let's", | |
| "maam": "ma'am", | |
| "mightnt": "mightn't", | |
| "mightnt've": "mightn't've", | |
| "mightn'tve": "mightn't've", | |
| "mightve": "might've", | |
| "mustnt": "mustn't", | |
| "mustve": "must've", | |
| "neednt": "needn't", | |
| "notve": "not've", | |
| "oclock": "o'clock", | |
| "oughtnt": "oughtn't", | |
| "ow's'at": "'ow's'at", | |
| "'ows'at": "'ow's'at", | |
| "'ow'sat": "'ow's'at", | |
| "shant": "shan't", | |
| "shed've": "she'd've", | |
| "she'dve": "she'd've", | |
| "she's": "she's", | |
| "shouldve": "should've", | |
| "shouldnt": "shouldn't", | |
| "shouldnt've": "shouldn't've", | |
| "shouldn'tve": "shouldn't've", | |
| "somebody'd": "somebodyd", | |
| "somebodyd've": "somebody'd've", | |
| "somebody'dve": "somebody'd've", | |
| "somebodyll": "somebody'll", | |
| "somebodys": "somebody's", | |
| "someoned": "someone'd", | |
| "someoned've": "someone'd've", | |
| "someone'dve": "someone'd've", | |
| "someonell": "someone'll", | |
| "someones": "someone's", | |
| "somethingd": "something'd", | |
| "somethingd've": "something'd've", | |
| "something'dve": "something'd've", | |
| "somethingll": "something'll", | |
| "thats": "that's", | |
| "thered": "there'd", | |
| "thered've": "there'd've", | |
| "there'dve": "there'd've", | |
| "therere": "there're", | |
| "theres": "there's", | |
| "theyd": "they'd", | |
| "theyd've": "they'd've", | |
| "they'dve": "they'd've", | |
| "theyll": "they'll", | |
| "theyre": "they're", | |
| "theyve": "they've", | |
| "twas": "'twas", | |
| "wasnt": "wasn't", | |
| "wed've": "we'd've", | |
| "we'dve": "we'd've", | |
| "weve": "we've", | |
| "werent": "weren't", | |
| "whatll": "what'll", | |
| "whatre": "what're", | |
| "whats": "what's", | |
| "whatve": "what've", | |
| "whens": "when's", | |
| "whered": "where'd", | |
| "wheres": "where's", | |
| "whereve": "where've", | |
| "whod": "who'd", | |
| "whod've": "who'd've", | |
| "who'dve": "who'd've", | |
| "wholl": "who'll", | |
| "whos": "who's", | |
| "whove": "who've", | |
| "whyll": "why'll", | |
| "whyre": "why're", | |
| "whys": "why's", | |
| "wont": "won't", | |
| "wouldve": "would've", | |
| "wouldnt": "wouldn't", | |
| "wouldnt've": "wouldn't've", | |
| "wouldn'tve": "wouldn't've", | |
| "yall": "y'all", | |
| "yall'll": "y'all'll", | |
| "y'allll": "y'all'll", | |
| "yall'd've": "y'all'd've", | |
| "y'alld've": "y'all'd've", | |
| "y'all'dve": "y'all'd've", | |
| "youd": "you'd", | |
| "youd've": "you'd've", | |
| "you'dve": "you'd've", | |
| "youll": "you'll", | |
| "youre": "you're", | |
| "youve": "you've", | |
| } | |
| manual_map = { | |
| "none": "0", | |
| "zero": "0", | |
| "one": "1", | |
| "two": "2", | |
| "three": "3", | |
| "four": "4", | |
| "five": "5", | |
| "six": "6", | |
| "seven": "7", | |
| "eight": "8", | |
| "nine": "9", | |
| "ten": "10", | |
| } | |
| articles = ["a", "an", "the"] | |
| period_strip = re.compile("(?!<=\d)(\.)(?!\d)") | |
| comma_strip = re.compile("(\d)(\,)(\d)") | |
| punct = [ | |
| ";", | |
| r"/", | |
| "[", | |
| "]", | |
| '"', | |
| "{", | |
| "}", | |
| "(", | |
| ")", | |
| "=", | |
| "+", | |
| "\\", | |
| "_", | |
| "-", | |
| ">", | |
| "<", | |
| "@", | |
| "`", | |
| ",", | |
| "?", | |
| "!", | |
| ] | |
| def normalize_word(token): | |
| _token = token | |
| for p in punct: | |
| if (p + " " in token or " " + p in token) or ( | |
| re.search(comma_strip, token) != None | |
| ): | |
| _token = _token.replace(p, "") | |
| else: | |
| _token = _token.replace(p, " ") | |
| token = period_strip.sub("", _token, re.UNICODE) | |
| _token = [] | |
| temp = token.lower().split() | |
| for word in temp: | |
| word = manual_map.setdefault(word, word) | |
| if word not in articles: | |
| _token.append(word) | |
| for i, word in enumerate(_token): | |
| if word in contractions: | |
| _token[i] = contractions[word] | |
| token = " ".join(_token) | |
| token = token.replace(",", "") | |
| return token | |