Spaces:
Build error
Build error
| from collections import defaultdict, Counter | |
| from nltk.corpus import stopwords | |
| class RuleBasedHeuristic: | |
| def __init__(self, sentence=None, corpus=None): | |
| self.sentence = sentence | |
| self.corpus = corpus | |
| def add_contiguous_titlecase_words(self, row): | |
| matches = [] | |
| dd = defaultdict(list) | |
| count = 0 | |
| for i, j in zip(row, row[1:]): | |
| if j[0] - i[0] == 1: | |
| dd[count].append(i[-1] + " " + j[-1]) | |
| else: | |
| count += 1 | |
| for key, value in dd.items(): | |
| if len(value) > 1: | |
| out = value[0] | |
| inter = "" | |
| for item in value[1:]: | |
| inter += " " + item.split()[-1] | |
| matches.append(out + inter) | |
| else: | |
| matches.extend(value) | |
| return matches | |
| def augment_using_most_frequent_starting_token(self, N=1): | |
| first_token = [] | |
| for sentence in self.corpus: | |
| first_token.append(sentence.split()[0]) | |
| return Counter(first_token).most_common(N) | |
| def get_top_tokens(self, top_most_common_ptb=None): | |
| out = set(stopwords.words("english")) | |
| if top_most_common_ptb: | |
| out.update([token for token, counts in self.augment_using_most_frequent_starting_token(N=top_most_common_ptb)]) | |
| return out | |