Spaces:
Sleeping
Sleeping
| import MeCab | |
| import re | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| class JapaneseTextVectorizer: | |
| def __init__(self): | |
| """。 | |
| MeCabのTaggerとTF-IDFベクトライザーを初期化 | |
| """ | |
| self.mecab_tagger = MeCab.Tagger() | |
| self.tfidf_model = TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b', norm=None) | |
| self.vocab_list = [] | |
| def _extract_nouns(self, text): | |
| """ | |
| テキストから名詞を抽出 | |
| Parameters: | |
| - text (str): 名詞を抽出する対象のテキスト | |
| Returns: | |
| - nouns (list): 抽出された名詞リスト | |
| """ | |
| node = self.mecab_tagger.parseToNode(text) | |
| nouns = [] | |
| while node: | |
| word = node.surface | |
| hinshi = node.feature.split(",")[0] | |
| if hinshi == "名詞": | |
| if (not word.isnumeric()) and (not re.match(r'^[\u3040-\u309F]+$', word)): | |
| # 名詞が数値と平仮名のみの場合は除き、それ以外の名詞を保存 | |
| nouns.append(word) | |
| node = node.next | |
| return nouns | |
| def fit_transform(self, text): | |
| """ | |
| テキストをTF-IDF表現に変換 | |
| Parameters: | |
| - text (str): TF-IDF表現に変換する対象のテキスト | |
| Returns: | |
| - tfidf_dict (dict): 単語とそのTF-IDF値を格納した辞書 | |
| """ | |
| nouns = self._extract_nouns(text) | |
| self.tfidf_model.fit(nouns) | |
| vocab_text = " ".join(nouns) | |
| tfidf_vec = self.tfidf_model.transform([vocab_text]).toarray()[0] | |
| tfidf_dict = dict(zip(self.tfidf_model.get_feature_names_out(), tfidf_vec)) | |
| tfidf_dict = {word: num_val for word, num_val in tfidf_dict.items() if num_val > 0} | |
| # TF-IDF値で辞書をソートし、上位5つの要素を取得 | |
| top_tfidf = dict(sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)[:5]) | |
| return top_tfidf | |