Spaces:
Runtime error
Runtime error
| import os | |
| from pathlib import Path | |
| import abc | |
| import pandas as pd | |
| from dotenv import load_dotenv | |
| import notion_client as nt | |
| from notion2md.exporter.block import StringExporter | |
| from loguru import logger | |
| class BaseNotionDatabase: | |
| """ | |
| Notion DBからページのコンテンツを取り出すベースのクラス | |
| """ | |
| def __init__(self): | |
| load_dotenv() | |
| self.notion_database_id = os.getenv("NOTION_DATABASE_ID") | |
| self.integration_token = os.getenv("INTEGRATION_TOKEN") | |
| # notion2mdの環境変数 | |
| os.environ["NOTION_TOKEN"] = os.getenv("INTEGRATION_TOKEN") | |
| self.notion_client = nt.Client(auth=self.integration_token) | |
| def load_database(self) -> list[dict]: | |
| """ | |
| Notion DBのページ一覧を取得 | |
| Returns: | |
| """ | |
| results = [] | |
| has_more = True | |
| start_cursor = None | |
| while has_more: | |
| db = self.notion_client.databases.query( | |
| **{ | |
| "database_id": self.notion_database_id, | |
| "start_cursor": start_cursor | |
| } | |
| ) | |
| # 100件までしか1回に取得できない | |
| # 100件以上ある場合 has_more = True | |
| has_more = db["has_more"] | |
| # 次のカーソル | |
| start_cursor = db["next_cursor"] | |
| # 取得結果 | |
| results += db["results"] | |
| return results | |
| def load_content(self) -> list[dict]: | |
| """ | |
| Notion DBのページの中身をdictで返す | |
| Returns: | |
| """ | |
| raise NotImplementedError | |
| class SakurapDB(BaseNotionDatabase): | |
| def load_database(self) -> list[dict]: | |
| """ | |
| Notion DBのページ一覧を取得 | |
| Returns: | |
| results: | |
| list[dict] | |
| """ | |
| results = [] | |
| has_more = True | |
| start_cursor = None | |
| while has_more: | |
| # "Rap詞 : 櫻井翔"がTrueのもののみ取得 | |
| db = self.notion_client.databases.query( | |
| **{ | |
| "database_id": self.notion_database_id, | |
| "filter": { | |
| "property": "Rap詞 : 櫻井翔", | |
| "checkbox": { | |
| "equals": True | |
| } | |
| }, | |
| "start_cursor": start_cursor | |
| } | |
| ) | |
| # 100件までしか1回に取得できない | |
| # 100件以上ある場合 has_more = True | |
| has_more = db["has_more"] | |
| # 次のカーソル | |
| start_cursor = db["next_cursor"] | |
| # 取得結果 | |
| results += db["results"] | |
| return results | |
| def __load_blocks(self, block_id: str) -> str: | |
| """ | |
| Notionのページをプレーンテキストで取得する (Notion Official API) | |
| Parameters | |
| ---------- | |
| block_id: | |
| str, Block ID | |
| Returns | |
| ------- | |
| texts: | |
| str | |
| """ | |
| block = self.notion_client.blocks.children.list( | |
| **{ | |
| "block_id": block_id | |
| } | |
| ) | |
| # プレーンテキストを繋げる | |
| def join_plain_texts(): | |
| text = [blck["paragraph"]["rich_text"][0]["plain_text"] if len(blck["paragraph"]["rich_text"]) | |
| else "\n" for blck in block["results"]] | |
| texts = "\n".join(text) | |
| return texts | |
| return join_plain_texts() | |
| def load_content(self) -> list[dict]: | |
| """ | |
| Notion DBのページの中身をdictで返す | |
| Returns: | |
| lyrics: | |
| list[dict] | |
| """ | |
| # DBのページ一覧を取得 | |
| db_results = self.load_database() | |
| logger.info("🚦 [Notion] load database...") | |
| # コンテンツ一覧 | |
| lyrics = [] | |
| logger.info("🚦 [Notion] start to load each page content ...") | |
| # 各ページの処理 | |
| for result in db_results: | |
| block_id = result["id"] | |
| # rap_lyric = self.__load_blocks(block_id) | |
| # Markdown形式でページを取得 | |
| rap_lyric = StringExporter(block_id=block_id).export() | |
| # Markdownの修飾子を削除 | |
| rap_lyric = rap_lyric.replace("\n\n", "\n").replace("<br/>", "\n").replace("*", "") | |
| lyrics.append( | |
| { | |
| "title": result["properties"]["名前"]["title"][0]["plain_text"], | |
| "content": rap_lyric | |
| } | |
| ) | |
| logger.info("🚦 [Notion] Finish to load.") | |
| return lyrics | |
| def fetch_sakurap_corpus(filepath: str, refetch=False) -> pd.DataFrame: | |
| """ | |
| サクラップのコーパスを取得する | |
| CSVファイルが存在しないときにNotionから取得する | |
| Parameters | |
| ---------- | |
| filepath: | |
| str | |
| refetch: | |
| bool | |
| Returns | |
| ------- | |
| """ | |
| filepath = Path(filepath) | |
| if not filepath.exists() or refetch: | |
| # CSVファイルを保存するディレクトリが存在しなかったら作成する | |
| if not filepath.parent.exists(): | |
| logger.info(f"🚦 [Notion] mkdir {str(filepath.parent)} ...") | |
| filepath.parent.mkdir(parents=True, exist_ok=True) | |
| logger.info("🚦 [Notion] fetch from Notion DB ...") | |
| # dictを取得 | |
| rap_db = SakurapDB() | |
| lyrics = rap_db.load_content() | |
| lyrics_df = pd.DataFrame(lyrics) | |
| lyrics_df.to_csv(filepath, index=False) | |
| else: | |
| logger.info("🚦 [Notion] load CSV file.") | |
| lyrics_df = pd.read_csv(filepath) | |
| return lyrics_df | |
| if __name__ == "__main__": | |
| sakurap_db = SakurapDB() | |
| lyrics = sakurap_db.load_content() | |