Spaces:
Runtime error
Runtime error
| import os | |
| import re | |
| from typing import Iterator, Tuple, Union | |
| from hbutils.system import urlsplit | |
| from .web import WebDataSource | |
| from ..utils import get_requests_session, srequest | |
| def _extract_words(keyword): | |
| return list(filter(bool, re.split(r'[\W_]+', keyword))) | |
| class DuitangSource(WebDataSource): | |
| def __init__(self, keyword: str, strict: bool = True, page_size: int = 100, | |
| group_name: str = 'duitang', download_silent: bool = True): | |
| WebDataSource.__init__(self, group_name, get_requests_session(), download_silent) | |
| self.keyword = keyword | |
| self.words = set(_extract_words(keyword)) | |
| self.page_size: int = page_size | |
| self.strict = strict | |
| def _check_title(self, title): | |
| if not self.strict: | |
| return True | |
| else: | |
| t_words = set(_extract_words(title)) | |
| return len(t_words & self.words) == len(self.words) | |
| def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]: | |
| offset = 0 | |
| while True: | |
| resp = srequest(self.session, 'GET', 'https://www.duitang.com/napi/blog/list/by_search/', params={ | |
| 'kw': self.keyword, | |
| 'start': str(offset), | |
| 'limit': str(self.page_size), | |
| }) | |
| resp.raise_for_status() | |
| raw = resp.json() | |
| if 'data' not in raw or 'object_list' not in raw['data']: | |
| break | |
| posts = raw['data']['object_list'] | |
| if not posts: | |
| break | |
| for post in posts: | |
| if not self._check_title(post['msg']): | |
| continue | |
| url = post['photo']['path'] | |
| _, ext_name = os.path.splitext(urlsplit(url).filename) | |
| filename = f'{self.group_name}_{post["id"]}{ext_name}' | |
| meta = { | |
| 'duitang': post, | |
| 'group_id': f'{self.group_name}_{post["id"]}', | |
| 'filename': filename, | |
| } | |
| yield post['id'], url, meta | |
| offset += self.page_size | |