Spaces:
Sleeping
Sleeping
| from datetime import datetime | |
| import re, trafilatura | |
| from trafilatura.settings import DEFAULT_CONFIG | |
| DEFAULT_CONFIG.MAX_FILE_SIZE = 50000 | |
| _URL_DATE_PATS = [ | |
| re.compile(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})"), # 2025-07-03 | |
| re.compile(r"(?P<y>\d{4})/(?P<m>\d{2})/(?P<d>\d{2})"), # 2025/07/03 | |
| re.compile(r"(?P<y>\d{4})(?P<m>\d{2})(?P<d>\d{2})"), # 20250703 | |
| ] | |
| def _meta_date(url: str): | |
| page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG) | |
| if not page: | |
| return None | |
| meta = trafilatura.extract_metadata(page) | |
| if not meta or not meta.date: | |
| return None | |
| try: | |
| return datetime.fromisoformat(meta.date) | |
| except ValueError: | |
| try: | |
| return datetime.fromisoformat(meta.date.split("T")[0]) | |
| except Exception: | |
| return None | |
| def _regex_date(url: str): | |
| for pat in _URL_DATE_PATS: | |
| m = pat.search(url) | |
| if m: | |
| try: | |
| return datetime( | |
| int(m.group("y")), int(m.group("m")), int(m.group("d")) | |
| ) | |
| except ValueError: | |
| pass | |
| return None | |
| def is_after_start(url: str, start_ymd: str) -> bool: | |
| """ | |
| - start_ymd: 'YYYYMMDD' | |
| """ | |
| t0 = datetime.strptime(start_ymd, "%Y%m%d") | |
| pub_dt = _meta_date(url) | |
| if pub_dt is None: | |
| pub_dt = _regex_date(url) | |
| if pub_dt is None: | |
| return True | |
| return pub_dt >= t0 | |