Spaces:
Runtime error
Runtime error
| from parser import parse_article, Article | |
| from ai.classify_paper import classify_papers | |
| import os | |
| import requests | |
| import datetime | |
| import hashlib | |
| import json | |
| from rich import print | |
| from date import Date | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from typing import List, Dict | |
| API_URL = "https://huggingface.co/api/daily_papers" | |
| cache = {} | |
| cache_expiry = {} | |
| def make_request(url: str): | |
| # Create a hash of the URL to use as the cache key | |
| url_hash = hashlib.md5(url.encode()).hexdigest() | |
| current_time = datetime.datetime.now() | |
| # Check if the response is already cached and not expired | |
| if url_hash in cache and (current_time - cache_expiry[url_hash]).seconds < 3600: | |
| print(f"Cache hit for URL: {url}") | |
| return cache[url_hash] | |
| http_proxy = os.getenv("HF_HTTP_PROXY") | |
| https_proxy = os.getenv("HF_HTTPS_PROXY") | |
| proxies = { | |
| "http": http_proxy, | |
| "https": https_proxy | |
| } if http_proxy or https_proxy else None | |
| attempts = 0 | |
| while attempts < 3: | |
| try: | |
| response = requests.get(url, proxies=proxies) | |
| response.raise_for_status() | |
| data = response.json() | |
| # Cache the response and set the expiry time | |
| cache[url_hash] = data | |
| cache_expiry[url_hash] = current_time | |
| return data | |
| except requests.RequestException as e: | |
| attempts += 1 | |
| print(f"Attempt {attempts} failed: {e}") | |
| if attempts == 3: | |
| return [] | |
| def fetch_papers(): | |
| data = make_request(API_URL) | |
| return [parse_article(item) for item in data] | |
| def fetch_papers_with_date(date: datetime): | |
| formatted_date = str(date) | |
| data = make_request(API_URL + "?date=" + formatted_date) | |
| return [parse_article(item) for item in data] | |
| def fetch_papers_with_daterange(start_date: Date, end_date: Date): | |
| articles: List[Article] = [] | |
| current_date = start_date | |
| dates = [] | |
| while current_date <= end_date: | |
| dates.append(current_date) | |
| current_date += 1 | |
| def fetch_for_date(date): | |
| print(date) | |
| if date == Date(): | |
| print("Fetching papers for today") | |
| return fetch_papers() | |
| else: | |
| print(f"Fetching papers for {date}") | |
| return fetch_papers_with_date(date) | |
| with ThreadPoolExecutor(max_workers=8) as executor: | |
| future_to_date = {executor.submit(fetch_for_date, date): date for date in dates} | |
| for future in as_completed(future_to_date): | |
| date = future_to_date[future] | |
| try: | |
| articles.extend(future.result()) | |
| except Exception as e: | |
| print(f"Error fetching articles for date {date}: {e}") | |
| # articles = [article for article in articles if (start_date <= Date(article.publishedAt.isoformat().split('T')[0]) <= end_date)] | |
| unique_articles: Dict[str, Article] = {} | |
| for article in articles: | |
| if article.paper.id not in unique_articles: | |
| unique_articles[article.paper.id] = article | |
| print(f"Unique articles: {len(unique_articles)}") | |
| preprocessed_articles: List[Article] = list(unique_articles.values()) | |
| preprocessed_articles = list(map(lambda article: { | |
| "title": article.title, | |
| "abstract": article.paper.summary, | |
| "id": article.paper.id | |
| }, preprocessed_articles)) | |
| # classified_articles = classify_papers(preprocessed_articles) | |
| # 遍历 classified_articles,将分类结果写入到 unique_articles 中 | |
| # for article in classified_articles: | |
| # unique_articles[article["id"]].paper.label = article["category"] | |
| return list(unique_articles.values()) | |
| if __name__ == "__main__": | |
| from rich import print | |
| start_date = Date(2025, 1, 21) | |
| end_date = Date(2025, 2, 1) | |
| articles = fetch_papers_with_daterange(start_date=start_date, end_date=end_date) | |
| # print(articles) | |
| print(f"Total articles: {len(articles)}") | |