Spaces:
Runtime error
Runtime error
| # encoding: utf-8 | |
| import os | |
| import tqdm | |
| from bs4 import BeautifulSoup as bs | |
| import urllib.request | |
| import json | |
| import datetime | |
| import pytz | |
| def _download_new_papers(field_abbr): | |
| NEW_SUB_URL = f'https://arxiv.org/list/{field_abbr}/new' # https://arxiv.org/list/cs/new | |
| page = urllib.request.urlopen(NEW_SUB_URL) | |
| soup = bs(page) | |
| content = soup.body.find("div", {'id': 'content'}) | |
| # find the first h3 element in content | |
| h3 = content.find("h3").text # e.g: New submissions for Wed, 10 May 23 | |
| date = h3.replace("New submissions for", "").strip() | |
| dt_list = content.dl.find_all("dt") | |
| dd_list = content.dl.find_all("dd") | |
| arxiv_base = "https://arxiv.org/abs/" | |
| assert len(dt_list) == len(dd_list) | |
| new_paper_list = [] | |
| for i in tqdm.tqdm(range(len(dt_list))): | |
| paper = {} | |
| paper_number = dt_list[i].text.strip().split(" ")[2].split(":")[-1] | |
| paper['main_page'] = arxiv_base + paper_number | |
| paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number | |
| paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title: ", "").strip() | |
| paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text \ | |
| .replace("Authors:\n", "").replace("\n", "").strip() | |
| paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects: ", "").strip() | |
| paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip() | |
| new_paper_list.append(paper) | |
| # check if ./data exist, if not, create it | |
| if not os.path.exists("./data"): | |
| os.makedirs("./data") | |
| # save new_paper_list to a jsonl file, with each line as the element of a dictionary | |
| date = datetime.date.fromtimestamp(datetime.datetime.now(tz=pytz.timezone("America/New_York")).timestamp()) | |
| date = date.strftime("%a, %d %b %y") | |
| with open(f"./data/{field_abbr}_{date}.jsonl", "w") as f: | |
| for paper in new_paper_list: | |
| f.write(json.dumps(paper) + "\n") | |
| def get_papers(field_abbr, limit=None): | |
| date = datetime.date.fromtimestamp(datetime.datetime.now(tz=pytz.timezone("America/New_York")).timestamp()) | |
| date = date.strftime("%a, %d %b %y") | |
| if not os.path.exists(f"./data/{field_abbr}_{date}.jsonl"): | |
| _download_new_papers(field_abbr) | |
| results = [] | |
| with open(f"./data/{field_abbr}_{date}.jsonl", "r") as f: | |
| for i, line in enumerate(f.readlines()): | |
| if limit and i == limit: | |
| return results | |
| results.append(json.loads(line)) | |
| return results | |