Spaces:
Sleeping
Sleeping
pipe
Browse files- app copy.py +0 -134
- app.py +19 -117
- main.py +89 -29
app copy.py
DELETED
|
@@ -1,134 +0,0 @@
|
|
| 1 |
-
import base64
|
| 2 |
-
import os
|
| 3 |
-
from collections import defaultdict
|
| 4 |
-
from datetime import date, datetime, timedelta
|
| 5 |
-
from io import BytesIO
|
| 6 |
-
|
| 7 |
-
import dotenv
|
| 8 |
-
from datasets import load_dataset
|
| 9 |
-
from dateutil.parser import parse
|
| 10 |
-
from dateutil.tz import tzutc
|
| 11 |
-
from fasthtml.common import *
|
| 12 |
-
from huggingface_hub import login, whoami
|
| 13 |
-
|
| 14 |
-
dotenv.load_dotenv()
|
| 15 |
-
|
| 16 |
-
style = Style("""
|
| 17 |
-
.grid { margin-bottom: 1rem; }
|
| 18 |
-
.card { display: flex; flex-direction: column; }
|
| 19 |
-
.card img { margin-bottom: 0.5rem; }
|
| 20 |
-
.card h5 { margin: 0; font-size: 0.9rem; line-height: 1.2; }
|
| 21 |
-
.card a { color: inherit; text-decoration: none; }
|
| 22 |
-
.card a:hover { text-decoration: underline; }
|
| 23 |
-
""")
|
| 24 |
-
|
| 25 |
-
app, rt = fast_app(html_style=(style,))
|
| 26 |
-
|
| 27 |
-
login(token=os.environ.get("HF_TOKEN"))
|
| 28 |
-
|
| 29 |
-
hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
|
| 30 |
-
HF_REPO_ID = f"{hf_user}/zotero-articles"
|
| 31 |
-
|
| 32 |
-
abstract_ds = load_dataset(HF_REPO_ID, "abstracts", split="train")
|
| 33 |
-
article_ds = load_dataset(HF_REPO_ID, "articles", split="train")
|
| 34 |
-
|
| 35 |
-
image_ds = load_dataset(HF_REPO_ID, "images", split="train")
|
| 36 |
-
image_ds = image_ds.filter(lambda x: x["page_number"] == 1)
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def parse_date(date_string):
|
| 40 |
-
try:
|
| 41 |
-
return parse(date_string).astimezone(tzutc()).date()
|
| 42 |
-
except ValueError:
|
| 43 |
-
return date.today()
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
def get_week_start(date_obj):
|
| 47 |
-
return date_obj - timedelta(days=date_obj.weekday())
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
week2articles = defaultdict(list)
|
| 51 |
-
for article in article_ds:
|
| 52 |
-
date_added = parse_date(article["date_added"])
|
| 53 |
-
week_start = get_week_start(date_added)
|
| 54 |
-
week2articles[week_start].append(article["arxiv_id"])
|
| 55 |
-
|
| 56 |
-
weeks = sorted(week2articles.keys(), reverse=True)
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def get_article_details(arxiv_id):
|
| 60 |
-
article = article_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)[0]
|
| 61 |
-
abstract = abstract_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
|
| 62 |
-
image = image_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
|
| 63 |
-
return article, abstract, image
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
def generate_week_content(current_week):
|
| 67 |
-
week_index = weeks.index(current_week)
|
| 68 |
-
prev_week = weeks[week_index + 1] if week_index < len(weeks) - 1 else None
|
| 69 |
-
next_week = weeks[week_index - 1] if week_index > 0 else None
|
| 70 |
-
|
| 71 |
-
nav_buttons = Group(
|
| 72 |
-
Button(
|
| 73 |
-
"← Previous Week",
|
| 74 |
-
hx_get=f"/week/{prev_week}" if prev_week else "#",
|
| 75 |
-
hx_target="#content",
|
| 76 |
-
hx_swap="innerHTML",
|
| 77 |
-
disabled=not prev_week,
|
| 78 |
-
),
|
| 79 |
-
Button(
|
| 80 |
-
"Next Week →",
|
| 81 |
-
hx_get=f"/week/{next_week}" if next_week else "#",
|
| 82 |
-
hx_target="#content",
|
| 83 |
-
hx_swap="innerHTML",
|
| 84 |
-
disabled=not next_week,
|
| 85 |
-
),
|
| 86 |
-
)
|
| 87 |
-
|
| 88 |
-
articles = week2articles[current_week]
|
| 89 |
-
article_cards = []
|
| 90 |
-
for arxiv_id in articles:
|
| 91 |
-
article, abstract, image = get_article_details(arxiv_id)
|
| 92 |
-
article_title = article["contents"][0].get("paper_title", "article") if article["contents"] else "article"
|
| 93 |
-
|
| 94 |
-
card_content = [H5(A(article_title, href=f"https://arxiv.org/abs/{arxiv_id}", target="_blank"))]
|
| 95 |
-
|
| 96 |
-
if image:
|
| 97 |
-
pil_image = image[0]["image"]
|
| 98 |
-
img_byte_arr = BytesIO()
|
| 99 |
-
pil_image.save(img_byte_arr, format="JPEG")
|
| 100 |
-
img_byte_arr = img_byte_arr.getvalue()
|
| 101 |
-
image_url = f"data:image/jpeg;base64,{base64.b64encode(img_byte_arr).decode('utf-8')}"
|
| 102 |
-
card_content.insert(
|
| 103 |
-
1, Img(src=image_url, alt="Article image", style="max-width: 100%; height: auto; margin-bottom: 15px;")
|
| 104 |
-
)
|
| 105 |
-
|
| 106 |
-
article_cards.append(Card(*card_content, cls="mb-4"))
|
| 107 |
-
|
| 108 |
-
grid = Grid(*article_cards, style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 1rem;")
|
| 109 |
-
|
| 110 |
-
week_end = current_week + timedelta(days=6)
|
| 111 |
-
return Div(
|
| 112 |
-
nav_buttons,
|
| 113 |
-
H3(f"Week of {current_week.strftime('%B %d')} - {week_end.strftime('%B %d, %Y')} ({len(articles)} articles)"),
|
| 114 |
-
grid,
|
| 115 |
-
nav_buttons,
|
| 116 |
-
id="content",
|
| 117 |
-
)
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
@rt("/")
|
| 121 |
-
def get():
|
| 122 |
-
return Titled("AnswerAI Zotero Weekly", generate_week_content(weeks[0]))
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
@rt("/week/{date}")
|
| 126 |
-
def get(date: str):
|
| 127 |
-
try:
|
| 128 |
-
current_week = datetime.strptime(date, "%Y-%m-%d").date()
|
| 129 |
-
return generate_week_content(current_week)
|
| 130 |
-
except Exception as e:
|
| 131 |
-
return Div(f"Error displaying articles: {str(e)}")
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
serve()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,134 +1,36 @@
|
|
| 1 |
-
import base64
|
| 2 |
import os
|
| 3 |
-
from collections import defaultdict
|
| 4 |
-
from datetime import date, datetime, timedelta
|
| 5 |
-
from io import BytesIO
|
| 6 |
|
| 7 |
import dotenv
|
| 8 |
-
from datasets import load_dataset
|
| 9 |
-
from dateutil.parser import parse
|
| 10 |
-
from dateutil.tz import tzutc
|
| 11 |
from fasthtml.common import *
|
| 12 |
-
from huggingface_hub import login, whoami
|
| 13 |
|
| 14 |
dotenv.load_dotenv()
|
| 15 |
-
|
| 16 |
-
style = Style("""
|
| 17 |
-
.grid { margin-bottom: 1rem; }
|
| 18 |
-
.card { display: flex; flex-direction: column; }
|
| 19 |
-
.card img { margin-bottom: 0.5rem; }
|
| 20 |
-
.card h5 { margin: 0; font-size: 0.9rem; line-height: 1.2; }
|
| 21 |
-
.card a { color: inherit; text-decoration: none; }
|
| 22 |
-
.card a:hover { text-decoration: underline; }
|
| 23 |
-
""")
|
| 24 |
-
|
| 25 |
-
app, rt = fast_app(html_style=(style,))
|
| 26 |
-
|
| 27 |
login(token=os.environ.get("HF_TOKEN"))
|
|
|
|
| 28 |
|
| 29 |
hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
abstract_ds = load_dataset(HF_REPO_ID, "abstracts", split="train")
|
| 33 |
-
article_ds = load_dataset(HF_REPO_ID, "articles", split="train")
|
| 34 |
-
|
| 35 |
-
image_ds = load_dataset(HF_REPO_ID, "images", split="train")
|
| 36 |
-
image_ds = image_ds.filter(lambda x: x["page_number"] == 1)
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def parse_date(date_string):
|
| 40 |
-
try:
|
| 41 |
-
return parse(date_string).astimezone(tzutc()).date()
|
| 42 |
-
except ValueError:
|
| 43 |
-
return date.today()
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
def get_week_start(date_obj):
|
| 47 |
-
return date_obj - timedelta(days=date_obj.weekday())
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
week2articles = defaultdict(list)
|
| 51 |
-
for article in article_ds:
|
| 52 |
-
date_added = parse_date(article["date_added"])
|
| 53 |
-
week_start = get_week_start(date_added)
|
| 54 |
-
week2articles[week_start].append(article["arxiv_id"])
|
| 55 |
-
|
| 56 |
-
weeks = sorted(week2articles.keys(), reverse=True)
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def get_article_details(arxiv_id):
|
| 60 |
-
article = article_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)[0]
|
| 61 |
-
abstract = abstract_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
|
| 62 |
-
image = image_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
|
| 63 |
-
return article, abstract, image
|
| 64 |
-
|
| 65 |
|
| 66 |
-
|
| 67 |
-
week_index = weeks.index(current_week)
|
| 68 |
-
prev_week = weeks[week_index + 1] if week_index < len(weeks) - 1 else None
|
| 69 |
-
next_week = weeks[week_index - 1] if week_index > 0 else None
|
| 70 |
-
|
| 71 |
-
nav_buttons = Group(
|
| 72 |
-
Button(
|
| 73 |
-
"← Previous Week",
|
| 74 |
-
hx_get=f"/week/{prev_week}" if prev_week else "#",
|
| 75 |
-
hx_target="#content",
|
| 76 |
-
hx_swap="innerHTML",
|
| 77 |
-
disabled=not prev_week,
|
| 78 |
-
),
|
| 79 |
-
Button(
|
| 80 |
-
"Next Week →",
|
| 81 |
-
hx_get=f"/week/{next_week}" if next_week else "#",
|
| 82 |
-
hx_target="#content",
|
| 83 |
-
hx_swap="innerHTML",
|
| 84 |
-
disabled=not next_week,
|
| 85 |
-
),
|
| 86 |
-
)
|
| 87 |
-
|
| 88 |
-
articles = week2articles[current_week]
|
| 89 |
-
article_cards = []
|
| 90 |
-
for arxiv_id in articles:
|
| 91 |
-
article, abstract, image = get_article_details(arxiv_id)
|
| 92 |
-
article_title = article["contents"][0].get("paper_title", "article") if article["contents"] else "article"
|
| 93 |
-
|
| 94 |
-
card_content = [H5(A(article_title, href=f"https://arxiv.org/abs/{arxiv_id}", target="_blank"))]
|
| 95 |
-
|
| 96 |
-
if image:
|
| 97 |
-
pil_image = image[0]["image"]
|
| 98 |
-
img_byte_arr = BytesIO()
|
| 99 |
-
pil_image.save(img_byte_arr, format="JPEG")
|
| 100 |
-
img_byte_arr = img_byte_arr.getvalue()
|
| 101 |
-
image_url = f"data:image/jpeg;base64,{base64.b64encode(img_byte_arr).decode('utf-8')}"
|
| 102 |
-
card_content.insert(
|
| 103 |
-
0, Img(src=image_url, alt="Article image", style="max-width: 100%; height: auto; margin-bottom: 15px;")
|
| 104 |
-
)
|
| 105 |
-
|
| 106 |
-
article_cards.append(Card(*card_content, cls="mb-4"))
|
| 107 |
-
|
| 108 |
-
grid = Grid(*article_cards, style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 1rem;")
|
| 109 |
-
|
| 110 |
-
week_end = current_week + timedelta(days=6)
|
| 111 |
-
return Div(
|
| 112 |
-
nav_buttons,
|
| 113 |
-
H3(f"Week of {current_week.strftime('%B %d')} - {week_end.strftime('%B %d, %Y')} ({len(articles)} articles)"),
|
| 114 |
-
grid,
|
| 115 |
-
nav_buttons,
|
| 116 |
-
id="content",
|
| 117 |
-
)
|
| 118 |
|
| 119 |
|
| 120 |
@rt("/")
|
| 121 |
def get():
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
|
| 134 |
serve()
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import dotenv
|
|
|
|
|
|
|
|
|
|
| 4 |
from fasthtml.common import *
|
| 5 |
+
from huggingface_hub import HfApi, login, whoami
|
| 6 |
|
| 7 |
dotenv.load_dotenv()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
login(token=os.environ.get("HF_TOKEN"))
|
| 9 |
+
api = HfApi()
|
| 10 |
|
| 11 |
hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
|
| 12 |
+
HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-texts"
|
| 13 |
+
HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-images"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
app, rt = fast_app()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
@rt("/")
|
| 19 |
def get():
|
| 20 |
+
info = api.dataset_info(HF_REPO_ID_TXT)
|
| 21 |
+
text_last_modified = info.last_modified.strftime("%d-%b-%y at %H:%M:%S")
|
| 22 |
+
|
| 23 |
+
info = api.dataset_info(HF_REPO_ID_IMG)
|
| 24 |
+
img_last_modified = info.last_modified.strftime("%d-%b-%y at %H:%M:%S")
|
| 25 |
+
|
| 26 |
+
return Titled(
|
| 27 |
+
"Zotero Refresh Pipeline",
|
| 28 |
+
Div(
|
| 29 |
+
H3("Status"),
|
| 30 |
+
P(f"{HF_REPO_ID_TXT} : {text_last_modified} (last updated)"),
|
| 31 |
+
P(f"{HF_REPO_ID_IMG}: {img_last_modified} (last updated)"),
|
| 32 |
+
),
|
| 33 |
+
)
|
| 34 |
|
| 35 |
|
| 36 |
serve()
|
main.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import re
|
|
|
|
| 3 |
import time
|
| 4 |
|
| 5 |
import dotenv
|
|
@@ -9,17 +10,19 @@ import requests
|
|
| 9 |
import schedule
|
| 10 |
import srsly
|
| 11 |
from bs4 import BeautifulSoup
|
| 12 |
-
from datasets import Dataset, Image, load_dataset
|
| 13 |
-
from huggingface_hub import create_repo, login, whoami
|
| 14 |
from PIL import Image as PILImage
|
| 15 |
from retry import retry
|
| 16 |
from tqdm.auto import tqdm
|
| 17 |
|
| 18 |
dotenv.load_dotenv()
|
| 19 |
login(token=os.environ.get("HF_TOKEN"))
|
|
|
|
| 20 |
|
| 21 |
hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
|
| 22 |
-
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
########################################################
|
|
@@ -66,7 +69,7 @@ def get_zotero_items(debug=False):
|
|
| 66 |
print(f"# items fetched {len(items)}")
|
| 67 |
|
| 68 |
if debug:
|
| 69 |
-
if len(items) >
|
| 70 |
break
|
| 71 |
|
| 72 |
return items
|
|
@@ -103,11 +106,18 @@ def get_arxiv_items(items):
|
|
| 103 |
if arxiv_id in visited:
|
| 104 |
continue
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
arxiv_items.append(
|
| 107 |
{
|
| 108 |
"arxiv_id": arxiv_id,
|
| 109 |
"arxiv_url": arxiv_url,
|
|
|
|
|
|
|
| 110 |
"pdf_url": pdf_url,
|
|
|
|
| 111 |
"added_by": item["meta"]["createdByUser"]["username"],
|
| 112 |
"date_added": data.get("dateAdded", ""),
|
| 113 |
}
|
|
@@ -129,10 +139,10 @@ def fetch_arxiv_htmls(arxiv_items):
|
|
| 129 |
for item in tqdm(arxiv_items):
|
| 130 |
html = fetch_arxiv_html(item["arxiv_id"])
|
| 131 |
if html:
|
| 132 |
-
item["
|
| 133 |
else:
|
| 134 |
print(f"failed to fetch html for {item['arxiv_id']}")
|
| 135 |
-
item["
|
| 136 |
|
| 137 |
return arxiv_items
|
| 138 |
|
|
@@ -326,7 +336,7 @@ def download_arxiv_pdf(arxiv_id):
|
|
| 326 |
raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
|
| 327 |
|
| 328 |
|
| 329 |
-
def pdf_to_jpegs(pdf_content, output_folder):
|
| 330 |
# Create output folder if it doesn't exist
|
| 331 |
os.makedirs(output_folder, exist_ok=True)
|
| 332 |
|
|
@@ -345,6 +355,9 @@ def pdf_to_jpegs(pdf_content, output_folder):
|
|
| 345 |
pix.save(image_path)
|
| 346 |
# print(f"Saved {image_path}")
|
| 347 |
|
|
|
|
|
|
|
|
|
|
| 348 |
doc.close()
|
| 349 |
|
| 350 |
|
|
@@ -392,8 +405,6 @@ def create_hf_image_dataset(base_dir):
|
|
| 392 |
"image": [d["image"] for d in data],
|
| 393 |
"arxiv_id": [d["arxiv_id"] for d in data],
|
| 394 |
"page_number": [d["page_number"] for d in data],
|
| 395 |
-
"width": [d["width"] for d in data],
|
| 396 |
-
"height": [d["height"] for d in data],
|
| 397 |
}
|
| 398 |
)
|
| 399 |
|
|
@@ -409,9 +420,17 @@ def create_hf_image_dataset(base_dir):
|
|
| 409 |
|
| 410 |
|
| 411 |
def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
|
| 412 |
-
repo_id = HF_REPO_ID
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
create_repo(
|
| 414 |
-
repo_id=
|
| 415 |
token=os.environ.get("HF_TOKEN"),
|
| 416 |
private=True,
|
| 417 |
repo_type="dataset",
|
|
@@ -421,23 +440,44 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
|
|
| 421 |
# upload image dataset
|
| 422 |
try:
|
| 423 |
img_ds = create_hf_image_dataset("data/arxiv_images")
|
| 424 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
|
|
|
|
| 426 |
# push id_to_abstract
|
| 427 |
abstract_ds = Dataset.from_pandas(abstract_df)
|
| 428 |
-
abstract_ds.push_to_hub(
|
| 429 |
|
| 430 |
# push arxiv_items
|
| 431 |
arxiv_ds = Dataset.from_pandas(contents_df)
|
| 432 |
-
arxiv_ds.push_to_hub(
|
| 433 |
|
| 434 |
# push processed_arxiv_ids
|
| 435 |
processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
|
| 436 |
processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
|
| 437 |
-
processed_arxiv_ids_ds.push_to_hub(
|
| 438 |
except Exception as e:
|
| 439 |
print(e)
|
| 440 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
|
| 442 |
########################################################
|
| 443 |
### MAIN
|
|
@@ -445,21 +485,20 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
|
|
| 445 |
|
| 446 |
|
| 447 |
def main():
|
| 448 |
-
items = get_zotero_items(debug=True)
|
|
|
|
|
|
|
| 449 |
print(f"# of items fetched from zotero: {len(items)}")
|
| 450 |
arxiv_items = get_arxiv_items(items)
|
| 451 |
print(f"# of arxiv papers: {len(arxiv_items)}")
|
| 452 |
|
| 453 |
# get already processed arxiv ids from HF
|
| 454 |
try:
|
| 455 |
-
existing_arxiv_ids = load_dataset(
|
| 456 |
except Exception as e:
|
| 457 |
print(e)
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
except Exception as e:
|
| 461 |
-
print(e)
|
| 462 |
-
existing_arxiv_ids = []
|
| 463 |
existing_arxiv_ids = set(existing_arxiv_ids)
|
| 464 |
print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
|
| 465 |
|
|
@@ -468,15 +507,27 @@ def main():
|
|
| 468 |
arxiv_items = fetch_arxiv_htmls(arxiv_items)
|
| 469 |
print(f"# of new arxiv items: {len(arxiv_items)}")
|
| 470 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
processed_arxiv_ids = set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
for item in arxiv_items:
|
| 473 |
# download images --
|
| 474 |
save_arxiv_article_images(item["arxiv_id"])
|
| 475 |
|
| 476 |
# parse html
|
| 477 |
try:
|
| 478 |
-
item["contents"] = parse_html_content(item["
|
| 479 |
-
processed_arxiv_ids.add(item["arxiv_id"])
|
| 480 |
except Exception as e:
|
| 481 |
print(f"Failed to parse html for {item['arxiv_id']}: {e}")
|
| 482 |
item["contents"] = []
|
|
@@ -484,12 +535,21 @@ def main():
|
|
| 484 |
if len(item["contents"]) == 0:
|
| 485 |
print("Extracting from pdf...")
|
| 486 |
md_content = get_pdf_text(item["arxiv_id"]) # fix this
|
|
|
|
|
|
|
| 487 |
if md_content:
|
| 488 |
item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
|
| 489 |
-
processed_arxiv_ids.add(item["arxiv_id"])
|
| 490 |
else:
|
| 491 |
item["contents"] = []
|
| 492 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
# save contents ---
|
| 494 |
processed_arxiv_ids = list(processed_arxiv_ids)
|
| 495 |
print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")
|
|
@@ -507,7 +567,7 @@ def main():
|
|
| 507 |
|
| 508 |
# add to existing dataset
|
| 509 |
try:
|
| 510 |
-
old_abstract_df = load_dataset(
|
| 511 |
except Exception as e:
|
| 512 |
print(e)
|
| 513 |
old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
|
|
@@ -520,7 +580,7 @@ def main():
|
|
| 520 |
contents_df = pd.DataFrame(arxiv_items)
|
| 521 |
print(contents_df.head())
|
| 522 |
try:
|
| 523 |
-
old_contents_df = load_dataset(
|
| 524 |
except Exception as e:
|
| 525 |
print(e)
|
| 526 |
old_contents_df = pd.DataFrame(columns=contents_df.columns)
|
|
@@ -531,7 +591,7 @@ def main():
|
|
| 531 |
contents_df = contents_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
|
| 532 |
|
| 533 |
# upload to hf
|
| 534 |
-
processed_arxiv_ids = list(set(processed_arxiv_ids + list(
|
| 535 |
upload_to_hf(abstract_df, contents_df, processed_arxiv_ids)
|
| 536 |
|
| 537 |
# save as local copy
|
|
@@ -545,7 +605,7 @@ def schedule_periodic_task():
|
|
| 545 |
"""
|
| 546 |
Schedule the main task to run at the user-defined frequency
|
| 547 |
"""
|
| 548 |
-
main() # run once initially
|
| 549 |
|
| 550 |
frequency = "daily" # TODO: env
|
| 551 |
if frequency == "hourly":
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
+
import shutil
|
| 4 |
import time
|
| 5 |
|
| 6 |
import dotenv
|
|
|
|
| 10 |
import schedule
|
| 11 |
import srsly
|
| 12 |
from bs4 import BeautifulSoup
|
| 13 |
+
from datasets import Dataset, Image, concatenate_datasets, load_dataset
|
| 14 |
+
from huggingface_hub import HfApi, create_repo, login, whoami
|
| 15 |
from PIL import Image as PILImage
|
| 16 |
from retry import retry
|
| 17 |
from tqdm.auto import tqdm
|
| 18 |
|
| 19 |
dotenv.load_dotenv()
|
| 20 |
login(token=os.environ.get("HF_TOKEN"))
|
| 21 |
+
api = HfApi()
|
| 22 |
|
| 23 |
hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
|
| 24 |
+
HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-texts"
|
| 25 |
+
HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-images"
|
| 26 |
|
| 27 |
|
| 28 |
########################################################
|
|
|
|
| 69 |
print(f"# items fetched {len(items)}")
|
| 70 |
|
| 71 |
if debug:
|
| 72 |
+
if len(items) > 1600:
|
| 73 |
break
|
| 74 |
|
| 75 |
return items
|
|
|
|
| 106 |
if arxiv_id in visited:
|
| 107 |
continue
|
| 108 |
|
| 109 |
+
authors = []
|
| 110 |
+
for author in data.get("creators", []):
|
| 111 |
+
authors.append(f"{author.get('firstName', '')} {author.get('lastName', '')}")
|
| 112 |
+
|
| 113 |
arxiv_items.append(
|
| 114 |
{
|
| 115 |
"arxiv_id": arxiv_id,
|
| 116 |
"arxiv_url": arxiv_url,
|
| 117 |
+
"title": data.get("title", ""),
|
| 118 |
+
"authors": authors,
|
| 119 |
"pdf_url": pdf_url,
|
| 120 |
+
"date_published": data.get("date", ""),
|
| 121 |
"added_by": item["meta"]["createdByUser"]["username"],
|
| 122 |
"date_added": data.get("dateAdded", ""),
|
| 123 |
}
|
|
|
|
| 139 |
for item in tqdm(arxiv_items):
|
| 140 |
html = fetch_arxiv_html(item["arxiv_id"])
|
| 141 |
if html:
|
| 142 |
+
item["raw_content"] = html
|
| 143 |
else:
|
| 144 |
print(f"failed to fetch html for {item['arxiv_id']}")
|
| 145 |
+
item["raw_content"] = "Error"
|
| 146 |
|
| 147 |
return arxiv_items
|
| 148 |
|
|
|
|
| 336 |
raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
|
| 337 |
|
| 338 |
|
| 339 |
+
def pdf_to_jpegs(pdf_content, output_folder, max_pages=128):
|
| 340 |
# Create output folder if it doesn't exist
|
| 341 |
os.makedirs(output_folder, exist_ok=True)
|
| 342 |
|
|
|
|
| 355 |
pix.save(image_path)
|
| 356 |
# print(f"Saved {image_path}")
|
| 357 |
|
| 358 |
+
if page_num >= max_pages:
|
| 359 |
+
break
|
| 360 |
+
|
| 361 |
doc.close()
|
| 362 |
|
| 363 |
|
|
|
|
| 405 |
"image": [d["image"] for d in data],
|
| 406 |
"arxiv_id": [d["arxiv_id"] for d in data],
|
| 407 |
"page_number": [d["page_number"] for d in data],
|
|
|
|
|
|
|
| 408 |
}
|
| 409 |
)
|
| 410 |
|
|
|
|
| 420 |
|
| 421 |
|
| 422 |
def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
|
| 423 |
+
# repo_id = HF_REPO_ID
|
| 424 |
+
create_repo(
|
| 425 |
+
repo_id=HF_REPO_ID_TXT,
|
| 426 |
+
token=os.environ.get("HF_TOKEN"),
|
| 427 |
+
private=True,
|
| 428 |
+
repo_type="dataset",
|
| 429 |
+
exist_ok=True,
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
create_repo(
|
| 433 |
+
repo_id=HF_REPO_ID_IMG,
|
| 434 |
token=os.environ.get("HF_TOKEN"),
|
| 435 |
private=True,
|
| 436 |
repo_type="dataset",
|
|
|
|
| 440 |
# upload image dataset
|
| 441 |
try:
|
| 442 |
img_ds = create_hf_image_dataset("data/arxiv_images")
|
| 443 |
+
try:
|
| 444 |
+
old_img_ds = load_dataset(HF_REPO_ID_IMG, "images")["train"]
|
| 445 |
+
img_ds = concatenate_datasets([old_img_ds, img_ds])
|
| 446 |
+
except Exception as e:
|
| 447 |
+
print(e)
|
| 448 |
+
img_ds.push_to_hub(HF_REPO_ID_IMG, "images", token=os.environ.get("HF_TOKEN"))
|
| 449 |
+
except Exception as e:
|
| 450 |
+
print(e)
|
| 451 |
+
|
| 452 |
+
# upload first pages only
|
| 453 |
+
try:
|
| 454 |
+
img_ds = img_ds.filter(lambda x: x["page_number"] == 1)
|
| 455 |
+
img_ds.push_to_hub(HF_REPO_ID_IMG, "images_first_page", token=os.environ.get("HF_TOKEN"))
|
| 456 |
+
except Exception as e:
|
| 457 |
+
print(e)
|
| 458 |
|
| 459 |
+
try:
|
| 460 |
# push id_to_abstract
|
| 461 |
abstract_ds = Dataset.from_pandas(abstract_df)
|
| 462 |
+
abstract_ds.push_to_hub(HF_REPO_ID_TXT, "abstracts", token=os.environ.get("HF_TOKEN"))
|
| 463 |
|
| 464 |
# push arxiv_items
|
| 465 |
arxiv_ds = Dataset.from_pandas(contents_df)
|
| 466 |
+
arxiv_ds.push_to_hub(HF_REPO_ID_TXT, "articles", token=os.environ.get("HF_TOKEN"))
|
| 467 |
|
| 468 |
# push processed_arxiv_ids
|
| 469 |
processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
|
| 470 |
processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
|
| 471 |
+
processed_arxiv_ids_ds.push_to_hub(HF_REPO_ID_TXT, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
|
| 472 |
except Exception as e:
|
| 473 |
print(e)
|
| 474 |
|
| 475 |
+
# trigger refresh of connected datasets
|
| 476 |
+
print("==" * 40)
|
| 477 |
+
print("Triggering refresh of connected datasets")
|
| 478 |
+
api.restart_space(repo_id="answerdotai/zotero-weekly")
|
| 479 |
+
print("==" * 40)
|
| 480 |
+
|
| 481 |
|
| 482 |
########################################################
|
| 483 |
### MAIN
|
|
|
|
| 485 |
|
| 486 |
|
| 487 |
def main():
|
| 488 |
+
# items = get_zotero_items(debug=True)
|
| 489 |
+
items = get_zotero_items(debug=False)
|
| 490 |
+
|
| 491 |
print(f"# of items fetched from zotero: {len(items)}")
|
| 492 |
arxiv_items = get_arxiv_items(items)
|
| 493 |
print(f"# of arxiv papers: {len(arxiv_items)}")
|
| 494 |
|
| 495 |
# get already processed arxiv ids from HF
|
| 496 |
try:
|
| 497 |
+
existing_arxiv_ids = load_dataset(HF_REPO_ID_TXT, "processed_arxiv_ids")["train"]["arxiv_id"]
|
| 498 |
except Exception as e:
|
| 499 |
print(e)
|
| 500 |
+
existing_arxiv_ids = []
|
| 501 |
+
|
|
|
|
|
|
|
|
|
|
| 502 |
existing_arxiv_ids = set(existing_arxiv_ids)
|
| 503 |
print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
|
| 504 |
|
|
|
|
| 507 |
arxiv_items = fetch_arxiv_htmls(arxiv_items)
|
| 508 |
print(f"# of new arxiv items: {len(arxiv_items)}")
|
| 509 |
|
| 510 |
+
if len(arxiv_items) == 0:
|
| 511 |
+
print("No new arxiv items to process")
|
| 512 |
+
return
|
| 513 |
+
|
| 514 |
processed_arxiv_ids = set()
|
| 515 |
+
pbar = tqdm(range(len(arxiv_items)))
|
| 516 |
+
|
| 517 |
+
# remove "data" directory if it exists
|
| 518 |
+
if os.path.exists("data"):
|
| 519 |
+
try:
|
| 520 |
+
shutil.rmtree("data")
|
| 521 |
+
except Exception as e:
|
| 522 |
+
print(e)
|
| 523 |
+
|
| 524 |
for item in arxiv_items:
|
| 525 |
# download images --
|
| 526 |
save_arxiv_article_images(item["arxiv_id"])
|
| 527 |
|
| 528 |
# parse html
|
| 529 |
try:
|
| 530 |
+
item["contents"] = parse_html_content(item["raw_content"])
|
|
|
|
| 531 |
except Exception as e:
|
| 532 |
print(f"Failed to parse html for {item['arxiv_id']}: {e}")
|
| 533 |
item["contents"] = []
|
|
|
|
| 535 |
if len(item["contents"]) == 0:
|
| 536 |
print("Extracting from pdf...")
|
| 537 |
md_content = get_pdf_text(item["arxiv_id"]) # fix this
|
| 538 |
+
item["raw_content"] = md_content
|
| 539 |
+
|
| 540 |
if md_content:
|
| 541 |
item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
|
|
|
|
| 542 |
else:
|
| 543 |
item["contents"] = []
|
| 544 |
|
| 545 |
+
if len(item["contents"]) > 0:
|
| 546 |
+
processed_arxiv_ids.add(item["arxiv_id"])
|
| 547 |
+
if len(item["authors"]) == 0:
|
| 548 |
+
item["authors"] = [] # ["unknown"]
|
| 549 |
+
item["title"] = item["contents"][0]["paper_title"]
|
| 550 |
+
pbar.update(1)
|
| 551 |
+
pbar.close()
|
| 552 |
+
|
| 553 |
# save contents ---
|
| 554 |
processed_arxiv_ids = list(processed_arxiv_ids)
|
| 555 |
print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")
|
|
|
|
| 567 |
|
| 568 |
# add to existing dataset
|
| 569 |
try:
|
| 570 |
+
old_abstract_df = load_dataset(HF_REPO_ID_TXT, "abstracts")["train"].to_pandas()
|
| 571 |
except Exception as e:
|
| 572 |
print(e)
|
| 573 |
old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
|
|
|
|
| 580 |
contents_df = pd.DataFrame(arxiv_items)
|
| 581 |
print(contents_df.head())
|
| 582 |
try:
|
| 583 |
+
old_contents_df = load_dataset(HF_REPO_ID_TXT, "articles")["train"].to_pandas()
|
| 584 |
except Exception as e:
|
| 585 |
print(e)
|
| 586 |
old_contents_df = pd.DataFrame(columns=contents_df.columns)
|
|
|
|
| 591 |
contents_df = contents_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
|
| 592 |
|
| 593 |
# upload to hf
|
| 594 |
+
processed_arxiv_ids = list(set(processed_arxiv_ids + list(existing_arxiv_ids)))
|
| 595 |
upload_to_hf(abstract_df, contents_df, processed_arxiv_ids)
|
| 596 |
|
| 597 |
# save as local copy
|
|
|
|
| 605 |
"""
|
| 606 |
Schedule the main task to run at the user-defined frequency
|
| 607 |
"""
|
| 608 |
+
# main() # run once initially
|
| 609 |
|
| 610 |
frequency = "daily" # TODO: env
|
| 611 |
if frequency == "hourly":
|