|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import Dict, Generator, List, Optional |
|
|
|
|
|
from camel.logger import get_logger |
|
|
from camel.toolkits.base import BaseToolkit |
|
|
from camel.toolkits.function_tool import FunctionTool |
|
|
from camel.utils import dependencies_required |
|
|
|
|
|
logger = get_logger(__name__) |
|
|
|
|
|
|
|
|
class ArxivToolkit(BaseToolkit): |
|
|
r"""A toolkit for interacting with the arXiv API to search and download |
|
|
academic papers. |
|
|
""" |
|
|
|
|
|
@dependencies_required('arxiv') |
|
|
def __init__(self) -> None: |
|
|
r"""Initializes the ArxivToolkit and sets up the arXiv client.""" |
|
|
import arxiv |
|
|
|
|
|
self.client = arxiv.Client() |
|
|
|
|
|
def _get_search_results( |
|
|
self, |
|
|
query: str, |
|
|
paper_ids: Optional[List[str]] = None, |
|
|
max_results: Optional[int] = 5, |
|
|
) -> Generator: |
|
|
r"""Retrieves search results from the arXiv API based on the provided |
|
|
query and optional paper IDs. |
|
|
|
|
|
Args: |
|
|
query (str): The search query string used to search for papers on |
|
|
arXiv. |
|
|
paper_ids (List[str], optional): A list of specific arXiv paper |
|
|
IDs to search for. (default: :obj: `None`) |
|
|
max_results (int, optional): The maximum number of search results |
|
|
to retrieve. (default: :obj: `5`) |
|
|
|
|
|
Returns: |
|
|
Generator: A generator that yields results from the arXiv search |
|
|
query, which includes metadata about each paper matching the |
|
|
query. |
|
|
""" |
|
|
import arxiv |
|
|
|
|
|
paper_ids = paper_ids or [] |
|
|
search_query = arxiv.Search( |
|
|
query=query, |
|
|
id_list=paper_ids, |
|
|
max_results=max_results, |
|
|
) |
|
|
return self.client.results(search_query) |
|
|
|
|
|
def search_papers( |
|
|
self, |
|
|
query: str, |
|
|
paper_ids: Optional[List[str]] = None, |
|
|
max_results: Optional[int] = 5, |
|
|
) -> List[Dict[str, str]]: |
|
|
r"""Searches for academic papers on arXiv using a query string and |
|
|
optional paper IDs. |
|
|
|
|
|
Args: |
|
|
query (str): The search query string. |
|
|
paper_ids (List[str], optional): A list of specific arXiv paper |
|
|
IDs to search for. (default: :obj: `None`) |
|
|
max_results (int, optional): The maximum number of search results |
|
|
to return. (default: :obj: `5`) |
|
|
|
|
|
Returns: |
|
|
List[Dict[str, str]]: A list of dictionaries, each containing |
|
|
information about a paper, including title, published date, |
|
|
authors, entry ID, summary, and extracted text from the paper. |
|
|
""" |
|
|
from arxiv2text import arxiv_to_text |
|
|
|
|
|
search_results = self._get_search_results( |
|
|
query, paper_ids, max_results |
|
|
) |
|
|
papers_data = [] |
|
|
|
|
|
for paper in search_results: |
|
|
paper_info = { |
|
|
"title": paper.title, |
|
|
"published_date": paper.updated.date().isoformat(), |
|
|
"authors": [author.name for author in paper.authors], |
|
|
"entry_id": paper.entry_id, |
|
|
"summary": paper.summary, |
|
|
"pdf_url": paper.pdf_url, |
|
|
} |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
text = arxiv_to_text(paper_info["pdf_url"]) |
|
|
except Exception as e: |
|
|
logger.error( |
|
|
"Failed to extract text content from the PDF at " |
|
|
"the specified URL. " |
|
|
f"URL: {paper_info.get('pdf_url', 'Unknown')} | Error: {e}" |
|
|
) |
|
|
text = "" |
|
|
|
|
|
paper_info['paper_text'] = text |
|
|
|
|
|
papers_data.append(paper_info) |
|
|
|
|
|
return papers_data |
|
|
|
|
|
def download_papers( |
|
|
self, |
|
|
query: str, |
|
|
paper_ids: Optional[List[str]] = None, |
|
|
max_results: Optional[int] = 5, |
|
|
output_dir: Optional[str] = "./", |
|
|
) -> str: |
|
|
r"""Downloads PDFs of academic papers from arXiv based on the provided |
|
|
query. |
|
|
|
|
|
Args: |
|
|
query (str): The search query string. |
|
|
paper_ids (List[str], optional): A list of specific arXiv paper |
|
|
IDs to download. (default: :obj: `None`) |
|
|
max_results (int, optional): The maximum number of search results |
|
|
to download. (default: :obj: `5`) |
|
|
output_dir (str, optional): The directory to save the downloaded |
|
|
PDFs. Defaults to the current directory. |
|
|
|
|
|
Returns: |
|
|
str: Status message indicating success or failure. |
|
|
""" |
|
|
try: |
|
|
search_results = self._get_search_results( |
|
|
query, paper_ids, max_results |
|
|
) |
|
|
|
|
|
for paper in search_results: |
|
|
paper.download_pdf( |
|
|
dirpath=output_dir, filename=f"{paper.title}" + ".pdf" |
|
|
) |
|
|
return "papers downloaded successfully" |
|
|
except Exception as e: |
|
|
return f"An error occurred: {e}" |
|
|
|
|
|
def get_tools(self) -> List[FunctionTool]: |
|
|
r"""Returns a list of FunctionTool objects representing the |
|
|
functions in the toolkit. |
|
|
|
|
|
Returns: |
|
|
List[FunctionTool]: A list of FunctionTool objects |
|
|
representing the functions in the toolkit. |
|
|
""" |
|
|
return [ |
|
|
FunctionTool(self.search_papers), |
|
|
FunctionTool(self.download_papers), |
|
|
] |
|
|
|