Spaces:
Sleeping
Sleeping
| """ | |
| ArXiv Fetcher Component | |
| Fetches and processes research papers from ArXiv | |
| """ | |
| import re | |
| import time | |
| import requests | |
| from typing import List, Dict, Optional, Any | |
| from datetime import datetime, timedelta | |
| import arxiv | |
| class ArxivFetcher: | |
| """ | |
| Fetches research papers from ArXiv | |
| Provides search, download, and metadata extraction capabilities | |
| """ | |
| def __init__(self, config = None): | |
| # Import Config only when needed to avoid dependency issues | |
| if config is None: | |
| try: | |
| from .config import Config | |
| self.config = Config() | |
| except ImportError: | |
| # Fallback to None if Config cannot be imported | |
| self.config = None | |
| else: | |
| self.config = config | |
| self.client = arxiv.Client() | |
| def search_papers(self, | |
| query: str, | |
| max_results: int = 10, | |
| sort_by: str = "relevance", | |
| category: str = None, | |
| date_range: int = None) -> List[Dict[str, Any]]: | |
| """ | |
| Search for papers on ArXiv | |
| Args: | |
| query: Search query | |
| max_results: Maximum number of results | |
| sort_by: Sort criteria ('relevance', 'lastUpdatedDate', 'submittedDate') | |
| category: ArXiv category filter (e.g., 'cs.AI', 'cs.LG') | |
| date_range: Days back to search (e.g., 7, 30, 365) | |
| Returns: | |
| List of paper dictionaries | |
| """ | |
| try: | |
| print(f"Searching ArXiv for: '{query}'") | |
| # Build search query | |
| search_query = query | |
| if category: | |
| search_query = f"cat:{category} AND {query}" | |
| # Set sort criteria | |
| sort_criteria = { | |
| "relevance": arxiv.SortCriterion.Relevance, | |
| "lastUpdatedDate": arxiv.SortCriterion.LastUpdatedDate, | |
| "submittedDate": arxiv.SortCriterion.SubmittedDate | |
| }.get(sort_by, arxiv.SortCriterion.Relevance) | |
| # Create search | |
| search = arxiv.Search( | |
| query=search_query, | |
| max_results=max_results, | |
| sort_by=sort_criteria, | |
| sort_order=arxiv.SortOrder.Descending | |
| ) | |
| papers = [] | |
| for result in self.client.results(search): | |
| # Date filtering | |
| if date_range: | |
| cutoff_date = datetime.now() - timedelta(days=date_range) | |
| if result.published.replace(tzinfo=None) < cutoff_date: | |
| continue | |
| # Extract paper information | |
| paper = self._extract_paper_info(result) | |
| papers.append(paper) | |
| print(f"Found {len(papers)} papers") | |
| return papers | |
| except Exception as e: | |
| print(f"Error searching ArXiv: {e}") | |
| return [] | |
| def get_paper_by_id(self, arxiv_id: str) -> Optional[Dict[str, Any]]: | |
| """ | |
| Get a specific paper by ArXiv ID | |
| Args: | |
| arxiv_id: ArXiv paper ID (e.g., '2301.12345') | |
| Returns: | |
| Paper dictionary or None | |
| """ | |
| try: | |
| print(f"Fetching paper: {arxiv_id}") | |
| search = arxiv.Search(id_list=[arxiv_id]) | |
| results = list(self.client.results(search)) | |
| if results: | |
| paper = self._extract_paper_info(results[0]) | |
| print(f"Retrieved paper: {paper['title']}") | |
| return paper | |
| else: | |
| print(f"Paper not found: {arxiv_id}") | |
| return None | |
| except Exception as e: | |
| print(f"Error fetching paper {arxiv_id}: {e}") | |
| return None | |
| def search_by_author(self, author: str, max_results: int = 20) -> List[Dict[str, Any]]: | |
| """ | |
| Search for papers by author | |
| Args: | |
| author: Author name | |
| max_results: Maximum number of results | |
| Returns: | |
| List of paper dictionaries | |
| """ | |
| query = f"au:{author}" | |
| return self.search_papers(query, max_results=max_results, sort_by="lastUpdatedDate") | |
| def search_by_category(self, category: str, max_results: int = 20) -> List[Dict[str, Any]]: | |
| """ | |
| Search for papers by category | |
| Args: | |
| category: ArXiv category (e.g., 'cs.AI', 'cs.LG', 'stat.ML') | |
| max_results: Maximum number of results | |
| Returns: | |
| List of paper dictionaries | |
| """ | |
| query = f"cat:{category}" | |
| return self.search_papers(query, max_results=max_results, sort_by="lastUpdatedDate") | |
| def get_trending_papers(self, category: str = "cs.AI", days: int = 7, max_results: int = 10) -> List[Dict[str, Any]]: | |
| """ | |
| Get trending papers in a category | |
| Args: | |
| category: ArXiv category | |
| days: Days back to look for papers | |
| max_results: Maximum number of results | |
| Returns: | |
| List of paper dictionaries | |
| """ | |
| return self.search_by_category(category, max_results=max_results) | |
| def _extract_paper_info(self, result) -> Dict[str, Any]: | |
| """ | |
| Extract paper information from ArXiv result | |
| Args: | |
| result: ArXiv search result | |
| Returns: | |
| Paper dictionary | |
| """ | |
| try: | |
| # Extract ArXiv ID | |
| arxiv_id = result.entry_id.split('/')[-1] | |
| # Clean and format data | |
| paper = { | |
| 'arxiv_id': arxiv_id, | |
| 'title': result.title.strip(), | |
| 'authors': [author.name for author in result.authors], | |
| 'summary': result.summary.strip(), | |
| 'published': result.published.isoformat(), | |
| 'updated': result.updated.isoformat(), | |
| 'categories': result.categories, | |
| 'primary_category': result.primary_category, | |
| 'pdf_url': result.pdf_url, | |
| 'entry_id': result.entry_id, | |
| 'journal_ref': result.journal_ref, | |
| 'doi': result.doi, | |
| 'comment': result.comment, | |
| 'links': [{'title': link.title, 'href': link.href} for link in result.links], | |
| 'fetched_at': datetime.now().isoformat() | |
| } | |
| # Add formatted metadata | |
| paper['authors_str'] = ', '.join(paper['authors'][:3]) + ('...' if len(paper['authors']) > 3 else '') | |
| paper['categories_str'] = ', '.join(paper['categories'][:3]) + ('...' if len(paper['categories']) > 3 else '') | |
| paper['year'] = result.published.year | |
| paper['month'] = result.published.month | |
| return paper | |
| except Exception as e: | |
| print(f"Error extracting paper info: {e}") | |
| return { | |
| 'arxiv_id': 'unknown', | |
| 'title': 'Error extracting title', | |
| 'authors': [], | |
| 'summary': 'Error extracting summary', | |
| 'error': str(e) | |
| } | |
| def download_pdf(self, paper: Dict[str, Any], download_dir: str = "downloads") -> Optional[str]: | |
| """ | |
| Download PDF for a paper | |
| Args: | |
| paper: Paper dictionary | |
| download_dir: Directory to save PDF | |
| Returns: | |
| Path to downloaded PDF or None | |
| """ | |
| try: | |
| import os | |
| os.makedirs(download_dir, exist_ok=True) | |
| pdf_url = paper.get('pdf_url') | |
| if not pdf_url: | |
| print(f"No PDF URL for paper: {paper.get('title', 'Unknown')}") | |
| return None | |
| arxiv_id = paper.get('arxiv_id', 'unknown') | |
| filename = f"{arxiv_id}.pdf" | |
| filepath = os.path.join(download_dir, filename) | |
| if os.path.exists(filepath): | |
| print(f"PDF already exists: {filepath}") | |
| return filepath | |
| print(f"Downloading PDF: {paper.get('title', 'Unknown')}") | |
| response = requests.get(pdf_url, timeout=30) | |
| response.raise_for_status() | |
| with open(filepath, 'wb') as f: | |
| f.write(response.content) | |
| print(f"PDF downloaded: {filepath}") | |
| return filepath | |
| except Exception as e: | |
| print(f"Error downloading PDF: {e}") | |
| return None | |
| def get_paper_recommendations(self, paper_id: str, max_results: int = 5) -> List[Dict[str, Any]]: | |
| """ | |
| Get paper recommendations based on a paper's content | |
| Args: | |
| paper_id: ArXiv ID of the base paper | |
| max_results: Number of recommendations | |
| Returns: | |
| List of recommended papers | |
| """ | |
| try: | |
| # Get the base paper | |
| base_paper = self.get_paper_by_id(paper_id) | |
| if not base_paper: | |
| return [] | |
| # Extract key terms from title and summary | |
| title = base_paper.get('title', '') | |
| summary = base_paper.get('summary', '') | |
| categories = base_paper.get('categories', []) | |
| # Simple keyword extraction (can be improved with NLP) | |
| keywords = self._extract_keywords(title + ' ' + summary) | |
| # Search for related papers | |
| query = ' '.join(keywords[:5]) # Use top 5 keywords | |
| related_papers = self.search_papers( | |
| query=query, | |
| max_results=max_results + 5, # Get more to filter out the original | |
| sort_by="relevance" | |
| ) | |
| # Filter out the original paper | |
| recommendations = [p for p in related_papers if p.get('arxiv_id') != paper_id] | |
| return recommendations[:max_results] | |
| except Exception as e: | |
| print(f"Error getting recommendations: {e}") | |
| return [] | |
| def _extract_keywords(self, text: str) -> List[str]: | |
| """ | |
| Simple keyword extraction from text | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of keywords | |
| """ | |
| # Simple implementation - can be improved with NLP libraries | |
| import re | |
| from collections import Counter | |
| # Remove common stop words | |
| stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'a', 'an', 'as', 'is', 'was', 'are', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'we', 'us', 'our', 'you', 'your', 'he', 'him', 'his', 'she', 'her', 'it', 'its', 'they', 'them', 'their'} | |
| # Extract words | |
| words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower()) | |
| # Filter and count | |
| filtered_words = [word for word in words if word not in stop_words] | |
| word_counts = Counter(filtered_words) | |
| # Return most common words | |
| return [word for word, count in word_counts.most_common(20)] | |
| def get_categories(self) -> Dict[str, str]: | |
| """ | |
| Get available ArXiv categories | |
| Returns: | |
| Dictionary of category codes and descriptions | |
| """ | |
| return { | |
| 'cs.AI': 'Artificial Intelligence', | |
| 'cs.LG': 'Machine Learning', | |
| 'cs.CV': 'Computer Vision', | |
| 'cs.CL': 'Computation and Language', | |
| 'cs.NE': 'Neural and Evolutionary Computing', | |
| 'cs.RO': 'Robotics', | |
| 'cs.CR': 'Cryptography and Security', | |
| 'cs.DC': 'Distributed, Parallel, and Cluster Computing', | |
| 'cs.DB': 'Databases', | |
| 'cs.DS': 'Data Structures and Algorithms', | |
| 'cs.HC': 'Human-Computer Interaction', | |
| 'cs.IR': 'Information Retrieval', | |
| 'cs.IT': 'Information Theory', | |
| 'cs.MM': 'Multimedia', | |
| 'cs.NI': 'Networking and Internet Architecture', | |
| 'cs.OS': 'Operating Systems', | |
| 'cs.PL': 'Programming Languages', | |
| 'cs.SE': 'Software Engineering', | |
| 'cs.SY': 'Systems and Control', | |
| 'stat.ML': 'Machine Learning (Statistics)', | |
| 'stat.AP': 'Applications (Statistics)', | |
| 'stat.CO': 'Computation (Statistics)', | |
| 'stat.ME': 'Methodology (Statistics)', | |
| 'stat.TH': 'Statistics Theory', | |
| 'math.ST': 'Statistics Theory (Mathematics)', | |
| 'math.PR': 'Probability (Mathematics)', | |
| 'math.OC': 'Optimization and Control', | |
| 'math.NA': 'Numerical Analysis', | |
| 'eess.AS': 'Audio and Speech Processing', | |
| 'eess.IV': 'Image and Video Processing', | |
| 'eess.SP': 'Signal Processing', | |
| 'eess.SY': 'Systems and Control', | |
| 'q-bio.QM': 'Quantitative Methods', | |
| 'q-bio.NC': 'Neurons and Cognition', | |
| 'physics.data-an': 'Data Analysis, Statistics and Probability' | |
| } | |