Spaces:
Sleeping
Sleeping
| """ | |
| Unified Research Paper Fetcher | |
| Fetches papers from multiple sources: ArXiv, Semantic Scholar, Crossref, and PubMed | |
| Replaces all previous fetcher components for maximum minimalism | |
| """ | |
| import re | |
| import time | |
| import requests | |
| import xml.etree.ElementTree as ET | |
| from typing import List, Dict, Optional, Any, Union | |
| from datetime import datetime, timedelta | |
| import arxiv | |
| import json | |
| from collections import Counter | |
| class UnifiedPaperFetcher: | |
| """ | |
| Unified fetcher for research papers from multiple academic databases | |
| Supports: ArXiv, Semantic Scholar, Crossref, PubMed | |
| """ | |
| def __init__(self, config=None): | |
| # Import Config only when needed to avoid dependency issues | |
| if config is None: | |
| try: | |
| from .config import Config | |
| self.config = Config() | |
| except ImportError: | |
| self.config = None | |
| else: | |
| self.config = config | |
| # Initialize clients | |
| self.arxiv_client = arxiv.Client() | |
| # API endpoints | |
| self.semantic_scholar_base = "https://api.semanticscholar.org/graph/v1" | |
| self.crossref_base = "https://api.crossref.org/works" | |
| self.pubmed_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" | |
| # Rate limiting | |
| self.last_request_time = {} | |
| self.min_request_interval = { | |
| 'semantic_scholar': 5.0, # 5 seconds between requests | |
| 'crossref': 0.1, # 100ms between requests (polite) | |
| 'pubmed': 0.34, # ~3 requests per second | |
| 'arxiv': 3.0 # 3 seconds between requests | |
| } | |
| def search_papers(self, | |
| query: str, | |
| max_results: int = 10, | |
| sources: List[str] = None, | |
| sort_by: str = "relevance") -> List[Dict[str, Any]]: | |
| """ | |
| Search for papers across multiple sources | |
| Args: | |
| query: Search query | |
| max_results: Maximum number of results per source | |
| sources: List of sources ['arxiv', 'semantic_scholar', 'crossref', 'pubmed'] | |
| sort_by: Sort criteria | |
| Returns: | |
| List of paper dictionaries with unified format | |
| """ | |
| if sources is None: | |
| sources = ['arxiv', 'semantic_scholar', 'crossref', 'pubmed'] | |
| all_papers = [] | |
| results_per_source = max(1, max_results // len(sources)) | |
| print(f"Searching for: '{query}' across sources: {sources}") | |
| for source in sources: | |
| try: | |
| print(f"Searching {source}...") | |
| if source == 'arxiv': | |
| papers = self._search_arxiv(query, results_per_source) | |
| elif source == 'semantic_scholar': | |
| papers = self._search_semantic_scholar(query, results_per_source) | |
| elif source == 'crossref': | |
| papers = self._search_crossref(query, results_per_source) | |
| elif source == 'pubmed': | |
| papers = self._search_pubmed(query, results_per_source) | |
| else: | |
| print(f"Unknown source: {source}") | |
| continue | |
| print(f"Found {len(papers)} papers from {source}") | |
| all_papers.extend(papers) | |
| except Exception as e: | |
| print(f"Error searching {source}: {e}") | |
| continue | |
| # Remove duplicates and sort | |
| unique_papers = self._deduplicate_papers(all_papers) | |
| # Sort by relevance/date | |
| if sort_by == "date": | |
| unique_papers.sort(key=lambda x: x.get('published_date', ''), reverse=True) | |
| print(f"Total unique papers found: {len(unique_papers)}") | |
| return unique_papers[:max_results] | |
| def _search_arxiv(self, query: str, max_results: int) -> List[Dict[str, Any]]: | |
| """Search ArXiv""" | |
| self._rate_limit('arxiv') | |
| try: | |
| search = arxiv.Search( | |
| query=query, | |
| max_results=max_results, | |
| sort_by=arxiv.SortCriterion.Relevance, | |
| sort_order=arxiv.SortOrder.Descending | |
| ) | |
| papers = [] | |
| for result in self.arxiv_client.results(search): | |
| paper = { | |
| 'title': result.title, | |
| 'authors': [author.name for author in result.authors], | |
| 'abstract': result.summary, | |
| 'published_date': result.published.strftime('%Y-%m-%d'), | |
| 'year': result.published.year, | |
| 'url': result.entry_id, | |
| 'pdf_url': result.pdf_url, | |
| 'source': 'ArXiv', | |
| 'arxiv_id': result.entry_id.split('/')[-1], | |
| 'categories': [cat for cat in result.categories], | |
| 'doi': result.doi | |
| } | |
| papers.append(paper) | |
| return papers | |
| except Exception as e: | |
| print(f"ArXiv search error: {e}") | |
| return [] | |
| def _search_semantic_scholar(self, query: str, max_results: int) -> List[Dict[str, Any]]: | |
| """Search Semantic Scholar""" | |
| self._rate_limit('semantic_scholar') | |
| try: | |
| url = f"{self.semantic_scholar_base}/paper/search" | |
| params = { | |
| 'query': query, | |
| 'limit': min(max_results, 100), | |
| 'fields': 'title,authors,abstract,year,url,venue,citationCount,referenceCount,publicationDate,externalIds' | |
| } | |
| # Retry logic for rate limiting | |
| max_retries = 3 | |
| data = None | |
| for attempt in range(max_retries): | |
| data = self.safe_get(url, params) | |
| if data and 'data' in data: | |
| break | |
| elif attempt < max_retries - 1: | |
| wait_time = (attempt + 1) * 5 | |
| print(f"Semantic Scholar rate limited, waiting {wait_time} seconds...") | |
| time.sleep(wait_time) # Exponential backoff | |
| else: | |
| print("Semantic Scholar API unavailable after retries") | |
| return [] | |
| if not data or 'data' not in data: | |
| return [] | |
| papers = [] | |
| for paper_data in data.get('data', []): | |
| # Handle authors | |
| authors = [] | |
| if paper_data.get('authors'): | |
| authors = [author.get('name', 'Unknown') for author in paper_data['authors']] | |
| # Handle external IDs | |
| external_ids = paper_data.get('externalIds', {}) | |
| doi = external_ids.get('DOI') | |
| arxiv_id = external_ids.get('ArXiv') | |
| paper = { | |
| 'title': paper_data.get('title', 'No title'), | |
| 'authors': authors, | |
| 'abstract': paper_data.get('abstract', ''), | |
| 'published_date': paper_data.get('publicationDate', ''), | |
| 'year': paper_data.get('year'), | |
| 'url': paper_data.get('url', ''), | |
| 'source': 'Semantic Scholar', | |
| 'venue': paper_data.get('venue', ''), | |
| 'citation_count': paper_data.get('citationCount', 0), | |
| 'reference_count': paper_data.get('referenceCount', 0), | |
| 'doi': doi, | |
| 'arxiv_id': arxiv_id | |
| } | |
| papers.append(paper) | |
| return papers | |
| except Exception as e: | |
| print(f"Semantic Scholar search error: {e}") | |
| return [] | |
| def _search_crossref(self, query: str, max_results: int) -> List[Dict[str, Any]]: | |
| """Search Crossref""" | |
| self._rate_limit('crossref') | |
| try: | |
| url = self.crossref_base | |
| params = { | |
| 'query': query, | |
| 'rows': min(max_results, 20), | |
| 'sort': 'relevance', | |
| 'select': 'title,author,abstract,published-print,published-online,URL,DOI,container-title,type' | |
| } | |
| headers = { | |
| 'User-Agent': 'ResearchMate/2.0 (mailto:research@example.com)' | |
| } | |
| response = requests.get(url, params=params, headers=headers, timeout=30) | |
| response.raise_for_status() | |
| data = response.json() | |
| papers = [] | |
| for item in data.get('message', {}).get('items', []): | |
| # Handle authors | |
| authors = [] | |
| if item.get('author'): | |
| for author in item['author']: | |
| given = author.get('given', '') | |
| family = author.get('family', '') | |
| name = f"{given} {family}".strip() | |
| if name: | |
| authors.append(name) | |
| # Handle publication date | |
| published_date = '' | |
| year = None | |
| if item.get('published-print'): | |
| date_parts = item['published-print'].get('date-parts', [[]])[0] | |
| if date_parts: | |
| year = date_parts[0] | |
| if len(date_parts) >= 3: | |
| published_date = f"{date_parts[0]:04d}-{date_parts[1]:02d}-{date_parts[2]:02d}" | |
| elif len(date_parts) >= 2: | |
| published_date = f"{date_parts[0]:04d}-{date_parts[1]:02d}-01" | |
| else: | |
| published_date = f"{date_parts[0]:04d}-01-01" | |
| paper = { | |
| 'title': item.get('title', ['No title'])[0] if item.get('title') else 'No title', | |
| 'authors': authors, | |
| 'abstract': item.get('abstract', ''), | |
| 'published_date': published_date, | |
| 'year': year, | |
| 'url': item.get('URL', ''), | |
| 'source': 'Crossref', | |
| 'doi': item.get('DOI', ''), | |
| 'journal': item.get('container-title', [''])[0] if item.get('container-title') else '', | |
| 'type': item.get('type', '') | |
| } | |
| papers.append(paper) | |
| return papers | |
| except Exception as e: | |
| print(f"Crossref search error: {e}") | |
| return [] | |
| def _search_pubmed(self, query: str, max_results: int) -> List[Dict[str, Any]]: | |
| """Search PubMed""" | |
| self._rate_limit('pubmed') | |
| try: | |
| # Step 1: Search for PMIDs | |
| search_url = f"{self.pubmed_base}/esearch.fcgi" | |
| search_params = { | |
| 'db': 'pubmed', | |
| 'term': query, | |
| 'retmax': min(max_results, 20), | |
| 'retmode': 'json', | |
| 'sort': 'relevance' | |
| } | |
| response = requests.get(search_url, params=search_params, timeout=30) | |
| response.raise_for_status() | |
| search_data = response.json() | |
| pmids = search_data.get('esearchresult', {}).get('idlist', []) | |
| if not pmids: | |
| return [] | |
| # Step 2: Fetch details for PMIDs | |
| self._rate_limit('pubmed') | |
| fetch_url = f"{self.pubmed_base}/efetch.fcgi" | |
| fetch_params = { | |
| 'db': 'pubmed', | |
| 'id': ','.join(pmids), | |
| 'retmode': 'xml' | |
| } | |
| response = requests.get(fetch_url, params=fetch_params, timeout=30) | |
| response.raise_for_status() | |
| # Parse XML | |
| root = ET.fromstring(response.content) | |
| papers = [] | |
| for article in root.findall('.//PubmedArticle'): | |
| try: | |
| # Extract basic info | |
| medline = article.find('.//MedlineCitation') | |
| if medline is None: | |
| continue | |
| article_elem = medline.find('.//Article') | |
| if article_elem is None: | |
| continue | |
| # Title | |
| title_elem = article_elem.find('.//ArticleTitle') | |
| title = title_elem.text if title_elem is not None else 'No title' | |
| # Authors | |
| authors = [] | |
| author_list = article_elem.find('.//AuthorList') | |
| if author_list is not None: | |
| for author in author_list.findall('.//Author'): | |
| last_name = author.find('.//LastName') | |
| first_name = author.find('.//ForeName') | |
| if last_name is not None and first_name is not None: | |
| authors.append(f"{first_name.text} {last_name.text}") | |
| elif last_name is not None: | |
| authors.append(last_name.text) | |
| # Abstract | |
| abstract = '' | |
| abstract_elem = article_elem.find('.//AbstractText') | |
| if abstract_elem is not None: | |
| abstract = abstract_elem.text or '' | |
| # Publication date | |
| pub_date = article_elem.find('.//PubDate') | |
| published_date = '' | |
| year = None | |
| if pub_date is not None: | |
| year_elem = pub_date.find('.//Year') | |
| month_elem = pub_date.find('.//Month') | |
| day_elem = pub_date.find('.//Day') | |
| if year_elem is not None: | |
| year = int(year_elem.text) | |
| month = month_elem.text if month_elem is not None else '01' | |
| day = day_elem.text if day_elem is not None else '01' | |
| # Convert month name to number if needed | |
| month_map = { | |
| 'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', | |
| 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', | |
| 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12' | |
| } | |
| if month in month_map: | |
| month = month_map[month] | |
| elif not month.isdigit(): | |
| month = '01' | |
| published_date = f"{year}-{month.zfill(2)}-{day.zfill(2)}" | |
| # PMID | |
| pmid_elem = medline.find('.//PMID') | |
| pmid = pmid_elem.text if pmid_elem is not None else '' | |
| # Journal | |
| journal_elem = article_elem.find('.//Journal/Title') | |
| journal = journal_elem.text if journal_elem is not None else '' | |
| # DOI | |
| doi = '' | |
| article_ids = article.findall('.//ArticleId') | |
| for article_id in article_ids: | |
| if article_id.get('IdType') == 'doi': | |
| doi = article_id.text | |
| break | |
| paper = { | |
| 'title': title, | |
| 'authors': authors, | |
| 'abstract': abstract, | |
| 'published_date': published_date, | |
| 'year': year, | |
| 'url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", | |
| 'source': 'PubMed', | |
| 'pmid': pmid, | |
| 'journal': journal, | |
| 'doi': doi | |
| } | |
| papers.append(paper) | |
| except Exception as e: | |
| print(f"Error parsing PubMed article: {e}") | |
| continue | |
| return papers | |
| except Exception as e: | |
| print(f"PubMed search error: {e}") | |
| return [] | |
| def _rate_limit(self, source: str): | |
| """Implement rate limiting for API calls""" | |
| now = time.time() | |
| last_request = self.last_request_time.get(source, 0) | |
| interval = self.min_request_interval.get(source, 1.0) | |
| time_since_last = now - last_request | |
| if time_since_last < interval: | |
| sleep_time = interval - time_since_last | |
| time.sleep(sleep_time) | |
| self.last_request_time[source] = time.time() | |
| def safe_get(self, url: str, params: dict = None, headers: dict = None, timeout: int = 30) -> Optional[Dict[str, Any]]: | |
| """Safe HTTP GET with error handling""" | |
| try: | |
| response = requests.get(url, params=params, headers=headers, timeout=timeout) | |
| response.raise_for_status() | |
| return response.json() | |
| except requests.exceptions.RequestException as e: | |
| print(f"HTTP request failed: {e}") | |
| return None | |
| except json.JSONDecodeError as e: | |
| print(f"JSON decode error: {e}") | |
| return None | |
| def _deduplicate_papers(self, papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| """Remove duplicate papers based on title, DOI, or ArXiv ID""" | |
| seen = set() | |
| unique_papers = [] | |
| for paper in papers: | |
| # Create identifier based on available fields | |
| identifiers = [] | |
| # Use DOI if available | |
| doi = paper.get('doi') | |
| if doi is None: | |
| doi = '' | |
| doi = str(doi).strip() | |
| if doi: | |
| identifiers.append(f"doi:{doi.lower()}") | |
| # Use ArXiv ID if available | |
| arxiv_id = paper.get('arxiv_id') | |
| if arxiv_id is None: | |
| arxiv_id = '' | |
| arxiv_id = str(arxiv_id).strip() | |
| if arxiv_id: | |
| identifiers.append(f"arxiv:{arxiv_id.lower()}") | |
| # Use PMID if available | |
| pmid = paper.get('pmid') | |
| if pmid is None: | |
| pmid = '' | |
| pmid = str(pmid).strip() | |
| if pmid: | |
| identifiers.append(f"pmid:{pmid}") | |
| # Use title as fallback | |
| title = paper.get('title') | |
| if title is None: | |
| title = '' | |
| title = str(title).strip().lower() | |
| if title and title != 'no title': | |
| # Clean title for comparison | |
| clean_title = re.sub(r'[^\w\s]', '', title) | |
| clean_title = ' '.join(clean_title.split()) | |
| identifiers.append(f"title:{clean_title}") | |
| # Check if any identifier has been seen | |
| found_duplicate = False | |
| for identifier in identifiers: | |
| if identifier in seen: | |
| found_duplicate = True | |
| break | |
| if not found_duplicate: | |
| # Add all identifiers to seen set | |
| for identifier in identifiers: | |
| seen.add(identifier) | |
| unique_papers.append(paper) | |
| return unique_papers | |
| def get_paper_by_doi(self, doi: str) -> Optional[Dict[str, Any]]: | |
| """Get paper details by DOI from Crossref""" | |
| try: | |
| url = f"{self.crossref_base}/{doi}" | |
| headers = { | |
| 'User-Agent': 'ResearchMate/2.0 (mailto:research@example.com)' | |
| } | |
| response = requests.get(url, headers=headers, timeout=30) | |
| response.raise_for_status() | |
| data = response.json() | |
| item = data.get('message', {}) | |
| if not item: | |
| return None | |
| # Parse the item (similar to _search_crossref) | |
| authors = [] | |
| if item.get('author'): | |
| for author in item['author']: | |
| given = author.get('given', '') | |
| family = author.get('family', '') | |
| name = f"{given} {family}".strip() | |
| if name: | |
| authors.append(name) | |
| # Handle publication date | |
| published_date = '' | |
| year = None | |
| if item.get('published-print'): | |
| date_parts = item['published-print'].get('date-parts', [[]])[0] | |
| if date_parts: | |
| year = date_parts[0] | |
| if len(date_parts) >= 3: | |
| published_date = f"{date_parts[0]:04d}-{date_parts[1]:02d}-{date_parts[2]:02d}" | |
| paper = { | |
| 'title': item.get('title', ['No title'])[0] if item.get('title') else 'No title', | |
| 'authors': authors, | |
| 'abstract': item.get('abstract', ''), | |
| 'published_date': published_date, | |
| 'year': year, | |
| 'url': item.get('URL', ''), | |
| 'source': 'Crossref', | |
| 'doi': item.get('DOI', ''), | |
| 'journal': item.get('container-title', [''])[0] if item.get('container-title') else '' | |
| } | |
| return paper | |
| except Exception as e: | |
| print(f"Error fetching DOI {doi}: {e}") | |
| return None | |
| class PaperFetcher(UnifiedPaperFetcher): | |
| """ | |
| Consolidated paper fetcher combining all sources | |
| This is the single fetcher class that replaces all previous fetcher components | |
| """ | |
| def __init__(self, config=None): | |
| super().__init__(config) | |
| def search_papers(self, | |
| query: str, | |
| max_results: int = 10, | |
| sources: List[str] = None, | |
| sort_by: str = "relevance", | |
| category: str = None, | |
| date_range: int = None) -> List[Dict[str, Any]]: | |
| """ | |
| Enhanced search with additional parameters from original ArxivFetcher | |
| Args: | |
| query: Search query | |
| max_results: Maximum number of results | |
| sources: List of sources ['arxiv', 'semantic_scholar', 'crossref', 'pubmed'] | |
| sort_by: Sort criteria ('relevance', 'date', 'lastUpdatedDate', 'submittedDate') | |
| category: ArXiv category filter (e.g., 'cs.AI', 'cs.LG') | |
| date_range: Days back to search (e.g., 7, 30, 365) | |
| Returns: | |
| List of paper dictionaries with unified format | |
| """ | |
| # Use all sources by default | |
| if sources is None: | |
| sources = ['arxiv', 'semantic_scholar', 'crossref', 'pubmed'] | |
| # Apply category filter to ArXiv query if specified | |
| if category and 'arxiv' in sources: | |
| enhanced_query = f"cat:{category} AND {query}" | |
| return self._search_with_enhanced_query(enhanced_query, max_results, sources, sort_by, date_range) | |
| return super().search_papers(query, max_results, sources, sort_by) | |
| def _search_with_enhanced_query(self, query: str, max_results: int, sources: List[str], sort_by: str, date_range: int) -> List[Dict[str, Any]]: | |
| """Internal method for enhanced search with date filtering""" | |
| papers = super().search_papers(query, max_results, sources, sort_by) | |
| # Apply date filtering if specified | |
| if date_range: | |
| cutoff_date = datetime.now() - timedelta(days=date_range) | |
| filtered_papers = [] | |
| for paper in papers: | |
| pub_date_str = paper.get('published_date', '') | |
| if pub_date_str: | |
| try: | |
| pub_date = datetime.strptime(pub_date_str, '%Y-%m-%d') | |
| if pub_date >= cutoff_date: | |
| filtered_papers.append(paper) | |
| except ValueError: | |
| # If date parsing fails, include the paper | |
| filtered_papers.append(paper) | |
| else: | |
| # If no date, include the paper | |
| filtered_papers.append(paper) | |
| return filtered_papers | |
| return papers | |
| def get_paper_by_id(self, paper_id: str) -> Optional[Dict[str, Any]]: | |
| """ | |
| Get a specific paper by ID (supports ArXiv ID, DOI, PMID) | |
| Args: | |
| paper_id: Paper ID (ArXiv ID, DOI, or PMID) | |
| Returns: | |
| Paper dictionary or None | |
| """ | |
| # Check if it's an ArXiv ID | |
| if re.match(r'^\d{4}\.\d{4,5}(v\d+)?$', paper_id): | |
| return self._get_arxiv_paper_by_id(paper_id) | |
| # Check if it's a DOI | |
| if '/' in paper_id and ('10.' in paper_id or paper_id.startswith('doi:')): | |
| doi = paper_id.replace('doi:', '') | |
| return self.get_paper_by_doi(doi) | |
| # Check if it's a PMID | |
| if paper_id.isdigit(): | |
| return self._get_pubmed_paper_by_id(paper_id) | |
| # Fallback: search for it | |
| results = self.search_papers(paper_id, max_results=1) | |
| return results[0] if results else None | |
| def _get_arxiv_paper_by_id(self, arxiv_id: str) -> Optional[Dict[str, Any]]: | |
| """Get paper by ArXiv ID""" | |
| try: | |
| search = arxiv.Search(id_list=[arxiv_id]) | |
| results = list(self.arxiv_client.results(search)) | |
| if results: | |
| result = results[0] | |
| return { | |
| 'title': result.title, | |
| 'authors': [author.name for author in result.authors], | |
| 'abstract': result.summary, | |
| 'published_date': result.published.strftime('%Y-%m-%d'), | |
| 'year': result.published.year, | |
| 'url': result.entry_id, | |
| 'pdf_url': result.pdf_url, | |
| 'source': 'ArXiv', | |
| 'arxiv_id': result.entry_id.split('/')[-1], | |
| 'categories': [cat for cat in result.categories], | |
| 'doi': result.doi | |
| } | |
| return None | |
| except Exception as e: | |
| print(f"Error fetching ArXiv paper {arxiv_id}: {e}") | |
| return None | |
| def _get_pubmed_paper_by_id(self, pmid: str) -> Optional[Dict[str, Any]]: | |
| """Get paper by PubMed ID""" | |
| try: | |
| fetch_url = f"{self.pubmed_base}/efetch.fcgi" | |
| fetch_params = { | |
| 'db': 'pubmed', | |
| 'id': pmid, | |
| 'retmode': 'xml' | |
| } | |
| response = requests.get(fetch_url, params=fetch_params, timeout=30) | |
| response.raise_for_status() | |
| root = ET.fromstring(response.content) | |
| article = root.find('.//PubmedArticle') | |
| if article is not None: | |
| # Parse similar to _search_pubmed | |
| medline = article.find('.//MedlineCitation') | |
| article_elem = medline.find('.//Article') | |
| title_elem = article_elem.find('.//ArticleTitle') | |
| title = title_elem.text if title_elem is not None else 'No title' | |
| authors = [] | |
| author_list = article_elem.find('.//AuthorList') | |
| if author_list is not None: | |
| for author in author_list.findall('.//Author'): | |
| last_name = author.find('.//LastName') | |
| first_name = author.find('.//ForeName') | |
| if last_name is not None and first_name is not None: | |
| authors.append(f"{first_name.text} {last_name.text}") | |
| abstract = '' | |
| abstract_elem = article_elem.find('.//AbstractText') | |
| if abstract_elem is not None: | |
| abstract = abstract_elem.text or '' | |
| return { | |
| 'title': title, | |
| 'authors': authors, | |
| 'abstract': abstract, | |
| 'url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", | |
| 'source': 'PubMed', | |
| 'pmid': pmid | |
| } | |
| return None | |
| except Exception as e: | |
| print(f"Error fetching PubMed paper {pmid}: {e}") | |
| return None | |
| def search_by_author(self, author: str, max_results: int = 20) -> List[Dict[str, Any]]: | |
| """ | |
| Search for papers by author across all sources | |
| Args: | |
| author: Author name | |
| max_results: Maximum number of results | |
| Returns: | |
| List of paper dictionaries | |
| """ | |
| return self.search_papers(f"author:{author}", max_results=max_results, sort_by="date") | |
| def search_by_category(self, category: str, max_results: int = 20) -> List[Dict[str, Any]]: | |
| """ | |
| Search for papers by category (primarily ArXiv) | |
| Args: | |
| category: Category (e.g., 'cs.AI', 'cs.LG', 'stat.ML') | |
| max_results: Maximum number of results | |
| Returns: | |
| List of paper dictionaries | |
| """ | |
| return self.search_papers("", max_results=max_results, category=category, sort_by="date") | |
| def get_trending_papers(self, category: str = "cs.AI", days: int = 7, max_results: int = 10) -> List[Dict[str, Any]]: | |
| """ | |
| Get trending papers in a category | |
| Args: | |
| category: Category to search | |
| days: Days back to look for papers | |
| max_results: Maximum number of results | |
| Returns: | |
| List of paper dictionaries | |
| """ | |
| return self.search_papers( | |
| query="recent", | |
| max_results=max_results, | |
| category=category, | |
| date_range=days, | |
| sort_by="date" | |
| ) | |
| def download_pdf(self, paper: Dict[str, Any], download_dir: str = "downloads") -> Optional[str]: | |
| """ | |
| Download PDF for a paper | |
| Args: | |
| paper: Paper dictionary | |
| download_dir: Directory to save PDF | |
| Returns: | |
| Path to downloaded PDF or None | |
| """ | |
| try: | |
| import os | |
| os.makedirs(download_dir, exist_ok=True) | |
| pdf_url = paper.get('pdf_url') | |
| if not pdf_url: | |
| print(f"No PDF URL for paper: {paper.get('title', 'Unknown')}") | |
| return None | |
| # Generate filename | |
| paper_id = paper.get('arxiv_id', paper.get('pmid', paper.get('doi', 'unknown'))) | |
| filename = f"{paper_id}.pdf" | |
| filepath = os.path.join(download_dir, filename) | |
| if os.path.exists(filepath): | |
| print(f"PDF already exists: {filepath}") | |
| return filepath | |
| print(f"Downloading PDF: {paper.get('title', 'Unknown')}") | |
| response = requests.get(pdf_url, timeout=30) | |
| response.raise_for_status() | |
| with open(filepath, 'wb') as f: | |
| f.write(response.content) | |
| print(f"PDF downloaded: {filepath}") | |
| return filepath | |
| except Exception as e: | |
| print(f"Error downloading PDF: {e}") | |
| return None | |
| def get_paper_recommendations(self, paper_id: str, max_results: int = 5) -> List[Dict[str, Any]]: | |
| """ | |
| Get paper recommendations based on a paper's content | |
| Args: | |
| paper_id: Paper ID | |
| max_results: Number of recommendations | |
| Returns: | |
| List of recommended papers | |
| """ | |
| try: | |
| # Get the base paper | |
| base_paper = self.get_paper_by_id(paper_id) | |
| if not base_paper: | |
| return [] | |
| # Extract key terms from title and abstract | |
| title = base_paper.get('title', '') | |
| abstract = base_paper.get('abstract', '') | |
| # Simple keyword extraction | |
| keywords = self._extract_keywords(title + ' ' + abstract) | |
| # Search for related papers | |
| query = ' '.join(keywords[:5]) # Use top 5 keywords | |
| related_papers = self.search_papers( | |
| query=query, | |
| max_results=max_results + 5, # Get more to filter out the original | |
| sort_by="relevance" | |
| ) | |
| # Filter out the original paper | |
| recommendations = [p for p in related_papers if p.get('arxiv_id') != paper_id and p.get('pmid') != paper_id] | |
| return recommendations[:max_results] | |
| except Exception as e: | |
| print(f"Error getting recommendations: {e}") | |
| return [] | |
| def _extract_keywords(self, text: str) -> List[str]: | |
| """ | |
| Simple keyword extraction from text | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of keywords | |
| """ | |
| # Simple implementation - can be improved with NLP libraries | |
| stop_words = { | |
| 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', | |
| 'a', 'an', 'as', 'is', 'was', 'are', 'were', 'be', 'been', 'have', 'has', 'had', | |
| 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', | |
| 'can', 'this', 'that', 'these', 'those', 'we', 'us', 'our', 'you', 'your', | |
| 'he', 'him', 'his', 'she', 'her', 'it', 'its', 'they', 'them', 'their' | |
| } | |
| # Extract words | |
| words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower()) | |
| # Filter and count | |
| filtered_words = [word for word in words if word not in stop_words] | |
| word_counts = Counter(filtered_words) | |
| # Return most common words | |
| return [word for word, count in word_counts.most_common(20)] | |
| def get_categories(self) -> Dict[str, str]: | |
| """ | |
| Get available categories (primarily ArXiv) | |
| Returns: | |
| Dictionary of category codes and descriptions | |
| """ | |
| return { | |
| 'cs.AI': 'Artificial Intelligence', | |
| 'cs.LG': 'Machine Learning', | |
| 'cs.CV': 'Computer Vision', | |
| 'cs.CL': 'Computation and Language', | |
| 'cs.NE': 'Neural and Evolutionary Computing', | |
| 'cs.RO': 'Robotics', | |
| 'cs.CR': 'Cryptography and Security', | |
| 'cs.DC': 'Distributed, Parallel, and Cluster Computing', | |
| 'cs.DB': 'Databases', | |
| 'cs.DS': 'Data Structures and Algorithms', | |
| 'cs.HC': 'Human-Computer Interaction', | |
| 'cs.IR': 'Information Retrieval', | |
| 'cs.IT': 'Information Theory', | |
| 'cs.MM': 'Multimedia', | |
| 'cs.NI': 'Networking and Internet Architecture', | |
| 'cs.OS': 'Operating Systems', | |
| 'cs.PL': 'Programming Languages', | |
| 'cs.SE': 'Software Engineering', | |
| 'cs.SY': 'Systems and Control', | |
| 'stat.ML': 'Machine Learning (Statistics)', | |
| 'stat.AP': 'Applications (Statistics)', | |
| 'stat.CO': 'Computation (Statistics)', | |
| 'stat.ME': 'Methodology (Statistics)', | |
| 'stat.TH': 'Statistics Theory', | |
| 'math.ST': 'Statistics Theory (Mathematics)', | |
| 'math.PR': 'Probability (Mathematics)', | |
| 'math.OC': 'Optimization and Control', | |
| 'math.NA': 'Numerical Analysis', | |
| 'eess.AS': 'Audio and Speech Processing', | |
| 'eess.IV': 'Image and Video Processing', | |
| 'eess.SP': 'Signal Processing', | |
| 'eess.SY': 'Systems and Control', | |
| 'q-bio.QM': 'Quantitative Methods', | |
| 'q-bio.NC': 'Neurons and Cognition', | |
| 'physics.data-an': 'Data Analysis, Statistics and Probability' | |
| } | |
| # Backward compatibility aliases | |
| class ArxivFetcher(PaperFetcher): | |
| """Backward compatibility class for ArxivFetcher""" | |
| def __init__(self, config=None): | |
| super().__init__(config) | |
| def search_papers(self, query: str, max_results: int = 10, **kwargs) -> List[Dict[str, Any]]: | |
| """Search only ArXiv for backward compatibility""" | |
| return super().search_papers(query, max_results, sources=['arxiv'], **kwargs) | |
| # Main class alias for the unified fetcher | |
| UnifiedFetcher = PaperFetcher | |