Spaces:
Sleeping
Sleeping
Implement arXiv backend
Browse files- app.py +19 -2
- docs/docs.md +1 -0
- requirements.txt +2 -1
- serp.py +35 -0
app.py
CHANGED
|
@@ -10,7 +10,7 @@ import logging
|
|
| 10 |
import uvicorn
|
| 11 |
|
| 12 |
from scrap import PatentScrapBulkResponse, scrap_patent_async, scrap_patent_bulk_async
|
| 13 |
-
from serp import SerpQuery, SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar
|
| 14 |
from utils import log_gathered_exceptions
|
| 15 |
|
| 16 |
logging.basicConfig(
|
|
@@ -68,6 +68,23 @@ async def search_google_scholar(params: SerpQuery):
|
|
| 68 |
return SerpResults(results=flattened_results, error=None)
|
| 69 |
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
@serp_router.post("/search_patents")
|
| 72 |
async def search_patents(params: SerpQuery) -> SerpResults:
|
| 73 |
"""Searches google patents for the specified queries and returns the found documents."""
|
|
@@ -215,4 +232,4 @@ async def scrap_patents(params: ScrapPatentsRequest) -> PatentScrapBulkResponse:
|
|
| 215 |
app.include_router(serp_router)
|
| 216 |
app.include_router(scrap_router)
|
| 217 |
|
| 218 |
-
uvicorn.run(app, host="
|
|
|
|
| 10 |
import uvicorn
|
| 11 |
|
| 12 |
from scrap import PatentScrapBulkResponse, scrap_patent_async, scrap_patent_bulk_async
|
| 13 |
+
from serp import SerpQuery, SerpResults, query_arxiv, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar
|
| 14 |
from utils import log_gathered_exceptions
|
| 15 |
|
| 16 |
logging.basicConfig(
|
|
|
|
| 68 |
return SerpResults(results=flattened_results, error=None)
|
| 69 |
|
| 70 |
|
| 71 |
+
@serp_router.post("/search_arxiv")
|
| 72 |
+
async def search_arxiv(params: SerpQuery):
|
| 73 |
+
"""Searches arxiv for the specified queries and returns the found documents."""
|
| 74 |
+
logging.info(f"Searching Arxiv for queries: {params.queries}")
|
| 75 |
+
results = await asyncio.gather(*[query_arxiv(httpx_client, q, params.n_results) for q in params.queries], return_exceptions=True)
|
| 76 |
+
log_gathered_exceptions(results, "arxiv search", params)
|
| 77 |
+
|
| 78 |
+
filtered_results = [r for r in results if not isinstance(r, Exception)]
|
| 79 |
+
flattened_results = [
|
| 80 |
+
item for sublist in filtered_results for item in sublist]
|
| 81 |
+
|
| 82 |
+
if len(filtered_results) == 0:
|
| 83 |
+
return SerpResults(results=[], error=str(results[-1]))
|
| 84 |
+
|
| 85 |
+
return SerpResults(results=flattened_results, error=None)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
@serp_router.post("/search_patents")
|
| 89 |
async def search_patents(params: SerpQuery) -> SerpResults:
|
| 90 |
"""Searches google patents for the specified queries and returns the found documents."""
|
|
|
|
| 232 |
app.include_router(serp_router)
|
| 233 |
app.include_router(scrap_router)
|
| 234 |
|
| 235 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
docs/docs.md
CHANGED
|
@@ -8,6 +8,7 @@ SERPent exposes an unified API to query SERP (Search Engine Result Pages) for a
|
|
| 8 |
- Brave
|
| 9 |
- Bing
|
| 10 |
- Google Patents
|
|
|
|
| 11 |
- Google
|
| 12 |
|
| 13 |
The application uses the `playwright` library to control a headless web browser, to simulate normal user activity, to fool the anti-bot measures often present on those sites. See the `/serp/` endpoints for search results scrapping.
|
|
|
|
| 8 |
- Brave
|
| 9 |
- Bing
|
| 10 |
- Google Patents
|
| 11 |
+
- arXiv
|
| 12 |
- Google
|
| 13 |
|
| 14 |
The application uses the `playwright` library to control a headless web browser, to simulate normal user activity, to fool the anti-bot measures often present on those sites. See the `/serp/` endpoints for search results scrapping.
|
requirements.txt
CHANGED
|
@@ -4,4 +4,5 @@ pydantic
|
|
| 4 |
playwright
|
| 5 |
duckduckgo_search
|
| 6 |
beautifulsoup4
|
| 7 |
-
httpx
|
|
|
|
|
|
| 4 |
playwright
|
| 5 |
duckduckgo_search
|
| 6 |
beautifulsoup4
|
| 7 |
+
httpx
|
| 8 |
+
lxml
|
serp.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
| 1 |
from contextlib import asynccontextmanager
|
| 2 |
from typing import Optional
|
| 3 |
from duckduckgo_search import DDGS
|
|
|
|
| 4 |
from pydantic import BaseModel, Field
|
| 5 |
from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
|
| 6 |
from urllib.parse import quote_plus
|
| 7 |
import logging
|
| 8 |
import re
|
|
|
|
| 9 |
from asyncio import Semaphore
|
| 10 |
|
| 11 |
# Concurrency limit for Playwright browser contexts.
|
|
@@ -243,3 +245,36 @@ async def query_ddg_search(q: str, n_results: int = 10):
|
|
| 243 |
{"title": result["title"], "body": result["body"], "href": result["href"]})
|
| 244 |
|
| 245 |
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from contextlib import asynccontextmanager
|
| 2 |
from typing import Optional
|
| 3 |
from duckduckgo_search import DDGS
|
| 4 |
+
import httpx
|
| 5 |
from pydantic import BaseModel, Field
|
| 6 |
from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
|
| 7 |
from urllib.parse import quote_plus
|
| 8 |
import logging
|
| 9 |
import re
|
| 10 |
+
from lxml import etree
|
| 11 |
from asyncio import Semaphore
|
| 12 |
|
| 13 |
# Concurrency limit for Playwright browser contexts.
|
|
|
|
| 245 |
{"title": result["title"], "body": result["body"], "href": result["href"]})
|
| 246 |
|
| 247 |
return results
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
async def query_arxiv(client: httpx.AsyncClient, query: str, max_results: int = 3):
|
| 251 |
+
"""Searches arXiv for the specified query and returns a list of results with titles and PDF URLs."""
|
| 252 |
+
ATOM_NAMESPACE = {'atom': 'http://www.w3.org/2005/Atom'}
|
| 253 |
+
ARXIV_API_URL = 'https://export.arxiv.org/api/query?'
|
| 254 |
+
|
| 255 |
+
search_params = {
|
| 256 |
+
'search_query': query,
|
| 257 |
+
'start': 0,
|
| 258 |
+
'max_results': max_results
|
| 259 |
+
}
|
| 260 |
+
query_url = ARXIV_API_URL
|
| 261 |
+
|
| 262 |
+
response = await client.get(query_url, params=search_params)
|
| 263 |
+
response.raise_for_status()
|
| 264 |
+
|
| 265 |
+
root = etree.fromstring(response.content)
|
| 266 |
+
entries = root.findall('atom:entry', ATOM_NAMESPACE)
|
| 267 |
+
|
| 268 |
+
results = []
|
| 269 |
+
for entry in entries:
|
| 270 |
+
title = entry.find(
|
| 271 |
+
'atom:title', ATOM_NAMESPACE).text.strip().replace('\n', ' ')
|
| 272 |
+
id = entry.find('atom:id', ATOM_NAMESPACE).text.strip()
|
| 273 |
+
pdf_url = entry.find(
|
| 274 |
+
'atom:id', ATOM_NAMESPACE).text.replace('/abs/', '/pdf/')
|
| 275 |
+
summary = entry.find(
|
| 276 |
+
'atom:summary', ATOM_NAMESPACE).text.strip()
|
| 277 |
+
results.append({'title': title, 'href': pdf_url,
|
| 278 |
+
'body': summary, 'id': id})
|
| 279 |
+
|
| 280 |
+
return results
|