Spaces:

OrganizedProgrammers
/

SERPent

Sleeping

App Files Files Community

Game4all commited on Jun 26

Commit

21275ec

1 Parent(s): 66641c2

Implement arXiv backend

Browse files

Files changed (4) hide show

app.py +19 -2
docs/docs.md +1 -0
requirements.txt +2 -1
serp.py +35 -0

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ import logging
 import uvicorn
 from scrap import PatentScrapBulkResponse, scrap_patent_async, scrap_patent_bulk_async
-from serp import SerpQuery, SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar
 from utils import log_gathered_exceptions
 logging.basicConfig(
@@ -68,6 +68,23 @@ async def search_google_scholar(params: SerpQuery):
     return SerpResults(results=flattened_results, error=None)
 @serp_router.post("/search_patents")
 async def search_patents(params: SerpQuery) -> SerpResults:
     """Searches google patents for the specified queries and returns the found documents."""
@@ -215,4 +232,4 @@ async def scrap_patents(params: ScrapPatentsRequest) -> PatentScrapBulkResponse:
 app.include_router(serp_router)
 app.include_router(scrap_router)
-uvicorn.run(app, host="127.0.0.1", port=7860)

 import uvicorn
 from scrap import PatentScrapBulkResponse, scrap_patent_async, scrap_patent_bulk_async
+from serp import SerpQuery, SerpResults, query_arxiv, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar
 from utils import log_gathered_exceptions
 logging.basicConfig(
     return SerpResults(results=flattened_results, error=None)
+@serp_router.post("/search_arxiv")
+async def search_arxiv(params: SerpQuery):
+    """Searches arxiv for the specified queries and returns the found documents."""
+    logging.info(f"Searching Arxiv for queries: {params.queries}")
+    results = await asyncio.gather(*[query_arxiv(httpx_client, q, params.n_results) for q in params.queries], return_exceptions=True)
+    log_gathered_exceptions(results, "arxiv search", params)
+    filtered_results = [r for r in results if not isinstance(r, Exception)]
+    flattened_results = [
+        item for sublist in filtered_results for item in sublist]
+    if len(filtered_results) == 0:
+        return SerpResults(results=[], error=str(results[-1]))
+    return SerpResults(results=flattened_results, error=None)
 @serp_router.post("/search_patents")
 async def search_patents(params: SerpQuery) -> SerpResults:
     """Searches google patents for the specified queries and returns the found documents."""
 app.include_router(serp_router)
 app.include_router(scrap_router)
+uvicorn.run(app, host="0.0.0.0", port=7860)

docs/docs.md CHANGED Viewed

@@ -8,6 +8,7 @@ SERPent exposes an unified API to query SERP (Search Engine Result Pages) for a
 - Brave
 - Bing
 - Google Patents
 - Google
 The application uses the `playwright` library to control a headless web browser, to simulate normal user activity, to fool the anti-bot measures often present on those sites. See the `/serp/` endpoints for search results scrapping.

 - Brave
 - Bing
 - Google Patents
+- arXiv
 - Google
 The application uses the `playwright` library to control a headless web browser, to simulate normal user activity, to fool the anti-bot measures often present on those sites. See the `/serp/` endpoints for search results scrapping.

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ pydantic
 playwright
 duckduckgo_search
 beautifulsoup4
-httpx

 playwright
 duckduckgo_search
 beautifulsoup4
+httpx
+lxml

serp.py CHANGED Viewed

@@ -1,11 +1,13 @@
 from contextlib import asynccontextmanager
 from typing import Optional
 from duckduckgo_search import DDGS
 from pydantic import BaseModel, Field
 from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
 from urllib.parse import quote_plus
 import logging
 import re
 from asyncio import Semaphore
 # Concurrency limit for Playwright browser contexts.
@@ -243,3 +245,36 @@ async def query_ddg_search(q: str, n_results: int = 10):
             {"title": result["title"], "body": result["body"], "href": result["href"]})
     return results

 from contextlib import asynccontextmanager
 from typing import Optional
 from duckduckgo_search import DDGS
+import httpx
 from pydantic import BaseModel, Field
 from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
 from urllib.parse import quote_plus
 import logging
 import re
+from lxml import etree
 from asyncio import Semaphore
 # Concurrency limit for Playwright browser contexts.
             {"title": result["title"], "body": result["body"], "href": result["href"]})
     return results
+async def query_arxiv(client: httpx.AsyncClient, query: str, max_results: int = 3):
+    """Searches arXiv for the specified query and returns a list of results with titles and PDF URLs."""
+    ATOM_NAMESPACE = {'atom': 'http://www.w3.org/2005/Atom'}
+    ARXIV_API_URL = 'https://export.arxiv.org/api/query?'
+    search_params = {
+        'search_query': query,
+        'start': 0,
+        'max_results': max_results
+    }
+    query_url = ARXIV_API_URL
+    response = await client.get(query_url, params=search_params)
+    response.raise_for_status()
+    root = etree.fromstring(response.content)
+    entries = root.findall('atom:entry', ATOM_NAMESPACE)
+    results = []
+    for entry in entries:
+        title = entry.find(
+            'atom:title', ATOM_NAMESPACE).text.strip().replace('\n', ' ')
+        id = entry.find('atom:id', ATOM_NAMESPACE).text.strip()
+        pdf_url = entry.find(
+            'atom:id', ATOM_NAMESPACE).text.replace('/abs/', '/pdf/')
+        summary = entry.find(
+            'atom:summary', ATOM_NAMESPACE).text.strip()
+        results.append({'title': title, 'href': pdf_url,
+                       'body': summary, 'id': id})
+    return results