Spaces:
Sleeping
Sleeping
Add scholar endpoint + rework
Browse files
app.py
CHANGED
|
@@ -9,7 +9,7 @@ import logging
|
|
| 9 |
import uvicorn
|
| 10 |
|
| 11 |
from scrap import scrap_patent_async, scrap_patent_bulk_async
|
| 12 |
-
from serp import SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents
|
| 13 |
|
| 14 |
logging.basicConfig(
|
| 15 |
level=logging.INFO,
|
|
@@ -54,9 +54,18 @@ class SerpQuery(BaseModel):
|
|
| 54 |
|
| 55 |
|
| 56 |
@serp_router.post("/search_scholar")
|
| 57 |
-
async def
|
| 58 |
"""Queries google scholar for the specified query"""
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
@serp_router.post("/search_patents")
|
|
|
|
| 9 |
import uvicorn
|
| 10 |
|
| 11 |
from scrap import scrap_patent_async, scrap_patent_bulk_async
|
| 12 |
+
from serp import SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar
|
| 13 |
|
| 14 |
logging.basicConfig(
|
| 15 |
level=logging.INFO,
|
|
|
|
| 54 |
|
| 55 |
|
| 56 |
@serp_router.post("/search_scholar")
|
| 57 |
+
async def search_google_scholar(params: SerpQuery):
|
| 58 |
"""Queries google scholar for the specified query"""
|
| 59 |
+
results = []
|
| 60 |
+
for q in params.queries:
|
| 61 |
+
logging.info(f"Searching Google Scholar with query `{q}`")
|
| 62 |
+
try:
|
| 63 |
+
res = await query_google_scholar(pw_browser, q, params.n_results)
|
| 64 |
+
results.extend(res)
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logging.error(
|
| 67 |
+
f"Failed to query Google Scholar with query `{q}`: {e}")
|
| 68 |
+
return SerpResults(results=results, error=None)
|
| 69 |
|
| 70 |
|
| 71 |
@serp_router.post("/search_patents")
|
serp.py
CHANGED
|
@@ -33,9 +33,46 @@ async def playwright_open_page(browser: Browser):
|
|
| 33 |
await context.close()
|
| 34 |
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
| 37 |
"""Queries google patents for the specified query and number of results. Returns relevant patents"""
|
| 38 |
|
|
|
|
|
|
|
|
|
|
| 39 |
async with playwright_open_page(browser) as page:
|
| 40 |
|
| 41 |
async def _block_resources(route, request):
|
|
@@ -46,36 +83,31 @@ async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
|
| 46 |
|
| 47 |
await page.route("**/*", _block_resources)
|
| 48 |
|
| 49 |
-
url = f"https://patents.google.com/?q=
|
| 50 |
await page.goto(url)
|
| 51 |
|
|
|
|
|
|
|
| 52 |
await page.wait_for_function(
|
| 53 |
-
f"""() => document.querySelectorAll('search-result-item').length >=
|
| 54 |
timeout=30_000
|
| 55 |
)
|
| 56 |
|
| 57 |
-
# regex to locate a patent id
|
| 58 |
-
PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
|
| 59 |
-
|
| 60 |
items = await page.locator("search-result-item").all()
|
| 61 |
results = []
|
| 62 |
for item in items:
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
if not found:
|
| 67 |
continue
|
| 68 |
|
| 69 |
-
|
| 70 |
-
patent_id = found[0]
|
| 71 |
-
|
| 72 |
-
# extract patent title
|
| 73 |
-
title = await item.locator("h3, h4").first.inner_text(timeout=1000)
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
"div.abstract, div.result-snippet, .snippet, .result-text")
|
| 78 |
-
|
|
|
|
| 79 |
|
| 80 |
results.append({
|
| 81 |
"id": patent_id,
|
|
|
|
| 33 |
await context.close()
|
| 34 |
|
| 35 |
|
| 36 |
+
async def query_google_scholar(browser: Browser, q: str, n_results: int = 10):
|
| 37 |
+
"""Queries google scholar for the specified query and number of results. Returns relevant papers"""
|
| 38 |
+
|
| 39 |
+
async with playwright_open_page(browser) as page:
|
| 40 |
+
|
| 41 |
+
async def _block_resources(route, request):
|
| 42 |
+
if request.resource_type in ["stylesheet", "image"]:
|
| 43 |
+
await route.abort()
|
| 44 |
+
else:
|
| 45 |
+
await route.continue_()
|
| 46 |
+
|
| 47 |
+
await page.route("**/*", _block_resources)
|
| 48 |
+
|
| 49 |
+
url = f"https://scholar.google.com/scholar?q={quote_plus(q)}&num={n_results}"
|
| 50 |
+
await page.goto(url)
|
| 51 |
+
|
| 52 |
+
await page.wait_for_selector("div.gs_ri")
|
| 53 |
+
|
| 54 |
+
items = await page.locator("div.gs_ri").all()
|
| 55 |
+
results = []
|
| 56 |
+
for item in items[:n_results]:
|
| 57 |
+
title = await item.locator("h3").inner_text(timeout=1000)
|
| 58 |
+
body = await item.locator("div.gs_rs").inner_text(timeout=1000)
|
| 59 |
+
href = await item.locator("h3 > a").get_attribute("href")
|
| 60 |
+
|
| 61 |
+
results.append({
|
| 62 |
+
"title": title,
|
| 63 |
+
"body": body,
|
| 64 |
+
"href": href
|
| 65 |
+
})
|
| 66 |
+
|
| 67 |
+
return results
|
| 68 |
+
|
| 69 |
+
|
| 70 |
async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
| 71 |
"""Queries google patents for the specified query and number of results. Returns relevant patents"""
|
| 72 |
|
| 73 |
+
# regex to locate a patent id
|
| 74 |
+
PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
|
| 75 |
+
|
| 76 |
async with playwright_open_page(browser) as page:
|
| 77 |
|
| 78 |
async def _block_resources(route, request):
|
|
|
|
| 83 |
|
| 84 |
await page.route("**/*", _block_resources)
|
| 85 |
|
| 86 |
+
url = f"https://patents.google.com/?q={quote_plus(q)}&num={n_results}"
|
| 87 |
await page.goto(url)
|
| 88 |
|
| 89 |
+
# Wait for at least one search result item to appear
|
| 90 |
+
# This ensures the page has loaded enough to start scraping
|
| 91 |
await page.wait_for_function(
|
| 92 |
+
f"""() => document.querySelectorAll('search-result-item').length >= 1""",
|
| 93 |
timeout=30_000
|
| 94 |
)
|
| 95 |
|
|
|
|
|
|
|
|
|
|
| 96 |
items = await page.locator("search-result-item").all()
|
| 97 |
results = []
|
| 98 |
for item in items:
|
| 99 |
+
text = " ".join(await item.locator("span").all_inner_texts())
|
| 100 |
+
match = re.search(PATENT_ID_REGEX, text)
|
| 101 |
+
if not match:
|
|
|
|
| 102 |
continue
|
| 103 |
|
| 104 |
+
patent_id = match.group()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
+
try:
|
| 107 |
+
title = await item.locator("h3, h4").first.inner_text(timeout=1000)
|
| 108 |
+
body = await item.locator("div.abstract, div.result-snippet, .snippet, .result-text").first.inner_text(timeout=1000)
|
| 109 |
+
except:
|
| 110 |
+
continue # If we can't get title or body, skip this item
|
| 111 |
|
| 112 |
results.append({
|
| 113 |
"id": patent_id,
|