Spaces:
Sleeping
Sleeping
Extract field, background and description
Browse files- .gitignore +2 -1
- app.py +1 -1
- scrap.py +29 -1
.gitignore
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
__pycache__
|
| 2 |
-
.vscode
|
|
|
|
|
|
| 1 |
__pycache__
|
| 2 |
+
.vscode
|
| 3 |
+
.venv
|
app.py
CHANGED
|
@@ -24,7 +24,7 @@ pw_browser: Optional[Browser] = None
|
|
| 24 |
|
| 25 |
# httpx client
|
| 26 |
httpx_client = httpx.AsyncClient(timeout=30, limits=httpx.Limits(
|
| 27 |
-
max_connections=
|
| 28 |
|
| 29 |
|
| 30 |
@asynccontextmanager
|
|
|
|
| 24 |
|
| 25 |
# httpx client
|
| 26 |
httpx_client = httpx.AsyncClient(timeout=30, limits=httpx.Limits(
|
| 27 |
+
max_connections=30, max_keepalive_connections=20))
|
| 28 |
|
| 29 |
|
| 30 |
@asynccontextmanager
|
scrap.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import asyncio
|
| 2 |
import logging
|
|
|
|
| 3 |
from typing import Optional
|
| 4 |
from httpx import AsyncClient
|
| 5 |
from bs4 import BeautifulSoup
|
|
@@ -8,10 +9,18 @@ from pydantic import BaseModel
|
|
| 8 |
|
| 9 |
class PatentScrapResult(BaseModel):
|
| 10 |
"""Schema for the result of scraping a google patents page."""
|
|
|
|
| 11 |
title: str
|
|
|
|
| 12 |
abstract: Optional[str] = None
|
|
|
|
| 13 |
description: Optional[str] = None
|
|
|
|
| 14 |
claims: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
|
|
@@ -34,6 +43,18 @@ async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScra
|
|
| 34 |
description = description_section.get_text(
|
| 35 |
separator="\n", strip=True) if description_section else None
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
# Claims
|
| 38 |
claims_section = soup.find("section", itemprop="claims")
|
| 39 |
claims = claims_section.get_text(
|
|
@@ -43,11 +64,18 @@ async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScra
|
|
| 43 |
meta_title = soup.find("meta", {"name": "DC.title"}).get(
|
| 44 |
"content").strip()
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
return PatentScrapResult(
|
|
|
|
| 47 |
abstract=abstract,
|
| 48 |
description=description,
|
| 49 |
claims=claims,
|
| 50 |
-
title=meta_title
|
|
|
|
|
|
|
| 51 |
)
|
| 52 |
except Exception as e:
|
| 53 |
logging.error(f"Error scraping {patent_url}: {e}")
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import logging
|
| 3 |
+
import re
|
| 4 |
from typing import Optional
|
| 5 |
from httpx import AsyncClient
|
| 6 |
from bs4 import BeautifulSoup
|
|
|
|
| 9 |
|
| 10 |
class PatentScrapResult(BaseModel):
|
| 11 |
"""Schema for the result of scraping a google patents page."""
|
| 12 |
+
# The title of the patent.
|
| 13 |
title: str
|
| 14 |
+
# The abstract of the patent, if available.
|
| 15 |
abstract: Optional[str] = None
|
| 16 |
+
# The full description of the patent containing the field of the invention, background, summary, etc.
|
| 17 |
description: Optional[str] = None
|
| 18 |
+
# The full claims of the patent.
|
| 19 |
claims: Optional[str] = None
|
| 20 |
+
# The field of the invention, if available.
|
| 21 |
+
field_of_invention: Optional[str] = None
|
| 22 |
+
# The background of the invention, if available.
|
| 23 |
+
background: Optional[str] = None
|
| 24 |
|
| 25 |
|
| 26 |
async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
|
|
|
|
| 43 |
description = description_section.get_text(
|
| 44 |
separator="\n", strip=True) if description_section else None
|
| 45 |
|
| 46 |
+
# Field of the Invention
|
| 47 |
+
invention_field_match = re.findall(
|
| 48 |
+
r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None
|
| 49 |
+
invention_field = invention_field_match[0][1].strip(
|
| 50 |
+
) if invention_field_match else None
|
| 51 |
+
|
| 52 |
+
# Background of the Invention
|
| 53 |
+
invention_background_match = re.findall(
|
| 54 |
+
r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None
|
| 55 |
+
invention_background = invention_background_match[0][1].strip(
|
| 56 |
+
) if invention_background_match else None
|
| 57 |
+
|
| 58 |
# Claims
|
| 59 |
claims_section = soup.find("section", itemprop="claims")
|
| 60 |
claims = claims_section.get_text(
|
|
|
|
| 64 |
meta_title = soup.find("meta", {"name": "DC.title"}).get(
|
| 65 |
"content").strip()
|
| 66 |
|
| 67 |
+
# Patent publication number
|
| 68 |
+
# pub_num = soup.select_one("h2#pubnum").get_text(strip=True)
|
| 69 |
+
# get the h2 with id ="pubnum" and extract the text
|
| 70 |
+
|
| 71 |
return PatentScrapResult(
|
| 72 |
+
# publication_number=pub_num,
|
| 73 |
abstract=abstract,
|
| 74 |
description=description,
|
| 75 |
claims=claims,
|
| 76 |
+
title=meta_title,
|
| 77 |
+
field_of_invention=invention_field,
|
| 78 |
+
background=invention_background
|
| 79 |
)
|
| 80 |
except Exception as e:
|
| 81 |
logging.error(f"Error scraping {patent_url}: {e}")
|