Commit
·
84c8f5a
1
Parent(s):
79508b3
Tool for scraping contacts
Browse files- contacts.py +6 -1
contacts.py
CHANGED
|
@@ -41,6 +41,7 @@ class LeadData(BaseModel):
|
|
| 41 |
technologies: List[str] = []
|
| 42 |
|
| 43 |
class ScrapeResponse(BaseModel):
|
|
|
|
| 44 |
body_content: Optional[str] = None
|
| 45 |
screenshot: Optional[str] = None
|
| 46 |
links: Optional[List[LinkInfo]] = None
|
|
@@ -72,7 +73,8 @@ async def scrape_page(
|
|
| 72 |
lead_generation: bool = Query(True, description="Extract lead generation data (emails, phones, business info)"),
|
| 73 |
screenshot: bool = Query(True, description="Take a full page screenshot"),
|
| 74 |
get_links: bool = Query(True, description="Extract all links from the page"),
|
| 75 |
-
get_body: bool = Query(False, description="Extract body tag content (can be large)")
|
|
|
|
| 76 |
):
|
| 77 |
norm_url = normalize_url(url)
|
| 78 |
if norm_url in visited_urls:
|
|
@@ -98,6 +100,9 @@ async def scrape_page(
|
|
| 98 |
}
|
| 99 |
""")
|
| 100 |
|
|
|
|
|
|
|
|
|
|
| 101 |
if get_body:
|
| 102 |
response.body_content = await page.evaluate("""
|
| 103 |
() => {
|
|
|
|
| 41 |
technologies: List[str] = []
|
| 42 |
|
| 43 |
class ScrapeResponse(BaseModel):
|
| 44 |
+
full_html: Optional[str] = None
|
| 45 |
body_content: Optional[str] = None
|
| 46 |
screenshot: Optional[str] = None
|
| 47 |
links: Optional[List[LinkInfo]] = None
|
|
|
|
| 73 |
lead_generation: bool = Query(True, description="Extract lead generation data (emails, phones, business info)"),
|
| 74 |
screenshot: bool = Query(True, description="Take a full page screenshot"),
|
| 75 |
get_links: bool = Query(True, description="Extract all links from the page"),
|
| 76 |
+
get_body: bool = Query(False, description="Extract body tag content (can be large)"),
|
| 77 |
+
get_frontend: bool = Query(True, description="Get full rendered frontend HTML content")
|
| 78 |
):
|
| 79 |
norm_url = normalize_url(url)
|
| 80 |
if norm_url in visited_urls:
|
|
|
|
| 100 |
}
|
| 101 |
""")
|
| 102 |
|
| 103 |
+
if get_frontend:
|
| 104 |
+
response.full_html = await page.content()
|
| 105 |
+
|
| 106 |
if get_body:
|
| 107 |
response.body_content = await page.evaluate("""
|
| 108 |
() => {
|