playwright-scraper-clean

Sleeping

apexherbert200 commited on May 25

Commit

9e4b598

1 Parent(s): 0540d86

Focus scraper on body content, links, and webpage images

Features:
- Extract clean body text content (removes scripts/styles)
- Get all meaningful links with text and URLs
- Take full page screenshots (not just viewport)
- Extract page title and meta description
- Filter links to only include valid HTTP URLs
- Limit link text to 200 characters for better performance
- Changed parameter from get_content to get_body for clarity

Example: /scrape?url=https://example.com&screenshot=true&get_links=true&get_body=true

Files changed (1) hide show

scrape.py +60 -20

scrape.py CHANGED Viewed

@@ -17,27 +17,35 @@ class LinkInfo(BaseModel):
     href: str
 class ScrapeResponse(BaseModel):
-    content: Optional[str] = None
     screenshot: Optional[str] = None
     links: Optional[List[LinkInfo]] = None
 @app.get("/")
 async def root():
     return {
-        "message": "Playwright Web Scraper API",
         "endpoints": {
-            "/scrape": "Scrape a webpage (GET request)",
             "/docs": "API documentation"
         },
-        "example": "/scrape?url=https://example.com&screenshot=true&get_links=true&get_content=false"
     }
 @app.get("/scrape")
 async def scrape_page(
     url: str = Query(..., description="URL to scrape"),
-    screenshot: bool = Query(True, description="Take a screenshot"),
-    get_links: bool = Query(True, description="Extract links"),
-    get_content: bool = Query(False, description="Get page content (can be large)")
 ):
     logger.info(f"Starting scrape for URL: {url}")
     try:
@@ -62,28 +70,60 @@ async def scrape_page(
                 await page.goto(url, wait_until="networkidle")
                 response = ScrapeResponse()
-                # Get page content
-                if get_content:
-                    logger.info("Getting page content...")
-                    response.content = await page.content()
-                # Get screenshot
                 if screenshot:
-                    logger.info("Taking screenshot...")
-                    screenshot_bytes = await page.screenshot()
                     response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
-                # Get links
                 if get_links:
                     logger.info("Extracting links...")
                     links = await page.evaluate("""
                         () => {
-                            return Array.from(document.querySelectorAll('a')).map(a => {
-                                return {
-                                    text: a.innerText.trim(),
-                                    href: a.href
                                 }
-                            });
                         }
                     """)
                     response.links = [LinkInfo(**link) for link in links]

     href: str
 class ScrapeResponse(BaseModel):
+    body_content: Optional[str] = None
     screenshot: Optional[str] = None
     links: Optional[List[LinkInfo]] = None
+    page_title: Optional[str] = None
+    meta_description: Optional[str] = None
 @app.get("/")
 async def root():
     return {
+        "message": "Playwright Web Scraper API - Body, Links & Images",
         "endpoints": {
+            "/scrape": "Scrape webpage body content, links, and take screenshot",
             "/docs": "API documentation"
         },
+        "example": "/scrape?url=https://example.com&screenshot=true&get_links=true&get_body=true",
+        "features": [
+            "Extract body tag content (clean text)",
+            "Get all links with text and URLs",
+            "Take full page screenshot",
+            "Extract page title and meta description"
+        ]
     }
 @app.get("/scrape")
 async def scrape_page(
     url: str = Query(..., description="URL to scrape"),
+    screenshot: bool = Query(True, description="Take a full page screenshot"),
+    get_links: bool = Query(True, description="Extract all links from the page"),
+    get_body: bool = Query(True, description="Extract body tag content")
 ):
     logger.info(f"Starting scrape for URL: {url}")
     try:
                 await page.goto(url, wait_until="networkidle")
                 response = ScrapeResponse()
+                # Always get page title and meta description
+                logger.info("Getting page metadata...")
+                response.page_title = await page.title()
+                meta_desc = await page.evaluate("""
+                    () => {
+                        const meta = document.querySelector('meta[name="description"]');
+                        return meta ? meta.getAttribute('content') : null;
+                    }
+                """)
+                response.meta_description = meta_desc
+                # Get body content (clean text)
+                if get_body:
+                    logger.info("Extracting body content...")
+                    body_content = await page.evaluate("""
+                        () => {
+                            const body = document.querySelector('body');
+                            if (!body) return null;
+                            // Remove script and style elements
+                            const scripts = body.querySelectorAll('script, style, noscript');
+                            scripts.forEach(el => el.remove());
+                            // Get clean text content
+                            return body.innerText.trim();
+                        }
+                    """)
+                    response.body_content = body_content
+                # Get screenshot (full page)
                 if screenshot:
+                    logger.info("Taking full page screenshot...")
+                    screenshot_bytes = await page.screenshot(full_page=True)
                     response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
+                # Get links with better filtering
                 if get_links:
                     logger.info("Extracting links...")
                     links = await page.evaluate("""
                         () => {
+                            return Array.from(document.querySelectorAll('a[href]')).map(a => {
+                                const text = a.innerText.trim();
+                                const href = a.href;
+                                // Only include links with meaningful text and valid URLs
+                                if (text && href && href.startsWith('http')) {
+                                    return {
+                                        text: text.substring(0, 200), // Limit text length
+                                        href: href
+                                    }
                                 }
+                                return null;
+                            }).filter(link => link !== null);
                         }
                     """)
                     response.links = [LinkInfo(**link) for link in links]