Spaces:

AiDeveloper1
/

RCS

Sleeping

App Files Files Community

AiDeveloper1 commited on May 16

Commit

b74cfe9

verified ·

1 Parent(s): fca7c02

Update main.py

Browse files

Files changed (1) hide show

main.py +31 -10

main.py CHANGED Viewed

@@ -8,11 +8,13 @@ from fastapi.templating import Jinja2Templates
 from fastapi.staticfiles import StaticFiles
 from typing import List, Dict
 import asyncio
 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-app = FastAPI(title="Website Scraper API (Enhanced for Images)")
 # Mount static files
 app.mount("/static", StaticFiles(directory="static"), name="static")
@@ -26,21 +28,34 @@ MAX_PAGES = 20
 async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[Dict, set]:
     """Scrape a single page for text, images, and links using Playwright."""
     try:
         async with async_playwright() as p:
             browser = await p.chromium.launch(headless=True)
             context = await browser.new_context(
                 user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
-                viewport={"width": 1280, "height": 720}
             )
             page = await context.new_page()
-            # Navigate and wait for content
-            await page.goto(url, wait_until="networkidle", timeout=40000)
             # Scroll to trigger lazy-loaded images
             await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
             await page.wait_for_timeout(2000)  # Wait for lazy-loaded content
             # Extract text content
             text_content = await page.evaluate(
                 """() => document.body.innerText"""
@@ -74,6 +89,7 @@ async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[Dict, s
             links = set(urljoin(url, link) for link in links if urlparse(urljoin(url, link)).netloc == base_domain and urljoin(url, link) not in visited)
             await browser.close()
         page_data = {
             "url": url,
@@ -83,13 +99,14 @@ async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[Dict, s
         return page_data, links
     except Exception as e:
-        logging.error(f"Error scraping {url}: {e}")
         return {}, set()
 @app.get("/scrape")
 async def crawl_website(url: HttpUrl):
     """Crawl the website starting from the given URL and return scraped data for up to 10 pages as JSON."""
     try:
         visited = set()
         to_visit = {str(url)}
         base_domain = urlparse(str(url)).netloc
@@ -100,27 +117,31 @@ async def crawl_website(url: HttpUrl):
             if current_url in visited:
                 continue
-            logging.info(f"Scraping: {current_url}")
             visited.add(current_url)
             page_data, new_links = await scrape_page(current_url, visited, base_domain)
             if page_data:
                 results.append(page_data)
             to_visit.update(new_links)
             # Small delay to avoid overwhelming the server
             await asyncio.sleep(0.5)
         return JSONResponse(content={"pages": results})
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")
 @app.get("/")
 async def serve_home(request: Request):
     """Serve the frontend HTML page."""
     return templates.TemplateResponse("index.html", {"request": request})
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

 from fastapi.staticfiles import StaticFiles
 from typing import List, Dict
 import asyncio
+import os
 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+app = FastAPI(title="Website Scraper API with Frontend")
 # Mount static files
 app.mount("/static", StaticFiles(directory="static"), name="static")
 async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[Dict, set]:
     """Scrape a single page for text, images, and links using Playwright."""
     try:
+        logger.info(f"Starting Playwright for URL: {url}")
         async with async_playwright() as p:
             browser = await p.chromium.launch(headless=True)
             context = await browser.new_context(
                 user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
+                viewport={"width": 800, "height": 600},  # Reduced viewport for performance
+                bypass_csp=True  # Bypass Content Security Policy
             )
             page = await context.new_page()
+            # Retry navigation with fallback
+            for attempt in range(2):  # Try up to 2 times
+                try:
+                    logger.info(f"Navigating to {url} (Attempt {attempt + 1})")
+                    await page.goto(url, wait_until="domcontentloaded", timeout=30000)  # 30s timeout
+                    break  # Success, exit retry loop
+                except Exception as e:
+                    logger.warning(f"Navigation attempt {attempt + 1} failed for {url}: {str(e)}")
+                    if attempt == 1:  # Last attempt
+                        logger.error(f"All navigation attempts failed for {url}")
+                        await browser.close()
+                        return {}, set()
+                    await asyncio.sleep(1)  # Wait before retry
             # Scroll to trigger lazy-loaded images
             await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
             await page.wait_for_timeout(2000)  # Wait for lazy-loaded content
             # Extract text content
             text_content = await page.evaluate(
                 """() => document.body.innerText"""
             links = set(urljoin(url, link) for link in links if urlparse(urljoin(url, link)).netloc == base_domain and urljoin(url, link) not in visited)
             await browser.close()
+            logger.info(f"Successfully scraped {url}")
         page_data = {
             "url": url,
         return page_data, links
     except Exception as e:
+        logger.error(f"Error scraping {url}: {str(e)}")
         return {}, set()
 @app.get("/scrape")
 async def crawl_website(url: HttpUrl):
     """Crawl the website starting from the given URL and return scraped data for up to 10 pages as JSON."""
     try:
+        logger.info(f"Starting crawl for {url}")
         visited = set()
         to_visit = {str(url)}
         base_domain = urlparse(str(url)).netloc
             if current_url in visited:
                 continue
+            logger.info(f"Scraping: {current_url}")
             visited.add(current_url)
             page_data, new_links = await scrape_page(current_url, visited, base_domain)
             if page_data:
                 results.append(page_data)
             to_visit.update(new_links)
             # Small delay to avoid overwhelming the server
             await asyncio.sleep(0.5)
+        logger.info(f"Crawl completed for {url}")
         return JSONResponse(content={"pages": results})
     except Exception as e:
+        logger.error(f"Scraping failed for {url}: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")
 @app.get("/")
 async def serve_home(request: Request):
     """Serve the frontend HTML page."""
+    logger.info("Serving home page")
     return templates.TemplateResponse("index.html", {"request": request})
 if __name__ == "__main__":
+    logger.info("Starting FastAPI server on port 7860")
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)