Spaces:

apexherbert200
/

playwright-scraper-clean

Paused

App Files Files Community

apexherbert200 commited on Jun 15

Commit

366b9dd

1 Parent(s): 36092a0

Using google search

Browse files

Files changed (1) hide show

test1.py +79 -32

test1.py CHANGED Viewed

@@ -1,48 +1,95 @@
 from fastapi import FastAPI
 from playwright.async_api import async_playwright, TimeoutError
-import re
 app = FastAPI()
-async def scrape_google(query: str):
-    url = f"https://www.google.com/search?q={query}"
     async with async_playwright() as pw:
         browser = await pw.chromium.launch(headless=True)
         context = await browser.new_context()
         page = await context.new_page()
         await page.goto(url, wait_until="domcontentloaded", timeout=60000)
         try:
-            await page.wait_for_selector("div#search", timeout=10000)
         except TimeoutError:
             pass
-        links = []
-        for h in await page.query_selector_all("h3"):
-            try:
-                a = await h.evaluate_handle("e => e.closest('a')")
-                href = await a.get_attribute("href")
-                title = await h.inner_text()
-                links.append({"title": title, "link": href})
-            except:
-                continue
-        results = []
-        for item in links[:5]:
-            await page.goto(item["link"], wait_until="domcontentloaded", timeout=30000)
-            html = await page.content()
-            emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", html)
-            phones = re.findall(r"\+?\d[\d\s\-/]{7,}\d", html)
-            results.append({
-                **item,
-                "emails": list(set(emails))[:2],
-                "phones": list(set(phones))[:2]
-            })
         await browser.close()
-    return results
-@app.get("/search")
-async def search(query: str):
-    data = await scrape_google(query.replace(" ", "+"))
-    return {"query": query, "results": data}

+# from fastapi import FastAPI
+# from playwright.async_api import async_playwright, TimeoutError
+# import re
+# app = FastAPI()
+# async def scrape_google(query: str):
+#     url = f"https://www.google.com/search?q={query}"
+#     async with async_playwright() as pw:
+#         browser = await pw.chromium.launch(headless=True)
+#         context = await browser.new_context()
+#         page = await context.new_page()
+#         await page.goto(url, wait_until="domcontentloaded", timeout=60000)
+#         try:
+#             await page.wait_for_selector("div#search", timeout=10000)
+#         except TimeoutError:
+#             pass
+#         links = []
+#         for h in await page.query_selector_all("h3"):
+#             try:
+#                 a = await h.evaluate_handle("e => e.closest('a')")
+#                 href = await a.get_attribute("href")
+#                 title = await h.inner_text()
+#                 links.append({"title": title, "link": href})
+#             except:
+#                 continue
+#         results = []
+#         for item in links[:5]:
+#             await page.goto(item["link"], wait_until="domcontentloaded", timeout=30000)
+#             html = await page.content()
+#             emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", html)
+#             phones = re.findall(r"\+?\d[\d\s\-/]{7,}\d", html)
+#             results.append({
+#                 **item,
+#                 "emails": list(set(emails))[:2],
+#                 "phones": list(set(phones))[:2]
+#             })
+#         await browser.close()
+#     return results
+# @app.get("/search")
+# async def search(query: str):
+#     data = await scrape_google(query.replace(" ", "+"))
+#     return {"query": query, "results": data}
 from fastapi import FastAPI
 from playwright.async_api import async_playwright, TimeoutError
 app = FastAPI()
+async def scrape_full_page(url: str):
     async with async_playwright() as pw:
         browser = await pw.chromium.launch(headless=True)
         context = await browser.new_context()
         page = await context.new_page()
         await page.goto(url, wait_until="domcontentloaded", timeout=60000)
         try:
+            await page.wait_for_selector("body", timeout=10000)
         except TimeoutError:
             pass
+        html = await page.content()
+        # Extract headings & paragraphs as structured JSON
+        items = await page.evaluate("""
+        () => {
+            const data = [];
+            document.querySelectorAll('h1,h2,h3,h4,h5,h6,p').forEach(el => {
+                data.push({ tag: el.tagName.toLowerCase(), text: el.innerText.trim() });
+            });
+            return data;
+        }
+        """)
         await browser.close()
+    return {"html": html, "content": items}
+@app.get("/scrape")
+async def scrape(url: str):
+    """
+    Fetches the full page and returns:
+    - raw HTML
+    - an array of objects: { tag: 'h1'|'p'|..., text: '...' }
+    """
+    result = await scrape_full_page(url)
+    return result