playwright-scraper-clean

Sleeping

apexherbert200 commited on May 29

Commit

35b24cc

1 Parent(s): 9e4b598

🚀 Transform scraper into Lead Generation powerhouse

MAJOR FEATURES ADDED:
📧 Email extraction with regex pattern matching
📞 Phone number detection and formatting
🏢 Company name identification (multiple methods)
📍 Address extraction with US format regex
🔗 Social media profile discovery
📝 Contact form detection
⚡ Technology stack identification
🎯 Industry keyword classification
📊 Intelligent lead scoring system (0-100)

LEAD SCORING ALGORITHM:
- Emails found: +30 points
- Phone numbers: +25 points
- Contact forms: +20 points
- Social media: +15 points
- Address: +15 points
- Company name: +10 points
- Technologies: +10 points
- Industry keywords: +5 points

BUSINESS APPLICATIONS:
- B2B lead generation
- Sales prospecting
- Market research
- Competitor analysis
- Contact discovery

Example: /scrape?url=https://company.com&lead_generation=true

Files changed (1) hide show

scrape.py +144 -9

scrape.py CHANGED Viewed

@@ -16,36 +16,73 @@ class LinkInfo(BaseModel):
     text: str
     href: str
 class ScrapeResponse(BaseModel):
     body_content: Optional[str] = None
     screenshot: Optional[str] = None
     links: Optional[List[LinkInfo]] = None
     page_title: Optional[str] = None
     meta_description: Optional[str] = None
 @app.get("/")
 async def root():
     return {
-        "message": "Playwright Web Scraper API - Body, Links & Images",
         "endpoints": {
-            "/scrape": "Scrape webpage body content, links, and take screenshot",
             "/docs": "API documentation"
         },
-        "example": "/scrape?url=https://example.com&screenshot=true&get_links=true&get_body=true",
-        "features": [
-            "Extract body tag content (clean text)",
-            "Get all links with text and URLs",
-            "Take full page screenshot",
-            "Extract page title and meta description"
         ]
     }
 @app.get("/scrape")
 async def scrape_page(
     url: str = Query(..., description="URL to scrape"),
     screenshot: bool = Query(True, description="Take a full page screenshot"),
     get_links: bool = Query(True, description="Extract all links from the page"),
-    get_body: bool = Query(True, description="Extract body tag content")
 ):
     logger.info(f"Starting scrape for URL: {url}")
     try:
@@ -128,6 +165,104 @@ async def scrape_page(
                     """)
                     response.links = [LinkInfo(**link) for link in links]
                 await browser.close()
                 logger.info("Scraping completed successfully")
                 return response

     text: str
     href: str
+class ContactInfo(BaseModel):
+    emails: List[str] = []
+    phones: List[str] = []
+    social_media: List[str] = []
+    contact_forms: List[str] = []
+class BusinessInfo(BaseModel):
+    company_name: Optional[str] = None
+    address: Optional[str] = None
+    description: Optional[str] = None
+    industry_keywords: List[str] = []
+class LeadData(BaseModel):
+    contact_info: ContactInfo
+    business_info: BusinessInfo
+    lead_score: int = 0
+    technologies: List[str] = []
 class ScrapeResponse(BaseModel):
     body_content: Optional[str] = None
     screenshot: Optional[str] = None
     links: Optional[List[LinkInfo]] = None
     page_title: Optional[str] = None
     meta_description: Optional[str] = None
+    lead_data: Optional[LeadData] = None
 @app.get("/")
 async def root():
     return {
+        "message": "🚀 Lead Generation Web Scraper API",
+        "tagline": "Turn any website into qualified leads",
         "endpoints": {
+            "/scrape": "Extract leads, contacts, and business data from any website",
             "/docs": "API documentation"
         },
+        "example": "/scrape?url=https://example.com&lead_generation=true&screenshot=true",
+        "lead_generation_features": [
+            "📧 Extract email addresses and contact forms",
+            "📞 Find phone numbers and contact info",
+            "🏢 Identify company names and addresses",
+            "🔗 Discover social media profiles",
+            "⚡ Detect technologies and tools used",
+            "📊 Calculate lead quality scores",
+            "🎯 Industry keyword extraction"
+        ],
+        "basic_features": [
+            "📄 Clean body text extraction",
+            "🔗 Smart link filtering",
+            "📸 Full page screenshots",
+            "📋 Page metadata extraction"
+        ],
+        "use_cases": [
+            "B2B lead generation",
+            "Sales prospecting",
+            "Market research",
+            "Competitor analysis",
+            "Contact discovery"
         ]
     }
 @app.get("/scrape")
 async def scrape_page(
     url: str = Query(..., description="URL to scrape"),
+    lead_generation: bool = Query(True, description="Extract lead generation data (emails, phones, business info)"),
     screenshot: bool = Query(True, description="Take a full page screenshot"),
     get_links: bool = Query(True, description="Extract all links from the page"),
+    get_body: bool = Query(False, description="Extract body tag content (can be large)")
 ):
     logger.info(f"Starting scrape for URL: {url}")
     try:
                     """)
                     response.links = [LinkInfo(**link) for link in links]
+                # Lead Generation Extraction
+                if lead_generation:
+                    logger.info("Extracting lead generation data...")
+                    lead_data_raw = await page.evaluate("""
+                        () => {
+                            const result = {
+                                emails: [],
+                                phones: [],
+                                social_media: [],
+                                contact_forms: [],
+                                company_name: null,
+                                address: null,
+                                technologies: [],
+                                industry_keywords: []
+                            };
+                            // Extract emails
+                            const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
+                            const pageText = document.body.innerText;
+                            const emails = pageText.match(emailRegex) || [];
+                            result.emails = [...new Set(emails)].slice(0, 10); // Unique emails, max 10
+                            // Extract phone numbers
+                            const phoneRegex = /(\+?1?[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})/g;
+                            const phones = pageText.match(phoneRegex) || [];
+                            result.phones = [...new Set(phones)].slice(0, 5); // Unique phones, max 5
+                            // Extract social media links
+                            const socialLinks = Array.from(document.querySelectorAll('a[href]')).map(a => a.href)
+                                .filter(href => /facebook|twitter|linkedin|instagram|youtube|tiktok/i.test(href));
+                            result.social_media = [...new Set(socialLinks)].slice(0, 10);
+                            // Find contact forms
+                            const forms = Array.from(document.querySelectorAll('form')).map(form => {
+                                const action = form.action || window.location.href;
+                                return action;
+                            });
+                            result.contact_forms = [...new Set(forms)].slice(0, 5);
+                            // Extract company name (try multiple methods)
+                            result.company_name =
+                                document.querySelector('meta[property="og:site_name"]')?.content ||
+                                document.querySelector('meta[name="application-name"]')?.content ||
+                                document.querySelector('h1')?.innerText?.trim() ||
+                                document.title?.split('|')[0]?.split('-')[0]?.trim();
+                            // Extract address
+                            const addressRegex = /\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl)\s*,?\s*[A-Za-z\s]+,?\s*[A-Z]{2}\s*\d{5}/g;
+                            const addresses = pageText.match(addressRegex) || [];
+                            result.address = addresses[0] || null;
+                            // Detect technologies
+                            const techKeywords = ['wordpress', 'shopify', 'react', 'angular', 'vue', 'bootstrap', 'jquery', 'google analytics', 'facebook pixel'];
+                            const htmlContent = document.documentElement.outerHTML.toLowerCase();
+                            result.technologies = techKeywords.filter(tech => htmlContent.includes(tech));
+                            // Industry keywords
+                            const industryKeywords = ['consulting', 'marketing', 'software', 'healthcare', 'finance', 'real estate', 'education', 'retail', 'manufacturing', 'legal', 'restaurant', 'fitness', 'beauty', 'automotive'];
+                            const lowerPageText = pageText.toLowerCase();
+                            result.industry_keywords = industryKeywords.filter(keyword => lowerPageText.includes(keyword));
+                            return result;
+                        }
+                    """)
+                    # Calculate lead score
+                    lead_score = 0
+                    if lead_data_raw['emails']: lead_score += 30
+                    if lead_data_raw['phones']: lead_score += 25
+                    if lead_data_raw['contact_forms']: lead_score += 20
+                    if lead_data_raw['social_media']: lead_score += 15
+                    if lead_data_raw['company_name']: lead_score += 10
+                    if lead_data_raw['address']: lead_score += 15
+                    if lead_data_raw['technologies']: lead_score += 10
+                    if lead_data_raw['industry_keywords']: lead_score += 5
+                    # Create lead data object
+                    contact_info = ContactInfo(
+                        emails=lead_data_raw['emails'],
+                        phones=lead_data_raw['phones'],
+                        social_media=lead_data_raw['social_media'],
+                        contact_forms=lead_data_raw['contact_forms']
+                    )
+                    business_info = BusinessInfo(
+                        company_name=lead_data_raw['company_name'],
+                        address=lead_data_raw['address'],
+                        description=response.meta_description,
+                        industry_keywords=lead_data_raw['industry_keywords']
+                    )
+                    response.lead_data = LeadData(
+                        contact_info=contact_info,
+                        business_info=business_info,
+                        lead_score=min(lead_score, 100),  # Cap at 100
+                        technologies=lead_data_raw['technologies']
+                    )
                 await browser.close()
                 logger.info("Scraping completed successfully")
                 return response