Spaces:

MagicMeWizard
/

AI_Powered_Web_Scraper

Paused

App Files Files Community

MagicMeWizard commited on Jun 30

Commit

35f9333

verified ·

1 Parent(s): 399a018

Create app.py

Browse files

Files changed (1) hide show

app.py +701 -0

app.py ADDED Viewed

	@@ -0,0 +1,701 @@

+"""
+AI-Powered Web Scraper - app.py
+Professional-grade web content extraction and AI summarization tool for Hugging Face Spaces
+"""
+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+import pandas as pd
+from datetime import datetime
+import json
+import re
+import time
+from typing import List, Dict, Optional, Tuple
+import logging
+from pathlib import Path
+import os
+from dataclasses import dataclass
+from transformers import pipeline
+import nltk
+from nltk.tokenize import sent_tokenize
+import asyncio
+import aiohttp
+from concurrent.futures import ThreadPoolExecutor
+import hashlib
+# Download required NLTK data
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt', quiet=True)
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+@dataclass
+class ScrapedContent:
+    """Data class for scraped content with metadata"""
+    url: str
+    title: str
+    content: str
+    summary: str
+    word_count: int
+    reading_time: int
+    extracted_at: str
+    author: Optional[str] = None
+    publish_date: Optional[str] = None
+    meta_description: Optional[str] = None
+    keywords: List[str] = None
+class SecurityValidator:
+    """Security validation for URLs and content"""
+    ALLOWED_SCHEMES = {'http', 'https'}
+    BLOCKED_DOMAINS = {
+        'localhost', '127.0.0.1', '0.0.0.0',
+        '192.168.', '10.', '172.16.', '172.17.',
+        '172.18.', '172.19.', '172.20.', '172.21.',
+        '172.22.', '172.23.', '172.24.', '172.25.',
+        '172.26.', '172.27.', '172.28.', '172.29.',
+        '172.30.', '172.31.'
+    }
+    @classmethod
+    def validate_url(cls, url: str) -> Tuple[bool, str]:
+        """Validate URL for security concerns"""
+        try:
+            parsed = urlparse(url)
+            # Check scheme
+            if parsed.scheme not in cls.ALLOWED_SCHEMES:
+                return False, f"Invalid scheme: {parsed.scheme}. Only HTTP/HTTPS allowed."
+            # Check for blocked domains
+            hostname = parsed.hostname or ''
+            if any(blocked in hostname for blocked in cls.BLOCKED_DOMAINS):
+                return False, "Access to internal/local networks is not allowed."
+            # Basic malformed URL check
+            if not parsed.netloc:
+                return False, "Invalid URL format."
+            return True, "URL is valid."
+        except Exception as e:
+            return False, f"URL validation error: {str(e)}"
+class RobotsTxtChecker:
+    """Check robots.txt compliance"""
+    @staticmethod
+    def can_fetch(url: str, user_agent: str = "*") -> bool:
+        """Check if URL can be fetched according to robots.txt"""
+        try:
+            parsed_url = urlparse(url)
+            robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
+            response = requests.get(robots_url, timeout=5)
+            if response.status_code == 200:
+                # Simple robots.txt parsing (basic implementation)
+                lines = response.text.split('\n')
+                user_agent_section = False
+                for line in lines:
+                    line = line.strip()
+                    if line.startswith('User-agent:'):
+                        agent = line.split(':', 1)[1].strip()
+                        user_agent_section = agent == '*' or agent.lower() == user_agent.lower()
+                    elif user_agent_section and line.startswith('Disallow:'):
+                        disallowed = line.split(':', 1)[1].strip()
+                        if disallowed and url.endswith(disallowed):
+                            return False
+            return True
+        except Exception:
+            # If robots.txt can't be fetched, assume allowed
+            return True
+class ContentExtractor:
+    """Advanced content extraction with multiple strategies"""
+    def __init__(self):
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (compatible; AI-WebScraper/1.0; Research Tool)',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+        })
+    def extract_content(self, url: str) -> Optional[ScrapedContent]:
+        """Extract content from URL with robust error handling"""
+        try:
+            # Security validation
+            is_valid, validation_msg = SecurityValidator.validate_url(url)
+            if not is_valid:
+                raise ValueError(f"Security validation failed: {validation_msg}")
+            # Check robots.txt
+            if not RobotsTxtChecker.can_fetch(url):
+                raise ValueError("robots.txt disallows scraping this URL")
+            # Fetch content
+            response = self.session.get(url, timeout=15)
+            response.raise_for_status()
+            # Parse HTML
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Extract metadata
+            title = self._extract_title(soup)
+            author = self._extract_author(soup)
+            publish_date = self._extract_publish_date(soup)
+            meta_description = self._extract_meta_description(soup)
+            # Extract main content
+            content = self._extract_main_content(soup)
+            if not content or len(content.strip()) < 100:
+                raise ValueError("Insufficient content extracted")
+            # Calculate metrics
+            word_count = len(content.split())
+            reading_time = max(1, word_count // 200)  # Average reading speed
+            # Extract keywords
+            keywords = self._extract_keywords(content)
+            return ScrapedContent(
+                url=url,
+                title=title,
+                content=content,
+                summary="",  # Will be filled by AI summarizer
+                word_count=word_count,
+                reading_time=reading_time,
+                extracted_at=datetime.now().isoformat(),
+                author=author,
+                publish_date=publish_date,
+                meta_description=meta_description,
+                keywords=keywords
+            )
+        except Exception as e:
+            logger.error(f"Content extraction failed for {url}: {str(e)}")
+            raise
+    def _extract_title(self, soup: BeautifulSoup) -> str:
+        """Extract page title with fallbacks"""
+        # Try meta og:title first
+        og_title = soup.find('meta', property='og:title')
+        if og_title and og_title.get('content'):
+            return og_title['content'].strip()
+        # Try regular title tag
+        title_tag = soup.find('title')
+        if title_tag:
+            return title_tag.get_text().strip()
+        # Try h1 as fallback
+        h1_tag = soup.find('h1')
+        if h1_tag:
+            return h1_tag.get_text().strip()
+        return "No title found"
+    def _extract_author(self, soup: BeautifulSoup) -> Optional[str]:
+        """Extract author information"""
+        # Try multiple selectors for author
+        author_selectors = [
+            'meta[name="author"]',
+            'meta[property="article:author"]',
+            '.author',
+            '.byline',
+            '[rel="author"]'
+        ]
+        for selector in author_selectors:
+            element = soup.select_one(selector)
+            if element:
+                if element.name == 'meta':
+                    return element.get('content', '').strip()
+                else:
+                    return element.get_text().strip()
+        return None
+    def _extract_publish_date(self, soup: BeautifulSoup) -> Optional[str]:
+        """Extract publication date"""
+        date_selectors = [
+            'meta[property="article:published_time"]',
+            'meta[name="publishdate"]',
+            'time[datetime]',
+            '.publish-date',
+            '.date'
+        ]
+        for selector in date_selectors:
+            element = soup.select_one(selector)
+            if element:
+                if element.name == 'meta':
+                    return element.get('content', '').strip()
+                elif element.name == 'time':
+                    return element.get('datetime', '').strip()
+                else:
+                    return element.get_text().strip()
+        return None
+    def _extract_meta_description(self, soup: BeautifulSoup) -> Optional[str]:
+        """Extract meta description"""
+        meta_desc = soup.find('meta', attrs={'name': 'description'})
+        if meta_desc:
+            return meta_desc.get('content', '').strip()
+        og_desc = soup.find('meta', property='og:description')
+        if og_desc:
+            return og_desc.get('content', '').strip()
+        return None
+    def _extract_main_content(self, soup: BeautifulSoup) -> str:
+        """Extract main content with multiple strategies"""
+        # Remove unwanted elements
+        for element in soup(['script', 'style', 'nav', 'header', 'footer',
+                           'aside', 'advertisement', '.ads', '.sidebar']):
+            element.decompose()
+        # Try content-specific selectors first
+        content_selectors = [
+            'article',
+            'main',
+            '.content',
+            '.post-content',
+            '.entry-content',
+            '.article-body',
+            '#content',
+            '.story-body'
+        ]
+        for selector in content_selectors:
+            element = soup.select_one(selector)
+            if element:
+                text = element.get_text(separator=' ', strip=True)
+                if len(text) > 200:  # Minimum content threshold
+                    return self._clean_text(text)
+        # Fallback: extract from body
+        body = soup.find('body')
+        if body:
+            text = body.get_text(separator=' ', strip=True)
+            return self._clean_text(text)
+        # Last resort: all text
+        return self._clean_text(soup.get_text(separator=' ', strip=True))
+    def _clean_text(self, text: str) -> str:
+        """Clean extracted text"""
+        # Remove extra whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # Remove common unwanted patterns
+        text = re.sub(r'Subscribe.*?newsletter', '', text, flags=re.IGNORECASE)
+        text = re.sub(r'Click here.*?more', '', text, flags=re.IGNORECASE)
+        text = re.sub(r'Advertisement', '', text, flags=re.IGNORECASE)
+        return text.strip()
+    def _extract_keywords(self, content: str) -> List[str]:
+        """Extract basic keywords from content"""
+        # Simple keyword extraction (can be enhanced with NLP)
+        words = re.findall(r'\b[A-Za-z]{4,}\b', content.lower())
+        word_freq = {}
+        for word in words:
+            if word not in ['that', 'this', 'with', 'from', 'they', 'have', 'been', 'were', 'said']:
+                word_freq[word] = word_freq.get(word, 0) + 1
+        # Return top 10 keywords
+        sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
+        return [word for word, freq in sorted_words[:10]]
+class AISummarizer:
+    """AI-powered content summarization"""
+    def __init__(self):
+        self.summarizer = None
+        self._load_model()
+    def _load_model(self):
+        """Load summarization model with error handling"""
+        try:
+            self.summarizer = pipeline(
+                "summarization",
+                model="facebook/bart-large-cnn",
+                tokenizer="facebook/bart-large-cnn"
+            )
+            logger.info("Summarization model loaded successfully")
+        except Exception as e:
+            logger.error(f"Failed to load summarization model: {e}")
+            # Fallback to a smaller model
+            try:
+                self.summarizer = pipeline(
+                    "summarization",
+                    model="sshleifer/distilbart-cnn-12-6"
+                )
+                logger.info("Fallback summarization model loaded")
+            except Exception as e2:
+                logger.error(f"Failed to load fallback model: {e2}")
+                self.summarizer = None
+    def summarize(self, content: str, max_length: int = 300) -> str:
+        """Generate AI summary of content"""
+        if not self.summarizer:
+            return self._extractive_summary(content)
+        try:
+            # Split content into chunks if too long
+            max_input_length = 1024
+            chunks = self._split_content(content, max_input_length)
+            summaries = []
+            for chunk in chunks:
+                if len(chunk.split()) < 20:  # Skip very short chunks
+                    continue
+                result = self.summarizer(
+                    chunk,
+                    max_length=min(max_length, len(chunk.split()) // 2),
+                    min_length=30,
+                    do_sample=False
+                )
+                summaries.append(result[0]['summary_text'])
+            # Combine summaries
+            combined = ' '.join(summaries)
+            # If still too long, summarize again
+            if len(combined.split()) > max_length:
+                result = self.summarizer(
+                    combined,
+                    max_length=max_length,
+                    min_length=50,
+                    do_sample=False
+                )
+                return result[0]['summary_text']
+            return combined
+        except Exception as e:
+            logger.error(f"AI summarization failed: {e}")
+            return self._extractive_summary(content)
+    def _split_content(self, content: str, max_length: int) -> List[str]:
+        """Split content into manageable chunks"""
+        sentences = sent_tokenize(content)
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        for sentence in sentences:
+            sentence_length = len(sentence.split())
+            if current_length + sentence_length > max_length and current_chunk:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = [sentence]
+                current_length = sentence_length
+            else:
+                current_chunk.append(sentence)
+                current_length += sentence_length
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks
+    def _extractive_summary(self, content: str) -> str:
+        """Fallback extractive summarization"""
+        sentences = sent_tokenize(content)
+        if len(sentences) <= 3:
+            return content
+        # Simple extractive approach: take first, middle, and last sentences
+        summary_sentences = [
+            sentences[0],
+            sentences[len(sentences) // 2],
+            sentences[-1]
+        ]
+        return ' '.join(summary_sentences)
+class WebScraperApp:
+    """Main application class"""
+    def __init__(self):
+        self.extractor = ContentExtractor()
+        self.summarizer = AISummarizer()
+        self.scraped_data = []
+    def process_url(self, url: str, summary_length: int = 300) -> Tuple[str, str, str, str]:
+        """Process a single URL and return results"""
+        try:
+            if not url.strip():
+                return "❌ Error", "Please enter a valid URL", "", ""
+            # Add protocol if missing
+            if not url.startswith(('http://', 'https://')):
+                url = 'https://' + url
+            # Extract content
+            with gr.update():  # Show progress
+                scraped_content = self.extractor.extract_content(url)
+            # Generate summary
+            summary = self.summarizer.summarize(scraped_content.content, summary_length)
+            scraped_content.summary = summary
+            # Store result
+            self.scraped_data.append(scraped_content)
+            # Format results
+            metadata = f"""
+            **📊 Content Analysis**
+            - **Title:** {scraped_content.title}
+            - **Author:** {scraped_content.author or 'Not found'}
+            - **Published:** {scraped_content.publish_date or 'Not found'}
+            - **Word Count:** {scraped_content.word_count:,}
+            - **Reading Time:** {scraped_content.reading_time} minutes
+            - **Extracted:** {scraped_content.extracted_at}
+            """
+            keywords_text = f"**🏷️ Keywords:** {', '.join(scraped_content.keywords[:10])}" if scraped_content.keywords else ""
+            return (
+                "✅ Success",
+                metadata,
+                f"**📝 AI Summary ({len(summary.split())} words):**\n\n{summary}",
+                keywords_text
+            )
+        except Exception as e:
+            error_msg = f"Failed to process URL: {str(e)}"
+            logger.error(error_msg)
+            return "❌ Error", error_msg, "", ""
+    def export_data(self, format_type: str) -> str:
+        """Export scraped data to file"""
+        if not self.scraped_data:
+            return "No data to export"
+        try:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            if format_type == "CSV":
+                filename = f"scraped_data_{timestamp}.csv"
+                df = pd.DataFrame([
+                    {
+                        'URL': item.url,
+                        'Title': item.title,
+                        'Author': item.author,
+                        'Published': item.publish_date,
+                        'Word Count': item.word_count,
+                        'Reading Time': item.reading_time,
+                        'Summary': item.summary,
+                        'Keywords': ', '.join(item.keywords) if item.keywords else '',
+                        'Extracted At': item.extracted_at
+                    }
+                    for item in self.scraped_data
+                ])
+                df.to_csv(filename, index=False)
+            elif format_type == "JSON":
+                filename = f"scraped_data_{timestamp}.json"
+                data = [
+                    {
+                        'url': item.url,
+                        'title': item.title,
+                        'content': item.content,
+                        'summary': item.summary,
+                        'metadata': {
+                            'author': item.author,
+                            'publish_date': item.publish_date,
+                            'word_count': item.word_count,
+                            'reading_time': item.reading_time,
+                            'keywords': item.keywords,
+                            'extracted_at': item.extracted_at
+                        }
+                    }
+                    for item in self.scraped_data
+                ]
+                with open(filename, 'w', encoding='utf-8') as f:
+                    json.dump(data, f, indent=2, ensure_ascii=False)
+            return filename
+        except Exception as e:
+            logger.error(f"Export failed: {e}")
+            return f"Export failed: {str(e)}"
+    def clear_data(self) -> str:
+        """Clear all scraped data"""
+        self.scraped_data.clear()
+        return "Data cleared successfully"
+def create_interface():
+    """Create the Gradio interface"""
+    app = WebScraperApp()
+    # Custom CSS for professional appearance
+    custom_css = """
+    .gradio-container {
+        max-width: 1200px;
+        margin: auto;
+    }
+    .main-header {
+        text-align: center;
+        background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        padding: 2rem;
+        border-radius: 10px;
+        margin-bottom: 2rem;
+    }
+    .feature-box {
+        background: #f8f9fa;
+        border: 1px solid #e9ecef;
+        border-radius: 8px;
+        padding: 1.5rem;
+        margin: 1rem 0;
+    }
+    .status-success {
+        color: #28a745;
+        font-weight: bold;
+    }
+    .status-error {
+        color: #dc3545;
+        font-weight: bold;
+    }
+    """
+    with gr.Blocks(css=custom_css, title="AI Web Scraper") as interface:
+        # Header
+        gr.HTML("""
+        <div class="main-header">
+            <h1>🤖 AI-Powered Web Scraper</h1>
+            <p>Professional content extraction and summarization for journalists, analysts, and researchers</p>
+        </div>
+        """)
+        # Main interface
+        with gr.Row():
+            with gr.Column(scale=2):
+                # Input section
+                gr.HTML("<div class='feature-box'><h3>📡 Content Extraction</h3></div>")
+                url_input = gr.Textbox(
+                    label="Enter URL to scrape",
+                    placeholder="https://example.com/article",
+                    lines=1
+                )
+                with gr.Row():
+                    summary_length = gr.Slider(
+                        minimum=100,
+                        maximum=500,
+                        value=300,
+                        step=50,
+                        label="Summary Length (words)"
+                    )
+                scrape_btn = gr.Button("🚀 Extract & Summarize", variant="primary", size="lg")
+                # Results section
+                gr.HTML("<div class='feature-box'><h3>📊 Results</h3></div>")
+                status_output = gr.Textbox(label="Status", lines=1, interactive=False)
+                metadata_output = gr.Markdown(label="Metadata")
+                summary_output = gr.Markdown(label="AI Summary")
+                keywords_output = gr.Markdown(label="Keywords")
+            with gr.Column(scale=1):
+                # Export section
+                gr.HTML("<div class='feature-box'><h3>💾 Export Options</h3></div>")
+                export_format = gr.Radio(
+                    choices=["CSV", "JSON"],
+                    label="Export Format",
+                    value="CSV"
+                )
+                export_btn = gr.Button("📥 Export Data", variant="secondary")
+                export_status = gr.Textbox(label="Export Status", lines=2, interactive=False)
+                gr.HTML("<div class='feature-box'><h3>🧹 Data Management</h3></div>")
+                clear_btn = gr.Button("🗑️ Clear All Data", variant="secondary")
+                clear_status = gr.Textbox(label="Clear Status", lines=1, interactive=False)
+        # Usage instructions
+        with gr.Accordion("📚 Usage Instructions", open=False):
+            gr.Markdown("""
+            ### How to Use This Tool
+            1. **Enter URL**: Paste the URL of the article or webpage you want to analyze
+            2. **Adjust Settings**: Set your preferred summary length
+            3. **Extract Content**: Click "Extract & Summarize" to process the content
+            4. **Review Results**: View the extracted metadata, AI summary, and keywords
+            5. **Export Data**: Save your results in CSV or JSON format
+            ### Features
+            - 🛡️ **Security**: Built-in URL validation and robots.txt compliance
+            - 🤖 **AI Summarization**: Advanced BART model for intelligent summarization
+            - 📊 **Rich Metadata**: Author, publication date, reading time, and more
+            - 🏷️ **Keyword Extraction**: Automatic identification of key terms
+            - 💾 **Export Options**: CSV and JSON formats for further analysis
+            - 🔄 **Batch Processing**: Process multiple URLs and export all results
+            ### Supported Content
+            - News articles and blog posts
+            - Research papers and reports
+            - Documentation and guides
+            - Most HTML-based content
+            ### Limitations
+            - Respects robots.txt restrictions
+            - Cannot access password-protected content
+            - Some dynamic content may not be captured
+            - Processing time varies with content length
+            """)
+        # Event handlers
+        scrape_btn.click(
+            fn=app.process_url,
+            inputs=[url_input, summary_length],
+            outputs=[status_output, metadata_output, summary_output, keywords_output]
+        )
+        export_btn.click(
+            fn=app.export_data,
+            inputs=[export_format],
+            outputs=[export_status]
+        )
+        clear_btn.click(
+            fn=app.clear_data,
+            outputs=[clear_status]
+        )
+    return interface
+# Launch the application
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )