Spaces:

MagicMeWizard
/

AI_Powered_Web_Scraper

Paused

App Files Files Community

MagicMeWizard commited on Jul 1

Commit

f9f65ef

verified ·

1 Parent(s): d0e21d2

Create config.py

Browse files

Files changed (1) hide show

config.py +204 -0

config.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Configuration settings for AI Web Scraper
+Centralized configuration management for security, performance, and features
+"""
+import os
+from typing import Dict, List, Optional
+from dataclasses import dataclass
+@dataclass
+class SecurityConfig:
+    """Security-related configuration"""
+    # URL validation settings
+    allowed_schemes: List[str] = None
+    blocked_domains: List[str] = None
+    max_url_length: int = 2048
+    # Rate limiting
+    requests_per_minute: int = 30
+    requests_per_hour: int = 500
+    # Content safety
+    max_content_size: int = 10 * 1024 * 1024  # 10MB
+    max_processing_time: int = 60  # seconds
+    def __post_init__(self):
+        if self.allowed_schemes is None:
+            self.allowed_schemes = ['http', 'https']
+        if self.blocked_domains is None:
+            self.blocked_domains = [
+                'localhost', '127.0.0.1', '0.0.0.0',
+                '192.168.', '10.', '172.16.', '172.17.',
+                '172.18.', '172.19.', '172.20.', '172.21.',
+                '172.22.', '172.23.', '172.24.', '172.25.',
+                '172.26.', '172.27.', '172.28.', '172.29.',
+                '172.30.', '172.31.'
+            ]
+@dataclass
+class ModelConfig:
+    """AI model configuration"""
+    # Primary summarization model
+    primary_model: str = "facebook/bart-large-cnn"
+    # Fallback model for faster processing
+    fallback_model: str = "sshleifer/distilbart-cnn-12-6"
+    # Model parameters
+    max_input_length: int = 1024
+    max_summary_length: int = 500
+    min_summary_length: int = 30
+    # Performance settings
+    device: str = "auto"  # auto, cpu, cuda
+    batch_size: int = 1
+    use_fast_tokenizer: bool = True
+@dataclass
+class ScrapingConfig:
+    """Web scraping configuration"""
+    # Request settings
+    timeout: int = 15
+    max_retries: int = 3
+    retry_delay: int = 1
+    # User agent string
+    user_agent: str = "Mozilla/5.0 (compatible; AI-WebScraper/1.0; Research Tool)"
+    # Content extraction
+    min_content_length: int = 100
+    max_content_length: int = 100000
+    # Robots.txt settings
+    respect_robots_txt: bool = True
+    robots_cache_duration: int = 3600  # seconds
+@dataclass
+class UIConfig:
+    """User interface configuration"""
+    # Default values
+    default_summary_length: int = 300
+    max_summary_length: int = 500
+    min_summary_length: int = 100
+    # Interface settings
+    enable_batch_processing: bool = True
+    max_batch_size: int = 10
+    show_advanced_options: bool = False
+    # Export settings
+    supported_export_formats: List[str] = None
+    def __post_init__(self):
+        if self.supported_export_formats is None:
+            self.supported_export_formats = ["CSV", "JSON"]
+class Config:
+    """Main configuration class"""
+    def __init__(self):
+        self.security = SecurityConfig()
+        self.models = ModelConfig()
+        self.scraping = ScrapingConfig()
+        self.ui = UIConfig()
+        # Load from environment variables if available
+        self._load_from_env()
+    def _load_from_env(self):
+        """Load configuration from environment variables"""
+        # Security settings
+        if os.getenv('MAX_REQUESTS_PER_MINUTE'):
+            self.security.requests_per_minute = int(os.getenv('MAX_REQUESTS_PER_MINUTE'))
+        if os.getenv('MAX_CONTENT_SIZE'):
+            self.security.max_content_size = int(os.getenv('MAX_CONTENT_SIZE'))
+        # Model settings
+        if os.getenv('PRIMARY_MODEL'):
+            self.models.primary_model = os.getenv('PRIMARY_MODEL')
+        if os.getenv('FALLBACK_MODEL'):
+            self.models.fallback_model = os.getenv('FALLBACK_MODEL')
+        if os.getenv('DEVICE'):
+            self.models.device = os.getenv('DEVICE')
+        # Scraping settings
+        if os.getenv('REQUEST_TIMEOUT'):
+            self.scraping.timeout = int(os.getenv('REQUEST_TIMEOUT'))
+        if os.getenv('USER_AGENT'):
+            self.scraping.user_agent = os.getenv('USER_AGENT')
+        if os.getenv('RESPECT_ROBOTS_TXT'):
+            self.scraping.respect_robots_txt = os.getenv('RESPECT_ROBOTS_TXT').lower() == 'true'
+    def get_model_device(self) -> str:
+        """Get the appropriate device for model inference"""
+        if self.models.device == "auto":
+            try:
+                import torch
+                return "cuda" if torch.cuda.is_available() else "cpu"
+            except ImportError:
+                return "cpu"
+        return self.models.device
+    def is_url_allowed(self, url: str) -> bool:
+        """Check if URL is allowed based on security settings"""
+        from urllib.parse import urlparse
+        try:
+            parsed = urlparse(url)
+            # Check scheme
+            if parsed.scheme not in self.security.allowed_schemes:
+                return False
+            # Check blocked domains
+            hostname = parsed.hostname or ''
+            for blocked in self.security.blocked_domains:
+                if blocked in hostname:
+                    return False
+            # Check URL length
+            if len(url) > self.security.max_url_length:
+                return False
+            return True
+        except Exception:
+            return False
+    def get_request_headers(self) -> Dict[str, str]:
+        """Get standard request headers"""
+        return {
+            'User-Agent': self.scraping.user_agent,
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+        }
+# Global configuration instance
+config = Config()
+# Environment-specific overrides for Hugging Face Spaces
+if os.getenv('SPACE_ID'):
+    # Running on Hugging Face Spaces
+    config.models.device = "auto"
+    config.security.requests_per_minute = 20  # More conservative on shared infrastructure
+    config.scraping.timeout = 10  # Shorter timeout on shared infrastructure
+    # Enable GPU if available
+    if os.getenv('CUDA_VISIBLE_DEVICES'):
+        config.models.device = "cuda"
+# Development mode overrides
+if os.getenv('ENVIRONMENT') == 'development':
+    config.security.requests_per_minute = 100
+    config.scraping.timeout = 30
+    config.ui.show_advanced_options = True