Spaces:
Paused
Paused
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse, parse_qs | |
| import logging | |
| import hashlib | |
| def get_relevant_images(soup: BeautifulSoup, url: str) -> list: | |
| """Extract relevant images from the page""" | |
| image_urls = [] | |
| try: | |
| # Find all img tags with src attribute | |
| all_images = soup.find_all('img', src=True) | |
| for img in all_images: | |
| img_src = urljoin(url, img['src']) | |
| if img_src.startswith(('http://', 'https://')): | |
| score = 0 | |
| # Check for relevant classes | |
| if any(cls in img.get('class', []) for cls in ['header', 'featured', 'hero', 'thumbnail', 'main', 'content']): | |
| score = 4 # Higher score | |
| # Check for size attributes | |
| elif img.get('width') and img.get('height'): | |
| width = parse_dimension(img['width']) | |
| height = parse_dimension(img['height']) | |
| if width and height: | |
| if width >= 2000 and height >= 1000: | |
| score = 3 # Medium score (very large images) | |
| elif width >= 1600 or height >= 800: | |
| score = 2 # Lower score | |
| elif width >= 800 or height >= 500: | |
| score = 1 # Lowest score | |
| elif width >= 500 or height >= 300: | |
| score = 0 # Lowest score | |
| else: | |
| continue # Skip small images | |
| image_urls.append({'url': img_src, 'score': score}) | |
| # Sort images by score (highest first) | |
| sorted_images = sorted(image_urls, key=lambda x: x['score'], reverse=True) | |
| # Select all images with score 3 and 2, then add score 1 images up to a total of 10 | |
| high_score_images = [img for img in sorted_images if img['score'] in [3, 2]] | |
| low_score_images = [img for img in sorted_images if img['score'] == 1] | |
| result = high_score_images + low_score_images[:max(0, 10 - len(high_score_images))] | |
| return result[:10] # Ensure we don't return more than 10 images in total | |
| except Exception as e: | |
| logging.error(f"Error in get_relevant_images: {e}") | |
| return [] | |
| def parse_dimension(value: str) -> int: | |
| """Parse dimension value, handling px units""" | |
| if value.lower().endswith('px'): | |
| value = value[:-2] # Remove 'px' suffix | |
| try: | |
| return int(value) # Convert to float first to handle decimal values | |
| except ValueError as e: | |
| print(f"Error parsing dimension value {value}: {e}") | |
| return None | |
| def extract_title(soup: BeautifulSoup) -> str: | |
| """Extract the title from the BeautifulSoup object""" | |
| return soup.title.string if soup.title else "" | |
| def get_image_hash(image_url: str) -> str: | |
| """Calculate a simple hash based on the image filename and essential query parameters""" | |
| try: | |
| parsed_url = urlparse(image_url) | |
| # Extract the filename | |
| filename = parsed_url.path.split('/')[-1] | |
| # Extract essential query parameters (e.g., 'url' for CDN-served images) | |
| query_params = parse_qs(parsed_url.query) | |
| essential_params = query_params.get('url', []) | |
| # Combine filename and essential parameters | |
| image_identifier = filename + ''.join(essential_params) | |
| # Calculate hash | |
| return hashlib.md5(image_identifier.encode()).hexdigest() | |
| except Exception as e: | |
| logging.error(f"Error calculating image hash for {image_url}: {e}") | |
| return None | |