Spaces:

Shreyas094
/

GPT-Researcher

Paused

App Files Files Community

GPT-Researcher / gpt_researcher /scraper /utils.py

Shreyas094

Upload 528 files

372531f verified 10 months ago

raw

history blame contribute delete

3.8 kB

	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse, parse_qs
	import logging
	import hashlib

	def get_relevant_images(soup: BeautifulSoup, url: str) -> list:
	"""Extract relevant images from the page"""
	image_urls = []

	try:
	# Find all img tags with src attribute
	all_images = soup.find_all('img', src=True)

	for img in all_images:
	img_src = urljoin(url, img['src'])
	if img_src.startswith(('http://', 'https://')):
	score = 0
	# Check for relevant classes
	if any(cls in img.get('class', []) for cls in ['header', 'featured', 'hero', 'thumbnail', 'main', 'content']):
	score = 4 # Higher score
	# Check for size attributes
	elif img.get('width') and img.get('height'):
	width = parse_dimension(img['width'])
	height = parse_dimension(img['height'])
	if width and height:
	if width >= 2000 and height >= 1000:
	score = 3 # Medium score (very large images)
	elif width >= 1600 or height >= 800:
	score = 2 # Lower score
	elif width >= 800 or height >= 500:
	score = 1 # Lowest score
	elif width >= 500 or height >= 300:
	score = 0 # Lowest score
	else:
	continue # Skip small images

	image_urls.append({'url': img_src, 'score': score})

	# Sort images by score (highest first)
	sorted_images = sorted(image_urls, key=lambda x: x['score'], reverse=True)

	# Select all images with score 3 and 2, then add score 1 images up to a total of 10
	high_score_images = [img for img in sorted_images if img['score'] in [3, 2]]
	low_score_images = [img for img in sorted_images if img['score'] == 1]

	result = high_score_images + low_score_images[:max(0, 10 - len(high_score_images))]
	return result[:10] # Ensure we don't return more than 10 images in total

	except Exception as e:
	logging.error(f"Error in get_relevant_images: {e}")
	return []

	def parse_dimension(value: str) -> int:
	"""Parse dimension value, handling px units"""
	if value.lower().endswith('px'):
	value = value[:-2] # Remove 'px' suffix
	try:
	return int(value) # Convert to float first to handle decimal values
	except ValueError as e:
	print(f"Error parsing dimension value {value}: {e}")
	return None

	def extract_title(soup: BeautifulSoup) -> str:
	"""Extract the title from the BeautifulSoup object"""
	return soup.title.string if soup.title else ""

	def get_image_hash(image_url: str) -> str:
	"""Calculate a simple hash based on the image filename and essential query parameters"""
	try:
	parsed_url = urlparse(image_url)

	# Extract the filename
	filename = parsed_url.path.split('/')[-1]

	# Extract essential query parameters (e.g., 'url' for CDN-served images)
	query_params = parse_qs(parsed_url.query)
	essential_params = query_params.get('url', [])

	# Combine filename and essential parameters
	image_identifier = filename + ''.join(essential_params)

	# Calculate hash
	return hashlib.md5(image_identifier.encode()).hexdigest()
	except Exception as e:
	logging.error(f"Error calculating image hash for {image_url}: {e}")
	return None