Spaces:

Shreyas094
/

GPT-Researcher

Paused

App Files Files Community

GPT-Researcher / gpt_researcher /skills /browser.py

Shreyas094

Upload 528 files

372531f verified 10 months ago

raw

history blame contribute delete

3.14 kB

	from typing import List, Dict

	from ..actions.utils import stream_output
	from ..actions.web_scraping import scrape_urls
	from ..scraper.utils import get_image_hash # Add this import


	class BrowserManager:
	"""Manages context for the researcher agent."""

	def __init__(self, researcher):
	self.researcher = researcher

	async def browse_urls(self, urls: List[str]) -> List[Dict]:
	"""
	Scrape content from a list of URLs.

	Args:
	urls (List[str]): List of URLs to scrape.

	Returns:
	List[Dict]: List of scraped content results.
	"""
	if self.researcher.verbose:
	await stream_output(
	"logs",
	"scraping_urls",
	f"🌐 Scraping content from {len(urls)} URLs...",
	self.researcher.websocket,
	)

	scraped_content, images = scrape_urls(urls, self.researcher.cfg)
	self.researcher.add_research_sources(scraped_content)
	new_images = self.select_top_images(images, k=4) # Select top 2 images
	self.researcher.add_research_images(new_images)

	if self.researcher.verbose:
	await stream_output(
	"logs",
	"scraping_content",
	f"📄 Scraped {len(scraped_content)} pages of content",
	self.researcher.websocket,
	)
	await stream_output(
	"logs",
	"scraping_images",
	f"🖼️ Selected {len(new_images)} new images from {len(images)} total images",
	self.researcher.websocket,
	True,
	new_images
	)
	await stream_output(
	"logs",
	"scraping_complete",
	f"🌐 Scraping complete",
	self.researcher.websocket,
	)

	return scraped_content

	def select_top_images(self, images: List[Dict], k: int = 2) -> List[str]:
	"""
	Select most relevant images and remove duplicates based on image content.

	Args:
	images (List[Dict]): List of image dictionaries with 'url' and 'score' keys.
	k (int): Number of top images to select if no high-score images are found.

	Returns:
	List[str]: List of selected image URLs.
	"""
	unique_images = []
	seen_hashes = set()
	current_research_images = self.researcher.get_research_images()

	# First, select all score 2 and 3 images
	high_score_images = [img for img in images if img['score'] >= 2]

	for img in high_score_images + images: # Process high-score images first, then all images
	img_hash = get_image_hash(img['url'])
	if img_hash and img_hash not in seen_hashes and img['url'] not in current_research_images:
	seen_hashes.add(img_hash)
	unique_images.append(img['url'])

	if len(unique_images) == k:
	break

	return unique_images