Spaces:

Shreyas094
/

GPT-Researcher

Paused

App Files Files Community

GPT-Researcher / gpt_researcher /scraper /scraper.py

Shreyas094

Upload 528 files

372531f verified 10 months ago

raw

history blame contribute delete

4.81 kB

	from concurrent.futures.thread import ThreadPoolExecutor
	from functools import partial
	from colorama import Fore, init

	import requests
	import subprocess
	import sys
	import importlib

	from . import (
	ArxivScraper,
	BeautifulSoupScraper,
	PyMuPDFScraper,
	WebBaseLoaderScraper,
	BrowserScraper,
	TavilyExtract
	)


	class Scraper:
	"""
	Scraper class to extract the content from the links
	"""

	def __init__(self, urls, user_agent, scraper):
	"""
	Initialize the Scraper class.
	Args:
	urls:
	"""
	self.urls = urls
	self.session = requests.Session()
	self.session.headers.update({"User-Agent": user_agent})
	self.scraper = scraper
	if self.scraper == "tavily_extract":
	self._check_pkg(self.scraper)

	def run(self):
	"""
	Extracts the content from the links
	"""
	partial_extract = partial(self.extract_data_from_url, session=self.session)
	with ThreadPoolExecutor(max_workers=20) as executor:
	contents = executor.map(partial_extract, self.urls)
	res = [content for content in contents if content["raw_content"] is not None]
	return res

	def _check_pkg(self, scrapper_name : str) -> None:
	"""
	Checks and ensures required Python packages are available for scrapers that need
	dependencies beyond requirements.txt. When adding a new scraper to the repo, update `pkg_map`
	with its required information and call check_pkg() during initialization.
	"""
	pkg_map = {
	"tavily_extract": {"package_installation_name": "tavily-python",
	"import_name": "tavily"},
	}
	pkg = pkg_map[scrapper_name]
	if not importlib.util.find_spec(pkg["import_name"]):
	pkg_inst_name = pkg["package_installation_name"]
	init(autoreset=True)
	print(Fore.YELLOW + f"{pkg_inst_name} not found. Attempting to install...")
	try:
	subprocess.check_call([sys.executable, "-m", "pip", "install", pkg_inst_name])
	print(Fore.GREEN + f"{pkg_inst_name} installed successfully.")
	except subprocess.CalledProcessError:
	raise ImportError(
	Fore.RED + f"Unable to install {pkg_inst_name}. Please install manually with "
	f"`pip install -U {pkg_inst_name}`"
	)

	def extract_data_from_url(self, link, session):
	"""
	Extracts the data from the link
	"""
	try:
	Scraper = self.get_scraper(link)
	scraper = Scraper(link, session)
	content, image_urls, title = scraper.scrape()

	if len(content) < 100:
	return {"url": link, "raw_content": None, "image_urls": [], "title": ""}

	return {"url": link, "raw_content": content, "image_urls": image_urls, "title": title}
	except Exception as e:
	return {"url": link, "raw_content": None, "image_urls": [], "title": ""}

	def get_scraper(self, link):
	"""
	The function `get_scraper` determines the appropriate scraper class based on the provided link
	or a default scraper if none matches.

	Args:
	link: The `get_scraper` method takes a `link` parameter which is a URL link to a webpage or a
	PDF file. Based on the type of content the link points to, the method determines the appropriate
	scraper class to use for extracting data from that content.

	Returns:
	The `get_scraper` method returns the scraper class based on the provided link. The method
	checks the link to determine the appropriate scraper class to use based on predefined mappings
	in the `SCRAPER_CLASSES` dictionary. If the link ends with ".pdf", it selects the
	`PyMuPDFScraper` class. If the link contains "arxiv.org", it selects the `ArxivScraper
	"""

	SCRAPER_CLASSES = {
	"pdf": PyMuPDFScraper,
	"arxiv": ArxivScraper,
	"bs": BeautifulSoupScraper,
	"web_base_loader": WebBaseLoaderScraper,
	"browser": BrowserScraper,
	"tavily_extract": TavilyExtract
	}

	scraper_key = None

	if link.endswith(".pdf"):
	scraper_key = "pdf"
	elif "arxiv.org" in link:
	scraper_key = "arxiv"
	else:
	scraper_key = self.scraper

	scraper_class = SCRAPER_CLASSES.get(scraper_key)
	if scraper_class is None:
	raise Exception("Scraper not found.")

	return scraper_class