Spaces:

Shreyas094
/

GPT-Researcher

Paused

Upload 528 files

372531f verified 10 months ago

1.61 kB

	from bs4 import BeautifulSoup
	from urllib.parse import urljoin
	import requests
	from ..utils import get_relevant_images, extract_title

	class WebBaseLoaderScraper:

	def __init__(self, link, session=None):
	self.link = link
	self.session = session or requests.Session()

	def scrape(self) -> tuple:
	"""
	This Python function scrapes content from a webpage using a WebBaseLoader object and returns the
	concatenated page content.

	Returns:
	The `scrape` method is returning a string variable named `content` which contains the
	concatenated page content from the documents loaded by the `WebBaseLoader`. If an exception
	occurs during the process, an error message is printed and an empty string is returned.
	"""
	try:
	from langchain_community.document_loaders import WebBaseLoader
	loader = WebBaseLoader(self.link)
	loader.requests_kwargs = {"verify": False}
	docs = loader.load()
	content = ""

	for doc in docs:
	content += doc.page_content

	response = self.session.get(self.link)
	soup = BeautifulSoup(response.content, 'html.parser')
	image_urls = get_relevant_images(soup, self.link)

	# Extract the title using the utility function
	title = extract_title(soup)

	return content, image_urls, title

	except Exception as e:
	print("Error! : " + str(e))
	return "", [], ""