Spaces:
Paused
Paused
| from concurrent.futures.thread import ThreadPoolExecutor | |
| from functools import partial | |
| from colorama import Fore, init | |
| import requests | |
| import subprocess | |
| import sys | |
| import importlib | |
| from . import ( | |
| ArxivScraper, | |
| BeautifulSoupScraper, | |
| PyMuPDFScraper, | |
| WebBaseLoaderScraper, | |
| BrowserScraper, | |
| TavilyExtract | |
| ) | |
| class Scraper: | |
| """ | |
| Scraper class to extract the content from the links | |
| """ | |
| def __init__(self, urls, user_agent, scraper): | |
| """ | |
| Initialize the Scraper class. | |
| Args: | |
| urls: | |
| """ | |
| self.urls = urls | |
| self.session = requests.Session() | |
| self.session.headers.update({"User-Agent": user_agent}) | |
| self.scraper = scraper | |
| if self.scraper == "tavily_extract": | |
| self._check_pkg(self.scraper) | |
| def run(self): | |
| """ | |
| Extracts the content from the links | |
| """ | |
| partial_extract = partial(self.extract_data_from_url, session=self.session) | |
| with ThreadPoolExecutor(max_workers=20) as executor: | |
| contents = executor.map(partial_extract, self.urls) | |
| res = [content for content in contents if content["raw_content"] is not None] | |
| return res | |
| def _check_pkg(self, scrapper_name : str) -> None: | |
| """ | |
| Checks and ensures required Python packages are available for scrapers that need | |
| dependencies beyond requirements.txt. When adding a new scraper to the repo, update `pkg_map` | |
| with its required information and call check_pkg() during initialization. | |
| """ | |
| pkg_map = { | |
| "tavily_extract": {"package_installation_name": "tavily-python", | |
| "import_name": "tavily"}, | |
| } | |
| pkg = pkg_map[scrapper_name] | |
| if not importlib.util.find_spec(pkg["import_name"]): | |
| pkg_inst_name = pkg["package_installation_name"] | |
| init(autoreset=True) | |
| print(Fore.YELLOW + f"{pkg_inst_name} not found. Attempting to install...") | |
| try: | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", pkg_inst_name]) | |
| print(Fore.GREEN + f"{pkg_inst_name} installed successfully.") | |
| except subprocess.CalledProcessError: | |
| raise ImportError( | |
| Fore.RED + f"Unable to install {pkg_inst_name}. Please install manually with " | |
| f"`pip install -U {pkg_inst_name}`" | |
| ) | |
| def extract_data_from_url(self, link, session): | |
| """ | |
| Extracts the data from the link | |
| """ | |
| try: | |
| Scraper = self.get_scraper(link) | |
| scraper = Scraper(link, session) | |
| content, image_urls, title = scraper.scrape() | |
| if len(content) < 100: | |
| return {"url": link, "raw_content": None, "image_urls": [], "title": ""} | |
| return {"url": link, "raw_content": content, "image_urls": image_urls, "title": title} | |
| except Exception as e: | |
| return {"url": link, "raw_content": None, "image_urls": [], "title": ""} | |
| def get_scraper(self, link): | |
| """ | |
| The function `get_scraper` determines the appropriate scraper class based on the provided link | |
| or a default scraper if none matches. | |
| Args: | |
| link: The `get_scraper` method takes a `link` parameter which is a URL link to a webpage or a | |
| PDF file. Based on the type of content the link points to, the method determines the appropriate | |
| scraper class to use for extracting data from that content. | |
| Returns: | |
| The `get_scraper` method returns the scraper class based on the provided link. The method | |
| checks the link to determine the appropriate scraper class to use based on predefined mappings | |
| in the `SCRAPER_CLASSES` dictionary. If the link ends with ".pdf", it selects the | |
| `PyMuPDFScraper` class. If the link contains "arxiv.org", it selects the `ArxivScraper | |
| """ | |
| SCRAPER_CLASSES = { | |
| "pdf": PyMuPDFScraper, | |
| "arxiv": ArxivScraper, | |
| "bs": BeautifulSoupScraper, | |
| "web_base_loader": WebBaseLoaderScraper, | |
| "browser": BrowserScraper, | |
| "tavily_extract": TavilyExtract | |
| } | |
| scraper_key = None | |
| if link.endswith(".pdf"): | |
| scraper_key = "pdf" | |
| elif "arxiv.org" in link: | |
| scraper_key = "arxiv" | |
| else: | |
| scraper_key = self.scraper | |
| scraper_class = SCRAPER_CLASSES.get(scraper_key) | |
| if scraper_class is None: | |
| raise Exception("Scraper not found.") | |
| return scraper_class | |