Spaces:
Paused
Paused
| import requests | |
| import logging | |
| from typing import Iterator, List, Union | |
| from langchain_core.document_loaders import BaseLoader | |
| from langchain_core.documents import Document | |
| from open_webui.env import SRC_LOG_LEVELS | |
| log = logging.getLogger(__name__) | |
| log.setLevel(SRC_LOG_LEVELS["RAG"]) | |
| class ExternalLoader(BaseLoader): | |
| def __init__( | |
| self, | |
| web_paths: Union[str, List[str]], | |
| external_url: str, | |
| external_api_key: str, | |
| continue_on_failure: bool = True, | |
| **kwargs, | |
| ) -> None: | |
| self.external_url = external_url | |
| self.external_api_key = external_api_key | |
| self.urls = web_paths if isinstance(web_paths, list) else [web_paths] | |
| self.continue_on_failure = continue_on_failure | |
| def lazy_load(self) -> Iterator[Document]: | |
| batch_size = 20 | |
| for i in range(0, len(self.urls), batch_size): | |
| urls = self.urls[i : i + batch_size] | |
| try: | |
| response = requests.post( | |
| self.external_url, | |
| headers={ | |
| "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot", | |
| "Authorization": f"Bearer {self.external_api_key}", | |
| }, | |
| json={ | |
| "urls": urls, | |
| }, | |
| ) | |
| response.raise_for_status() | |
| results = response.json() | |
| for result in results: | |
| yield Document( | |
| page_content=result.get("page_content", ""), | |
| metadata=result.get("metadata", {}), | |
| ) | |
| except Exception as e: | |
| if self.continue_on_failure: | |
| log.error(f"Error extracting content from batch {urls}: {e}") | |
| else: | |
| raise e | |