Spaces:
Paused
Paused
Update webscout.py
Browse files- webscout.py +51 -31
webscout.py
CHANGED
|
@@ -5,11 +5,14 @@ from datetime import datetime, timezone
|
|
| 5 |
from decimal import Decimal
|
| 6 |
from functools import cached_property
|
| 7 |
from itertools import cycle, islice
|
|
|
|
| 8 |
from threading import Event
|
| 9 |
from types import TracebackType
|
| 10 |
from typing import Dict, List, Optional, Tuple, Type, Union, cast
|
| 11 |
|
| 12 |
-
import pyreqwest_impersonate as pri
|
|
|
|
|
|
|
| 13 |
|
| 14 |
try:
|
| 15 |
from lxml.etree import _Element
|
|
@@ -26,29 +29,33 @@ from html import unescape
|
|
| 26 |
from math import atan2, cos, radians, sin, sqrt
|
| 27 |
from typing import Any, Dict, List, Union
|
| 28 |
from urllib.parse import unquote
|
| 29 |
-
import orjson
|
| 30 |
-
import requests
|
| 31 |
-
import base64
|
| 32 |
-
from typing import List, Dict, Union
|
| 33 |
-
import json
|
| 34 |
-
import requests
|
| 35 |
-
import base64
|
| 36 |
-
from typing import List, Dict, Union
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
REGEX_STRIP_TAGS = re.compile("<.*?>")
|
| 40 |
|
| 41 |
|
| 42 |
def json_dumps(obj: Any) -> str:
|
| 43 |
try:
|
| 44 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
except Exception as ex:
|
| 46 |
raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
|
| 47 |
|
| 48 |
|
| 49 |
def json_loads(obj: Union[str, bytes]) -> Any:
|
| 50 |
try:
|
| 51 |
-
return orjson.loads(obj)
|
| 52 |
except Exception as ex:
|
| 53 |
raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
|
| 54 |
|
|
@@ -101,8 +108,6 @@ def _calculate_distance(lat1: Decimal, lon1: Decimal, lat2: Decimal, lon2: Decim
|
|
| 101 |
c = 2 * atan2(sqrt(a), sqrt(1 - a))
|
| 102 |
return R * c
|
| 103 |
|
| 104 |
-
logger = logging.getLogger("webscout.WEBS")
|
| 105 |
-
|
| 106 |
class WebscoutE(Exception):
|
| 107 |
"""Base exception class for search."""
|
| 108 |
|
|
@@ -121,14 +126,26 @@ class AllProvidersFailure(Exception):
|
|
| 121 |
"""None of the providers generated response successfully"""
|
| 122 |
|
| 123 |
pass
|
|
|
|
|
|
|
|
|
|
| 124 |
class WEBS:
|
| 125 |
"""webscout class to get search results from duckduckgo.com."""
|
| 126 |
|
| 127 |
_executor: ThreadPoolExecutor = ThreadPoolExecutor()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
def __init__(
|
| 130 |
self,
|
| 131 |
-
headers: Optional[Dict[str, str]] =
|
| 132 |
proxy: Optional[str] = None,
|
| 133 |
proxies: Union[Dict[str, str], str, None] = None, # deprecated
|
| 134 |
timeout: Optional[int] = 10,
|
|
@@ -152,9 +169,9 @@ class WEBS:
|
|
| 152 |
headers=self.headers,
|
| 153 |
proxy=self.proxy,
|
| 154 |
timeout=timeout,
|
| 155 |
-
cookie_store=
|
| 156 |
referer=True,
|
| 157 |
-
impersonate=
|
| 158 |
follow_redirects=False,
|
| 159 |
verify=False,
|
| 160 |
)
|
|
@@ -208,13 +225,14 @@ class WEBS:
|
|
| 208 |
resp_content = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords})
|
| 209 |
return _extract_vqd(resp_content, keywords)
|
| 210 |
|
| 211 |
-
def chat(self, keywords: str, model: str = "gpt-3.5") -> str:
|
| 212 |
"""Initiates a chat session with DuckDuckGo AI.
|
| 213 |
|
| 214 |
Args:
|
| 215 |
keywords (str): The initial message or question to send to the AI.
|
| 216 |
model (str): The model to use: "gpt-3.5", "claude-3-haiku", "llama-3-70b", "mixtral-8x7b".
|
| 217 |
Defaults to "gpt-3.5".
|
|
|
|
| 218 |
|
| 219 |
Returns:
|
| 220 |
str: The response from the AI.
|
|
@@ -237,18 +255,16 @@ class WEBS:
|
|
| 237 |
"messages": self._chat_messages,
|
| 238 |
}
|
| 239 |
resp = self.client.post(
|
| 240 |
-
"https://duckduckgo.com/duckchat/v1/chat",
|
|
|
|
|
|
|
|
|
|
| 241 |
)
|
| 242 |
self._chat_vqd = resp.headers.get("x-vqd-4", "")
|
| 243 |
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
if x:
|
| 248 |
-
j = json_loads(x)
|
| 249 |
-
message = j.get("message", "")
|
| 250 |
-
messages.append(message)
|
| 251 |
-
result = "".join(messages)
|
| 252 |
self._chat_messages.append({"role": "assistant", "content": result})
|
| 253 |
return result
|
| 254 |
|
|
@@ -435,7 +451,7 @@ class WEBS:
|
|
| 435 |
for e in elements:
|
| 436 |
if isinstance(e, _Element):
|
| 437 |
hrefxpath = e.xpath("./a/@href")
|
| 438 |
-
href = str(hrefxpath[0]) if isinstance(hrefxpath, List) else None
|
| 439 |
if (
|
| 440 |
href
|
| 441 |
and href not in cache
|
|
@@ -445,9 +461,9 @@ class WEBS:
|
|
| 445 |
):
|
| 446 |
cache.add(href)
|
| 447 |
titlexpath = e.xpath("./h2/a/text()")
|
| 448 |
-
title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
|
| 449 |
bodyxpath = e.xpath("./a//text()")
|
| 450 |
-
body = "".join(str(x) for x in bodyxpath) if isinstance(bodyxpath, List) else ""
|
| 451 |
result = {
|
| 452 |
"title": _normalize(title),
|
| 453 |
"href": _normalize_url(href),
|
|
@@ -537,10 +553,14 @@ class WEBS:
|
|
| 537 |
else:
|
| 538 |
cache.add(href)
|
| 539 |
titlexpath = e.xpath(".//a//text()")
|
| 540 |
-
title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
|
| 541 |
elif i == 2:
|
| 542 |
bodyxpath = e.xpath(".//td[@class='result-snippet']//text()")
|
| 543 |
-
body =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
if href:
|
| 545 |
result = {
|
| 546 |
"title": _normalize(title),
|
|
|
|
| 5 |
from decimal import Decimal
|
| 6 |
from functools import cached_property
|
| 7 |
from itertools import cycle, islice
|
| 8 |
+
from random import choice
|
| 9 |
from threading import Event
|
| 10 |
from types import TracebackType
|
| 11 |
from typing import Dict, List, Optional, Tuple, Type, Union, cast
|
| 12 |
|
| 13 |
+
import pyreqwest_impersonate as pri
|
| 14 |
+
|
| 15 |
+
|
| 16 |
|
| 17 |
try:
|
| 18 |
from lxml.etree import _Element
|
|
|
|
| 29 |
from math import atan2, cos, radians, sin, sqrt
|
| 30 |
from typing import Any, Dict, List, Union
|
| 31 |
from urllib.parse import unquote
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
+
from .exceptions import WebscoutE
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
HAS_ORJSON = True
|
| 37 |
+
import orjson
|
| 38 |
+
except ImportError:
|
| 39 |
+
HAS_ORJSON = False
|
| 40 |
+
import json
|
| 41 |
|
| 42 |
REGEX_STRIP_TAGS = re.compile("<.*?>")
|
| 43 |
|
| 44 |
|
| 45 |
def json_dumps(obj: Any) -> str:
|
| 46 |
try:
|
| 47 |
+
return (
|
| 48 |
+
orjson.dumps(obj, option=orjson.OPT_INDENT_2).decode()
|
| 49 |
+
if HAS_ORJSON
|
| 50 |
+
else json.dumps(obj, ensure_ascii=False, indent=2)
|
| 51 |
+
)
|
| 52 |
except Exception as ex:
|
| 53 |
raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
|
| 54 |
|
| 55 |
|
| 56 |
def json_loads(obj: Union[str, bytes]) -> Any:
|
| 57 |
try:
|
| 58 |
+
return orjson.loads(obj) if HAS_ORJSON else json.loads(obj)
|
| 59 |
except Exception as ex:
|
| 60 |
raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
|
| 61 |
|
|
|
|
| 108 |
c = 2 * atan2(sqrt(a), sqrt(1 - a))
|
| 109 |
return R * c
|
| 110 |
|
|
|
|
|
|
|
| 111 |
class WebscoutE(Exception):
|
| 112 |
"""Base exception class for search."""
|
| 113 |
|
|
|
|
| 126 |
"""None of the providers generated response successfully"""
|
| 127 |
|
| 128 |
pass
|
| 129 |
+
logger = logging.getLogger("webscout.WEBS")
|
| 130 |
+
|
| 131 |
+
|
| 132 |
class WEBS:
|
| 133 |
"""webscout class to get search results from duckduckgo.com."""
|
| 134 |
|
| 135 |
_executor: ThreadPoolExecutor = ThreadPoolExecutor()
|
| 136 |
+
_impersonates = (
|
| 137 |
+
"chrome_99", "chrome_100", "chrome_101", "chrome_104", "chrome_105", "chrome_106", "chrome_108",
|
| 138 |
+
"chrome_107", "chrome_109", "chrome_114", "chrome_116", "chrome_117", "chrome_118", "chrome_119",
|
| 139 |
+
"chrome_120", #"chrome_123", "chrome_124", "chrome_126",
|
| 140 |
+
"safari_ios_16.5", "safari_ios_17.2", "safari_ios_17.4.1", "safari_15.3", "safari_15.5",
|
| 141 |
+
"safari_15.6.1", "safari_16", "safari_16.5", "safari_17.2.1", "safari_17.4.1", "safari_17.5",
|
| 142 |
+
#"okhttp_3.9", "okhttp_3.11", "okhttp_3.13", "okhttp_3.14", "okhttp_4.9", "okhttp_4.10", "okhttp_5",
|
| 143 |
+
"edge_99", "edge_101", "edge_122",
|
| 144 |
+
) # fmt: skip
|
| 145 |
|
| 146 |
def __init__(
|
| 147 |
self,
|
| 148 |
+
headers: Optional[Dict[str, str]] = None,
|
| 149 |
proxy: Optional[str] = None,
|
| 150 |
proxies: Union[Dict[str, str], str, None] = None, # deprecated
|
| 151 |
timeout: Optional[int] = 10,
|
|
|
|
| 169 |
headers=self.headers,
|
| 170 |
proxy=self.proxy,
|
| 171 |
timeout=timeout,
|
| 172 |
+
cookie_store=True,
|
| 173 |
referer=True,
|
| 174 |
+
impersonate=choice(self._impersonates),
|
| 175 |
follow_redirects=False,
|
| 176 |
verify=False,
|
| 177 |
)
|
|
|
|
| 225 |
resp_content = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords})
|
| 226 |
return _extract_vqd(resp_content, keywords)
|
| 227 |
|
| 228 |
+
def chat(self, keywords: str, model: str = "gpt-3.5", timeout: int = 20) -> str:
|
| 229 |
"""Initiates a chat session with DuckDuckGo AI.
|
| 230 |
|
| 231 |
Args:
|
| 232 |
keywords (str): The initial message or question to send to the AI.
|
| 233 |
model (str): The model to use: "gpt-3.5", "claude-3-haiku", "llama-3-70b", "mixtral-8x7b".
|
| 234 |
Defaults to "gpt-3.5".
|
| 235 |
+
timeout (int): Timeout value for the HTTP client. Defaults to 20.
|
| 236 |
|
| 237 |
Returns:
|
| 238 |
str: The response from the AI.
|
|
|
|
| 255 |
"messages": self._chat_messages,
|
| 256 |
}
|
| 257 |
resp = self.client.post(
|
| 258 |
+
"https://duckduckgo.com/duckchat/v1/chat",
|
| 259 |
+
headers={"x-vqd-4": self._chat_vqd},
|
| 260 |
+
json=json_data,
|
| 261 |
+
timeout=timeout,
|
| 262 |
)
|
| 263 |
self._chat_vqd = resp.headers.get("x-vqd-4", "")
|
| 264 |
|
| 265 |
+
data = ",".join(x for line in resp.text.rstrip("[DONE]\n").split("data:") if (x := line.strip()))
|
| 266 |
+
result = "".join(x.get("message", "") for x in json_loads("[" + data + "]"))
|
| 267 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
self._chat_messages.append({"role": "assistant", "content": result})
|
| 269 |
return result
|
| 270 |
|
|
|
|
| 451 |
for e in elements:
|
| 452 |
if isinstance(e, _Element):
|
| 453 |
hrefxpath = e.xpath("./a/@href")
|
| 454 |
+
href = str(hrefxpath[0]) if hrefxpath and isinstance(hrefxpath, List) else None
|
| 455 |
if (
|
| 456 |
href
|
| 457 |
and href not in cache
|
|
|
|
| 461 |
):
|
| 462 |
cache.add(href)
|
| 463 |
titlexpath = e.xpath("./h2/a/text()")
|
| 464 |
+
title = str(titlexpath[0]) if titlexpath and isinstance(titlexpath, List) else ""
|
| 465 |
bodyxpath = e.xpath("./a//text()")
|
| 466 |
+
body = "".join(str(x) for x in bodyxpath) if bodyxpath and isinstance(bodyxpath, List) else ""
|
| 467 |
result = {
|
| 468 |
"title": _normalize(title),
|
| 469 |
"href": _normalize_url(href),
|
|
|
|
| 553 |
else:
|
| 554 |
cache.add(href)
|
| 555 |
titlexpath = e.xpath(".//a//text()")
|
| 556 |
+
title = str(titlexpath[0]) if titlexpath and isinstance(titlexpath, List) else ""
|
| 557 |
elif i == 2:
|
| 558 |
bodyxpath = e.xpath(".//td[@class='result-snippet']//text()")
|
| 559 |
+
body = (
|
| 560 |
+
"".join(str(x) for x in bodyxpath).strip()
|
| 561 |
+
if bodyxpath and isinstance(bodyxpath, List)
|
| 562 |
+
else ""
|
| 563 |
+
)
|
| 564 |
if href:
|
| 565 |
result = {
|
| 566 |
"title": _normalize(title),
|