lsottani's picture
Upload folder using huggingface_hub
d9f69e5 verified
"""
cssselect.parser
================
Tokenizer, parser and parsed objects for CSS selectors.
:copyright: (c) 2007-2012 Ian Bicking and contributors.
See AUTHORS for more details.
:license: BSD, see LICENSE for more details.
"""
from __future__ import annotations
import operator
import re
import sys
from typing import TYPE_CHECKING, Literal, Optional, Protocol, Union, cast, overload
if TYPE_CHECKING:
from collections.abc import Iterable, Iterator, Sequence
# typing.Self requires Python 3.11
from typing_extensions import Self
def ascii_lower(string: str) -> str:
"""Lower-case, but only in the ASCII range."""
return string.encode("utf8").lower().decode("utf8")
class SelectorError(Exception):
"""Common parent for :class:`SelectorSyntaxError` and
:class:`ExpressionError`.
You can just use ``except SelectorError:`` when calling
:meth:`~GenericTranslator.css_to_xpath` and handle both exceptions types.
"""
class SelectorSyntaxError(SelectorError, SyntaxError):
"""Parsing a selector that does not match the grammar."""
#### Parsed objects
Tree = Union[
"Element",
"Hash",
"Class",
"Function",
"Pseudo",
"Attrib",
"Negation",
"Relation",
"Matching",
"SpecificityAdjustment",
"CombinedSelector",
]
PseudoElement = Union["FunctionalPseudoElement", str]
class Selector:
"""
Represents a parsed selector.
:meth:`~GenericTranslator.selector_to_xpath` accepts this object,
but ignores :attr:`pseudo_element`. It is the user’s responsibility
to account for pseudo-elements and reject selectors with unknown
or unsupported pseudo-elements.
"""
def __init__(self, tree: Tree, pseudo_element: PseudoElement | None = None) -> None:
self.parsed_tree = tree
if pseudo_element is not None and not isinstance(
pseudo_element, FunctionalPseudoElement
):
pseudo_element = ascii_lower(pseudo_element)
#: A :class:`FunctionalPseudoElement`,
#: or the identifier for the pseudo-element as a string,
# or ``None``.
#:
#: +-------------------------+----------------+--------------------------------+
#: | | Selector | Pseudo-element |
#: +=========================+================+================================+
#: | CSS3 syntax | ``a::before`` | ``'before'`` |
#: +-------------------------+----------------+--------------------------------+
#: | Older syntax | ``a:before`` | ``'before'`` |
#: +-------------------------+----------------+--------------------------------+
#: | From the Lists3_ draft, | ``li::marker`` | ``'marker'`` |
#: | not in Selectors3 | | |
#: +-------------------------+----------------+--------------------------------+
#: | Invalid pseudo-class | ``li:marker`` | ``None`` |
#: +-------------------------+----------------+--------------------------------+
#: | Functional | ``a::foo(2)`` | ``FunctionalPseudoElement(…)`` |
#: +-------------------------+----------------+--------------------------------+
#:
#: .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement
self.pseudo_element = pseudo_element
def __repr__(self) -> str:
if isinstance(self.pseudo_element, FunctionalPseudoElement):
pseudo_element = repr(self.pseudo_element)
elif self.pseudo_element:
pseudo_element = f"::{self.pseudo_element}"
else:
pseudo_element = ""
return f"{self.__class__.__name__}[{self.parsed_tree!r}{pseudo_element}]"
def canonical(self) -> str:
"""Return a CSS representation for this selector (a string)"""
if isinstance(self.pseudo_element, FunctionalPseudoElement):
pseudo_element = f"::{self.pseudo_element.canonical()}"
elif self.pseudo_element:
pseudo_element = f"::{self.pseudo_element}"
else:
pseudo_element = ""
res = f"{self.parsed_tree.canonical()}{pseudo_element}"
if len(res) > 1:
res = res.lstrip("*")
return res
def specificity(self) -> tuple[int, int, int]:
"""Return the specificity_ of this selector as a tuple of 3 integers.
.. _specificity: http://www.w3.org/TR/selectors/#specificity
"""
a, b, c = self.parsed_tree.specificity()
if self.pseudo_element:
c += 1
return a, b, c
class Class:
"""
Represents selector.class_name
"""
def __init__(self, selector: Tree, class_name: str) -> None:
self.selector = selector
self.class_name = class_name
def __repr__(self) -> str:
return f"{self.__class__.__name__}[{self.selector!r}.{self.class_name}]"
def canonical(self) -> str:
return f"{self.selector.canonical()}.{self.class_name}"
def specificity(self) -> tuple[int, int, int]:
a, b, c = self.selector.specificity()
b += 1
return a, b, c
class FunctionalPseudoElement:
"""
Represents selector::name(arguments)
.. attribute:: name
The name (identifier) of the pseudo-element, as a string.
.. attribute:: arguments
The arguments of the pseudo-element, as a list of tokens.
**Note:** tokens are not part of the public API,
and may change between cssselect versions.
Use at your own risks.
"""
def __init__(self, name: str, arguments: Sequence[Token]):
self.name = ascii_lower(name)
self.arguments = arguments
def __repr__(self) -> str:
token_values = [token.value for token in self.arguments]
return f"{self.__class__.__name__}[::{self.name}({token_values!r})]"
def argument_types(self) -> list[str]:
return [token.type for token in self.arguments]
def canonical(self) -> str:
args = "".join(token.css() for token in self.arguments)
return f"{self.name}({args})"
class Function:
"""
Represents selector:name(expr)
"""
def __init__(self, selector: Tree, name: str, arguments: Sequence[Token]) -> None:
self.selector = selector
self.name = ascii_lower(name)
self.arguments = arguments
def __repr__(self) -> str:
token_values = [token.value for token in self.arguments]
return f"{self.__class__.__name__}[{self.selector!r}:{self.name}({token_values!r})]"
def argument_types(self) -> list[str]:
return [token.type for token in self.arguments]
def canonical(self) -> str:
args = "".join(token.css() for token in self.arguments)
return f"{self.selector.canonical()}:{self.name}({args})"
def specificity(self) -> tuple[int, int, int]:
a, b, c = self.selector.specificity()
b += 1
return a, b, c
class Pseudo:
"""
Represents selector:ident
"""
def __init__(self, selector: Tree, ident: str) -> None:
self.selector = selector
self.ident = ascii_lower(ident)
def __repr__(self) -> str:
return f"{self.__class__.__name__}[{self.selector!r}:{self.ident}]"
def canonical(self) -> str:
return f"{self.selector.canonical()}:{self.ident}"
def specificity(self) -> tuple[int, int, int]:
a, b, c = self.selector.specificity()
b += 1
return a, b, c
class Negation:
"""
Represents selector:not(subselector)
"""
def __init__(self, selector: Tree, subselector: Tree) -> None:
self.selector = selector
self.subselector = subselector
def __repr__(self) -> str:
return f"{self.__class__.__name__}[{self.selector!r}:not({self.subselector!r})]"
def canonical(self) -> str:
subsel = self.subselector.canonical()
if len(subsel) > 1:
subsel = subsel.lstrip("*")
return f"{self.selector.canonical()}:not({subsel})"
def specificity(self) -> tuple[int, int, int]:
a1, b1, c1 = self.selector.specificity()
a2, b2, c2 = self.subselector.specificity()
return a1 + a2, b1 + b2, c1 + c2
class Relation:
"""
Represents selector:has(subselector)
"""
def __init__(self, selector: Tree, combinator: Token, subselector: Selector):
self.selector = selector
self.combinator = combinator
self.subselector = subselector
def __repr__(self) -> str:
return f"{self.__class__.__name__}[{self.selector!r}:has({self.subselector!r})]"
def canonical(self) -> str:
try:
subsel = self.subselector[0].canonical() # type: ignore[index]
except TypeError:
subsel = self.subselector.canonical()
if len(subsel) > 1:
subsel = subsel.lstrip("*")
return f"{self.selector.canonical()}:has({subsel})"
def specificity(self) -> tuple[int, int, int]:
a1, b1, c1 = self.selector.specificity()
try:
a2, b2, c2 = self.subselector[-1].specificity() # type: ignore[index]
except TypeError:
a2, b2, c2 = self.subselector.specificity()
return a1 + a2, b1 + b2, c1 + c2
class Matching:
"""
Represents selector:is(selector_list)
"""
def __init__(self, selector: Tree, selector_list: Iterable[Tree]):
self.selector = selector
self.selector_list = selector_list
def __repr__(self) -> str:
args_str = ", ".join(repr(s) for s in self.selector_list)
return f"{self.__class__.__name__}[{self.selector!r}:is({args_str})]"
def canonical(self) -> str:
selector_arguments = []
for s in self.selector_list:
selarg = s.canonical()
selector_arguments.append(selarg.lstrip("*"))
args_str = ", ".join(str(s) for s in selector_arguments)
return f"{self.selector.canonical()}:is({args_str})"
def specificity(self) -> tuple[int, int, int]:
return max(x.specificity() for x in self.selector_list)
class SpecificityAdjustment:
"""
Represents selector:where(selector_list)
Same as selector:is(selector_list), but its specificity is always 0
"""
def __init__(self, selector: Tree, selector_list: list[Tree]):
self.selector = selector
self.selector_list = selector_list
def __repr__(self) -> str:
args_str = ", ".join(repr(s) for s in self.selector_list)
return f"{self.__class__.__name__}[{self.selector!r}:where({args_str})]"
def canonical(self) -> str:
selector_arguments = []
for s in self.selector_list:
selarg = s.canonical()
selector_arguments.append(selarg.lstrip("*"))
args_str = ", ".join(str(s) for s in selector_arguments)
return f"{self.selector.canonical()}:where({args_str})"
def specificity(self) -> tuple[int, int, int]:
return 0, 0, 0
class Attrib:
"""
Represents selector[namespace|attrib operator value]
"""
@overload
def __init__(
self,
selector: Tree,
namespace: str | None,
attrib: str,
operator: Literal["exists"],
value: None,
) -> None: ...
@overload
def __init__(
self,
selector: Tree,
namespace: str | None,
attrib: str,
operator: str,
value: Token,
) -> None: ...
def __init__(
self,
selector: Tree,
namespace: str | None,
attrib: str,
operator: str,
value: Token | None,
) -> None:
self.selector = selector
self.namespace = namespace
self.attrib = attrib
self.operator = operator
self.value = value
def __repr__(self) -> str:
attrib = f"{self.namespace}|{self.attrib}" if self.namespace else self.attrib
if self.operator == "exists":
return f"{self.__class__.__name__}[{self.selector!r}[{attrib}]]"
assert self.value is not None
return f"{self.__class__.__name__}[{self.selector!r}[{attrib} {self.operator} {self.value.value!r}]]"
def canonical(self) -> str:
attrib = f"{self.namespace}|{self.attrib}" if self.namespace else self.attrib
if self.operator == "exists":
op = attrib
else:
assert self.value is not None
op = f"{attrib}{self.operator}{self.value.css()}"
return f"{self.selector.canonical()}[{op}]"
def specificity(self) -> tuple[int, int, int]:
a, b, c = self.selector.specificity()
b += 1
return a, b, c
class Element:
"""
Represents namespace|element
`None` is for the universal selector '*'
"""
def __init__(
self, namespace: str | None = None, element: str | None = None
) -> None:
self.namespace = namespace
self.element = element
def __repr__(self) -> str:
return f"{self.__class__.__name__}[{self.canonical()}]"
def canonical(self) -> str:
element = self.element or "*"
if self.namespace:
element = f"{self.namespace}|{element}"
return element
def specificity(self) -> tuple[int, int, int]:
if self.element:
return 0, 0, 1
return 0, 0, 0
class Hash:
"""
Represents selector#id
"""
def __init__(self, selector: Tree, id: str) -> None:
self.selector = selector
self.id = id
def __repr__(self) -> str:
return f"{self.__class__.__name__}[{self.selector!r}#{self.id}]"
def canonical(self) -> str:
return f"{self.selector.canonical()}#{self.id}"
def specificity(self) -> tuple[int, int, int]:
a, b, c = self.selector.specificity()
a += 1
return a, b, c
class CombinedSelector:
def __init__(self, selector: Tree, combinator: str, subselector: Tree) -> None:
assert selector is not None
self.selector = selector
self.combinator = combinator
self.subselector = subselector
def __repr__(self) -> str:
comb = "<followed>" if self.combinator == " " else self.combinator
return (
f"{self.__class__.__name__}[{self.selector!r} {comb} {self.subselector!r}]"
)
def canonical(self) -> str:
subsel = self.subselector.canonical()
if len(subsel) > 1:
subsel = subsel.lstrip("*")
return f"{self.selector.canonical()} {self.combinator} {subsel}"
def specificity(self) -> tuple[int, int, int]:
a1, b1, c1 = self.selector.specificity()
a2, b2, c2 = self.subselector.specificity()
return a1 + a2, b1 + b2, c1 + c2
#### Parser
# foo
_el_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$")
# foo#bar or #bar
_id_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$")
# foo.bar or .bar
_class_re = re.compile(
r"^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$"
)
def parse(css: str) -> list[Selector]:
"""Parse a CSS *group of selectors*.
If you don't care about pseudo-elements or selector specificity,
you can skip this and use :meth:`~GenericTranslator.css_to_xpath`.
:param css:
A *group of selectors* as a string.
:raises:
:class:`SelectorSyntaxError` on invalid selectors.
:returns:
A list of parsed :class:`Selector` objects, one for each
selector in the comma-separated group.
"""
# Fast path for simple cases
match = _el_re.match(css)
if match:
return [Selector(Element(element=match.group(1)))]
match = _id_re.match(css)
if match is not None:
return [Selector(Hash(Element(element=match.group(1) or None), match.group(2)))]
match = _class_re.match(css)
if match is not None:
return [
Selector(Class(Element(element=match.group(1) or None), match.group(2)))
]
stream = TokenStream(tokenize(css))
stream.source = css
return list(parse_selector_group(stream))
# except SelectorSyntaxError:
# e = sys.exc_info()[1]
# message = "%s at %s -> %r" % (
# e, stream.used, stream.peek())
# e.msg = message
# e.args = tuple([message])
# raise
def parse_selector_group(stream: TokenStream) -> Iterator[Selector]:
stream.skip_whitespace()
while 1:
yield Selector(*parse_selector(stream))
if stream.peek() == ("DELIM", ","):
stream.next()
stream.skip_whitespace()
else:
break
def parse_selector(stream: TokenStream) -> tuple[Tree, PseudoElement | None]:
result, pseudo_element = parse_simple_selector(stream)
while 1:
stream.skip_whitespace()
peek = stream.peek()
if peek in (("EOF", None), ("DELIM", ",")):
break
if pseudo_element:
raise SelectorSyntaxError(
f"Got pseudo-element ::{pseudo_element} not at the end of a selector"
)
if peek.is_delim("+", ">", "~"):
# A combinator
combinator = cast(str, stream.next().value)
stream.skip_whitespace()
else:
# By exclusion, the last parse_simple_selector() ended
# at peek == ' '
combinator = " "
next_selector, pseudo_element = parse_simple_selector(stream)
result = CombinedSelector(result, combinator, next_selector)
return result, pseudo_element
def parse_simple_selector(
stream: TokenStream, inside_negation: bool = False
) -> tuple[Tree, PseudoElement | None]:
stream.skip_whitespace()
selector_start = len(stream.used)
peek = stream.peek()
if peek.type == "IDENT" or peek == ("DELIM", "*"):
if peek.type == "IDENT":
namespace = stream.next().value
else:
stream.next()
namespace = None
if stream.peek() == ("DELIM", "|"):
stream.next()
element = stream.next_ident_or_star()
else:
element = namespace
namespace = None
else:
element = namespace = None
result: Tree = Element(namespace, element)
pseudo_element: PseudoElement | None = None
while 1:
peek = stream.peek()
if (
peek.type in ("S", "EOF")
or peek.is_delim(",", "+", ">", "~")
or (inside_negation and peek == ("DELIM", ")"))
):
break
if pseudo_element:
raise SelectorSyntaxError(
f"Got pseudo-element ::{pseudo_element} not at the end of a selector"
)
if peek.type == "HASH":
result = Hash(result, cast(str, stream.next().value))
elif peek == ("DELIM", "."):
stream.next()
result = Class(result, stream.next_ident())
elif peek == ("DELIM", "|"):
stream.next()
result = Element(None, stream.next_ident())
elif peek == ("DELIM", "["):
stream.next()
result = parse_attrib(result, stream)
elif peek == ("DELIM", ":"):
stream.next()
if stream.peek() == ("DELIM", ":"):
stream.next()
pseudo_element = stream.next_ident()
if stream.peek() == ("DELIM", "("):
stream.next()
pseudo_element = FunctionalPseudoElement(
pseudo_element, parse_arguments(stream)
)
continue
ident = stream.next_ident()
if ident.lower() in ("first-line", "first-letter", "before", "after"):
# Special case: CSS 2.1 pseudo-elements can have a single ':'
# Any new pseudo-element must have two.
pseudo_element = str(ident)
continue
if stream.peek() != ("DELIM", "("):
result = Pseudo(result, ident)
if repr(result) == "Pseudo[Element[*]:scope]" and not (
len(stream.used) == 2
or (len(stream.used) == 3 and stream.used[0].type == "S")
or (len(stream.used) >= 3 and stream.used[-3].is_delim(","))
or (
len(stream.used) >= 4
and stream.used[-3].type == "S"
and stream.used[-4].is_delim(",")
)
):
raise SelectorSyntaxError(
'Got immediate child pseudo-element ":scope" '
"not at the start of a selector"
)
continue
stream.next()
stream.skip_whitespace()
if ident.lower() == "not":
if inside_negation:
raise SelectorSyntaxError("Got nested :not()")
argument, argument_pseudo_element = parse_simple_selector(
stream, inside_negation=True
)
next = stream.next()
if argument_pseudo_element:
raise SelectorSyntaxError(
f"Got pseudo-element ::{argument_pseudo_element} inside :not() at {next.pos}"
)
if next != ("DELIM", ")"):
raise SelectorSyntaxError(f"Expected ')', got {next}")
result = Negation(result, argument)
elif ident.lower() == "has":
combinator, arguments = parse_relative_selector(stream)
result = Relation(result, combinator, arguments)
elif ident.lower() in ("matches", "is"):
selectors = parse_simple_selector_arguments(stream)
result = Matching(result, selectors)
elif ident.lower() == "where":
selectors = parse_simple_selector_arguments(stream)
result = SpecificityAdjustment(result, selectors)
else:
result = Function(result, ident, parse_arguments(stream))
else:
raise SelectorSyntaxError(f"Expected selector, got {peek}")
if len(stream.used) == selector_start:
raise SelectorSyntaxError(f"Expected selector, got {stream.peek()}")
return result, pseudo_element
def parse_arguments(stream: TokenStream) -> list[Token]:
arguments: list[Token] = []
while 1: # noqa: RET503
stream.skip_whitespace()
next = stream.next()
if next.type in ("IDENT", "STRING", "NUMBER") or next in [
("DELIM", "+"),
("DELIM", "-"),
]:
arguments.append(next)
elif next == ("DELIM", ")"):
return arguments
else:
raise SelectorSyntaxError(f"Expected an argument, got {next}")
def parse_relative_selector(stream: TokenStream) -> tuple[Token, Selector]:
stream.skip_whitespace()
subselector = ""
next = stream.next()
if next in [("DELIM", "+"), ("DELIM", "-"), ("DELIM", ">"), ("DELIM", "~")]:
combinator = next
stream.skip_whitespace()
next = stream.next()
else:
combinator = Token("DELIM", " ", pos=0)
while 1: # noqa: RET503
if next.type in ("IDENT", "STRING", "NUMBER") or next in [
("DELIM", "."),
("DELIM", "*"),
]:
subselector += cast(str, next.value)
elif next == ("DELIM", ")"):
result = parse(subselector)
return combinator, result[0]
else:
raise SelectorSyntaxError(f"Expected an argument, got {next}")
next = stream.next()
def parse_simple_selector_arguments(stream: TokenStream) -> list[Tree]:
arguments = []
while 1:
result, pseudo_element = parse_simple_selector(stream, True)
if pseudo_element:
raise SelectorSyntaxError(
f"Got pseudo-element ::{pseudo_element} inside function"
)
stream.skip_whitespace()
next = stream.next()
if next in (("EOF", None), ("DELIM", ",")):
stream.next()
stream.skip_whitespace()
arguments.append(result)
elif next == ("DELIM", ")"):
arguments.append(result)
break
else:
raise SelectorSyntaxError(f"Expected an argument, got {next}")
return arguments
def parse_attrib(selector: Tree, stream: TokenStream) -> Attrib:
stream.skip_whitespace()
attrib = stream.next_ident_or_star()
if attrib is None and stream.peek() != ("DELIM", "|"):
raise SelectorSyntaxError(f"Expected '|', got {stream.peek()}")
namespace: str | None
op: str | None
if stream.peek() == ("DELIM", "|"):
stream.next()
if stream.peek() == ("DELIM", "="):
namespace = None
stream.next()
op = "|="
else:
namespace = attrib
attrib = stream.next_ident()
op = None
else:
namespace = op = None
if op is None:
stream.skip_whitespace()
next = stream.next()
if next == ("DELIM", "]"):
return Attrib(selector, namespace, cast(str, attrib), "exists", None)
if next == ("DELIM", "="):
op = "="
elif next.is_delim("^", "$", "*", "~", "|", "!") and (
stream.peek() == ("DELIM", "=")
):
op = cast(str, next.value) + "="
stream.next()
else:
raise SelectorSyntaxError(f"Operator expected, got {next}")
stream.skip_whitespace()
value = stream.next()
if value.type not in ("IDENT", "STRING"):
raise SelectorSyntaxError(f"Expected string or ident, got {value}")
stream.skip_whitespace()
next = stream.next()
if next != ("DELIM", "]"):
raise SelectorSyntaxError(f"Expected ']', got {next}")
return Attrib(selector, namespace, cast(str, attrib), op, value)
def parse_series(tokens: Iterable[Token]) -> tuple[int, int]:
"""
Parses the arguments for :nth-child() and friends.
:raises: A list of tokens
:returns: :``(a, b)``
"""
for token in tokens:
if token.type == "STRING":
raise ValueError("String tokens not allowed in series.")
s = "".join(cast(str, token.value) for token in tokens).strip()
if s == "odd":
return 2, 1
if s == "even":
return 2, 0
if s == "n":
return 1, 0
if "n" not in s:
# Just b
return 0, int(s)
a, b = s.split("n", 1)
a_as_int: int
if not a:
a_as_int = 1
elif a in {"-", "+"}:
a_as_int = int(a + "1")
else:
a_as_int = int(a)
b_as_int = int(b) if b else 0
return a_as_int, b_as_int
#### Token objects
class Token(tuple[str, Optional[str]]): # noqa: SLOT001
@overload
def __new__(
cls,
type_: Literal["IDENT", "HASH", "STRING", "S", "DELIM", "NUMBER"],
value: str,
pos: int,
) -> Self: ...
@overload
def __new__(cls, type_: Literal["EOF"], value: None, pos: int) -> Self: ...
def __new__(cls, type_: str, value: str | None, pos: int) -> Self:
obj = tuple.__new__(cls, (type_, value))
obj.pos = pos
return obj
def __repr__(self) -> str:
return f"<{self.type} '{self.value}' at {self.pos}>"
def is_delim(self, *values: str) -> bool:
return self.type == "DELIM" and self.value in values
pos: int
@property
def type(self) -> str:
return self[0]
@property
def value(self) -> str | None:
return self[1]
def css(self) -> str:
if self.type == "STRING":
return repr(self.value)
return cast(str, self.value)
class EOFToken(Token):
def __new__(cls, pos: int) -> Self:
return Token.__new__(cls, "EOF", None, pos)
def __repr__(self) -> str:
return f"<{self.type} at {self.pos}>"
#### Tokenizer
class TokenMacros:
unicode_escape = r"\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?"
escape = unicode_escape + r"|\\[^\n\r\f0-9a-f]"
string_escape = r"\\(?:\n|\r\n|\r|\f)|" + escape
nonascii = r"[^\0-\177]"
nmchar = f"[_a-z0-9-]|{escape}|{nonascii}"
nmstart = f"[_a-z]|{escape}|{nonascii}"
class MatchFunc(Protocol):
def __call__(
self, string: str, pos: int = ..., endpos: int = ...
) -> re.Match[str] | None: ...
def _compile(pattern: str) -> MatchFunc:
return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match
_match_whitespace = _compile(r"[ \t\r\n\f]+")
_match_number = _compile(r"[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)")
_match_hash = _compile("#(?:%(nmchar)s)+")
_match_ident = _compile("-?(?:%(nmstart)s)(?:%(nmchar)s)*")
_match_string_by_quote = {
"'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"),
'"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'),
}
_sub_simple_escape = re.compile(r"\\(.)").sub
_sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.IGNORECASE).sub
_sub_newline_escape = re.compile(r"\\(?:\n|\r\n|\r|\f)").sub
# Same as r'\1', but faster on CPython
_replace_simple = operator.methodcaller("group", 1)
def _replace_unicode(match: re.Match[str]) -> str:
codepoint = int(match.group(1), 16)
if codepoint > sys.maxunicode:
codepoint = 0xFFFD
return chr(codepoint)
def unescape_ident(value: str) -> str:
value = _sub_unicode_escape(_replace_unicode, value)
return _sub_simple_escape(_replace_simple, value)
def tokenize(s: str) -> Iterator[Token]:
pos = 0
len_s = len(s)
while pos < len_s:
match = _match_whitespace(s, pos=pos)
if match:
yield Token("S", " ", pos)
pos = match.end()
continue
match = _match_ident(s, pos=pos)
if match:
value = _sub_simple_escape(
_replace_simple, _sub_unicode_escape(_replace_unicode, match.group())
)
yield Token("IDENT", value, pos)
pos = match.end()
continue
match = _match_hash(s, pos=pos)
if match:
value = _sub_simple_escape(
_replace_simple,
_sub_unicode_escape(_replace_unicode, match.group()[1:]),
)
yield Token("HASH", value, pos)
pos = match.end()
continue
quote = s[pos]
if quote in _match_string_by_quote:
match = _match_string_by_quote[quote](s, pos=pos + 1)
assert match, "Should have found at least an empty match"
end_pos = match.end()
if end_pos == len_s:
raise SelectorSyntaxError(f"Unclosed string at {pos}")
if s[end_pos] != quote:
raise SelectorSyntaxError(f"Invalid string at {pos}")
value = _sub_simple_escape(
_replace_simple,
_sub_unicode_escape(
_replace_unicode, _sub_newline_escape("", match.group())
),
)
yield Token("STRING", value, pos)
pos = end_pos + 1
continue
match = _match_number(s, pos=pos)
if match:
value = match.group()
yield Token("NUMBER", value, pos)
pos = match.end()
continue
pos2 = pos + 2
if s[pos:pos2] == "/*":
pos = s.find("*/", pos2)
if pos == -1:
pos = len_s
else:
pos += 2
continue
yield Token("DELIM", s[pos], pos)
pos += 1
assert pos == len_s
yield EOFToken(pos)
class TokenStream:
def __init__(self, tokens: Iterable[Token], source: str | None = None) -> None:
self.used: list[Token] = []
self.tokens = iter(tokens)
self.source = source
self.peeked: Token | None = None
self._peeking = False
self.next_token = self.tokens.__next__
def next(self) -> Token:
if self._peeking:
self._peeking = False
assert self.peeked is not None
self.used.append(self.peeked)
return self.peeked
next = self.next_token()
self.used.append(next)
return next
def peek(self) -> Token:
if not self._peeking:
self.peeked = self.next_token()
self._peeking = True
assert self.peeked is not None
return self.peeked
def next_ident(self) -> str:
next = self.next()
if next.type != "IDENT":
raise SelectorSyntaxError(f"Expected ident, got {next}")
return cast(str, next.value)
def next_ident_or_star(self) -> str | None:
next = self.next()
if next.type == "IDENT":
return next.value
if next == ("DELIM", "*"):
return None
raise SelectorSyntaxError(f"Expected ident or '*', got {next}")
def skip_whitespace(self) -> None:
peek = self.peek()
if peek.type == "S":
self.next()