lsottani's picture
Upload folder using huggingface_hub
d9f69e5 verified
"""
cssselect.xpath
===============
Translation of parsed CSS selectors to XPath expressions.
:copyright: (c) 2007-2012 Ian Bicking and contributors.
See AUTHORS for more details.
:license: BSD, see LICENSE for more details.
"""
from __future__ import annotations
import re
from collections.abc import Callable
from typing import TYPE_CHECKING, Optional, cast
from cssselect.parser import (
Attrib,
Class,
CombinedSelector,
Element,
Function,
Hash,
Matching,
Negation,
Pseudo,
PseudoElement,
Relation,
Selector,
SelectorError,
SpecificityAdjustment,
Tree,
parse,
parse_series,
)
if TYPE_CHECKING:
# typing.Self requires Python 3.11
from typing_extensions import Self
class ExpressionError(SelectorError, RuntimeError):
"""Unknown or unsupported selector (eg. pseudo-class)."""
#### XPath Helpers
class XPathExpr:
def __init__(
self,
path: str = "",
element: str = "*",
condition: str = "",
star_prefix: bool = False,
) -> None:
self.path = path
self.element = element
self.condition = condition
def __str__(self) -> str:
path = str(self.path) + str(self.element)
if self.condition:
path += f"[{self.condition}]"
return path
def __repr__(self) -> str:
return f"{self.__class__.__name__}[{self}]"
def add_condition(self, condition: str, conjuction: str = "and") -> Self:
if self.condition:
self.condition = f"({self.condition}) {conjuction} ({condition})"
else:
self.condition = condition
return self
def add_name_test(self) -> None:
if self.element == "*":
# We weren't doing a test anyway
return
self.add_condition(f"name() = {GenericTranslator.xpath_literal(self.element)}")
self.element = "*"
def add_star_prefix(self) -> None:
"""
Append '*/' to the path to keep the context constrained
to a single parent.
"""
self.path += "*/"
def join(
self,
combiner: str,
other: XPathExpr,
closing_combiner: str | None = None,
has_inner_condition: bool = False,
) -> Self:
path = str(self) + combiner
# Any "star prefix" is redundant when joining.
if other.path != "*/":
path += other.path
self.path = path
if not has_inner_condition:
self.element = (
other.element + closing_combiner if closing_combiner else other.element
)
self.condition = other.condition
else:
self.element = other.element
if other.condition:
self.element += "[" + other.condition + "]"
if closing_combiner:
self.element += closing_combiner
return self
split_at_single_quotes = re.compile("('+)").split
# The spec is actually more permissive than that, but don’t bother.
# This is just for the fast path.
# http://www.w3.org/TR/REC-xml/#NT-NameStartChar
is_safe_name = re.compile("^[a-zA-Z_][a-zA-Z0-9_.-]*$").match
# Test that the string is not empty and does not contain whitespace
is_non_whitespace = re.compile(r"^[^ \t\r\n\f]+$").match
#### Translation
class GenericTranslator:
"""
Translator for "generic" XML documents.
Everything is case-sensitive, no assumption is made on the meaning
of element names and attribute names.
"""
####
#### HERE BE DRAGONS
####
#### You are welcome to hook into this to change some behavior,
#### but do so at your own risks.
#### Until it has received a lot more work and review,
#### I reserve the right to change this API in backward-incompatible ways
#### with any minor version of cssselect.
#### See https://github.com/scrapy/cssselect/pull/22
#### -- Simon Sapin.
####
combinator_mapping = {
" ": "descendant",
">": "child",
"+": "direct_adjacent",
"~": "indirect_adjacent",
}
attribute_operator_mapping = {
"exists": "exists",
"=": "equals",
"~=": "includes",
"|=": "dashmatch",
"^=": "prefixmatch",
"$=": "suffixmatch",
"*=": "substringmatch",
"!=": "different", # XXX Not in Level 3 but meh
}
#: The attribute used for ID selectors depends on the document language:
#: http://www.w3.org/TR/selectors/#id-selectors
id_attribute = "id"
#: The attribute used for ``:lang()`` depends on the document language:
#: http://www.w3.org/TR/selectors/#lang-pseudo
lang_attribute = "xml:lang"
#: The case sensitivity of document language element names,
#: attribute names, and attribute values in selectors depends
#: on the document language.
#: http://www.w3.org/TR/selectors/#casesens
#:
#: When a document language defines one of these as case-insensitive,
#: cssselect assumes that the document parser makes the parsed values
#: lower-case. Making the selector lower-case too makes the comparaison
#: case-insensitive.
#:
#: In HTML, element names and attributes names (but not attribute values)
#: are case-insensitive. All of lxml.html, html5lib, BeautifulSoup4
#: and HTMLParser make them lower-case in their parse result, so
#: the assumption holds.
lower_case_element_names = False
lower_case_attribute_names = False
lower_case_attribute_values = False
# class used to represent and xpath expression
xpathexpr_cls = XPathExpr
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
"""Translate a *group of selectors* to XPath.
Pseudo-elements are not supported here since XPath only knows
about "real" elements.
:param css:
A *group of selectors* as a string.
:param prefix:
This string is prepended to the XPath expression for each selector.
The default makes selectors scoped to the context node’s subtree.
:raises:
:class:`~cssselect.SelectorSyntaxError` on invalid selectors,
:class:`ExpressionError` on unknown/unsupported selectors,
including pseudo-elements.
:returns:
The equivalent XPath 1.0 expression as a string.
"""
return " | ".join(
self.selector_to_xpath(selector, prefix, translate_pseudo_elements=True)
for selector in parse(css)
)
def selector_to_xpath(
self,
selector: Selector,
prefix: str = "descendant-or-self::",
translate_pseudo_elements: bool = False,
) -> str:
"""Translate a parsed selector to XPath.
:param selector:
A parsed :class:`Selector` object.
:param prefix:
This string is prepended to the resulting XPath expression.
The default makes selectors scoped to the context node’s subtree.
:param translate_pseudo_elements:
Unless this is set to ``True`` (as :meth:`css_to_xpath` does),
the :attr:`~Selector.pseudo_element` attribute of the selector
is ignored.
It is the caller's responsibility to reject selectors
with pseudo-elements, or to account for them somehow.
:raises:
:class:`ExpressionError` on unknown/unsupported selectors.
:returns:
The equivalent XPath 1.0 expression as a string.
"""
tree = getattr(selector, "parsed_tree", None)
if not tree:
raise TypeError(f"Expected a parsed selector, got {selector!r}")
xpath = self.xpath(tree)
assert isinstance(xpath, self.xpathexpr_cls) # help debug a missing 'return'
if translate_pseudo_elements and selector.pseudo_element:
xpath = self.xpath_pseudo_element(xpath, selector.pseudo_element)
return (prefix or "") + str(xpath)
def xpath_pseudo_element(
self, xpath: XPathExpr, pseudo_element: PseudoElement
) -> XPathExpr:
"""Translate a pseudo-element.
Defaults to not supporting pseudo-elements at all,
but can be overridden by sub-classes.
"""
raise ExpressionError("Pseudo-elements are not supported.")
@staticmethod
def xpath_literal(s: str) -> str:
s = str(s)
if "'" not in s:
s = f"'{s}'"
elif '"' not in s:
s = f'"{s}"'
else:
parts_quoted = [
f'"{part}"' if "'" in part else f"'{part}'"
for part in split_at_single_quotes(s)
if part
]
s = "concat({})".format(",".join(parts_quoted))
return s
def xpath(self, parsed_selector: Tree) -> XPathExpr:
"""Translate any parsed selector object."""
type_name = type(parsed_selector).__name__
method = cast(
Optional[Callable[[Tree], XPathExpr]],
getattr(self, f"xpath_{type_name.lower()}", None),
)
if method is None:
raise ExpressionError(f"{type_name} is not supported.")
return method(parsed_selector)
# Dispatched by parsed object type
def xpath_combinedselector(self, combined: CombinedSelector) -> XPathExpr:
"""Translate a combined selector."""
combinator = self.combinator_mapping[combined.combinator]
method = cast(
Callable[[XPathExpr, XPathExpr], XPathExpr],
getattr(self, f"xpath_{combinator}_combinator"),
)
return method(self.xpath(combined.selector), self.xpath(combined.subselector))
def xpath_negation(self, negation: Negation) -> XPathExpr:
xpath = self.xpath(negation.selector)
sub_xpath = self.xpath(negation.subselector)
sub_xpath.add_name_test()
if sub_xpath.condition:
return xpath.add_condition(f"not({sub_xpath.condition})")
return xpath.add_condition("0")
def xpath_relation(self, relation: Relation) -> XPathExpr:
xpath = self.xpath(relation.selector)
combinator = relation.combinator
subselector = relation.subselector
right = self.xpath(subselector.parsed_tree)
method = cast(
Callable[[XPathExpr, XPathExpr], XPathExpr],
getattr(
self,
f"xpath_relation_{self.combinator_mapping[cast(str, combinator.value)]}_combinator",
),
)
return method(xpath, right)
def xpath_matching(self, matching: Matching) -> XPathExpr:
xpath = self.xpath(matching.selector)
exprs = [self.xpath(selector) for selector in matching.selector_list]
for e in exprs:
e.add_name_test()
if e.condition:
xpath.add_condition(e.condition, "or")
return xpath
def xpath_specificityadjustment(self, matching: SpecificityAdjustment) -> XPathExpr:
xpath = self.xpath(matching.selector)
exprs = [self.xpath(selector) for selector in matching.selector_list]
for e in exprs:
e.add_name_test()
if e.condition:
xpath.add_condition(e.condition, "or")
return xpath
def xpath_function(self, function: Function) -> XPathExpr:
"""Translate a functional pseudo-class."""
method_name = "xpath_{}_function".format(function.name.replace("-", "_"))
method = cast(
Optional[Callable[[XPathExpr, Function], XPathExpr]],
getattr(self, method_name, None),
)
if not method:
raise ExpressionError(f"The pseudo-class :{function.name}() is unknown")
return method(self.xpath(function.selector), function)
def xpath_pseudo(self, pseudo: Pseudo) -> XPathExpr:
"""Translate a pseudo-class."""
method_name = "xpath_{}_pseudo".format(pseudo.ident.replace("-", "_"))
method = cast(
Optional[Callable[[XPathExpr], XPathExpr]], getattr(self, method_name, None)
)
if not method:
# TODO: better error message for pseudo-elements?
raise ExpressionError(f"The pseudo-class :{pseudo.ident} is unknown")
return method(self.xpath(pseudo.selector))
def xpath_attrib(self, selector: Attrib) -> XPathExpr:
"""Translate an attribute selector."""
operator = self.attribute_operator_mapping[selector.operator]
method = cast(
Callable[[XPathExpr, str, Optional[str]], XPathExpr],
getattr(self, f"xpath_attrib_{operator}"),
)
if self.lower_case_attribute_names:
name = selector.attrib.lower()
else:
name = selector.attrib
safe = is_safe_name(name)
if selector.namespace:
name = f"{selector.namespace}:{name}"
safe = safe and is_safe_name(selector.namespace)
if safe:
attrib = "@" + name
else:
attrib = f"attribute::*[name() = {self.xpath_literal(name)}]"
if selector.value is None:
value = None
elif self.lower_case_attribute_values:
value = cast(str, selector.value.value).lower()
else:
value = selector.value.value
return method(self.xpath(selector.selector), attrib, value)
def xpath_class(self, class_selector: Class) -> XPathExpr:
"""Translate a class selector."""
# .foo is defined as [class~=foo] in the spec.
xpath = self.xpath(class_selector.selector)
return self.xpath_attrib_includes(xpath, "@class", class_selector.class_name)
def xpath_hash(self, id_selector: Hash) -> XPathExpr:
"""Translate an ID selector."""
xpath = self.xpath(id_selector.selector)
return self.xpath_attrib_equals(xpath, "@id", id_selector.id)
def xpath_element(self, selector: Element) -> XPathExpr:
"""Translate a type or universal selector."""
element = selector.element
if not element:
element = "*"
safe = True
else:
safe = bool(is_safe_name(element))
if self.lower_case_element_names:
element = element.lower()
if selector.namespace:
# Namespace prefixes are case-sensitive.
# http://www.w3.org/TR/css3-namespace/#prefixes
element = f"{selector.namespace}:{element}"
safe = safe and bool(is_safe_name(selector.namespace))
xpath = self.xpathexpr_cls(element=element)
if not safe:
xpath.add_name_test()
return xpath
# CombinedSelector: dispatch by combinator
def xpath_descendant_combinator(
self, left: XPathExpr, right: XPathExpr
) -> XPathExpr:
"""right is a child, grand-child or further descendant of left"""
return left.join("/descendant-or-self::*/", right)
def xpath_child_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr:
"""right is an immediate child of left"""
return left.join("/", right)
def xpath_direct_adjacent_combinator(
self, left: XPathExpr, right: XPathExpr
) -> XPathExpr:
"""right is a sibling immediately after left"""
xpath = left.join("/following-sibling::", right)
xpath.add_name_test()
return xpath.add_condition("position() = 1")
def xpath_indirect_adjacent_combinator(
self, left: XPathExpr, right: XPathExpr
) -> XPathExpr:
"""right is a sibling after left, immediately or not"""
return left.join("/following-sibling::", right)
def xpath_relation_descendant_combinator(
self, left: XPathExpr, right: XPathExpr
) -> XPathExpr:
"""right is a child, grand-child or further descendant of left; select left"""
return left.join(
"[descendant::", right, closing_combiner="]", has_inner_condition=True
)
def xpath_relation_child_combinator(
self, left: XPathExpr, right: XPathExpr
) -> XPathExpr:
"""right is an immediate child of left; select left"""
return left.join("[./", right, closing_combiner="]")
def xpath_relation_direct_adjacent_combinator(
self, left: XPathExpr, right: XPathExpr
) -> XPathExpr:
"""right is a sibling immediately after left; select left"""
return left.add_condition(
f"following-sibling::*[(name() = '{right.element}') and (position() = 1)]"
)
def xpath_relation_indirect_adjacent_combinator(
self, left: XPathExpr, right: XPathExpr
) -> XPathExpr:
"""right is a sibling after left, immediately or not; select left"""
return left.join("[following-sibling::", right, closing_combiner="]")
# Function: dispatch by function/pseudo-class name
def xpath_nth_child_function(
self,
xpath: XPathExpr,
function: Function,
last: bool = False,
add_name_test: bool = True,
) -> XPathExpr:
try:
a, b = parse_series(function.arguments)
except ValueError as ex:
raise ExpressionError(f"Invalid series: '{function.arguments!r}'") from ex
# From https://www.w3.org/TR/css3-selectors/#structural-pseudos:
#
# :nth-child(an+b)
# an+b-1 siblings before
#
# :nth-last-child(an+b)
# an+b-1 siblings after
#
# :nth-of-type(an+b)
# an+b-1 siblings with the same expanded element name before
#
# :nth-last-of-type(an+b)
# an+b-1 siblings with the same expanded element name after
#
# So,
# for :nth-child and :nth-of-type
#
# count(preceding-sibling::<nodetest>) = an+b-1
#
# for :nth-last-child and :nth-last-of-type
#
# count(following-sibling::<nodetest>) = an+b-1
#
# therefore,
# count(...) - (b-1) ≡ 0 (mod a)
#
# if a == 0:
# ~~~~~~~~~~
# count(...) = b-1
#
# if a < 0:
# ~~~~~~~~~
# count(...) - b +1 <= 0
# -> count(...) <= b-1
#
# if a > 0:
# ~~~~~~~~~
# count(...) - b +1 >= 0
# -> count(...) >= b-1
# work with b-1 instead
b_min_1 = b - 1
# early-exit condition 1:
# ~~~~~~~~~~~~~~~~~~~~~~~
# for a == 1, nth-*(an+b) means n+b-1 siblings before/after,
# and since n ∈ {0, 1, 2, ...}, if b-1<=0,
# there is always an "n" matching any number of siblings (maybe none)
if a == 1 and b_min_1 <= 0:
return xpath
# early-exit condition 2:
# ~~~~~~~~~~~~~~~~~~~~~~~
# an+b-1 siblings with a<0 and (b-1)<0 is not possible
if a < 0 and b_min_1 < 0:
return xpath.add_condition("0")
# `add_name_test` boolean is inverted and somewhat counter-intuitive:
#
# nth_of_type() calls nth_child(add_name_test=False)
nodetest = "*" if add_name_test else f"{xpath.element}"
# count siblings before or after the element
if not last:
siblings_count = f"count(preceding-sibling::{nodetest})"
else:
siblings_count = f"count(following-sibling::{nodetest})"
# special case of fixed position: nth-*(0n+b)
# if a == 0:
# ~~~~~~~~~~
# count(***-sibling::***) = b-1
if a == 0:
return xpath.add_condition(f"{siblings_count} = {b_min_1}")
expressions = []
if a > 0:
# siblings count, an+b-1, is always >= 0,
# so if a>0, and (b-1)<=0, an "n" exists to satisfy this,
# therefore, the predicate is only interesting if (b-1)>0
if b_min_1 > 0:
expressions.append(f"{siblings_count} >= {b_min_1}")
else:
# if a<0, and (b-1)<0, no "n" satisfies this,
# this is tested above as an early exist condition
# otherwise,
expressions.append(f"{siblings_count} <= {b_min_1}")
# operations modulo 1 or -1 are simpler, one only needs to verify:
#
# - either:
# count(***-sibling::***) - (b-1) = n = 0, 1, 2, 3, etc.,
# i.e. count(***-sibling::***) >= (b-1)
#
# - or:
# count(***-sibling::***) - (b-1) = -n = 0, -1, -2, -3, etc.,
# i.e. count(***-sibling::***) <= (b-1)
# we we just did above.
#
if abs(a) != 1:
# count(***-sibling::***) - (b-1) ≡ 0 (mod a)
left = siblings_count
# apply "modulo a" on 2nd term, -(b-1),
# to simplify things like "(... +6) % -3",
# and also make it positive with |a|
b_neg = (-b_min_1) % abs(a)
if b_neg != 0:
left = f"({left} +{b_neg})"
expressions.append(f"{left} mod {a} = 0")
template = "(%s)" if len(expressions) > 1 else "%s"
xpath.add_condition(
" and ".join(template % expression for expression in expressions)
)
return xpath
def xpath_nth_last_child_function(
self, xpath: XPathExpr, function: Function
) -> XPathExpr:
return self.xpath_nth_child_function(xpath, function, last=True)
def xpath_nth_of_type_function(
self, xpath: XPathExpr, function: Function
) -> XPathExpr:
if xpath.element == "*":
raise ExpressionError("*:nth-of-type() is not implemented")
return self.xpath_nth_child_function(xpath, function, add_name_test=False)
def xpath_nth_last_of_type_function(
self, xpath: XPathExpr, function: Function
) -> XPathExpr:
if xpath.element == "*":
raise ExpressionError("*:nth-of-type() is not implemented")
return self.xpath_nth_child_function(
xpath, function, last=True, add_name_test=False
)
def xpath_contains_function(
self, xpath: XPathExpr, function: Function
) -> XPathExpr:
# Defined there, removed in later drafts:
# http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors
if function.argument_types() not in (["STRING"], ["IDENT"]):
raise ExpressionError(
f"Expected a single string or ident for :contains(), got {function.arguments!r}"
)
value = cast(str, function.arguments[0].value)
return xpath.add_condition(f"contains(., {self.xpath_literal(value)})")
def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr:
if function.argument_types() not in (["STRING"], ["IDENT"]):
raise ExpressionError(
f"Expected a single string or ident for :lang(), got {function.arguments!r}"
)
value = cast(str, function.arguments[0].value)
return xpath.add_condition(f"lang({self.xpath_literal(value)})")
# Pseudo: dispatch by pseudo-class name
def xpath_root_pseudo(self, xpath: XPathExpr) -> XPathExpr:
return xpath.add_condition("not(parent::*)")
# CSS immediate children (CSS ":scope > div" to XPath "child::div" or "./div")
# Works only at the start of a selector
# Needed to get immediate children of a processed selector in Scrapy
# for product in response.css('.product'):
# description = product.css(':scope > div::text').get()
def xpath_scope_pseudo(self, xpath: XPathExpr) -> XPathExpr:
return xpath.add_condition("1")
def xpath_first_child_pseudo(self, xpath: XPathExpr) -> XPathExpr:
return xpath.add_condition("count(preceding-sibling::*) = 0")
def xpath_last_child_pseudo(self, xpath: XPathExpr) -> XPathExpr:
return xpath.add_condition("count(following-sibling::*) = 0")
def xpath_first_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr:
if xpath.element == "*":
raise ExpressionError("*:first-of-type is not implemented")
return xpath.add_condition(f"count(preceding-sibling::{xpath.element}) = 0")
def xpath_last_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr:
if xpath.element == "*":
raise ExpressionError("*:last-of-type is not implemented")
return xpath.add_condition(f"count(following-sibling::{xpath.element}) = 0")
def xpath_only_child_pseudo(self, xpath: XPathExpr) -> XPathExpr:
return xpath.add_condition("count(parent::*/child::*) = 1")
def xpath_only_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr:
if xpath.element == "*":
raise ExpressionError("*:only-of-type is not implemented")
return xpath.add_condition(f"count(parent::*/child::{xpath.element}) = 1")
def xpath_empty_pseudo(self, xpath: XPathExpr) -> XPathExpr:
return xpath.add_condition("not(*) and not(string-length())")
def pseudo_never_matches(self, xpath: XPathExpr) -> XPathExpr:
"""Common implementation for pseudo-classes that never match."""
return xpath.add_condition("0")
xpath_link_pseudo = pseudo_never_matches
xpath_visited_pseudo = pseudo_never_matches
xpath_hover_pseudo = pseudo_never_matches
xpath_active_pseudo = pseudo_never_matches
xpath_focus_pseudo = pseudo_never_matches
xpath_target_pseudo = pseudo_never_matches
xpath_enabled_pseudo = pseudo_never_matches
xpath_disabled_pseudo = pseudo_never_matches
xpath_checked_pseudo = pseudo_never_matches
# Attrib: dispatch by attribute operator
def xpath_attrib_exists(
self, xpath: XPathExpr, name: str, value: str | None
) -> XPathExpr:
assert not value
xpath.add_condition(name)
return xpath
def xpath_attrib_equals(
self, xpath: XPathExpr, name: str, value: str | None
) -> XPathExpr:
assert value is not None
xpath.add_condition(f"{name} = {self.xpath_literal(value)}")
return xpath
def xpath_attrib_different(
self, xpath: XPathExpr, name: str, value: str | None
) -> XPathExpr:
assert value is not None
# FIXME: this seems like a weird hack...
if value:
xpath.add_condition(f"not({name}) or {name} != {self.xpath_literal(value)}")
else:
xpath.add_condition(f"{name} != {self.xpath_literal(value)}")
return xpath
def xpath_attrib_includes(
self, xpath: XPathExpr, name: str, value: str | None
) -> XPathExpr:
if value and is_non_whitespace(value):
arg = self.xpath_literal(" " + value + " ")
xpath.add_condition(
f"{name} and contains(concat(' ', normalize-space({name}), ' '), {arg})"
)
else:
xpath.add_condition("0")
return xpath
def xpath_attrib_dashmatch(
self, xpath: XPathExpr, name: str, value: str | None
) -> XPathExpr:
assert value is not None
arg = self.xpath_literal(value)
arg_dash = self.xpath_literal(value + "-")
# Weird, but true...
xpath.add_condition(
f"{name} and ({name} = {arg} or starts-with({name}, {arg_dash}))"
)
return xpath
def xpath_attrib_prefixmatch(
self, xpath: XPathExpr, name: str, value: str | None
) -> XPathExpr:
if value:
xpath.add_condition(
f"{name} and starts-with({name}, {self.xpath_literal(value)})"
)
else:
xpath.add_condition("0")
return xpath
def xpath_attrib_suffixmatch(
self, xpath: XPathExpr, name: str, value: str | None
) -> XPathExpr:
if value:
# Oddly there is a starts-with in XPath 1.0, but not ends-with
xpath.add_condition(
f"{name} and substring({name}, string-length({name})-{len(value) - 1}) = {self.xpath_literal(value)}"
)
else:
xpath.add_condition("0")
return xpath
def xpath_attrib_substringmatch(
self, xpath: XPathExpr, name: str, value: str | None
) -> XPathExpr:
if value:
# Attribute selectors are case sensitive
xpath.add_condition(
f"{name} and contains({name}, {self.xpath_literal(value)})"
)
else:
xpath.add_condition("0")
return xpath
class HTMLTranslator(GenericTranslator):
"""
Translator for (X)HTML documents.
Has a more useful implementation of some pseudo-classes based on
HTML-specific element names and attribute names, as described in
the `HTML5 specification`_. It assumes no-quirks mode.
The API is the same as :class:`GenericTranslator`.
.. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors
:param xhtml:
If false (the default), element names and attribute names
are case-insensitive.
"""
lang_attribute = "lang"
def __init__(self, xhtml: bool = False) -> None:
self.xhtml = xhtml # Might be useful for sub-classes?
if not xhtml:
# See their definition in GenericTranslator.
self.lower_case_element_names = True
self.lower_case_attribute_names = True
def xpath_checked_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore[override]
# FIXME: is this really all the elements?
return xpath.add_condition(
"(@selected and name(.) = 'option') or "
"(@checked "
"and (name(.) = 'input' or name(.) = 'command')"
"and (@type = 'checkbox' or @type = 'radio'))"
)
def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr:
if function.argument_types() not in (["STRING"], ["IDENT"]):
raise ExpressionError(
f"Expected a single string or ident for :lang(), got {function.arguments!r}"
)
value = function.arguments[0].value
assert value
arg = self.xpath_literal(value.lower() + "-")
return xpath.add_condition(
"ancestor-or-self::*[@lang][1][starts-with(concat("
# XPath 1.0 has no lower-case function...
f"translate(@{self.lang_attribute}, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
"'abcdefghijklmnopqrstuvwxyz'), "
f"'-'), {arg})]"
)
def xpath_link_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore[override]
return xpath.add_condition(
"@href and (name(.) = 'a' or name(.) = 'link' or name(.) = 'area')"
)
# Links are never visited, the implementation for :visited is the same
# as in GenericTranslator
def xpath_disabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore[override]
# http://www.w3.org/TR/html5/section-index.html#attributes-1
return xpath.add_condition(
"""
(
@disabled and
(
(name(.) = 'input' and @type != 'hidden') or
name(.) = 'button' or
name(.) = 'select' or
name(.) = 'textarea' or
name(.) = 'command' or
name(.) = 'fieldset' or
name(.) = 'optgroup' or
name(.) = 'option'
)
) or (
(
(name(.) = 'input' and @type != 'hidden') or
name(.) = 'button' or
name(.) = 'select' or
name(.) = 'textarea'
)
and ancestor::fieldset[@disabled]
)
"""
)
# FIXME: in the second half, add "and is not a descendant of that
# fieldset element's first legend element child, if any."
def xpath_enabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore[override]
# http://www.w3.org/TR/html5/section-index.html#attributes-1
return xpath.add_condition(
"""
(
@href and (
name(.) = 'a' or
name(.) = 'link' or
name(.) = 'area'
)
) or (
(
name(.) = 'command' or
name(.) = 'fieldset' or
name(.) = 'optgroup'
)
and not(@disabled)
) or (
(
(name(.) = 'input' and @type != 'hidden') or
name(.) = 'button' or
name(.) = 'select' or
name(.) = 'textarea' or
name(.) = 'keygen'
)
and not (@disabled or ancestor::fieldset[@disabled])
) or (
name(.) = 'option' and not(
@disabled or ancestor::optgroup[@disabled]
)
)
"""
)
# FIXME: ... or "li elements that are children of menu elements,
# and that have a child element that defines a command, if the first
# such element's Disabled State facet is false (not disabled)".
# FIXME: after ancestor::fieldset[@disabled], add "and is not a
# descendant of that fieldset element's first legend element child,
# if any."