Add white-space productions to the Selectors grammar implementation

amn · amn · commit b7613f68c059 · 2024-09-29T18:32:41.000+02:00
The missing white-space broke parsing of selectors, with the latter not having any tests in place to help uncover the issue. This adds handling of white-space through explicit references in the grammar (parsing procedures don't have to be amended), to match the specified behaviour (including that defined with prose).
diff --git a/README.md b/README.md
@@ -71,7 +71,7 @@ Parsing is offered only in the form of Python modules — no "command-line" prog
 
 ### Why?
 
-We wanted a "transparent" CSS parser — one that one could be used in different configurations without it imposing limitations that would strictly speaking go beyond parsing. Put differently, we wanted a parser that does not assume any particular application, a software _library_ in the classical sense of the term, or a true _API_ if you will.
+We wanted a "transparent" CSS parser — one that could be used in different configurations without it imposing limitations that would strictly speaking go beyond parsing. Put differently, we wanted a parser that does not assume any particular application, a software _library_ in the classical sense of the term, or a true _API_ if you will.
 
 For instance, the popular [Less](http://lesscss.org) software seems to rather effortlessly parse CSS [3] text, but it invariably re-arranges white-space in the output, without giving the user any control over the latter. Less is not _transparent_ like that — there is no way to use it with recovery of the originally parsed text from the parse tree — parsing with Less is a one-way street for at least _some_ applications (specifically those that "transform" CSS but need to preserve all of the original input as-is).
 
diff --git a/expand-macros.py b/expand-macros.py
@@ -2,7 +2,7 @@
 
 Macro processing refers here to eager rewriting/replacement/substitution of Python code constructs decorated with the "syntactic" (no definition available normally, when the containing module is imported) decorator `macro`. The purpose of such processing is to implement the equivalent to what is usually called "pre-processing" for e.g. C/C++ language(s). As `macro`-decorated procedures (only decorating of procedures is currently effectively supported for `macro`) are encountered during processing of Python code, the entire procedure is removed and "unparsed" equivalent of the series of AST statements it returned, are inserted in its place instead.
 
-This implements powerful and "semantically-aware" code pre-processing mechanism, for situations demanding it. Our immediate need with this was to allow type checkers like MyPy to be able to analyze as much of the project's Python code as possible, which these are normally unable to do in cases of so-called dynamically created types (and consequently object(s) of such types). And so instead of living with effectively uncheckable dynamic types created with the `type` built-in -- for e.g. `Token` subclasses -- we employ _pre-processing_ of Python code into Python code which lends to type-checking, a benefit we deemed to ba a "must-have" for the project.
+This implements powerful and "semantically-aware" code pre-processing mechanism, for situations demanding it. Our immediate need with this was to allow type checkers like MyPy to be able to analyze as much of the project's Python code as possible, which these are normally unable to do in cases of so-called dynamically created types (and consequently object(s) of such types). And so instead of living with effectively uncheckable dynamic types created with the `type` built-in -- for e.g. `Token` subclasses -- we employ _pre-processing_ of Python code into Python code which lends to type-checking, a benefit we deemed to be a "must-have" for the project.
 """
 
 import ast
diff --git a/src/csspring/selectors.py b/src/csspring/selectors.py
@@ -11,7 +11,8 @@
 from .syntax.tokenizing import Token, BadStringToken, BadURLToken, CloseBraceToken, CloseBracketToken, CloseParenToken, ColonToken, DelimToken, FunctionToken, HashToken, IdentToken, OpenBraceToken, OpenBracketToken, OpenParenToken, StringToken
 
 from .syntax.grammar import any_value
-from .values import Production, AlternativesProduction, CommaSeparatedRepetitionProduction, ConcatenationProduction, NonEmptyProduction, OptionalProduction, ReferenceProduction, RepetitionProduction, TokenProduction
+from .values import Production, AlternativesProduction, CommaSeparatedRepetitionProduction, ConcatenationProduction, NonEmptyProduction, OptionalProduction, ReferenceProduction, RepetitionProduction, TokenProduction, OWS
+from .utils import intersperse
 
 from functools import singledispatch
 from typing import cast
@@ -67,29 +68,6 @@ def parse_any_value(input: TokenStream) -> Product | None:
     else:
         return None
 
-@parse.register
-def _(production: CommaSeparatedRepetitionProduction, input: TokenStream) -> Product | None:
-    """Variant of `parse` for productions of the `#` multiplier variety (see https://drafts.csswg.org/css-values-4/#mult-comma)."""
-    result: list[Product | Token] = []
-    input.mark()
-    while True:
-        value: Product | Token | None
-        if result:
-            value = parse(production.delimiter, input)
-            if value is None:
-                break
-            result.append(value)
-        value = parse(production.element, input)
-        if value is None:
-            break
-        result.append(value)
-    if result:
-        input.discard_mark()
-        return result
-    else:
-        input.restore_mark()
-        return None
-
 @parse.register
 def _(production: ConcatenationProduction, input: TokenStream) -> Product | None:
     """Variant of `parse` for productions of the ` ` combinator variety (see "juxtaposing components" at https://drafts.csswg.org/css-values-4/#component-combinators)."""
@@ -106,7 +84,7 @@ def _(production: ConcatenationProduction, input: TokenStream) -> Product | None
 @parse.register
 def _(production: NonEmptyProduction, input: TokenStream) -> Product | None:
     """Variant of `parse` for productions of the `!` multiplier variety (see https://drafts.csswg.org/css-values-4/#mult-req)."""
-    result = cast(Product, parse(production.element, input)) # The element of a non-empty production is concatenation, and the `parse` overload for `ConcatenationProduction` never returns a `Token`, only `Product | None`
+    result = cast(Product | None, parse(production.element, input)) # The element of a non-empty production is concatenation, and the `parse` overload for `ConcatenationProduction` never returns a `Token`, only `Product | None`
     if result and any(tokens(result)):
         return result
     else:
@@ -126,9 +104,21 @@ def _(production: RepetitionProduction, input: TokenStream) -> Product | None:
     result: list[Product | Token] = []
     input.mark()
     while True:
+        if result and production.separator:
+            input.mark()
+            separator = parse(production.separator, input)
+            if separator is None:
+                input.restore_mark()
+                break
         value = parse(production.element, input)
         if value is None:
+            if result and production.separator:
+                input.restore_mark()
             break
+        if result and production.separator:
+            assert separator is not None
+            result.append(separator)
+            input.discard_mark()
         result.append(value)
         if len(result) == production.max:
             break
@@ -157,13 +147,19 @@ def parse_selector_list(input: TokenStream) -> Product | None:
 
     Parsing of selector lists is the _reason d'etre_ for this module and this is the [convenience] procedure that exposes the feature.
     """
-    return cast(Product | None, parse(grammar.selector_list, input))
+    return cast(Product | None, parse(ConcatenationProduction(OWS, grammar.selector_list, OWS), input))
 
 class Grammar:
     """The grammar defining the language of selector list expressions.
 
     Normally a grammar would be defined as a set of rules (for deriving productions), where each rule would feature a component to the left side of the `->` operator (the "rewriting" operator) and a component to the right side of the operator. Owing to relative simplicity of the Selectors grammar -- where the left-hand side component is always a production name _reference_ (an identifying factor of context free grammars), we leverage Python's meta-programming facilities and use class attribute assignment statements to define the rules instead, where the assigned value is the right side of the rule, an arbitrary production (which may be an opaque value). Each attribute of the grammar is assigned the corresponding name automatically, owing to the `__set_name__` dunder method of the common production (super)class (where appropriate).
 
+    NOTE: Some of the productions as defined in the specification, have been rewritten below to eliminate repetition. These rewritten productions are marked accordingly, for clarity.
+
+    NOTE: `intersperse` is used to insert white-space productions as required by the specification, which otherwise doesn't include them explicitly, instead describing white-space handling "in prose".
+
+    NOTE: There is no notation (defined by the Values & Units spec.) for expressing `RepetitionProduction` productions with a `separator` attribute value other than `None` (the '[ ... ]*' variant) or that of `CommaSeparatedRepetitionProduction` (the '[ ... ]#' variant). Nevertheless, these productions are employed below to eliminate repetition as part of optimizing the grammar.
+
     Implements http://drafts.csswg.org/selectors-4/#grammar.
     """
     ns_prefix = ConcatenationProduction(OptionalProduction(AlternativesProduction(TokenProduction(IdentToken), TokenProduction(DelimToken, value='*'))), TokenProduction(DelimToken, value='|'))
@@ -173,26 +169,17 @@ class Grammar:
     class_selector = ConcatenationProduction(TokenProduction(DelimToken, value='.'), TokenProduction(IdentToken))
     attr_matcher = ConcatenationProduction(OptionalProduction(AlternativesProduction(*(TokenProduction(DelimToken, value=value) for value in ('~', '|', '^', '$', '*')))), TokenProduction(DelimToken, value='='))
     attr_modifier = AlternativesProduction(*(TokenProduction(DelimToken, value=value) for value in ('i', 's')))
-    attribute_selector = AlternativesProduction(ConcatenationProduction(TokenProduction(OpenBracketToken), ReferenceProduction(wq_name), TokenProduction(CloseBracketToken)), ConcatenationProduction(TokenProduction(OpenBracketToken), ReferenceProduction(wq_name), ReferenceProduction(attr_matcher), AlternativesProduction(TokenProduction(StringToken), TokenProduction(IdentToken)), OptionalProduction(ReferenceProduction(attr_modifier)), TokenProduction(CloseBracketToken)))
+    attribute_selector = ConcatenationProduction(*intersperse(TokenProduction(OpenBracketToken), ReferenceProduction(wq_name), OptionalProduction(ConcatenationProduction(*intersperse(ReferenceProduction(attr_matcher), AlternativesProduction(TokenProduction(StringToken), TokenProduction(IdentToken)), OptionalProduction(ReferenceProduction(attr_modifier)), separator=OWS))), TokenProduction(CloseBracketToken), separator=OWS)) # Rewritten
     legacy_pseudo_element_selector = ConcatenationProduction(TokenProduction(ColonToken), AlternativesProduction(*(TokenProduction(IdentToken, value=value) for value in ('before', 'after', 'first-line', 'first-letter'))))
-    pseudo_class_selector = AlternativesProduction(ConcatenationProduction(TokenProduction(ColonToken), TokenProduction(IdentToken)), ConcatenationProduction(TokenProduction(ColonToken), TokenProduction(FunctionToken), ReferenceProduction(any_value), TokenProduction(CloseParenToken)))
+    pseudo_class_selector = ConcatenationProduction(TokenProduction(ColonToken), AlternativesProduction(TokenProduction(IdentToken), ConcatenationProduction(TokenProduction(FunctionToken), ReferenceProduction(any_value), TokenProduction(CloseParenToken)))) # Rewritten
     pseudo_element_selector = AlternativesProduction(ConcatenationProduction(TokenProduction(ColonToken), ReferenceProduction(pseudo_class_selector)), ReferenceProduction(legacy_pseudo_element_selector))
     pseudo_compound_selector = ConcatenationProduction(ReferenceProduction(pseudo_element_selector), RepetitionProduction(ReferenceProduction(pseudo_class_selector)))
     subclass_selector = AlternativesProduction(ReferenceProduction(id_selector), ReferenceProduction(class_selector), ReferenceProduction(attribute_selector), ReferenceProduction(pseudo_class_selector))
     compound_selector = NonEmptyProduction(ConcatenationProduction(OptionalProduction(ReferenceProduction(type_selector)), RepetitionProduction(ReferenceProduction(subclass_selector))))
     complex_selector_unit = NonEmptyProduction(ConcatenationProduction(OptionalProduction(ReferenceProduction(compound_selector)), RepetitionProduction(ReferenceProduction(pseudo_compound_selector))))
     combinator = AlternativesProduction(*(TokenProduction(DelimToken, value=value) for value in ('>', '+', '~')), ConcatenationProduction(*(TokenProduction(DelimToken, value=value) for value in ('|', '|'))))
-    complex_selector = ConcatenationProduction(ReferenceProduction(complex_selector_unit), RepetitionProduction(ConcatenationProduction(OptionalProduction(ReferenceProduction(combinator)), ReferenceProduction(complex_selector_unit))))
+    complex_selector = RepetitionProduction(ReferenceProduction(complex_selector_unit), min=1, separator=AlternativesProduction(ConcatenationProduction(OWS, ReferenceProduction(combinator), OWS), OWS)) # Rewritten
     complex_selector_list = CommaSeparatedRepetitionProduction(ReferenceProduction(complex_selector))
     selector_list = ReferenceProduction(complex_selector_list)
-    complex_real_selector = ConcatenationProduction(ReferenceProduction(compound_selector), RepetitionProduction(ConcatenationProduction(OptionalProduction(ReferenceProduction(combinator)), ReferenceProduction(compound_selector))))
-    complex_real_selector_list = CommaSeparatedRepetitionProduction(ReferenceProduction(complex_real_selector))
-    compound_selector_list = CommaSeparatedRepetitionProduction(ReferenceProduction(compound_selector))
-    simple_selector = AlternativesProduction(ReferenceProduction(type_selector), ReferenceProduction(subclass_selector))
-    simple_selector_list = CommaSeparatedRepetitionProduction(ReferenceProduction(simple_selector))
-    relative_selector = ConcatenationProduction(OptionalProduction(ReferenceProduction(combinator)), ReferenceProduction(complex_selector))
-    relative_selector_list = CommaSeparatedRepetitionProduction(ReferenceProduction(relative_selector))
-    relative_real_selector = ConcatenationProduction(OptionalProduction(ReferenceProduction(combinator)), ReferenceProduction(complex_real_selector))
-    relative_real_selector_list = CommaSeparatedRepetitionProduction(ReferenceProduction(relative_real_selector))
 
 grammar = Grammar()
diff --git a/src/csspring/syntax/parsing.py b/src/csspring/syntax/parsing.py
@@ -411,7 +411,8 @@ def consume_list_of_component_values(input: Input, *, stop_token: type[Token | N
 
 def consume_simple_block(input: Input, *, to: Appender[SimpleBlock]) -> SimpleBlock:
     """Implements http://drafts.csswg.org/css-syntax/#consume-simple-block."""
-    assert isinstance(token := input.next_token(), (OpenBraceToken, OpenBracketToken, OpenParenToken))
+    token = input.next_token()
+    assert isinstance(token, (OpenBraceToken, OpenBracketToken, OpenParenToken))
     ending_token = token.mirror_type
     block = SimpleBlock()
     consume_token(input, to=block)
diff --git a/src/csspring/values.py b/src/csspring/values.py
@@ -80,21 +80,25 @@ class RepetitionProduction(Production):
 
 	Implements the `*` notation as defined at http://drafts.csswg.org/css-values-4/#mult-zero-plus.
 	"""
+	separator: Production | None = None
 	element: Production
 	min: int
 	max: int | None
-	def __init__(self, element: Production, min: int = 0, max: int | None = None):
+	def __init__(self, element: Production, min: int = 0, max: int | None = None, *, separator: Production | None = None):
 		"""
 		:param element: The production expressing the repeating part of this production
 		:param min: The minimum amount of times the parser must accept input, i.e. the minimum number of repetitions of token sequences accepted by the parser
 		:param max: The maximum amount of times the parser will be called, i.e. the maximum number of repetitions that may be consumed in the input; the value of `None` implies no maximum (i.e. no upper bound on repetition)
+		:param separator: A production expressing the "delimiting" part between any two repetitions of the `element` production; if omitted or `None`, there's _no_ delimiting part -- repetitions are _adjacent_
 		"""
 		assert min >= 0
 		assert max is None or max > 0
 		assert max is None or min <= max
 		self.min = min
 		self.max = max
 		self.element = element
+		if separator:
+			self.separator = separator
 
 class OptionalProduction(RepetitionProduction):
 	"""Class of productions equivalent to `RepetitionProduction` with no lower bound and accepting no repetition of the element, meaning the element is expressed at most once.
@@ -117,22 +121,18 @@ def __init__(self, type: builtins.type[Token], **attributes):
 		self.type = type
 		self.attributes = attributes
 
+OWS = optional_whitespace = RepetitionProduction(TokenProduction(WhitespaceToken))
 whitespace = RepetitionProduction(TokenProduction(WhitespaceToken), min=1) # The white-space production; presence of white-space expressed with this production, is _mandatory_ (`min=1`); the definition was "hoisted" here because a) it depends on `RepetitionProduction` and `TokenProduction` definitions, which must thus precede it, and b) because the `CommaSeparatedRepetitionParser` definition that follows, depends on it, in turn
 
-class CommaSeparatedRepetitionProduction(Production):
+class CommaSeparatedRepetitionProduction(RepetitionProduction):
 	"""Class of productions that express a non-empty comma-separated repetition (CSR) of a production element.
 
-	Unlike `RepetitionProduction` which permits arbitrary number of the production element, this class does not currently implement arbitrary repetition bounds. The delimiting part (a comma optionally surrounded by white-space) is mandatory, which implies at least one repetition (two expressions of the element). Disregarding the delimiting behaviour, productions of this class thus behave like those of `RepetitionProduction` with `2` for `min` and `None` for `max` property values.
-
 	Implements the `#` notation as defined at http://drafts.csswg.org/css-values-4/#mult-comma.
 	"""
-	delimiter = ConcatenationProduction(OptionalProduction(AlternativesProduction(whitespace, TokenProduction(CommentToken))), TokenProduction(CommaToken), OptionalProduction(AlternativesProduction(whitespace, TokenProduction(CommentToken)))) # The production expressing the delimiter to use with the repetition, a comma with [optional] white-space around it
-	element: Production
-	def __init__(self, element: Production):
-		"""
-		:param element: A production to use for expressing the repeating part in this production
-		"""
-		self.element = element
+	separator = ConcatenationProduction(OWS, TokenProduction(CommaToken), OWS) # A comma with [optional] white-space around it
+	def __init__(self, element: Production, min: int = 1, max: int | None = None):
+		assert min >= 1 # "one or more times" (ref. definition); the spec. does not define whether a minimum of zero is permitted, so we err on the safer side
+		super().__init__(element, min, max)
 
 class Formatter:
 	"""Class of objects that offer procedures for serializing productions into streams of text formatted per the [value definition syntax](https://drafts.csswg.org/css-values-4/#value-defs)."""