Spaces:
Runtime error
Runtime error
| # Scanner produces tokens of the following types: | |
| # STREAM-START | |
| # STREAM-END | |
| # DIRECTIVE(name, value) | |
| # DOCUMENT-START | |
| # DOCUMENT-END | |
| # BLOCK-SEQUENCE-START | |
| # BLOCK-MAPPING-START | |
| # BLOCK-END | |
| # FLOW-SEQUENCE-START | |
| # FLOW-MAPPING-START | |
| # FLOW-SEQUENCE-END | |
| # FLOW-MAPPING-END | |
| # BLOCK-ENTRY | |
| # FLOW-ENTRY | |
| # KEY | |
| # VALUE | |
| # ALIAS(value) | |
| # ANCHOR(value) | |
| # TAG(value) | |
| # SCALAR(value, plain, style) | |
| # | |
| # Read comments in the Scanner code for more details. | |
| # | |
| __all__ = ['Scanner', 'ScannerError'] | |
| from .error import MarkedYAMLError | |
| from .tokens import * | |
| class ScannerError(MarkedYAMLError): | |
| pass | |
| class SimpleKey: | |
| # See below simple keys treatment. | |
| def __init__(self, token_number, required, index, line, column, mark): | |
| self.token_number = token_number | |
| self.required = required | |
| self.index = index | |
| self.line = line | |
| self.column = column | |
| self.mark = mark | |
| class Scanner: | |
| def __init__(self): | |
| """Initialize the scanner.""" | |
| # It is assumed that Scanner and Reader will have a common descendant. | |
| # Reader do the dirty work of checking for BOM and converting the | |
| # input data to Unicode. It also adds NUL to the end. | |
| # | |
| # Reader supports the following methods | |
| # self.peek(i=0) # peek the next i-th character | |
| # self.prefix(l=1) # peek the next l characters | |
| # self.forward(l=1) # read the next l characters and move the pointer. | |
| # Had we reached the end of the stream? | |
| self.done = False | |
| # The number of unclosed '{' and '['. `flow_level == 0` means block | |
| # context. | |
| self.flow_level = 0 | |
| # List of processed tokens that are not yet emitted. | |
| self.tokens = [] | |
| # Add the STREAM-START token. | |
| self.fetch_stream_start() | |
| # Number of tokens that were emitted through the `get_token` method. | |
| self.tokens_taken = 0 | |
| # The current indentation level. | |
| self.indent = -1 | |
| # Past indentation levels. | |
| self.indents = [] | |
| # Variables related to simple keys treatment. | |
| # A simple key is a key that is not denoted by the '?' indicator. | |
| # Example of simple keys: | |
| # --- | |
| # block simple key: value | |
| # ? not a simple key: | |
| # : { flow simple key: value } | |
| # We emit the KEY token before all keys, so when we find a potential | |
| # simple key, we try to locate the corresponding ':' indicator. | |
| # Simple keys should be limited to a single line and 1024 characters. | |
| # Can a simple key start at the current position? A simple key may | |
| # start: | |
| # - at the beginning of the line, not counting indentation spaces | |
| # (in block context), | |
| # - after '{', '[', ',' (in the flow context), | |
| # - after '?', ':', '-' (in the block context). | |
| # In the block context, this flag also signifies if a block collection | |
| # may start at the current position. | |
| self.allow_simple_key = True | |
| # Keep track of possible simple keys. This is a dictionary. The key | |
| # is `flow_level`; there can be no more that one possible simple key | |
| # for each level. The value is a SimpleKey record: | |
| # (token_number, required, index, line, column, mark) | |
| # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow), | |
| # '[', or '{' tokens. | |
| self.possible_simple_keys = {} | |
| # Public methods. | |
| def check_token(self, *choices): | |
| # Check if the next token is one of the given types. | |
| while self.need_more_tokens(): | |
| self.fetch_more_tokens() | |
| if self.tokens: | |
| if not choices: | |
| return True | |
| for choice in choices: | |
| if isinstance(self.tokens[0], choice): | |
| return True | |
| return False | |
| def peek_token(self): | |
| # Return the next token, but do not delete if from the queue. | |
| # Return None if no more tokens. | |
| while self.need_more_tokens(): | |
| self.fetch_more_tokens() | |
| if self.tokens: | |
| return self.tokens[0] | |
| else: | |
| return None | |
| def get_token(self): | |
| # Return the next token. | |
| while self.need_more_tokens(): | |
| self.fetch_more_tokens() | |
| if self.tokens: | |
| self.tokens_taken += 1 | |
| return self.tokens.pop(0) | |
| # Private methods. | |
| def need_more_tokens(self): | |
| if self.done: | |
| return False | |
| if not self.tokens: | |
| return True | |
| # The current token may be a potential simple key, so we | |
| # need to look further. | |
| self.stale_possible_simple_keys() | |
| if self.next_possible_simple_key() == self.tokens_taken: | |
| return True | |
| def fetch_more_tokens(self): | |
| # Eat whitespaces and comments until we reach the next token. | |
| self.scan_to_next_token() | |
| # Remove obsolete possible simple keys. | |
| self.stale_possible_simple_keys() | |
| # Compare the current indentation and column. It may add some tokens | |
| # and decrease the current indentation level. | |
| self.unwind_indent(self.column) | |
| # Peek the next character. | |
| ch = self.peek() | |
| # Is it the end of stream? | |
| if ch == '\0': | |
| return self.fetch_stream_end() | |
| # Is it a directive? | |
| if ch == '%' and self.check_directive(): | |
| return self.fetch_directive() | |
| # Is it the document start? | |
| if ch == '-' and self.check_document_start(): | |
| return self.fetch_document_start() | |
| # Is it the document end? | |
| if ch == '.' and self.check_document_end(): | |
| return self.fetch_document_end() | |
| # TODO: support for BOM within a stream. | |
| #if ch == '\uFEFF': | |
| # return self.fetch_bom() <-- issue BOMToken | |
| # Note: the order of the following checks is NOT significant. | |
| # Is it the flow sequence start indicator? | |
| if ch == '[': | |
| return self.fetch_flow_sequence_start() | |
| # Is it the flow mapping start indicator? | |
| if ch == '{': | |
| return self.fetch_flow_mapping_start() | |
| # Is it the flow sequence end indicator? | |
| if ch == ']': | |
| return self.fetch_flow_sequence_end() | |
| # Is it the flow mapping end indicator? | |
| if ch == '}': | |
| return self.fetch_flow_mapping_end() | |
| # Is it the flow entry indicator? | |
| if ch == ',': | |
| return self.fetch_flow_entry() | |
| # Is it the block entry indicator? | |
| if ch == '-' and self.check_block_entry(): | |
| return self.fetch_block_entry() | |
| # Is it the key indicator? | |
| if ch == '?' and self.check_key(): | |
| return self.fetch_key() | |
| # Is it the value indicator? | |
| if ch == ':' and self.check_value(): | |
| return self.fetch_value() | |
| # Is it an alias? | |
| if ch == '*': | |
| return self.fetch_alias() | |
| # Is it an anchor? | |
| if ch == '&': | |
| return self.fetch_anchor() | |
| # Is it a tag? | |
| if ch == '!': | |
| return self.fetch_tag() | |
| # Is it a literal scalar? | |
| if ch == '|' and not self.flow_level: | |
| return self.fetch_literal() | |
| # Is it a folded scalar? | |
| if ch == '>' and not self.flow_level: | |
| return self.fetch_folded() | |
| # Is it a single quoted scalar? | |
| if ch == '\'': | |
| return self.fetch_single() | |
| # Is it a double quoted scalar? | |
| if ch == '\"': | |
| return self.fetch_double() | |
| # It must be a plain scalar then. | |
| if self.check_plain(): | |
| return self.fetch_plain() | |
| # No? It's an error. Let's produce a nice error message. | |
| raise ScannerError("while scanning for the next token", None, | |
| "found character %r that cannot start any token" % ch, | |
| self.get_mark()) | |
| # Simple keys treatment. | |
| def next_possible_simple_key(self): | |
| # Return the number of the nearest possible simple key. Actually we | |
| # don't need to loop through the whole dictionary. We may replace it | |
| # with the following code: | |
| # if not self.possible_simple_keys: | |
| # return None | |
| # return self.possible_simple_keys[ | |
| # min(self.possible_simple_keys.keys())].token_number | |
| min_token_number = None | |
| for level in self.possible_simple_keys: | |
| key = self.possible_simple_keys[level] | |
| if min_token_number is None or key.token_number < min_token_number: | |
| min_token_number = key.token_number | |
| return min_token_number | |
| def stale_possible_simple_keys(self): | |
| # Remove entries that are no longer possible simple keys. According to | |
| # the YAML specification, simple keys | |
| # - should be limited to a single line, | |
| # - should be no longer than 1024 characters. | |
| # Disabling this procedure will allow simple keys of any length and | |
| # height (may cause problems if indentation is broken though). | |
| for level in list(self.possible_simple_keys): | |
| key = self.possible_simple_keys[level] | |
| if key.line != self.line \ | |
| or self.index-key.index > 1024: | |
| if key.required: | |
| raise ScannerError("while scanning a simple key", key.mark, | |
| "could not find expected ':'", self.get_mark()) | |
| del self.possible_simple_keys[level] | |
| def save_possible_simple_key(self): | |
| # The next token may start a simple key. We check if it's possible | |
| # and save its position. This function is called for | |
| # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. | |
| # Check if a simple key is required at the current position. | |
| required = not self.flow_level and self.indent == self.column | |
| # The next token might be a simple key. Let's save it's number and | |
| # position. | |
| if self.allow_simple_key: | |
| self.remove_possible_simple_key() | |
| token_number = self.tokens_taken+len(self.tokens) | |
| key = SimpleKey(token_number, required, | |
| self.index, self.line, self.column, self.get_mark()) | |
| self.possible_simple_keys[self.flow_level] = key | |
| def remove_possible_simple_key(self): | |
| # Remove the saved possible key position at the current flow level. | |
| if self.flow_level in self.possible_simple_keys: | |
| key = self.possible_simple_keys[self.flow_level] | |
| if key.required: | |
| raise ScannerError("while scanning a simple key", key.mark, | |
| "could not find expected ':'", self.get_mark()) | |
| del self.possible_simple_keys[self.flow_level] | |
| # Indentation functions. | |
| def unwind_indent(self, column): | |
| ## In flow context, tokens should respect indentation. | |
| ## Actually the condition should be `self.indent >= column` according to | |
| ## the spec. But this condition will prohibit intuitively correct | |
| ## constructions such as | |
| ## key : { | |
| ## } | |
| #if self.flow_level and self.indent > column: | |
| # raise ScannerError(None, None, | |
| # "invalid indentation or unclosed '[' or '{'", | |
| # self.get_mark()) | |
| # In the flow context, indentation is ignored. We make the scanner less | |
| # restrictive then specification requires. | |
| if self.flow_level: | |
| return | |
| # In block context, we may need to issue the BLOCK-END tokens. | |
| while self.indent > column: | |
| mark = self.get_mark() | |
| self.indent = self.indents.pop() | |
| self.tokens.append(BlockEndToken(mark, mark)) | |
| def add_indent(self, column): | |
| # Check if we need to increase indentation. | |
| if self.indent < column: | |
| self.indents.append(self.indent) | |
| self.indent = column | |
| return True | |
| return False | |
| # Fetchers. | |
| def fetch_stream_start(self): | |
| # We always add STREAM-START as the first token and STREAM-END as the | |
| # last token. | |
| # Read the token. | |
| mark = self.get_mark() | |
| # Add STREAM-START. | |
| self.tokens.append(StreamStartToken(mark, mark, | |
| encoding=self.encoding)) | |
| def fetch_stream_end(self): | |
| # Set the current indentation to -1. | |
| self.unwind_indent(-1) | |
| # Reset simple keys. | |
| self.remove_possible_simple_key() | |
| self.allow_simple_key = False | |
| self.possible_simple_keys = {} | |
| # Read the token. | |
| mark = self.get_mark() | |
| # Add STREAM-END. | |
| self.tokens.append(StreamEndToken(mark, mark)) | |
| # The steam is finished. | |
| self.done = True | |
| def fetch_directive(self): | |
| # Set the current indentation to -1. | |
| self.unwind_indent(-1) | |
| # Reset simple keys. | |
| self.remove_possible_simple_key() | |
| self.allow_simple_key = False | |
| # Scan and add DIRECTIVE. | |
| self.tokens.append(self.scan_directive()) | |
| def fetch_document_start(self): | |
| self.fetch_document_indicator(DocumentStartToken) | |
| def fetch_document_end(self): | |
| self.fetch_document_indicator(DocumentEndToken) | |
| def fetch_document_indicator(self, TokenClass): | |
| # Set the current indentation to -1. | |
| self.unwind_indent(-1) | |
| # Reset simple keys. Note that there could not be a block collection | |
| # after '---'. | |
| self.remove_possible_simple_key() | |
| self.allow_simple_key = False | |
| # Add DOCUMENT-START or DOCUMENT-END. | |
| start_mark = self.get_mark() | |
| self.forward(3) | |
| end_mark = self.get_mark() | |
| self.tokens.append(TokenClass(start_mark, end_mark)) | |
| def fetch_flow_sequence_start(self): | |
| self.fetch_flow_collection_start(FlowSequenceStartToken) | |
| def fetch_flow_mapping_start(self): | |
| self.fetch_flow_collection_start(FlowMappingStartToken) | |
| def fetch_flow_collection_start(self, TokenClass): | |
| # '[' and '{' may start a simple key. | |
| self.save_possible_simple_key() | |
| # Increase the flow level. | |
| self.flow_level += 1 | |
| # Simple keys are allowed after '[' and '{'. | |
| self.allow_simple_key = True | |
| # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START. | |
| start_mark = self.get_mark() | |
| self.forward() | |
| end_mark = self.get_mark() | |
| self.tokens.append(TokenClass(start_mark, end_mark)) | |
| def fetch_flow_sequence_end(self): | |
| self.fetch_flow_collection_end(FlowSequenceEndToken) | |
| def fetch_flow_mapping_end(self): | |
| self.fetch_flow_collection_end(FlowMappingEndToken) | |
| def fetch_flow_collection_end(self, TokenClass): | |
| # Reset possible simple key on the current level. | |
| self.remove_possible_simple_key() | |
| # Decrease the flow level. | |
| self.flow_level -= 1 | |
| # No simple keys after ']' or '}'. | |
| self.allow_simple_key = False | |
| # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END. | |
| start_mark = self.get_mark() | |
| self.forward() | |
| end_mark = self.get_mark() | |
| self.tokens.append(TokenClass(start_mark, end_mark)) | |
| def fetch_flow_entry(self): | |
| # Simple keys are allowed after ','. | |
| self.allow_simple_key = True | |
| # Reset possible simple key on the current level. | |
| self.remove_possible_simple_key() | |
| # Add FLOW-ENTRY. | |
| start_mark = self.get_mark() | |
| self.forward() | |
| end_mark = self.get_mark() | |
| self.tokens.append(FlowEntryToken(start_mark, end_mark)) | |
| def fetch_block_entry(self): | |
| # Block context needs additional checks. | |
| if not self.flow_level: | |
| # Are we allowed to start a new entry? | |
| if not self.allow_simple_key: | |
| raise ScannerError(None, None, | |
| "sequence entries are not allowed here", | |
| self.get_mark()) | |
| # We may need to add BLOCK-SEQUENCE-START. | |
| if self.add_indent(self.column): | |
| mark = self.get_mark() | |
| self.tokens.append(BlockSequenceStartToken(mark, mark)) | |
| # It's an error for the block entry to occur in the flow context, | |
| # but we let the parser detect this. | |
| else: | |
| pass | |
| # Simple keys are allowed after '-'. | |
| self.allow_simple_key = True | |
| # Reset possible simple key on the current level. | |
| self.remove_possible_simple_key() | |
| # Add BLOCK-ENTRY. | |
| start_mark = self.get_mark() | |
| self.forward() | |
| end_mark = self.get_mark() | |
| self.tokens.append(BlockEntryToken(start_mark, end_mark)) | |
| def fetch_key(self): | |
| # Block context needs additional checks. | |
| if not self.flow_level: | |
| # Are we allowed to start a key (not necessary a simple)? | |
| if not self.allow_simple_key: | |
| raise ScannerError(None, None, | |
| "mapping keys are not allowed here", | |
| self.get_mark()) | |
| # We may need to add BLOCK-MAPPING-START. | |
| if self.add_indent(self.column): | |
| mark = self.get_mark() | |
| self.tokens.append(BlockMappingStartToken(mark, mark)) | |
| # Simple keys are allowed after '?' in the block context. | |
| self.allow_simple_key = not self.flow_level | |
| # Reset possible simple key on the current level. | |
| self.remove_possible_simple_key() | |
| # Add KEY. | |
| start_mark = self.get_mark() | |
| self.forward() | |
| end_mark = self.get_mark() | |
| self.tokens.append(KeyToken(start_mark, end_mark)) | |
| def fetch_value(self): | |
| # Do we determine a simple key? | |
| if self.flow_level in self.possible_simple_keys: | |
| # Add KEY. | |
| key = self.possible_simple_keys[self.flow_level] | |
| del self.possible_simple_keys[self.flow_level] | |
| self.tokens.insert(key.token_number-self.tokens_taken, | |
| KeyToken(key.mark, key.mark)) | |
| # If this key starts a new block mapping, we need to add | |
| # BLOCK-MAPPING-START. | |
| if not self.flow_level: | |
| if self.add_indent(key.column): | |
| self.tokens.insert(key.token_number-self.tokens_taken, | |
| BlockMappingStartToken(key.mark, key.mark)) | |
| # There cannot be two simple keys one after another. | |
| self.allow_simple_key = False | |
| # It must be a part of a complex key. | |
| else: | |
| # Block context needs additional checks. | |
| # (Do we really need them? They will be caught by the parser | |
| # anyway.) | |
| if not self.flow_level: | |
| # We are allowed to start a complex value if and only if | |
| # we can start a simple key. | |
| if not self.allow_simple_key: | |
| raise ScannerError(None, None, | |
| "mapping values are not allowed here", | |
| self.get_mark()) | |
| # If this value starts a new block mapping, we need to add | |
| # BLOCK-MAPPING-START. It will be detected as an error later by | |
| # the parser. | |
| if not self.flow_level: | |
| if self.add_indent(self.column): | |
| mark = self.get_mark() | |
| self.tokens.append(BlockMappingStartToken(mark, mark)) | |
| # Simple keys are allowed after ':' in the block context. | |
| self.allow_simple_key = not self.flow_level | |
| # Reset possible simple key on the current level. | |
| self.remove_possible_simple_key() | |
| # Add VALUE. | |
| start_mark = self.get_mark() | |
| self.forward() | |
| end_mark = self.get_mark() | |
| self.tokens.append(ValueToken(start_mark, end_mark)) | |
| def fetch_alias(self): | |
| # ALIAS could be a simple key. | |
| self.save_possible_simple_key() | |
| # No simple keys after ALIAS. | |
| self.allow_simple_key = False | |
| # Scan and add ALIAS. | |
| self.tokens.append(self.scan_anchor(AliasToken)) | |
| def fetch_anchor(self): | |
| # ANCHOR could start a simple key. | |
| self.save_possible_simple_key() | |
| # No simple keys after ANCHOR. | |
| self.allow_simple_key = False | |
| # Scan and add ANCHOR. | |
| self.tokens.append(self.scan_anchor(AnchorToken)) | |
| def fetch_tag(self): | |
| # TAG could start a simple key. | |
| self.save_possible_simple_key() | |
| # No simple keys after TAG. | |
| self.allow_simple_key = False | |
| # Scan and add TAG. | |
| self.tokens.append(self.scan_tag()) | |
| def fetch_literal(self): | |
| self.fetch_block_scalar(style='|') | |
| def fetch_folded(self): | |
| self.fetch_block_scalar(style='>') | |
| def fetch_block_scalar(self, style): | |
| # A simple key may follow a block scalar. | |
| self.allow_simple_key = True | |
| # Reset possible simple key on the current level. | |
| self.remove_possible_simple_key() | |
| # Scan and add SCALAR. | |
| self.tokens.append(self.scan_block_scalar(style)) | |
| def fetch_single(self): | |
| self.fetch_flow_scalar(style='\'') | |
| def fetch_double(self): | |
| self.fetch_flow_scalar(style='"') | |
| def fetch_flow_scalar(self, style): | |
| # A flow scalar could be a simple key. | |
| self.save_possible_simple_key() | |
| # No simple keys after flow scalars. | |
| self.allow_simple_key = False | |
| # Scan and add SCALAR. | |
| self.tokens.append(self.scan_flow_scalar(style)) | |
| def fetch_plain(self): | |
| # A plain scalar could be a simple key. | |
| self.save_possible_simple_key() | |
| # No simple keys after plain scalars. But note that `scan_plain` will | |
| # change this flag if the scan is finished at the beginning of the | |
| # line. | |
| self.allow_simple_key = False | |
| # Scan and add SCALAR. May change `allow_simple_key`. | |
| self.tokens.append(self.scan_plain()) | |
| # Checkers. | |
| def check_directive(self): | |
| # DIRECTIVE: ^ '%' ... | |
| # The '%' indicator is already checked. | |
| if self.column == 0: | |
| return True | |
| def check_document_start(self): | |
| # DOCUMENT-START: ^ '---' (' '|'\n') | |
| if self.column == 0: | |
| if self.prefix(3) == '---' \ | |
| and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': | |
| return True | |
| def check_document_end(self): | |
| # DOCUMENT-END: ^ '...' (' '|'\n') | |
| if self.column == 0: | |
| if self.prefix(3) == '...' \ | |
| and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': | |
| return True | |
| def check_block_entry(self): | |
| # BLOCK-ENTRY: '-' (' '|'\n') | |
| return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' | |
| def check_key(self): | |
| # KEY(flow context): '?' | |
| if self.flow_level: | |
| return True | |
| # KEY(block context): '?' (' '|'\n') | |
| else: | |
| return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' | |
| def check_value(self): | |
| # VALUE(flow context): ':' | |
| if self.flow_level: | |
| return True | |
| # VALUE(block context): ':' (' '|'\n') | |
| else: | |
| return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' | |
| def check_plain(self): | |
| # A plain scalar may start with any non-space character except: | |
| # '-', '?', ':', ',', '[', ']', '{', '}', | |
| # '#', '&', '*', '!', '|', '>', '\'', '\"', | |
| # '%', '@', '`'. | |
| # | |
| # It may also start with | |
| # '-', '?', ':' | |
| # if it is followed by a non-space character. | |
| # | |
| # Note that we limit the last rule to the block context (except the | |
| # '-' character) because we want the flow context to be space | |
| # independent. | |
| ch = self.peek() | |
| return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \ | |
| or (self.peek(1) not in '\0 \t\r\n\x85\u2028\u2029' | |
| and (ch == '-' or (not self.flow_level and ch in '?:'))) | |
| # Scanners. | |
| def scan_to_next_token(self): | |
| # We ignore spaces, line breaks and comments. | |
| # If we find a line break in the block context, we set the flag | |
| # `allow_simple_key` on. | |
| # The byte order mark is stripped if it's the first character in the | |
| # stream. We do not yet support BOM inside the stream as the | |
| # specification requires. Any such mark will be considered as a part | |
| # of the document. | |
| # | |
| # TODO: We need to make tab handling rules more sane. A good rule is | |
| # Tabs cannot precede tokens | |
| # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, | |
| # KEY(block), VALUE(block), BLOCK-ENTRY | |
| # So the checking code is | |
| # if <TAB>: | |
| # self.allow_simple_keys = False | |
| # We also need to add the check for `allow_simple_keys == True` to | |
| # `unwind_indent` before issuing BLOCK-END. | |
| # Scanners for block, flow, and plain scalars need to be modified. | |
| if self.index == 0 and self.peek() == '\uFEFF': | |
| self.forward() | |
| found = False | |
| while not found: | |
| while self.peek() == ' ': | |
| self.forward() | |
| if self.peek() == '#': | |
| while self.peek() not in '\0\r\n\x85\u2028\u2029': | |
| self.forward() | |
| if self.scan_line_break(): | |
| if not self.flow_level: | |
| self.allow_simple_key = True | |
| else: | |
| found = True | |
| def scan_directive(self): | |
| # See the specification for details. | |
| start_mark = self.get_mark() | |
| self.forward() | |
| name = self.scan_directive_name(start_mark) | |
| value = None | |
| if name == 'YAML': | |
| value = self.scan_yaml_directive_value(start_mark) | |
| end_mark = self.get_mark() | |
| elif name == 'TAG': | |
| value = self.scan_tag_directive_value(start_mark) | |
| end_mark = self.get_mark() | |
| else: | |
| end_mark = self.get_mark() | |
| while self.peek() not in '\0\r\n\x85\u2028\u2029': | |
| self.forward() | |
| self.scan_directive_ignored_line(start_mark) | |
| return DirectiveToken(name, value, start_mark, end_mark) | |
| def scan_directive_name(self, start_mark): | |
| # See the specification for details. | |
| length = 0 | |
| ch = self.peek(length) | |
| while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ | |
| or ch in '-_': | |
| length += 1 | |
| ch = self.peek(length) | |
| if not length: | |
| raise ScannerError("while scanning a directive", start_mark, | |
| "expected alphabetic or numeric character, but found %r" | |
| % ch, self.get_mark()) | |
| value = self.prefix(length) | |
| self.forward(length) | |
| ch = self.peek() | |
| if ch not in '\0 \r\n\x85\u2028\u2029': | |
| raise ScannerError("while scanning a directive", start_mark, | |
| "expected alphabetic or numeric character, but found %r" | |
| % ch, self.get_mark()) | |
| return value | |
| def scan_yaml_directive_value(self, start_mark): | |
| # See the specification for details. | |
| while self.peek() == ' ': | |
| self.forward() | |
| major = self.scan_yaml_directive_number(start_mark) | |
| if self.peek() != '.': | |
| raise ScannerError("while scanning a directive", start_mark, | |
| "expected a digit or '.', but found %r" % self.peek(), | |
| self.get_mark()) | |
| self.forward() | |
| minor = self.scan_yaml_directive_number(start_mark) | |
| if self.peek() not in '\0 \r\n\x85\u2028\u2029': | |
| raise ScannerError("while scanning a directive", start_mark, | |
| "expected a digit or ' ', but found %r" % self.peek(), | |
| self.get_mark()) | |
| return (major, minor) | |
| def scan_yaml_directive_number(self, start_mark): | |
| # See the specification for details. | |
| ch = self.peek() | |
| if not ('0' <= ch <= '9'): | |
| raise ScannerError("while scanning a directive", start_mark, | |
| "expected a digit, but found %r" % ch, self.get_mark()) | |
| length = 0 | |
| while '0' <= self.peek(length) <= '9': | |
| length += 1 | |
| value = int(self.prefix(length)) | |
| self.forward(length) | |
| return value | |
| def scan_tag_directive_value(self, start_mark): | |
| # See the specification for details. | |
| while self.peek() == ' ': | |
| self.forward() | |
| handle = self.scan_tag_directive_handle(start_mark) | |
| while self.peek() == ' ': | |
| self.forward() | |
| prefix = self.scan_tag_directive_prefix(start_mark) | |
| return (handle, prefix) | |
| def scan_tag_directive_handle(self, start_mark): | |
| # See the specification for details. | |
| value = self.scan_tag_handle('directive', start_mark) | |
| ch = self.peek() | |
| if ch != ' ': | |
| raise ScannerError("while scanning a directive", start_mark, | |
| "expected ' ', but found %r" % ch, self.get_mark()) | |
| return value | |
| def scan_tag_directive_prefix(self, start_mark): | |
| # See the specification for details. | |
| value = self.scan_tag_uri('directive', start_mark) | |
| ch = self.peek() | |
| if ch not in '\0 \r\n\x85\u2028\u2029': | |
| raise ScannerError("while scanning a directive", start_mark, | |
| "expected ' ', but found %r" % ch, self.get_mark()) | |
| return value | |
| def scan_directive_ignored_line(self, start_mark): | |
| # See the specification for details. | |
| while self.peek() == ' ': | |
| self.forward() | |
| if self.peek() == '#': | |
| while self.peek() not in '\0\r\n\x85\u2028\u2029': | |
| self.forward() | |
| ch = self.peek() | |
| if ch not in '\0\r\n\x85\u2028\u2029': | |
| raise ScannerError("while scanning a directive", start_mark, | |
| "expected a comment or a line break, but found %r" | |
| % ch, self.get_mark()) | |
| self.scan_line_break() | |
| def scan_anchor(self, TokenClass): | |
| # The specification does not restrict characters for anchors and | |
| # aliases. This may lead to problems, for instance, the document: | |
| # [ *alias, value ] | |
| # can be interpreted in two ways, as | |
| # [ "value" ] | |
| # and | |
| # [ *alias , "value" ] | |
| # Therefore we restrict aliases to numbers and ASCII letters. | |
| start_mark = self.get_mark() | |
| indicator = self.peek() | |
| if indicator == '*': | |
| name = 'alias' | |
| else: | |
| name = 'anchor' | |
| self.forward() | |
| length = 0 | |
| ch = self.peek(length) | |
| while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ | |
| or ch in '-_': | |
| length += 1 | |
| ch = self.peek(length) | |
| if not length: | |
| raise ScannerError("while scanning an %s" % name, start_mark, | |
| "expected alphabetic or numeric character, but found %r" | |
| % ch, self.get_mark()) | |
| value = self.prefix(length) | |
| self.forward(length) | |
| ch = self.peek() | |
| if ch not in '\0 \t\r\n\x85\u2028\u2029?:,]}%@`': | |
| raise ScannerError("while scanning an %s" % name, start_mark, | |
| "expected alphabetic or numeric character, but found %r" | |
| % ch, self.get_mark()) | |
| end_mark = self.get_mark() | |
| return TokenClass(value, start_mark, end_mark) | |
| def scan_tag(self): | |
| # See the specification for details. | |
| start_mark = self.get_mark() | |
| ch = self.peek(1) | |
| if ch == '<': | |
| handle = None | |
| self.forward(2) | |
| suffix = self.scan_tag_uri('tag', start_mark) | |
| if self.peek() != '>': | |
| raise ScannerError("while parsing a tag", start_mark, | |
| "expected '>', but found %r" % self.peek(), | |
| self.get_mark()) | |
| self.forward() | |
| elif ch in '\0 \t\r\n\x85\u2028\u2029': | |
| handle = None | |
| suffix = '!' | |
| self.forward() | |
| else: | |
| length = 1 | |
| use_handle = False | |
| while ch not in '\0 \r\n\x85\u2028\u2029': | |
| if ch == '!': | |
| use_handle = True | |
| break | |
| length += 1 | |
| ch = self.peek(length) | |
| handle = '!' | |
| if use_handle: | |
| handle = self.scan_tag_handle('tag', start_mark) | |
| else: | |
| handle = '!' | |
| self.forward() | |
| suffix = self.scan_tag_uri('tag', start_mark) | |
| ch = self.peek() | |
| if ch not in '\0 \r\n\x85\u2028\u2029': | |
| raise ScannerError("while scanning a tag", start_mark, | |
| "expected ' ', but found %r" % ch, self.get_mark()) | |
| value = (handle, suffix) | |
| end_mark = self.get_mark() | |
| return TagToken(value, start_mark, end_mark) | |
| def scan_block_scalar(self, style): | |
| # See the specification for details. | |
| if style == '>': | |
| folded = True | |
| else: | |
| folded = False | |
| chunks = [] | |
| start_mark = self.get_mark() | |
| # Scan the header. | |
| self.forward() | |
| chomping, increment = self.scan_block_scalar_indicators(start_mark) | |
| self.scan_block_scalar_ignored_line(start_mark) | |
| # Determine the indentation level and go to the first non-empty line. | |
| min_indent = self.indent+1 | |
| if min_indent < 1: | |
| min_indent = 1 | |
| if increment is None: | |
| breaks, max_indent, end_mark = self.scan_block_scalar_indentation() | |
| indent = max(min_indent, max_indent) | |
| else: | |
| indent = min_indent+increment-1 | |
| breaks, end_mark = self.scan_block_scalar_breaks(indent) | |
| line_break = '' | |
| # Scan the inner part of the block scalar. | |
| while self.column == indent and self.peek() != '\0': | |
| chunks.extend(breaks) | |
| leading_non_space = self.peek() not in ' \t' | |
| length = 0 | |
| while self.peek(length) not in '\0\r\n\x85\u2028\u2029': | |
| length += 1 | |
| chunks.append(self.prefix(length)) | |
| self.forward(length) | |
| line_break = self.scan_line_break() | |
| breaks, end_mark = self.scan_block_scalar_breaks(indent) | |
| if self.column == indent and self.peek() != '\0': | |
| # Unfortunately, folding rules are ambiguous. | |
| # | |
| # This is the folding according to the specification: | |
| if folded and line_break == '\n' \ | |
| and leading_non_space and self.peek() not in ' \t': | |
| if not breaks: | |
| chunks.append(' ') | |
| else: | |
| chunks.append(line_break) | |
| # This is Clark Evans's interpretation (also in the spec | |
| # examples): | |
| # | |
| #if folded and line_break == '\n': | |
| # if not breaks: | |
| # if self.peek() not in ' \t': | |
| # chunks.append(' ') | |
| # else: | |
| # chunks.append(line_break) | |
| #else: | |
| # chunks.append(line_break) | |
| else: | |
| break | |
| # Chomp the tail. | |
| if chomping is not False: | |
| chunks.append(line_break) | |
| if chomping is True: | |
| chunks.extend(breaks) | |
| # We are done. | |
| return ScalarToken(''.join(chunks), False, start_mark, end_mark, | |
| style) | |
| def scan_block_scalar_indicators(self, start_mark): | |
| # See the specification for details. | |
| chomping = None | |
| increment = None | |
| ch = self.peek() | |
| if ch in '+-': | |
| if ch == '+': | |
| chomping = True | |
| else: | |
| chomping = False | |
| self.forward() | |
| ch = self.peek() | |
| if ch in '0123456789': | |
| increment = int(ch) | |
| if increment == 0: | |
| raise ScannerError("while scanning a block scalar", start_mark, | |
| "expected indentation indicator in the range 1-9, but found 0", | |
| self.get_mark()) | |
| self.forward() | |
| elif ch in '0123456789': | |
| increment = int(ch) | |
| if increment == 0: | |
| raise ScannerError("while scanning a block scalar", start_mark, | |
| "expected indentation indicator in the range 1-9, but found 0", | |
| self.get_mark()) | |
| self.forward() | |
| ch = self.peek() | |
| if ch in '+-': | |
| if ch == '+': | |
| chomping = True | |
| else: | |
| chomping = False | |
| self.forward() | |
| ch = self.peek() | |
| if ch not in '\0 \r\n\x85\u2028\u2029': | |
| raise ScannerError("while scanning a block scalar", start_mark, | |
| "expected chomping or indentation indicators, but found %r" | |
| % ch, self.get_mark()) | |
| return chomping, increment | |
| def scan_block_scalar_ignored_line(self, start_mark): | |
| # See the specification for details. | |
| while self.peek() == ' ': | |
| self.forward() | |
| if self.peek() == '#': | |
| while self.peek() not in '\0\r\n\x85\u2028\u2029': | |
| self.forward() | |
| ch = self.peek() | |
| if ch not in '\0\r\n\x85\u2028\u2029': | |
| raise ScannerError("while scanning a block scalar", start_mark, | |
| "expected a comment or a line break, but found %r" % ch, | |
| self.get_mark()) | |
| self.scan_line_break() | |
| def scan_block_scalar_indentation(self): | |
| # See the specification for details. | |
| chunks = [] | |
| max_indent = 0 | |
| end_mark = self.get_mark() | |
| while self.peek() in ' \r\n\x85\u2028\u2029': | |
| if self.peek() != ' ': | |
| chunks.append(self.scan_line_break()) | |
| end_mark = self.get_mark() | |
| else: | |
| self.forward() | |
| if self.column > max_indent: | |
| max_indent = self.column | |
| return chunks, max_indent, end_mark | |
| def scan_block_scalar_breaks(self, indent): | |
| # See the specification for details. | |
| chunks = [] | |
| end_mark = self.get_mark() | |
| while self.column < indent and self.peek() == ' ': | |
| self.forward() | |
| while self.peek() in '\r\n\x85\u2028\u2029': | |
| chunks.append(self.scan_line_break()) | |
| end_mark = self.get_mark() | |
| while self.column < indent and self.peek() == ' ': | |
| self.forward() | |
| return chunks, end_mark | |
| def scan_flow_scalar(self, style): | |
| # See the specification for details. | |
| # Note that we loose indentation rules for quoted scalars. Quoted | |
| # scalars don't need to adhere indentation because " and ' clearly | |
| # mark the beginning and the end of them. Therefore we are less | |
| # restrictive then the specification requires. We only need to check | |
| # that document separators are not included in scalars. | |
| if style == '"': | |
| double = True | |
| else: | |
| double = False | |
| chunks = [] | |
| start_mark = self.get_mark() | |
| quote = self.peek() | |
| self.forward() | |
| chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) | |
| while self.peek() != quote: | |
| chunks.extend(self.scan_flow_scalar_spaces(double, start_mark)) | |
| chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) | |
| self.forward() | |
| end_mark = self.get_mark() | |
| return ScalarToken(''.join(chunks), False, start_mark, end_mark, | |
| style) | |
| ESCAPE_REPLACEMENTS = { | |
| '0': '\0', | |
| 'a': '\x07', | |
| 'b': '\x08', | |
| 't': '\x09', | |
| '\t': '\x09', | |
| 'n': '\x0A', | |
| 'v': '\x0B', | |
| 'f': '\x0C', | |
| 'r': '\x0D', | |
| 'e': '\x1B', | |
| ' ': '\x20', | |
| '\"': '\"', | |
| '\\': '\\', | |
| '/': '/', | |
| 'N': '\x85', | |
| '_': '\xA0', | |
| 'L': '\u2028', | |
| 'P': '\u2029', | |
| } | |
| ESCAPE_CODES = { | |
| 'x': 2, | |
| 'u': 4, | |
| 'U': 8, | |
| } | |
| def scan_flow_scalar_non_spaces(self, double, start_mark): | |
| # See the specification for details. | |
| chunks = [] | |
| while True: | |
| length = 0 | |
| while self.peek(length) not in '\'\"\\\0 \t\r\n\x85\u2028\u2029': | |
| length += 1 | |
| if length: | |
| chunks.append(self.prefix(length)) | |
| self.forward(length) | |
| ch = self.peek() | |
| if not double and ch == '\'' and self.peek(1) == '\'': | |
| chunks.append('\'') | |
| self.forward(2) | |
| elif (double and ch == '\'') or (not double and ch in '\"\\'): | |
| chunks.append(ch) | |
| self.forward() | |
| elif double and ch == '\\': | |
| self.forward() | |
| ch = self.peek() | |
| if ch in self.ESCAPE_REPLACEMENTS: | |
| chunks.append(self.ESCAPE_REPLACEMENTS[ch]) | |
| self.forward() | |
| elif ch in self.ESCAPE_CODES: | |
| length = self.ESCAPE_CODES[ch] | |
| self.forward() | |
| for k in range(length): | |
| if self.peek(k) not in '0123456789ABCDEFabcdef': | |
| raise ScannerError("while scanning a double-quoted scalar", start_mark, | |
| "expected escape sequence of %d hexadecimal numbers, but found %r" % | |
| (length, self.peek(k)), self.get_mark()) | |
| code = int(self.prefix(length), 16) | |
| chunks.append(chr(code)) | |
| self.forward(length) | |
| elif ch in '\r\n\x85\u2028\u2029': | |
| self.scan_line_break() | |
| chunks.extend(self.scan_flow_scalar_breaks(double, start_mark)) | |
| else: | |
| raise ScannerError("while scanning a double-quoted scalar", start_mark, | |
| "found unknown escape character %r" % ch, self.get_mark()) | |
| else: | |
| return chunks | |
| def scan_flow_scalar_spaces(self, double, start_mark): | |
| # See the specification for details. | |
| chunks = [] | |
| length = 0 | |
| while self.peek(length) in ' \t': | |
| length += 1 | |
| whitespaces = self.prefix(length) | |
| self.forward(length) | |
| ch = self.peek() | |
| if ch == '\0': | |
| raise ScannerError("while scanning a quoted scalar", start_mark, | |
| "found unexpected end of stream", self.get_mark()) | |
| elif ch in '\r\n\x85\u2028\u2029': | |
| line_break = self.scan_line_break() | |
| breaks = self.scan_flow_scalar_breaks(double, start_mark) | |
| if line_break != '\n': | |
| chunks.append(line_break) | |
| elif not breaks: | |
| chunks.append(' ') | |
| chunks.extend(breaks) | |
| else: | |
| chunks.append(whitespaces) | |
| return chunks | |
| def scan_flow_scalar_breaks(self, double, start_mark): | |
| # See the specification for details. | |
| chunks = [] | |
| while True: | |
| # Instead of checking indentation, we check for document | |
| # separators. | |
| prefix = self.prefix(3) | |
| if (prefix == '---' or prefix == '...') \ | |
| and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': | |
| raise ScannerError("while scanning a quoted scalar", start_mark, | |
| "found unexpected document separator", self.get_mark()) | |
| while self.peek() in ' \t': | |
| self.forward() | |
| if self.peek() in '\r\n\x85\u2028\u2029': | |
| chunks.append(self.scan_line_break()) | |
| else: | |
| return chunks | |
| def scan_plain(self): | |
| # See the specification for details. | |
| # We add an additional restriction for the flow context: | |
| # plain scalars in the flow context cannot contain ',' or '?'. | |
| # We also keep track of the `allow_simple_key` flag here. | |
| # Indentation rules are loosed for the flow context. | |
| chunks = [] | |
| start_mark = self.get_mark() | |
| end_mark = start_mark | |
| indent = self.indent+1 | |
| # We allow zero indentation for scalars, but then we need to check for | |
| # document separators at the beginning of the line. | |
| #if indent == 0: | |
| # indent = 1 | |
| spaces = [] | |
| while True: | |
| length = 0 | |
| if self.peek() == '#': | |
| break | |
| while True: | |
| ch = self.peek(length) | |
| if ch in '\0 \t\r\n\x85\u2028\u2029' \ | |
| or (ch == ':' and | |
| self.peek(length+1) in '\0 \t\r\n\x85\u2028\u2029' | |
| + (u',[]{}' if self.flow_level else u''))\ | |
| or (self.flow_level and ch in ',?[]{}'): | |
| break | |
| length += 1 | |
| if length == 0: | |
| break | |
| self.allow_simple_key = False | |
| chunks.extend(spaces) | |
| chunks.append(self.prefix(length)) | |
| self.forward(length) | |
| end_mark = self.get_mark() | |
| spaces = self.scan_plain_spaces(indent, start_mark) | |
| if not spaces or self.peek() == '#' \ | |
| or (not self.flow_level and self.column < indent): | |
| break | |
| return ScalarToken(''.join(chunks), True, start_mark, end_mark) | |
| def scan_plain_spaces(self, indent, start_mark): | |
| # See the specification for details. | |
| # The specification is really confusing about tabs in plain scalars. | |
| # We just forbid them completely. Do not use tabs in YAML! | |
| chunks = [] | |
| length = 0 | |
| while self.peek(length) in ' ': | |
| length += 1 | |
| whitespaces = self.prefix(length) | |
| self.forward(length) | |
| ch = self.peek() | |
| if ch in '\r\n\x85\u2028\u2029': | |
| line_break = self.scan_line_break() | |
| self.allow_simple_key = True | |
| prefix = self.prefix(3) | |
| if (prefix == '---' or prefix == '...') \ | |
| and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': | |
| return | |
| breaks = [] | |
| while self.peek() in ' \r\n\x85\u2028\u2029': | |
| if self.peek() == ' ': | |
| self.forward() | |
| else: | |
| breaks.append(self.scan_line_break()) | |
| prefix = self.prefix(3) | |
| if (prefix == '---' or prefix == '...') \ | |
| and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': | |
| return | |
| if line_break != '\n': | |
| chunks.append(line_break) | |
| elif not breaks: | |
| chunks.append(' ') | |
| chunks.extend(breaks) | |
| elif whitespaces: | |
| chunks.append(whitespaces) | |
| return chunks | |
| def scan_tag_handle(self, name, start_mark): | |
| # See the specification for details. | |
| # For some strange reasons, the specification does not allow '_' in | |
| # tag handles. I have allowed it anyway. | |
| ch = self.peek() | |
| if ch != '!': | |
| raise ScannerError("while scanning a %s" % name, start_mark, | |
| "expected '!', but found %r" % ch, self.get_mark()) | |
| length = 1 | |
| ch = self.peek(length) | |
| if ch != ' ': | |
| while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ | |
| or ch in '-_': | |
| length += 1 | |
| ch = self.peek(length) | |
| if ch != '!': | |
| self.forward(length) | |
| raise ScannerError("while scanning a %s" % name, start_mark, | |
| "expected '!', but found %r" % ch, self.get_mark()) | |
| length += 1 | |
| value = self.prefix(length) | |
| self.forward(length) | |
| return value | |
| def scan_tag_uri(self, name, start_mark): | |
| # See the specification for details. | |
| # Note: we do not check if URI is well-formed. | |
| chunks = [] | |
| length = 0 | |
| ch = self.peek(length) | |
| while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ | |
| or ch in '-;/?:@&=+$,_.!~*\'()[]%': | |
| if ch == '%': | |
| chunks.append(self.prefix(length)) | |
| self.forward(length) | |
| length = 0 | |
| chunks.append(self.scan_uri_escapes(name, start_mark)) | |
| else: | |
| length += 1 | |
| ch = self.peek(length) | |
| if length: | |
| chunks.append(self.prefix(length)) | |
| self.forward(length) | |
| length = 0 | |
| if not chunks: | |
| raise ScannerError("while parsing a %s" % name, start_mark, | |
| "expected URI, but found %r" % ch, self.get_mark()) | |
| return ''.join(chunks) | |
| def scan_uri_escapes(self, name, start_mark): | |
| # See the specification for details. | |
| codes = [] | |
| mark = self.get_mark() | |
| while self.peek() == '%': | |
| self.forward() | |
| for k in range(2): | |
| if self.peek(k) not in '0123456789ABCDEFabcdef': | |
| raise ScannerError("while scanning a %s" % name, start_mark, | |
| "expected URI escape sequence of 2 hexadecimal numbers, but found %r" | |
| % self.peek(k), self.get_mark()) | |
| codes.append(int(self.prefix(2), 16)) | |
| self.forward(2) | |
| try: | |
| value = bytes(codes).decode('utf-8') | |
| except UnicodeDecodeError as exc: | |
| raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark) | |
| return value | |
| def scan_line_break(self): | |
| # Transforms: | |
| # '\r\n' : '\n' | |
| # '\r' : '\n' | |
| # '\n' : '\n' | |
| # '\x85' : '\n' | |
| # '\u2028' : '\u2028' | |
| # '\u2029 : '\u2029' | |
| # default : '' | |
| ch = self.peek() | |
| if ch in '\r\n\x85': | |
| if self.prefix(2) == '\r\n': | |
| self.forward(2) | |
| else: | |
| self.forward() | |
| return '\n' | |
| elif ch in '\u2028\u2029': | |
| self.forward() | |
| return ch | |
| return '' | |