Spaces:
Running
Running
| /** | |
| * The Lexer class handles tokenizing the input in various ways. Since our | |
| * parser expects us to be able to backtrack, the lexer allows lexing from any | |
| * given starting point. | |
| * | |
| * Its main exposed function is the `lex` function, which takes a position to | |
| * lex from and a type of token to lex. It defers to the appropriate `_innerLex` | |
| * function. | |
| * | |
| * The various `_innerLex` functions perform the actual lexing of different | |
| * kinds. | |
| */ | |
| var matchAt = require("../../match-at"); | |
| var ParseError = require("./ParseError"); | |
| // The main lexer class | |
| function Lexer(input) { | |
| this._input = input; | |
| } | |
| // The resulting token returned from `lex`. | |
| function Token(text, data, position) { | |
| this.text = text; | |
| this.data = data; | |
| this.position = position; | |
| } | |
| /* The following tokenRegex | |
| * - matches typical whitespace (but not NBSP etc.) using its first group | |
| * - matches symbol combinations which result in a single output character | |
| * - does not match any control character \x00-\x1f except whitespace | |
| * - does not match a bare backslash | |
| * - matches any ASCII character except those just mentioned | |
| * - does not match the BMP private use area \uE000-\uF8FF | |
| * - does not match bare surrogate code units | |
| * - matches any BMP character except for those just described | |
| * - matches any valid Unicode surrogate pair | |
| * - matches a backslash followed by one or more letters | |
| * - matches a backslash followed by any BMP character, including newline | |
| * Just because the Lexer matches something doesn't mean it's valid input: | |
| * If there is no matching function or symbol definition, the Parser will | |
| * still reject the input. | |
| */ | |
| var tokenRegex = new RegExp( | |
| "([ \r\n\t]+)|(" + // whitespace | |
| "---?" + // special combinations | |
| "|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint | |
| "|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair | |
| "|\\\\(?:[a-zA-Z]+|[^\uD800-\uDFFF])" + // function name | |
| ")" | |
| ); | |
| var whitespaceRegex = /\s*/; | |
| /** | |
| * This function lexes a single normal token. It takes a position and | |
| * whether it should completely ignore whitespace or not. | |
| */ | |
| Lexer.prototype._innerLex = function(pos, ignoreWhitespace) { | |
| var input = this._input; | |
| if (pos === input.length) { | |
| return new Token("EOF", null, pos); | |
| } | |
| var match = matchAt(tokenRegex, input, pos); | |
| if (match === null) { | |
| throw new ParseError( | |
| "Unexpected character: '" + input[pos] + "'", | |
| this, pos); | |
| } else if (match[2]) { // matched non-whitespace | |
| return new Token(match[2], null, pos + match[2].length); | |
| } else if (ignoreWhitespace) { | |
| return this._innerLex(pos + match[1].length, true); | |
| } else { // concatenate whitespace to a single space | |
| return new Token(" ", null, pos + match[1].length); | |
| } | |
| }; | |
| // A regex to match a CSS color (like #ffffff or BlueViolet) | |
| var cssColor = /#[a-z0-9]+|[a-z]+/i; | |
| /** | |
| * This function lexes a CSS color. | |
| */ | |
| Lexer.prototype._innerLexColor = function(pos) { | |
| var input = this._input; | |
| // Ignore whitespace | |
| var whitespace = matchAt(whitespaceRegex, input, pos)[0]; | |
| pos += whitespace.length; | |
| var match; | |
| if ((match = matchAt(cssColor, input, pos))) { | |
| // If we look like a color, return a color | |
| return new Token(match[0], null, pos + match[0].length); | |
| } else { | |
| throw new ParseError("Invalid color", this, pos); | |
| } | |
| }; | |
| // A regex to match a dimension. Dimensions look like | |
| // "1.2em" or ".4pt" or "1 ex" | |
| var sizeRegex = /(-?)\s*(\d+(?:\.\d*)?|\.\d+)\s*([a-z]{2})/; | |
| /** | |
| * This function lexes a dimension. | |
| */ | |
| Lexer.prototype._innerLexSize = function(pos) { | |
| var input = this._input; | |
| // Ignore whitespace | |
| var whitespace = matchAt(whitespaceRegex, input, pos)[0]; | |
| pos += whitespace.length; | |
| var match; | |
| if ((match = matchAt(sizeRegex, input, pos))) { | |
| var unit = match[3]; | |
| // We only currently handle "em" and "ex" units | |
| // if (unit !== "em" && unit !== "ex") { | |
| // throw new ParseError("Invalid unit: '" + unit + "'", this, pos); | |
| // } | |
| return new Token(match[0], { | |
| number: +(match[1] + match[2]), | |
| unit: unit, | |
| }, pos + match[0].length); | |
| } | |
| throw new ParseError("Invalid size", this, pos); | |
| }; | |
| /** | |
| * This function lexes a string of whitespace. | |
| */ | |
| Lexer.prototype._innerLexWhitespace = function(pos) { | |
| var input = this._input; | |
| var whitespace = matchAt(whitespaceRegex, input, pos)[0]; | |
| pos += whitespace.length; | |
| return new Token(whitespace[0], null, pos); | |
| }; | |
| /** | |
| * This function lexes a single token starting at `pos` and of the given mode. | |
| * Based on the mode, we defer to one of the `_innerLex` functions. | |
| */ | |
| Lexer.prototype.lex = function(pos, mode) { | |
| if (mode === "math") { | |
| return this._innerLex(pos, true); | |
| } else if (mode === "text") { | |
| return this._innerLex(pos, false); | |
| } else if (mode === "color") { | |
| return this._innerLexColor(pos); | |
| } else if (mode === "size") { | |
| return this._innerLexSize(pos); | |
| } else if (mode === "whitespace") { | |
| return this._innerLexWhitespace(pos); | |
| } | |
| }; | |
| module.exports = Lexer; | |