Spaces:

opendatalab
/

CDM-Demo

Running

App Files Files Community

CDM-Demo / modules /tokenize_latex /third_party /katex /src /Lexer.js

wufan's picture

Upload 47 files

685cc58 verified about 1 year ago

history blame contribute delete

5.3 kB

	/**
	* The Lexer class handles tokenizing the input in various ways. Since our
	* parser expects us to be able to backtrack, the lexer allows lexing from any
	* given starting point.
	*
	* Its main exposed function is the `lex` function, which takes a position to
	* lex from and a type of token to lex. It defers to the appropriate `_innerLex`
	* function.
	*
	* The various `_innerLex` functions perform the actual lexing of different
	* kinds.
	*/

	var matchAt = require("../../match-at");

	var ParseError = require("./ParseError");

	// The main lexer class
	function Lexer(input) {
	this._input = input;
	}

	// The resulting token returned from `lex`.
	function Token(text, data, position) {
	this.text = text;
	this.data = data;
	this.position = position;
	}

	/* The following tokenRegex
	* - matches typical whitespace (but not NBSP etc.) using its first group
	* - matches symbol combinations which result in a single output character
	* - does not match any control character \x00-\x1f except whitespace
	* - does not match a bare backslash
	* - matches any ASCII character except those just mentioned
	* - does not match the BMP private use area \uE000-\uF8FF
	* - does not match bare surrogate code units
	* - matches any BMP character except for those just described
	* - matches any valid Unicode surrogate pair
	* - matches a backslash followed by one or more letters
	* - matches a backslash followed by any BMP character, including newline
	* Just because the Lexer matches something doesn't mean it's valid input:
	* If there is no matching function or symbol definition, the Parser will
	* still reject the input.
	*/
	var tokenRegex = new RegExp(
	"([ \r\n\t]+)\|(" + // whitespace
	"---?" + // special combinations
	"\|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
	"\|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair
	"\|\\\\(?:[a-zA-Z]+\|[^\uD800-\uDFFF])" + // function name
	")"
	);

	var whitespaceRegex = /\s*/;

	/**
	* This function lexes a single normal token. It takes a position and
	* whether it should completely ignore whitespace or not.
	*/
	Lexer.prototype._innerLex = function(pos, ignoreWhitespace) {
	var input = this._input;
	if (pos === input.length) {
	return new Token("EOF", null, pos);
	}
	var match = matchAt(tokenRegex, input, pos);
	if (match === null) {
	throw new ParseError(
	"Unexpected character: '" + input[pos] + "'",
	this, pos);
	} else if (match[2]) { // matched non-whitespace
	return new Token(match[2], null, pos + match[2].length);
	} else if (ignoreWhitespace) {
	return this._innerLex(pos + match[1].length, true);
	} else { // concatenate whitespace to a single space
	return new Token(" ", null, pos + match[1].length);
	}
	};

	// A regex to match a CSS color (like #ffffff or BlueViolet)
	var cssColor = /#[a-z0-9]+\|[a-z]+/i;

	/**
	* This function lexes a CSS color.
	*/
	Lexer.prototype._innerLexColor = function(pos) {
	var input = this._input;

	// Ignore whitespace
	var whitespace = matchAt(whitespaceRegex, input, pos)[0];
	pos += whitespace.length;

	var match;
	if ((match = matchAt(cssColor, input, pos))) {
	// If we look like a color, return a color
	return new Token(match[0], null, pos + match[0].length);
	} else {
	throw new ParseError("Invalid color", this, pos);
	}
	};

	// A regex to match a dimension. Dimensions look like
	// "1.2em" or ".4pt" or "1 ex"
	var sizeRegex = /(-?)\s(\d+(?:\.\d)?\|\.\d+)\s*([a-z]{2})/;

	/**
	* This function lexes a dimension.
	*/
	Lexer.prototype._innerLexSize = function(pos) {
	var input = this._input;

	// Ignore whitespace
	var whitespace = matchAt(whitespaceRegex, input, pos)[0];
	pos += whitespace.length;

	var match;
	if ((match = matchAt(sizeRegex, input, pos))) {
	var unit = match[3];
	// We only currently handle "em" and "ex" units
	// if (unit !== "em" && unit !== "ex") {
	// throw new ParseError("Invalid unit: '" + unit + "'", this, pos);
	// }
	return new Token(match[0], {
	number: +(match[1] + match[2]),
	unit: unit,
	}, pos + match[0].length);
	}

	throw new ParseError("Invalid size", this, pos);
	};

	/**
	* This function lexes a string of whitespace.
	*/
	Lexer.prototype._innerLexWhitespace = function(pos) {
	var input = this._input;

	var whitespace = matchAt(whitespaceRegex, input, pos)[0];
	pos += whitespace.length;

	return new Token(whitespace[0], null, pos);
	};

	/**
	* This function lexes a single token starting at `pos` and of the given mode.
	* Based on the mode, we defer to one of the `_innerLex` functions.
	*/
	Lexer.prototype.lex = function(pos, mode) {
	if (mode === "math") {
	return this._innerLex(pos, true);
	} else if (mode === "text") {
	return this._innerLex(pos, false);
	} else if (mode === "color") {
	return this._innerLexColor(pos);
	} else if (mode === "size") {
	return this._innerLexSize(pos);
	} else if (mode === "whitespace") {
	return this._innerLexWhitespace(pos);
	}
	};

	module.exports = Lexer;