diff --git a/README.md b/README.md index 0e2d9a7..6545bc2 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ You can get an introductory overview of the tool in [this article](https://mediu - [Rust plugin](#rust-plugin) - [Java plugin](#java-plugin) - [Julia plugin](#julia-plugin) + - [Lua plugin](#lua-plugin) - [Grammar format](#grammar-format) - [JSON-like notation](#json-like-notation) - [Yacc/Bison notation](#yaccbison-notation) @@ -321,6 +322,21 @@ For complex Julia parser implementations it is recommended to leverage the JSON- } ``` +#### Lua Plugin +Syntax supports Lua as a target language. See its [calculator example](https://github.com/DmitrySoshnikov/syntax/blob/master/examples/calc.lua.g): + +``` +./bin/syntax -g examples/calc.lua.g -m lalr1 -o calcparser.lua +``` + +Then callers can use the module as: + +```lua +Parser = require("calcparser") +parser = Parser.new() +print(parser:parse("2^2^2^2")) -- 65536 +``` + ### Grammar format _Syntax_ support two main notations to define grammars: _JSON-like_ notation, and _Yacc/Bison-style_ notation. diff --git a/examples/calc.lua.g b/examples/calc.lua.g new file mode 100644 index 0000000..708b8b1 --- /dev/null +++ b/examples/calc.lua.g @@ -0,0 +1,40 @@ +/** + * Generated parser in Lua. + * + * ./bin/syntax -g examples/calc.lua.g -m lalr1 -o calcparser.lua + * + * > Parser = require("calcparser") + * > parser = Parser.new() + * > print(parser:parse("2^2^2^2")) + * 65536 + */ + +{ + "lex": { + "rules": [ + ["%s+", "-- skip whitespace"], + ["%d+", "return 'NUMBER'"], + ["%*", "return '*'"], + ["%+", "return '+'"], + ["%(", "return '('"], + ["%)", "return ')'"], + ["%^", "return '^'"], + ] + }, + + "operators": [ + ["left", "+"], + ["left", "*"], + ["right", "^"], + ], + + "bnf": { + "E": [ + ["E + E", "$$ = $1 + $3"], + ["E * E", "$$ = $1 * $3"], + ["E ^ E", "$$ = $1 ^ $3"], + ["NUMBER", "$$ = tonumber($1)"], + ["( E )", "$$ = $2"], + ], + }, +} \ No newline at end of file diff --git a/src/bin/syntax.js b/src/bin/syntax.js index bab68a9..5d4474e 100644 --- a/src/bin/syntax.js +++ b/src/bin/syntax.js @@ -402,6 +402,8 @@ const parsers = { .default, jl: require(ROOT + 'plugins/julia/lr/lr-parser-generator-julia.js') .default, + lua: require(ROOT + 'plugins/lua/lr/lr-parser-generator-lua.js') + .default, }; const LRParserGenerator = GENERATORS[language] || GENERATORS.js; @@ -587,7 +589,7 @@ function getLexGrammarData(options) { // If explicit lexical grammar file was passed, use it. if (options.lex) { - data = Grammar.dataFromGrammarFile(options.lex, { grammarType: 'lex' }); + data = Grammar.dataFromGrammarFile(options.lex, {grammarType: 'lex'}); } if (options['ignore-whitespaces'] && !data) { diff --git a/src/plugins/lua/lr/lr-parser-generator-lua.js b/src/plugins/lua/lr/lr-parser-generator-lua.js new file mode 100644 index 0000000..15948d2 --- /dev/null +++ b/src/plugins/lua/lr/lr-parser-generator-lua.js @@ -0,0 +1,60 @@ +/** + * The MIT License (MIT) + * Copyright (c) 2015-present Dmitry Soshnikov + */ + +const LRParserGeneratorDefault = require(ROOT + 'lr/lr-parser-generator-default').default; +const LuaParserGeneratorTrait = require('../lua-parser-generator-trait'); + +import fs from 'fs'; +import path from 'path'; + +const LUA_LR_PARSER_TEMPLATE = fs.readFileSync( + `${__dirname}/../templates/lr.template.lua`, + 'utf-8', +); + +/** + * LR parser generator for Lua. + */ +export default class LRParserGeneratorLua extends LRParserGeneratorDefault { + + /** + * Instance constructor. + */ + constructor({ + grammar, + outputFile, + options = {}, + }) { + super({grammar, outputFile, options}) + .setTemplate(LUA_LR_PARSER_TEMPLATE); + + /** + * Contains the lexical rule handlers: _lexRule1, _lexRule2, etc. + * It's populated by the trait file. + */ + this._lexHandlers = []; + this._productionHandlers = []; + + /** + * Actual class name of your parser. Here we infer from the output filename. + */ + this._parserClassName = path.basename( + outputFile, + path.extname(outputFile), + ); + + Object.assign(this, LuaParserGeneratorTrait); + } + + /** + * Generates parser code. + */ + generateParserData() { + super.generateParserData(); + this.generateLexHandlers(); + this.generateProductionHandlers(); + this.generateParserClassName(this._parserClassName); + } +}; diff --git a/src/plugins/lua/lua-parser-generator-trait.js b/src/plugins/lua/lua-parser-generator-trait.js new file mode 100644 index 0000000..561d9a4 --- /dev/null +++ b/src/plugins/lua/lua-parser-generator-trait.js @@ -0,0 +1,202 @@ +/** + * The MIT License (MIT) + * Copyright (c) 2015-present Dmitry Soshnikov + */ + +import fs from 'fs'; + +const LUA_TOKENIZER_TEMPLATE = fs.readFileSync( + `${__dirname}/templates/tokenizer.template.lua`, + 'utf-8' +); + +const LuaParserGeneratorTrait = { + + /** + * Generates parser class name. + */ + generateParserClassName(className) { + this.writeData('PARSER_CLASS_NAME', className); + }, + + generateParseTable() { + this.writeData( + 'TABLE', + this._toLuaMap(this.generateParseTableData()), + ); + }, + + /** + * Generates tokens table in Lua Map format. + */ + generateTokensTable() { + this.writeData( + 'TOKENS', + this._toLuaMap(this._tokens), + ); + }, + + buildSemanticAction(production) { + let action = this.getSemanticActionCode(production); + + if (!action) { + return null; + } + + action += ';'; + + const args = this + .getSemanticActionParams(production) + .join(','); + + this._productionHandlers.push({args, action}); + return `_handler${this._productionHandlers.length}`; + }, + + generateProductionsData() { + return this.generateRawProductionsData() + .map(data => { + return `{ ${data.map((item, index) => { + // quote + if (index >= 2) { + return `"${item}"`; + } + return item; + }).join(',')} }`; + }); + }, + + generateBuiltInTokenizer() { + this.writeData('TOKENIZER', LUA_TOKENIZER_TEMPLATE); + }, + + generateLexRules() { + let lexRules = this._grammar.getLexGrammar().getRules().map(lexRule => { + + const action = lexRule.getRawHandler() + ';'; + + this._lexHandlers.push({args: '', action}); + + const flags = []; + + if (lexRule.isCaseInsensitive()) { + flags.push('i'); + } + + // Example: ["^\s+", "_lexRule1"], + return `{[[${lexRule.getRawMatcher()}${flags.join('')}]], ` + + `"_lexRule${this._lexHandlers.length}"}`; + }); + + this.writeData('LEX_RULES', `{ ${lexRules.join(',\n')} }`); + }, + + generateLexRulesByStartConditions() { + const lexGrammar = this._grammar.getLexGrammar(); + const lexRulesByConditions = lexGrammar.getRulesByStartConditions(); + const result = {}; + + for (const condition in lexRulesByConditions) { + result[condition] = lexRulesByConditions[condition].map(lexRule => + lexGrammar.getRuleIndex(lexRule) + ); + } + + this.writeData( + 'LEX_RULES_BY_START_CONDITIONS', + `${this._toLuaMap(result)}`, + ); + }, + + /** + * Converts JS object to Lua's table representation. + * E.g. converts {foo: 10, bar: 20} into {foo = 10, bar = 20} + */ + _toLuaMap(value) { + function _toLuaMapInner(value) { + if (value === null) return "nil"; + if (typeof value === "number" || typeof value === "boolean") return value.toString(); + if (typeof value === "string") return `"${value.replace(/"/g, '\\"')}"`; + + if (Array.isArray(value)) { + const items = value.map(_toLuaMapInner).join(", "); + return `{${items}}`; + } + + if (typeof value === "object") { + const entries = Object.entries(value).map(([k, v]) => { + const key = /^[a-zA-Z_][a-zA-Z0-9_]*$/.test(k) ? k : `["${k}"]`; + return `${key} = ${_toLuaMapInner(v)}`; + }).join(", "); + return `{${entries}}`; + } + + return "nil"; // fallback + } + + return _toLuaMapInner(value); + }, + + /** + * Lua lex rules handler declarations. + */ + generateLexHandlers() { + const handlers = this._generateHandlers( + this._lexHandlers, + 'Tokenizer:', + '_lexRule', + '' /* return type, you can use e.g. 'string' */ + ); + this.writeData('LEX_RULE_HANDLERS', handlers.join('\n\n')); + }, + + /** + * Lua parser handler declarations. + */ + generateProductionHandlers() { + const handlers = this._generateHandlers( + this._productionHandlers, + 'parser:', + '_handler', + '', /* return type */ + ); + this.writeData('PRODUCTION_HANDLERS', handlers.join('\n')); + }, + + /** + * Productions array in the Lua format. + * + * An array of arrays, see `generateProductionsData` for details. + */ + generateProductions() { + this.writeData( + 'PRODUCTIONS', + `{ ${this.generateProductionsData().join(',\n')} }` + ); + }, + + /** + * Injects the code passed in the module include directive. + */ + generateModuleInclude() { + let moduleInclude = this._grammar.getModuleInclude(); + + if (!moduleInclude) { + // Example: add some default module include if needed. + moduleInclude = ` + let foo = 'Example module include'; + `; + } + + this.writeData('MODULE_INCLUDE', moduleInclude); + }, + + _generateHandlers(handlers, class_prefix, name, returnType = '') { + return handlers.map(({args, action}, index) => { + return `\nfunction ${class_prefix}${name}${index + 1}` + + `(${args})\n\t\t${action}\nend` + }); + }, +}; + +module.exports = LuaParserGeneratorTrait; \ No newline at end of file diff --git a/src/plugins/lua/templates/lr.template.lua b/src/plugins/lua/templates/lr.template.lua new file mode 100644 index 0000000..2f67748 --- /dev/null +++ b/src/plugins/lua/templates/lr.template.lua @@ -0,0 +1,265 @@ +--[[ + LR parser for Example language generated by the Syntax tool. + + https://www.npmjs.com/package/syntax-cli + + npm install -g syntax-cli + + syntax-cli --help + + To regenerate run: + + syntax-cli \ + --grammar ~/path-to-grammar-file \ + --mode \ + --output ~/ParserClassName.lua +--]] + +--[[ +usage: + Parser = require("ParserClassName") + parser = Parser.new() + print(parser:parse("")) +--]] + +local EOF = "$" + +{{{TOKENIZER}}} + +local productions = {{{PRODUCTIONS}}} + +local parsing_table = {{{TABLE}}} + +localstack = {} + +local __ = nil + +local __loc = nil + +function yyloc(startLoc, endLoc) + if not shouldCaptureLocations then + return nil + end + + if not startLoc or not endLoc then + return startLoc or endLoc + end + + return { + startOffset = startLoc.startOffset, + endOffset = endLoc.endOffset, + startLine = startLoc.startLine, + endLine = endLoc.endLine, + startColumn = startLoc.startColumn, + endColumn = endLoc.endColumn, + } +end + +local shouldCaptureLocations = {{{CAPTURE_LOCATIONS}}} + +local function StackEntry(params) + local entry = { + symbol = params.symbol, + semanticValue = params.semanticValue, + loc = params.loc, + } + return entry +end + +local function yyparse() + local parser = {} + + parser.tokenizer = Tokenizer:new() + + function parser:onParseBegin() + end + + function parser:onParseEnd() + end + + {{{PRODUCTION_HANDLERS}}} + + function parser:setOptions(options) + if options.captureLocations ~= nil then + shouldCaptureLocations = options.captureLocations + end + return self + end + + function parser:parse(str) + self:onParseBegin(str) + + if not self.tokenizer then + error("Tokenizer instance isn't specified.") + end + + self.tokenizer:initString(str) + + stack = {0} + + local token = self.tokenizer:getNextToken() + local shiftedToken = nil + + repeat + if not token then + self:unexpectedEndOfInput() + end + + local state = tonumber(stack[#stack]) + local column = token.type + local entry = parsing_table[state + 1][column] + + if not entry then + self:unexpectedToken(token) + end + + if entry:sub(1,1) == 's' then + local loc = nil + + if shouldCaptureLocations then + loc = { + startOffset = token.startOffset, + endOffset = token.endOffset, + startLine = token.startLine, + endLine = token.endLine, + startColumn = token.startColumn, + endColumn = token.endColumn, + } + end + + table.insert(stack, StackEntry({ + symbol = token.type, + semanticValue = token.value, + loc = loc, + })) + + table.insert(stack, tonumber(entry:sub(2))) + + shiftedToken = token + token = self.tokenizer:getNextToken() + + elseif entry:sub(1,1) == 'r' then + local productionNumber = tonumber(entry:sub(2)) + local production = productions[productionNumber + 1] + + local hasSemanticAction = #production > 2 + + local semanticValueArgs = hasSemanticAction and {} or nil + + local locationArgs = ( + hasSemanticAction and shouldCaptureLocations + and {} or nil + ) + + local rhsLength = tonumber(production[2]) + if rhsLength ~= 0 then + while rhsLength > 0 do + table.remove(stack) + + local stackEntry = table.remove(stack) + + if hasSemanticAction then + table.insert(semanticValueArgs, 1, stackEntry.semanticValue) + + if locationArgs then + table.insert(locationArgs, 1, stackEntry.loc) + end + end + rhsLength = rhsLength - 1 + end + end + + local previousState = tonumber(stack[#stack]) + + local symbolToReduceWith = tostring(production[1]) + + local reduceStackEntry = StackEntry({ + symbol = symbolToReduceWith, + semanticValue = nil, + loc = nil, + }) + + if hasSemanticAction then + yytext = shiftedToken and shiftedToken.value or nil + yyleng = shiftedToken and #shiftedToken.value or 0 + + local semanticActionName = production[3] + local semanticActionHandler = self[semanticActionName] + + local semanticActionArgs = {} + if locationArgs then + for _, v in ipairs(semanticValueArgs) do + table.insert(semanticActionArgs, v) + end + for _, v in ipairs(locationArgs) do + table.insert(semanticActionArgs, v) + end + else + for _, v in ipairs(semanticValueArgs) do + table.insert(semanticActionArgs, v) + end + end + + semanticActionHandler(self, table.unpack(semanticActionArgs)) + + reduceStackEntry.semanticValue = __ + + if locationArgs then + reduceStackEntry.loc = __loc + end + end + + table.insert(stack, reduceStackEntry) + + local nextState = parsing_table[previousState + 1][symbolToReduceWith] + table.insert(stack, nextState) + + elseif entry == "acc" then + table.remove(stack) + + local parsed = table.remove(stack) + + if #stack ~= 1 or + stack[#stack] ~= 0 or + self.tokenizer:hasMoreTokens() then + self:unexpectedToken(token) + end + + local parsedValue = parsed.semanticValue + self:onParseEnd(parsedValue) + + return parsedValue + end + + until not (self.tokenizer:hasMoreTokens() or #stack > 1) + + return nil + end + + function parser:unexpectedToken(token) + if token.type == EOF then + self:unexpectedEndOfInput() + end + + self.tokenizer:throwUnexpectedToken( + token.value, + token.startLine, + token.startColumn + ) + end + + function parser:unexpectedEndOfInput() + self:parseError("Unexpected end of input.") + end + + function parser:parseError(message) + error(message) + end + + return parser +end + +local M = {} +M.new = yyparse + +return M \ No newline at end of file diff --git a/src/plugins/lua/templates/tokenizer.template.lua b/src/plugins/lua/templates/tokenizer.template.lua new file mode 100644 index 0000000..3a9b70b --- /dev/null +++ b/src/plugins/lua/templates/tokenizer.template.lua @@ -0,0 +1,215 @@ +--[[ + * Generic tokenizer used by the parser in the Syntax tool. + * + * https://www.npmjs.com/package/syntax-cli + * + * See `--custom-tokinzer` to skip this generation, and use a custom one. +--]] + +-- In old version of Lua, unpack is global. Set table.unpack to unpack if not exists. +if table.unpack == nil then + table.unpack = unpack +end + +EOF = '$' + +local function Token(params) + local token = { + type = params.type, + value = params.value, + startOffset = params.startOffset, + endOffset = params.endOffset, + startLine = params.startLine, + endLine = params.endLine, + startColumn = params.startColumn, + endColumn = params.endColumn, + } + + return token +end + +tokensMap = {{{TOKENS}}} + +EOF_TOKEN = {type = tokensMap[EOF]} + +yytext = "" + +yyleng = 0 + +lexRules = {{{LEX_RULES}}} + +lexRulesByConditions = {{{LEX_RULES_BY_START_CONDITIONS}}} + +Tokenizer = {} +Tokenizer.__index = Tokenizer + +function Tokenizer:new(tokenizingString) + local self = setmetatable({}, Tokenizer) + self:initString(tokenizingString) + return self +end + +function Tokenizer:initString(tokenizingString) + self._string = tokenizingString + self._states = {"INITIAL"} + self._cursor = 0 + self._tokensQueue = {} + self._currentLine = 1 + self._currentColumn = 0 + self._currentLineBeginOffset = 0 + self._tokenStartOffset = 0 + self._tokenEndOffset = 0 + self._tokenStartLine = 1 + self._tokenEndLine = 1 + self._tokenStartColumn = 0 + self._tokenEndColumn = 0 + + return self +end + +{{{LEX_RULE_HANDLERS}}} + + +function Tokenizer:getCurrentState() + return self._states[#self._states] +end + +function Tokenizer:pushState(state) + table.insert(self._states, state) +end + +function Tokenizer:begin(state) + self:pushState(state) +end + +function Tokenizer:popState() + if #self._states > 1 then + return table.remove(self._states) + else + return self._states[1] + end +end + +-- -------------------------------------------- +-- Tokenizing. + +function Tokenizer:getNextToken() + -- Return queued token first + if #self._tokensQueue > 0 then + return self:_toToken(table.remove(self._tokensQueue, 1)) + end + + if not self:hasMoreTokens() then + return EOF_TOKEN + end + + local stringRest = self._string:sub(self._cursor + 1) + local lexRulesForState = lexRulesByConditions[self:getCurrentState()] + + for i = 1, #lexRulesForState do + local lexRuleIndex = lexRulesForState[i] + local lexRule = lexRules[lexRuleIndex + 1] + + local matched = self:_match(stringRest, lexRule[1]) + + if stringRest == '' and matched == '' then + self._cursor = self._cursor + 1 + end + + if matched then + yytext = matched + yyleng = #yytext + + local tokenType = self[lexRule[2]]() + if not tokenType then + return self:getNextToken() + end + + if type(tokenType) == "table" then + local tokensToQueue = {} + for j = 2, #tokenType do + table.insert(tokensToQueue, tokenType[j]) + end + tokenType = tokenType[1] + for j = #tokensToQueue, 1, -1 do + table.insert(self._tokensQueue, 1, tokensToQueue[j]) + end + end + + return self:_toToken(tokenType, yytext) + end + end + + if self:isEOF() then + self._cursor = self._cursor + 1 + return EOF_TOKEN + end + + self:throwUnexpectedToken(stringRest:sub(1,1), self._currentLine, self._currentColumn) +end + +function Tokenizer:throwUnexpectedToken(symbol, line, column) + local lines = {} + for l in self._string:gmatch("([^\n]*)\n?") do + table.insert(lines, l) + end + local lineSource = lines[line] or "" + + local pad = string.rep(" ", column) + local lineData = "\n\n" .. lineSource .. "\n" .. pad .. "^\n" + + error(lineData .. 'Unexpected token: "' .. symbol .. '" at ' .. line .. ":" .. column) +end + +function Tokenizer:_captureLocation(matched) + local nlRe = "\n" + + self._tokenStartOffset = self._cursor + self._tokenStartLine = self._currentLine + self._tokenStartColumn = self._tokenStartOffset - self._currentLineBeginOffset + + for nl in matched:gmatch(nlRe) do + self._currentLine = self._currentLine + 1 + -- Simplifying: assume 1 char per \n for offset + self._currentLineBeginOffset = self._cursor + 1 + end + + self._tokenEndOffset = self._cursor + #matched + self._tokenEndLine = self._currentLine + self._currentColumn = self._tokenEndOffset - self._currentLineBeginOffset + self._tokenEndColumn = self._currentColumn +end + +function Tokenizer:_toToken(tokenType, yytext) + yytext = yytext or "" + return Token({ + type = tokensMap[tokenType], + value = yytext, + startOffset = self._tokenStartOffset, + endOffset = self._tokenEndOffset, + startLine = self._tokenStartLine, + endLine = self._tokenEndLine, + startColumn = self._tokenStartColumn, + endColumn = self._tokenEndColumn + }) +end + +function Tokenizer:isEOF() + return self._cursor == #self._string +end + +function Tokenizer:hasMoreTokens() + return self._cursor <= #self._string +end + +function Tokenizer:_match(stringRest, pattern) + local s, e = stringRest:find(pattern) + if s then + local matched = stringRest:sub(s, e) + self:_captureLocation(matched) + self._cursor = self._cursor + #matched + return matched + end + return nil +end + \ No newline at end of file