From 3d6e3b3103b32971edd6832e0c35a0880209ba83 Mon Sep 17 00:00:00 2001 From: Twan van Laarhoven Date: Tue, 12 May 2020 02:09:10 +0200 Subject: [PATCH] Use iterators instead of string positions --- src/script/parser.cpp | 253 +++++++++++++++++++++--------------------- src/util/string.hpp | 8 ++ 2 files changed, 135 insertions(+), 126 deletions(-) diff --git a/src/script/parser.cpp b/src/script/parser.cpp index 3e9cdff3..03b44603 100644 --- a/src/script/parser.cpp +++ b/src/script/parser.cpp @@ -42,7 +42,7 @@ struct Token { TokenType type; String value; bool newline; ///< Is there a newline between this token and the previous one? - size_t pos; ///< Start position of the token + String::const_iterator pos; ///< Start position of the token inline bool operator == (TokenType t) const { return type == t; } inline bool operator != (TokenType t) const { return type != t; } @@ -61,7 +61,7 @@ enum OpenBrace /// Iterator over a string, one token at a time. /** Also stores errors found when tokenizing or parsing */ class TokenIterator { - public: +public: TokenIterator(const String& str, Packaged* package, bool string_mode, vector& errors); /// Peek at the next token, doesn't move to the one after that @@ -76,37 +76,26 @@ class TokenIterator { */ void putBack(); - /// Get a section of source code - /** Known problems: does not work correctly when crossing an include-file border */ - String getSourceCode(size_t start, size_t end); /// Get the current line number int getLineNumber(); - private: - String input; - size_t pos; - String filename; ///< Filename of include files, "" for the main input - Packaged* package; ///< Package the input is from - vector buffer; ///< buffer of unread tokens, front() = current - stack open_braces; ///< braces/quotes we entered from script mode - bool newline; ///< Did we just pass a newline? - // more input? - struct MoreInput { - String input; - size_t pos; - String filename; - Packaged* package; - }; - stack more; ///< Read tokens from here when we are done with the current input +private: + String::const_iterator pos; + const String::const_iterator begin, end; + String filename; ///< Filename of include files, "" for the main input + vector buffer; ///< buffer of unread tokens, front() = current + stack open_braces; ///< braces/quotes we entered from script mode + bool newline; ///< Did we just pass a newline? /// Add a token to the buffer, with the current newline value, resets newline - void addToken(TokenType type, const String& value, size_t start); + void addToken(TokenType type, const String& value, String::const_iterator start); /// Read the next token, and add it to the buffer void readToken(); - /// Read the next token which is a string (after the opening ") - void readStringToken(); + /// Read the next token, which is a string + void readStringToken(bool string_mode = false); - public: +public: + Packaged* package; ///< Package the input is from /// All errors found vector& errors; /// Add an error message @@ -117,7 +106,7 @@ class TokenIterator { // ----------------------------------------------------------------------------- : Characters -bool isUnicodeAlpha(Char c) { +bool isUnicodeAlpha(wxUniChar c) { #if defined(__WXMSW__) return false; #else @@ -126,19 +115,21 @@ bool isUnicodeAlpha(Char c) { #endif } -bool isAlpha_(Char c) { return isAlpha(c) || c==_('_'); } -bool isAlnum_(Char c) { return isAlnum(c) || c==_('_') || isUnicodeAlpha(c); } -bool isOper (Char c) { return wxStrchr(_("+-*/!.@%^&:=<>;,"),c) != nullptr; } -bool isLparen(Char c) { return c==_('(') || c==_('[') || c==_('{'); } -bool isRparen(Char c) { return c==_(')') || c==_(']') || c==_('}'); } -bool isDigitOrDot(Char c) { return isDigit(c) || c==_('.'); } +bool isAlpha_(wxUniChar c) { return isAlpha(c) || c==_('_'); } +bool isAlnum_(wxUniChar c) { return isAlnum(c) || c==_('_') || isUnicodeAlpha(c); } +bool isOper (wxUniChar c) { return wxStrchr(_("+-*/!.@%^&:=<>;,"),c) != nullptr; } +bool isLparen(wxUniChar c) { return c==_('(') || c==_('[') || c==_('{'); } +bool isRparen(wxUniChar c) { return c==_(')') || c==_(']') || c==_('}'); } +bool isDigitOrDot(wxUniChar c) { return isDigit(c) || c==_('.'); } bool isLongOper(const String& s) { return s==_(":=") || s==_("==") || s==_("!=") || s==_("<=") || s==_(">="); } +// moveme // ----------------------------------------------------------------------------- : Tokenizing TokenIterator::TokenIterator(const String& str, Packaged* package, bool string_mode, vector& errors) - : input(str) - , pos(0) + : pos(str.begin()) + , begin(str.begin()) + , end(str.end()) , package(package) , newline(false) , errors(errors) @@ -146,7 +137,7 @@ TokenIterator::TokenIterator(const String& str, Packaged* package, bool string_m if (string_mode) { open_braces.push(BRACE_STRING_MODE); putBack();//dummy - readStringToken(); + readStringToken(true); } } @@ -164,78 +155,65 @@ const Token& TokenIterator::read() { } void TokenIterator::putBack() { - // Don't use addToken, because it canges newline + // Don't use addToken, because it changes newline // Also, we want to push_front Token t = {TOK_DUMMY, _(""), false}; buffer.insert(buffer.begin(), t); } -void TokenIterator::addToken(TokenType type, const String& value, size_t start) { +void TokenIterator::addToken(TokenType type, const String& value, String::const_iterator start) { Token t = {type, value, newline, start}; buffer.push_back(t); newline = false; } void TokenIterator::readToken() { - if (pos >= input.size()) { - // done with input, is there more? - if (!more.empty()) { - input = more.top().input; - pos = more.top().pos; - filename = more.top().filename; - package = more.top().package; - more.pop(); - } else { - // EOF - addToken(TOK_EOF, _("end of input"), input.size()); - } + if (pos == end) { + addToken(TOK_EOF, "end of input", pos); return; } // read a character from the input - Char c = input.GetChar(pos++); + auto c = *pos; if (c == _('\n')) { + ++pos; newline = true; } else if (isSpace(c)) { + ++pos; // ignore - } else if (is_substr(input, pos-1, _("include file:"))) { - // include a file - // HACK: This is not really the right place for it, but it's the best I've got - // filename - pos += 12; // "nclude file:" - size_t eol = input.find_first_of(_("\r\n"), pos); - if (eol == String::npos) eol = input.size(); - String include_file = trim(input.substr(pos, eol - pos)); - // store the current input for later retrieval - MoreInput m = {input, eol, filename, package}; - more.push(m); - // read the entire file, and start at the beginning of it - pos = 0; - filename = include_file; - auto stream = package_manager.openFileFromPackage(package, include_file); - eat_utf8_bom(*stream); - input = read_utf8_line(*stream, true); + } else if (is_substr(pos, end, "include file:")) { + pos += 13; // "include file:" + const char* newlines = "\r\n"; + auto eol = find_first_of(pos,end, newlines,newlines+2); + String include_file = trim(String(pos, eol)); + // include_file("filename") + addToken(TOK_NAME, "include_file", pos - 13); + addToken(TOK_LPAREN, "(", pos); + addToken(TOK_STRING, include_file, pos); + addToken(TOK_RPAREN, ")", eol); + pos = eol; } else if (isAlpha(c) || isUnicodeAlpha(c) || c == _('_') || (isDigit(c) && !buffer.empty() && buffer.back() == _("."))) { // name, or a number after a . token, as in array.0 - size_t start = pos - 1; - while (pos < input.size() && isAlnum_(input.GetChar(pos))) ++pos; - addToken(TOK_NAME, canonical_name_form(input.substr(start, pos-start)), start); // convert name to canonical form + auto start = pos; + while (pos != end && isAlnum_(*pos)) ++pos; + addToken(TOK_NAME, canonical_name_form(String(start, pos)), start); // convert name to canonical form } else if (isDigit(c)) { // number - size_t start = pos - 1; - while (pos < input.size() && isDigitOrDot(input.GetChar(pos))) ++pos; - String num = input.substr(start, pos-start); - addToken( - num.find_first_of('.') == String::npos ? TOK_INT : TOK_DOUBLE, - num, start - ); + auto start = pos; + TokenType type = TOK_INT; + while (pos != end && isDigitOrDot(*pos)) { + if (*pos == '.') type = TOK_DOUBLE; + ++pos; + } + addToken(type, String(start,pos), start); } else if (isOper(c)) { // operator - if (pos < input.size() && isLongOper(input.substr(pos - 1, 2))) { + if (pos+1 != end && isLongOper(String(pos, pos+2))) { // long operator - addToken(TOK_OPER, input.substr(pos - 1, 2), pos-1); - pos += 1; + addToken(TOK_OPER, String(pos, pos+2), pos); + pos += 2; } else { - addToken(TOK_OPER, input.substr(pos - 1, 1), pos-1); + addToken(TOK_OPER, String(pos, pos+1), pos); + ++pos; } } else if (c==_('"')) { // string @@ -244,30 +222,34 @@ void TokenIterator::readToken() { } else if (c == _('}') && !open_braces.empty() && open_braces.top() != BRACE_PAREN) { // closing smart string, resume to string parsing // "a{e}b" --> "a" "{ e }" "b" - addToken(TOK_RPAREN, _("}\""), pos-1); + addToken(TOK_RPAREN, "}\"", pos); readStringToken(); } else if (isLparen(c)) { // paranthesis/brace open_braces.push(BRACE_PAREN); - addToken(TOK_LPAREN, String(1,c), pos-1); + addToken(TOK_LPAREN, String(1,c), pos); + ++pos; } else if (isRparen(c)) { // paranthesis/brace if (!open_braces.empty()) open_braces.pop(); - addToken(TOK_RPAREN, String(1,c), pos-1); + addToken(TOK_RPAREN, String(1,c), pos); + ++pos; } else if(c==_('#')) { // comment untill end of line - while (pos < input.size() && input[pos] != _('\n')) ++pos; + while (pos != end && *pos != '\n') ++pos; } else { add_error(_("Unknown character in script: '") + String(1,c) + _("'")); // just skip the character + ++pos; } } -void TokenIterator::readStringToken() { - size_t start = max((size_t)1, pos) - 1; +void TokenIterator::readStringToken(bool string_mode) { + auto start = pos; + if (!string_mode) ++pos; String str; while (true) { - if (pos >= input.size()) { + if (pos == end) { if (!open_braces.empty() && open_braces.top() == BRACE_STRING_MODE) { // in string mode: end of input = end of string addToken(TOK_STRING, str, start); @@ -279,41 +261,41 @@ void TokenIterator::readStringToken() { return; } } - Char c = input.GetChar(pos++); + auto c = *pos++; // parse the string constant - if (c == _('"') && !open_braces.empty() && open_braces.top() == BRACE_STRING) { + if (c == '"' && !open_braces.empty() && open_braces.top() == BRACE_STRING) { // end of string addToken(TOK_STRING, str, start); open_braces.pop(); return; - } else if (c == _('\\')) { + } else if (c == '\\') { // escape - if (pos >= input.size()) { + if (pos == end) { add_error(_("Unexpected end of input in string constant")); // fix up addToken(TOK_STRING, str, start); return; } - c = input.GetChar(pos++); + c = *pos++; if (c == _('n')) str += _('\n'); else if (c == _('r')) str += _('\r'); else if (c == _('t')) str += _('\t'); else if (c == _('&')); // escape for nothing - else if (c == _('<')) str += _('\1'); // escape for < + else if (c == _('<')) str += _('\1'); // escape for < in tagged string else if (c == _('\\') || c == _('"') || c == _('\'') || c == _('{') || c == _('}')) { str += c; // \ or { or " or ', don't warn about these, since they look escape-worthy - } else if (c >= _('0') && c <= _('9')) { + } else if (c >= '0' && c <= '9') { // numeric escape - int i = 0; - while (c >= _('0') && c <= _('9')) { - i = i * 10 + static_cast(c - _('0')); - c = input.GetChar(pos++); + int i = static_cast(c - '0'); + while (pos != end && *pos >= '0' && *pos <= '9') { + i = i * 10 + static_cast(c - *pos); + ++pos; } - pos--; - str += static_cast(i); // ignore + str += static_cast(i); } else { add_error(String::Format(_("Invalid string escape sequence: \"\\%c\""),c)); - str += _('\\') + c; // ignore + str += _('\\'); + str += c; // ignore error } } else if (c == _('{')) { // smart string @@ -327,38 +309,36 @@ void TokenIterator::readStringToken() { } } -int line_number(size_t pos, const String& input) { +int line_number(String::const_iterator pos, String::const_iterator begin) { int line = 1; - for (size_t i = 0 ; i < input.size() && i < pos ; ++i) { - if (input.GetChar(i) == _('\n')) line++; + for ( ; begin != pos ; ++begin) { + if (*begin == '\n') line++; } return line; } void TokenIterator::add_error(const String& message) { - if (!errors.empty() && errors.back().start == pos) return; // already an error here + size_t error_pos = pos - begin; + if (!errors.empty() && errors.back().start == error_pos) return; // already an error here // add error message - errors.push_back(ScriptParseError(pos, line_number(pos,input), filename, message)); + errors.push_back(ScriptParseError(error_pos, line_number(pos,begin), filename, message)); } + void TokenIterator::expected(const String& expected, const Token* opening) { - size_t error_pos = peek(0).pos; - if (!errors.empty() && errors.back().start == pos) return; // already an error here + auto next_token_pos = peek(0).pos; + size_t error_pos = next_token_pos - begin; + if (!errors.empty() && errors.back().start == error_pos) return; // already an error here // add error message if (opening) { - errors.push_back(ScriptParseError(opening->pos, error_pos, line_number(opening->pos,input), filename, opening->value, expected, peek(0).value)); + size_t open_pos = opening->pos - begin; + errors.push_back(ScriptParseError(open_pos, error_pos, line_number(opening->pos,begin), filename, opening->value, expected, peek(0).value)); } else { - errors.push_back(ScriptParseError(error_pos, line_number(error_pos,input), filename, expected, peek(0).value)); + errors.push_back(ScriptParseError(error_pos, line_number(next_token_pos,begin), filename, expected, peek(0).value)); } } - -String TokenIterator::getSourceCode(size_t start, size_t end) { - start = min(start, input.size()); - end = min(end, input.size()); - return input.substr(start, end-start); -} int TokenIterator::getLineNumber() { - return line_number(peek(0).pos, input); + return line_number(peek(0).pos, begin); } // ----------------------------------------------------------------------------- : Parsing @@ -414,17 +394,21 @@ ExprType parseOper(TokenIterator& input, Script& script, Precedence min_prec, In /// Parse call arguments, "(...)" void parseCallArguments(TokenIterator& input, Script& script, vector& arguments); +ExprType parseTopLevel(TokenIterator& input, Script& script) { + ExprType type = parseOper(input, script, PREC_ALL); + Token eof = input.read(); + if (eof != TOK_EOF) { + input.expected(_("end of input")); + } + return type; +} ScriptP parse(const String& s, Packaged* package, bool string_mode, vector& errors_out) { errors_out.clear(); // parse TokenIterator input(s, package, string_mode, errors_out); ScriptP script(new Script); - ExprType type = parseOper(input, *script, PREC_ALL); - Token eof = input.read(); - if (eof != TOK_EOF) { - input.expected(_("end of input")); - } + ExprType type = parseTopLevel(input, *script); // were there fatal errors? if (type == EXPR_FAILED) { return ScriptP(); @@ -656,13 +640,13 @@ ExprType parseExpr(TokenIterator& input, Script& script, Precedence minPrec) { } else if (token == _("assert")) { // assert(condition) expectToken(input, _("(")); - size_t start = input.peek().pos; + auto start = input.peek().pos; int line = input.getLineNumber(); size_t function_pos = script.getConstants().size(); script.addInstruction(I_PUSH_CONST, script_warning); parseOper(input, script, PREC_ALL); // condition - size_t end = input.peek().pos; - String message = String::Format(_("Assertion failure on line %d:\n expected: "), line) + input.getSourceCode(start,end); + auto end = input.peek().pos; + String message = String::Format(_("Assertion failure on line %d:\n expected: "), line) + String(start,end); expectToken(input, _(")"), &token); if (script.getInstructions().back().instr == I_BINARY && script.getInstructions().back().instr2 == I_EQ) { // compile "assert(x == y)" into @@ -684,6 +668,23 @@ ExprType parseExpr(TokenIterator& input, Script& script, Precedence minPrec) { script.addInstruction(I_NOP, SCRIPT_VAR_input); // (input:) } return EXPR_STATEMENT; + } else if (token == _("include_file")) { + // Include a file + expectToken(input, _("(")); + Token token = input.read(); + if (token != TOK_STRING) { + input.expected(_("filename")); + return EXPR_FAILED; + } + expectToken(input, _(")"), &token); + // include the file + // read the entire file, and start at the beginning of it + String const& filename = token.value; + auto stream = package_manager.openFileFromPackage(input.package, filename); + eat_utf8_bom(*stream); + String included_input = read_utf8_line(*stream, true); + TokenIterator included_tokens(included_input, input.package, false, input.errors); + return parseTopLevel(included_tokens, script); } else { // variable Variable var = string_to_variable(token.value); diff --git a/src/util/string.hpp b/src/util/string.hpp index 52ff0cb2..07bc8ef6 100644 --- a/src/util/string.hpp +++ b/src/util/string.hpp @@ -200,6 +200,14 @@ bool starts_with(const String& str, const String& start); bool is_substr(const String& str, size_t pos, const Char* cmp); /// Return whether str contains the string cmp at position pos bool is_substr(const String& str, size_t pos, const String& cmp); +/// Return whether begin..end contains the string cmp at position begin +template +bool is_substr(It begin, It end, const char* cmp) { + for (; begin != end && *cmp; ++begin, ++cmp) { + if (*begin != *cmp) return false; + } + return true; +} /// Return whether str contains the string cmp at position pos, case insensitive compare bool is_substr_i(const String& str, size_t pos, const Char* cmp);