Use iterators instead of string positions

This commit is contained in:
Twan van Laarhoven
2020-05-12 02:09:10 +02:00
parent bc53730091
commit 3d6e3b3103
2 changed files with 135 additions and 126 deletions
+127 -126
View File
@@ -42,7 +42,7 @@ struct Token {
TokenType type;
String value;
bool newline; ///< Is there a newline between this token and the previous one?
size_t pos; ///< Start position of the token
String::const_iterator pos; ///< Start position of the token
inline bool operator == (TokenType t) const { return type == t; }
inline bool operator != (TokenType t) const { return type != t; }
@@ -61,7 +61,7 @@ enum OpenBrace
/// Iterator over a string, one token at a time.
/** Also stores errors found when tokenizing or parsing */
class TokenIterator {
public:
public:
TokenIterator(const String& str, Packaged* package, bool string_mode, vector<ScriptParseError>& errors);
/// Peek at the next token, doesn't move to the one after that
@@ -76,37 +76,26 @@ class TokenIterator {
*/
void putBack();
/// Get a section of source code
/** Known problems: does not work correctly when crossing an include-file border */
String getSourceCode(size_t start, size_t end);
/// Get the current line number
int getLineNumber();
private:
String input;
size_t pos;
String filename; ///< Filename of include files, "" for the main input
Packaged* package; ///< Package the input is from
vector<Token> buffer; ///< buffer of unread tokens, front() = current
stack<OpenBrace> open_braces; ///< braces/quotes we entered from script mode
bool newline; ///< Did we just pass a newline?
// more input?
struct MoreInput {
String input;
size_t pos;
String filename;
Packaged* package;
};
stack<MoreInput> more; ///< Read tokens from here when we are done with the current input
private:
String::const_iterator pos;
const String::const_iterator begin, end;
String filename; ///< Filename of include files, "" for the main input
vector<Token> buffer; ///< buffer of unread tokens, front() = current
stack<OpenBrace> open_braces; ///< braces/quotes we entered from script mode
bool newline; ///< Did we just pass a newline?
/// Add a token to the buffer, with the current newline value, resets newline
void addToken(TokenType type, const String& value, size_t start);
void addToken(TokenType type, const String& value, String::const_iterator start);
/// Read the next token, and add it to the buffer
void readToken();
/// Read the next token which is a string (after the opening ")
void readStringToken();
/// Read the next token, which is a string
void readStringToken(bool string_mode = false);
public:
public:
Packaged* package; ///< Package the input is from
/// All errors found
vector<ScriptParseError>& errors;
/// Add an error message
@@ -117,7 +106,7 @@ class TokenIterator {
// ----------------------------------------------------------------------------- : Characters
bool isUnicodeAlpha(Char c) {
bool isUnicodeAlpha(wxUniChar c) {
#if defined(__WXMSW__)
return false;
#else
@@ -126,19 +115,21 @@ bool isUnicodeAlpha(Char c) {
#endif
}
bool isAlpha_(Char c) { return isAlpha(c) || c==_('_'); }
bool isAlnum_(Char c) { return isAlnum(c) || c==_('_') || isUnicodeAlpha(c); }
bool isOper (Char c) { return wxStrchr(_("+-*/!.@%^&:=<>;,"),c) != nullptr; }
bool isLparen(Char c) { return c==_('(') || c==_('[') || c==_('{'); }
bool isRparen(Char c) { return c==_(')') || c==_(']') || c==_('}'); }
bool isDigitOrDot(Char c) { return isDigit(c) || c==_('.'); }
bool isAlpha_(wxUniChar c) { return isAlpha(c) || c==_('_'); }
bool isAlnum_(wxUniChar c) { return isAlnum(c) || c==_('_') || isUnicodeAlpha(c); }
bool isOper (wxUniChar c) { return wxStrchr(_("+-*/!.@%^&:=<>;,"),c) != nullptr; }
bool isLparen(wxUniChar c) { return c==_('(') || c==_('[') || c==_('{'); }
bool isRparen(wxUniChar c) { return c==_(')') || c==_(']') || c==_('}'); }
bool isDigitOrDot(wxUniChar c) { return isDigit(c) || c==_('.'); }
bool isLongOper(const String& s) { return s==_(":=") || s==_("==") || s==_("!=") || s==_("<=") || s==_(">="); }
// moveme
// ----------------------------------------------------------------------------- : Tokenizing
TokenIterator::TokenIterator(const String& str, Packaged* package, bool string_mode, vector<ScriptParseError>& errors)
: input(str)
, pos(0)
: pos(str.begin())
, begin(str.begin())
, end(str.end())
, package(package)
, newline(false)
, errors(errors)
@@ -146,7 +137,7 @@ TokenIterator::TokenIterator(const String& str, Packaged* package, bool string_m
if (string_mode) {
open_braces.push(BRACE_STRING_MODE);
putBack();//dummy
readStringToken();
readStringToken(true);
}
}
@@ -164,78 +155,65 @@ const Token& TokenIterator::read() {
}
void TokenIterator::putBack() {
// Don't use addToken, because it canges newline
// Don't use addToken, because it changes newline
// Also, we want to push_front
Token t = {TOK_DUMMY, _(""), false};
buffer.insert(buffer.begin(), t);
}
void TokenIterator::addToken(TokenType type, const String& value, size_t start) {
void TokenIterator::addToken(TokenType type, const String& value, String::const_iterator start) {
Token t = {type, value, newline, start};
buffer.push_back(t);
newline = false;
}
void TokenIterator::readToken() {
if (pos >= input.size()) {
// done with input, is there more?
if (!more.empty()) {
input = more.top().input;
pos = more.top().pos;
filename = more.top().filename;
package = more.top().package;
more.pop();
} else {
// EOF
addToken(TOK_EOF, _("end of input"), input.size());
}
if (pos == end) {
addToken(TOK_EOF, "end of input", pos);
return;
}
// read a character from the input
Char c = input.GetChar(pos++);
auto c = *pos;
if (c == _('\n')) {
++pos;
newline = true;
} else if (isSpace(c)) {
++pos;
// ignore
} else if (is_substr(input, pos-1, _("include file:"))) {
// include a file
// HACK: This is not really the right place for it, but it's the best I've got
// filename
pos += 12; // "nclude file:"
size_t eol = input.find_first_of(_("\r\n"), pos);
if (eol == String::npos) eol = input.size();
String include_file = trim(input.substr(pos, eol - pos));
// store the current input for later retrieval
MoreInput m = {input, eol, filename, package};
more.push(m);
// read the entire file, and start at the beginning of it
pos = 0;
filename = include_file;
auto stream = package_manager.openFileFromPackage(package, include_file);
eat_utf8_bom(*stream);
input = read_utf8_line(*stream, true);
} else if (is_substr(pos, end, "include file:")) {
pos += 13; // "include file:"
const char* newlines = "\r\n";
auto eol = find_first_of(pos,end, newlines,newlines+2);
String include_file = trim(String(pos, eol));
// include_file("filename")
addToken(TOK_NAME, "include_file", pos - 13);
addToken(TOK_LPAREN, "(", pos);
addToken(TOK_STRING, include_file, pos);
addToken(TOK_RPAREN, ")", eol);
pos = eol;
} else if (isAlpha(c) || isUnicodeAlpha(c) || c == _('_') || (isDigit(c) && !buffer.empty() && buffer.back() == _("."))) {
// name, or a number after a . token, as in array.0
size_t start = pos - 1;
while (pos < input.size() && isAlnum_(input.GetChar(pos))) ++pos;
addToken(TOK_NAME, canonical_name_form(input.substr(start, pos-start)), start); // convert name to canonical form
auto start = pos;
while (pos != end && isAlnum_(*pos)) ++pos;
addToken(TOK_NAME, canonical_name_form(String(start, pos)), start); // convert name to canonical form
} else if (isDigit(c)) {
// number
size_t start = pos - 1;
while (pos < input.size() && isDigitOrDot(input.GetChar(pos))) ++pos;
String num = input.substr(start, pos-start);
addToken(
num.find_first_of('.') == String::npos ? TOK_INT : TOK_DOUBLE,
num, start
);
auto start = pos;
TokenType type = TOK_INT;
while (pos != end && isDigitOrDot(*pos)) {
if (*pos == '.') type = TOK_DOUBLE;
++pos;
}
addToken(type, String(start,pos), start);
} else if (isOper(c)) {
// operator
if (pos < input.size() && isLongOper(input.substr(pos - 1, 2))) {
if (pos+1 != end && isLongOper(String(pos, pos+2))) {
// long operator
addToken(TOK_OPER, input.substr(pos - 1, 2), pos-1);
pos += 1;
addToken(TOK_OPER, String(pos, pos+2), pos);
pos += 2;
} else {
addToken(TOK_OPER, input.substr(pos - 1, 1), pos-1);
addToken(TOK_OPER, String(pos, pos+1), pos);
++pos;
}
} else if (c==_('"')) {
// string
@@ -244,30 +222,34 @@ void TokenIterator::readToken() {
} else if (c == _('}') && !open_braces.empty() && open_braces.top() != BRACE_PAREN) {
// closing smart string, resume to string parsing
// "a{e}b" --> "a" "{ e }" "b"
addToken(TOK_RPAREN, _("}\""), pos-1);
addToken(TOK_RPAREN, "}\"", pos);
readStringToken();
} else if (isLparen(c)) {
// paranthesis/brace
open_braces.push(BRACE_PAREN);
addToken(TOK_LPAREN, String(1,c), pos-1);
addToken(TOK_LPAREN, String(1,c), pos);
++pos;
} else if (isRparen(c)) {
// paranthesis/brace
if (!open_braces.empty()) open_braces.pop();
addToken(TOK_RPAREN, String(1,c), pos-1);
addToken(TOK_RPAREN, String(1,c), pos);
++pos;
} else if(c==_('#')) {
// comment untill end of line
while (pos < input.size() && input[pos] != _('\n')) ++pos;
while (pos != end && *pos != '\n') ++pos;
} else {
add_error(_("Unknown character in script: '") + String(1,c) + _("'"));
// just skip the character
++pos;
}
}
void TokenIterator::readStringToken() {
size_t start = max((size_t)1, pos) - 1;
void TokenIterator::readStringToken(bool string_mode) {
auto start = pos;
if (!string_mode) ++pos;
String str;
while (true) {
if (pos >= input.size()) {
if (pos == end) {
if (!open_braces.empty() && open_braces.top() == BRACE_STRING_MODE) {
// in string mode: end of input = end of string
addToken(TOK_STRING, str, start);
@@ -279,41 +261,41 @@ void TokenIterator::readStringToken() {
return;
}
}
Char c = input.GetChar(pos++);
auto c = *pos++;
// parse the string constant
if (c == _('"') && !open_braces.empty() && open_braces.top() == BRACE_STRING) {
if (c == '"' && !open_braces.empty() && open_braces.top() == BRACE_STRING) {
// end of string
addToken(TOK_STRING, str, start);
open_braces.pop();
return;
} else if (c == _('\\')) {
} else if (c == '\\') {
// escape
if (pos >= input.size()) {
if (pos == end) {
add_error(_("Unexpected end of input in string constant"));
// fix up
addToken(TOK_STRING, str, start);
return;
}
c = input.GetChar(pos++);
c = *pos++;
if (c == _('n')) str += _('\n');
else if (c == _('r')) str += _('\r');
else if (c == _('t')) str += _('\t');
else if (c == _('&')); // escape for nothing
else if (c == _('<')) str += _('\1'); // escape for <
else if (c == _('<')) str += _('\1'); // escape for < in tagged string
else if (c == _('\\') || c == _('"') || c == _('\'') || c == _('{') || c == _('}')) {
str += c; // \ or { or " or ', don't warn about these, since they look escape-worthy
} else if (c >= _('0') && c <= _('9')) {
} else if (c >= '0' && c <= '9') {
// numeric escape
int i = 0;
while (c >= _('0') && c <= _('9')) {
i = i * 10 + static_cast<int>(c - _('0'));
c = input.GetChar(pos++);
int i = static_cast<int>(c - '0');
while (pos != end && *pos >= '0' && *pos <= '9') {
i = i * 10 + static_cast<int>(c - *pos);
++pos;
}
pos--;
str += static_cast<Char>(i); // ignore
str += static_cast<wxUniChar>(i);
} else {
add_error(String::Format(_("Invalid string escape sequence: \"\\%c\""),c));
str += _('\\') + c; // ignore
str += _('\\');
str += c; // ignore error
}
} else if (c == _('{')) {
// smart string
@@ -327,38 +309,36 @@ void TokenIterator::readStringToken() {
}
}
int line_number(size_t pos, const String& input) {
int line_number(String::const_iterator pos, String::const_iterator begin) {
int line = 1;
for (size_t i = 0 ; i < input.size() && i < pos ; ++i) {
if (input.GetChar(i) == _('\n')) line++;
for ( ; begin != pos ; ++begin) {
if (*begin == '\n') line++;
}
return line;
}
void TokenIterator::add_error(const String& message) {
if (!errors.empty() && errors.back().start == pos) return; // already an error here
size_t error_pos = pos - begin;
if (!errors.empty() && errors.back().start == error_pos) return; // already an error here
// add error message
errors.push_back(ScriptParseError(pos, line_number(pos,input), filename, message));
errors.push_back(ScriptParseError(error_pos, line_number(pos,begin), filename, message));
}
void TokenIterator::expected(const String& expected, const Token* opening) {
size_t error_pos = peek(0).pos;
if (!errors.empty() && errors.back().start == pos) return; // already an error here
auto next_token_pos = peek(0).pos;
size_t error_pos = next_token_pos - begin;
if (!errors.empty() && errors.back().start == error_pos) return; // already an error here
// add error message
if (opening) {
errors.push_back(ScriptParseError(opening->pos, error_pos, line_number(opening->pos,input), filename, opening->value, expected, peek(0).value));
size_t open_pos = opening->pos - begin;
errors.push_back(ScriptParseError(open_pos, error_pos, line_number(opening->pos,begin), filename, opening->value, expected, peek(0).value));
} else {
errors.push_back(ScriptParseError(error_pos, line_number(error_pos,input), filename, expected, peek(0).value));
errors.push_back(ScriptParseError(error_pos, line_number(next_token_pos,begin), filename, expected, peek(0).value));
}
}
String TokenIterator::getSourceCode(size_t start, size_t end) {
start = min(start, input.size());
end = min(end, input.size());
return input.substr(start, end-start);
}
int TokenIterator::getLineNumber() {
return line_number(peek(0).pos, input);
return line_number(peek(0).pos, begin);
}
// ----------------------------------------------------------------------------- : Parsing
@@ -414,17 +394,21 @@ ExprType parseOper(TokenIterator& input, Script& script, Precedence min_prec, In
/// Parse call arguments, "(...)"
void parseCallArguments(TokenIterator& input, Script& script, vector<Variable>& arguments);
ExprType parseTopLevel(TokenIterator& input, Script& script) {
ExprType type = parseOper(input, script, PREC_ALL);
Token eof = input.read();
if (eof != TOK_EOF) {
input.expected(_("end of input"));
}
return type;
}
ScriptP parse(const String& s, Packaged* package, bool string_mode, vector<ScriptParseError>& errors_out) {
errors_out.clear();
// parse
TokenIterator input(s, package, string_mode, errors_out);
ScriptP script(new Script);
ExprType type = parseOper(input, *script, PREC_ALL);
Token eof = input.read();
if (eof != TOK_EOF) {
input.expected(_("end of input"));
}
ExprType type = parseTopLevel(input, *script);
// were there fatal errors?
if (type == EXPR_FAILED) {
return ScriptP();
@@ -656,13 +640,13 @@ ExprType parseExpr(TokenIterator& input, Script& script, Precedence minPrec) {
} else if (token == _("assert")) {
// assert(condition)
expectToken(input, _("("));
size_t start = input.peek().pos;
auto start = input.peek().pos;
int line = input.getLineNumber();
size_t function_pos = script.getConstants().size();
script.addInstruction(I_PUSH_CONST, script_warning);
parseOper(input, script, PREC_ALL); // condition
size_t end = input.peek().pos;
String message = String::Format(_("Assertion failure on line %d:\n expected: "), line) + input.getSourceCode(start,end);
auto end = input.peek().pos;
String message = String::Format(_("Assertion failure on line %d:\n expected: "), line) + String(start,end);
expectToken(input, _(")"), &token);
if (script.getInstructions().back().instr == I_BINARY && script.getInstructions().back().instr2 == I_EQ) {
// compile "assert(x == y)" into
@@ -684,6 +668,23 @@ ExprType parseExpr(TokenIterator& input, Script& script, Precedence minPrec) {
script.addInstruction(I_NOP, SCRIPT_VAR_input); // (input:)
}
return EXPR_STATEMENT;
} else if (token == _("include_file")) {
// Include a file
expectToken(input, _("("));
Token token = input.read();
if (token != TOK_STRING) {
input.expected(_("filename"));
return EXPR_FAILED;
}
expectToken(input, _(")"), &token);
// include the file
// read the entire file, and start at the beginning of it
String const& filename = token.value;
auto stream = package_manager.openFileFromPackage(input.package, filename);
eat_utf8_bom(*stream);
String included_input = read_utf8_line(*stream, true);
TokenIterator included_tokens(included_input, input.package, false, input.errors);
return parseTopLevel(included_tokens, script);
} else {
// variable
Variable var = string_to_variable(token.value);
+8
View File
@@ -200,6 +200,14 @@ bool starts_with(const String& str, const String& start);
bool is_substr(const String& str, size_t pos, const Char* cmp);
/// Return whether str contains the string cmp at position pos
bool is_substr(const String& str, size_t pos, const String& cmp);
/// Return whether begin..end contains the string cmp at position begin
template <typename It>
bool is_substr(It begin, It end, const char* cmp) {
for (; begin != end && *cmp; ++begin, ++cmp) {
if (*begin != *cmp) return false;
}
return true;
}
/// Return whether str contains the string cmp at position pos, case insensitive compare
bool is_substr_i(const String& str, size_t pos, const Char* cmp);