diff --git a/src/script/parser.cpp b/src/script/parser.cpp index 4b12a068..b28a85cd 100644 --- a/src/script/parser.cpp +++ b/src/script/parser.cpp @@ -20,7 +20,7 @@ DECLARE_TYPEOF_COLLECTION(Variable); #define TokenType TokenType_ // some stupid windows header uses our name #endif -String read_utf8_line(wxInputStream& input, bool eat_bom = true, bool until_eof = false); +String read_utf8_line(wxInputStream& input, bool until_eof = false); extern ScriptValueP script_warning; extern ScriptValueP script_warning_if_neq; @@ -205,7 +205,8 @@ void TokenIterator::readToken() { pos = 0; filename = include_file; InputStreamP is = package_manager.openFileFromPackage(package, include_file); - input = read_utf8_line(*is, true, true); + eat_utf8_bom(*is); + input = read_utf8_line(*is, true); } else if (isAlpha(c) || c == _('_') || (isDigit(c) && !buffer.empty() && buffer.back() == _("."))) { // name, or a number after a . token, as in array.0 size_t start = pos - 1; diff --git a/src/util/io/reader.cpp b/src/util/io/reader.cpp index 9f47454b..a6520731 100644 --- a/src/util/io/reader.cpp +++ b/src/util/io/reader.cpp @@ -27,6 +27,8 @@ Reader::Reader(const InputStreamP& input, Packaged* package, const String& filen , filename(filename), package(package), line_number(0), previous_line_number(0) , input(input) { + assert(input); + eat_utf8_bom(*input); moveNext(); handleAppVersion(); } @@ -37,6 +39,8 @@ Reader::Reader(Reader* parent, Packaged* pkg, const String& filename, bool ignor , filename(filename), package(pkg), line_number(0), previous_line_number(0) , input(package_manager.openFileFromPackage(package, filename)) { + assert(input); + eat_utf8_bom(*input); moveNext(); // in an included file, use the app version of the parent if we have none handleAppVersion(); @@ -162,83 +166,56 @@ template class LocalVector { T small[SMALL_SIZE]; }; +/// Eat a utf-8 byte order mark from the begining of a stream +bool eat_utf8_bom(wxInputStream& input) { + int c; + if ((c = input.GetC()) == 0xEF) { + if ((c = input.GetC()) == 0xBB) { + if ((c = input.GetC()) == 0xBF) { + return true; + } else if (c != EOF) input.Ungetch(c); + } else if (c != EOF) input.Ungetch(c); + } else if (c != EOF) input.Ungetch(c); + return false; +} + /// Read an UTF-8 encoded line from an input stream /** As opposed to wx functions, this one actually reports errors */ -String read_utf8_line(wxInputStream& input, bool eat_bom = true, bool until_eof = false); -String read_utf8_line(wxInputStream& input, bool eat_bom, bool until_eof) { +String read_utf8_line(wxInputStream& input, bool until_eof = false); +String read_utf8_line(wxInputStream& input, bool until_eof) { LocalVector buffer; - while (!input.Eof()) { - Byte c = input.GetC(); if (input.LastRead() <= 0) break; + while (true) { + int c = input.GetC(); + if (c == EOF) break; if (!until_eof) { if (c == '\n') break; if (c == '\r') { - if (input.Eof()) break; - c = input.GetC(); if (input.LastRead() <= 0) break; - if (c != '\n') { + c = input.GetC(); + if (c != '\n' && c != EOF) { input.Ungetch(c); // \r but not \r\n } break; } } - buffer.push_back(c); + buffer.push_back((Byte)c); } // convert to string - buffer.push_back('\0'); - size_t size = wxConvUTF8.MB2WC(nullptr, buffer.get(), 0); + // TODO: Doing this in one step should be faster + size_t size = wxConvUTF8.ToWChar(nullptr, 0, buffer.get(), buffer.size()); if (size == size_t(-1)) { throw ParseError(_("Invalid UTF-8 sequence")); } else if (size == 0) { return _(""); } - #ifdef UNICODE - #if wxVERSION_NUMBER >= 2900 - String result = wxString::FromUTF8(buffer.get(), buffer.size()); - return eat_bom ? decodeUTF8BOM(result) : result; - #else - // NOTE: wx doc is wrong, parameter to GetWritableChar is numer of characters, not bytes - String result; - Char* result_buf = result.GetWriteBuf(size + 1); - wxConvUTF8.MB2WC(result_buf, buffer.get(), size + 1); - result.UngetWriteBuf(size); - return eat_bom ? decodeUTF8BOM(result) : result; - #endif - #else - String result; - // first to wchar, then back to local - vector buf2; buf2.resize(size+1); - wxConvUTF8.MB2WC(&buf2[0], buffer.get(), size + 1); - // eat BOM? - if (eat_bom && buf2[0]==0xFEFF ) { - buf2.erase(buf2.begin()); // remove BOM - } - // convert - #ifdef __WXMSW__ - // size includes null terminator - size = ::WideCharToMultiByte(CP_ACP, 0, &buf2[0], -1, nullptr, 0, nullptr, nullptr); - Char* result_buf = result.GetWriteBuf(size); - ::WideCharToMultiByte(CP_ACP, 0, &buf2[0], -1, result_buf, (int)size, nullptr, nullptr); - result.UngetWriteBuf(size - 1); - #else - for (size_t i = 0 ; i < size ; ++i) { - wchar_t wc = buf2[i]; - if (wc < 0xFF) { - result += (Char)wc; - } else { - // not valid in Latin1 - result += '?'; - } - } - #endif - return result; - #endif + return wxString::FromUTF8(buffer.get(), buffer.size()); } void Reader::readLine(bool in_string) { line_number += 1; // We have to do our own line reading, because wxTextInputStream is insane try { - line = read_utf8_line(*input, line_number == 1); + line = read_utf8_line(*input); } catch (const ParseError& e) { throw ParseError(e.what() + String(_(" on line ")) << line_number); } diff --git a/src/util/io/writer.cpp b/src/util/io/writer.cpp index f063127e..2e50c9d8 100644 --- a/src/util/io/writer.cpp +++ b/src/util/io/writer.cpp @@ -19,7 +19,8 @@ using boost::tribool; Writer::Writer(const OutputStreamP& output, Version file_app_version) : indentation(0) - , output(output), stream(*output) + , output(output) + , stream(*output, wxEOL_UNIX, wxMBConvUTF8()) { stream.WriteString(BYTE_ORDER_MARK); handle(_("mse_version"), file_app_version); diff --git a/src/util/string.cpp b/src/util/string.cpp index 9c2a7e4b..b6f28e08 100644 --- a/src/util/string.cpp +++ b/src/util/string.cpp @@ -13,25 +13,6 @@ // ----------------------------------------------------------------------------- : Unicode -String decodeUTF8BOM(const String& s) { - #ifdef UNICODE - if (!s.empty() && s.GetChar(0) == L'\xFEFF') { - // skip byte-order-mark - return s.substr(1); - } else { - return s; - } - #else - wxWCharBuffer buf = s.wc_str(wxConvUTF8); - if (buf && buf[size_t(0)] == L'\xFEFF') { - // skip byte-order-mark - return String(buf + 1, *wxConvCurrent); - } else { - return String(buf, *wxConvCurrent); - } - #endif -} - void writeUTF8(wxTextOutputStream& stream, const String& str) { #ifdef UNICODE stream.WriteString(str); diff --git a/src/util/string.hpp b/src/util/string.hpp index de986925..c1eaa1e5 100644 --- a/src/util/string.hpp +++ b/src/util/string.hpp @@ -52,22 +52,18 @@ inline wxStdString const& toStdString(String const& s) { /// The character type used typedef wxChar Char; -/// Decode a UTF8 string -/** In non-unicode builds the input is considered to be an incorrectly encoded utf8 string. - * In unicode builds it is a normal string, utf8 already decoded. - * Also removes a byte-order-mark from the start of the string if it is pressent - */ -String decodeUTF8BOM(const String& s); - -/// UTF8 Byte order mark for writing at the start of files +/// UTF-8 Byte order mark for writing at the start of files /** In non-unicode builds it is UTF8 encoded \xFEFF. * In unicode builds it is a normal \xFEFF. */ -const Char BYTE_ORDER_MARK[] = L"\xFEFF"; +const wchar_t BYTE_ORDER_MARK[] = L"\xFEFF"; /// Writes a string to an output stream, encoded as UTF8 void writeUTF8(wxTextOutputStream& stream, const String& str); +/// Remove a UTF-8 Byte order mark from an input stream +bool eat_utf8_bom(wxInputStream& input); + /// Some constants we like to use #ifdef UNICODE #define LEFT_ANGLE_BRACKET _("\x2039")