Cleaned up utf8 decoding code.

It was actually completely broken in newer wxWidgets versions, putting '\0' characters in the string, which broke things like concatenation.
2026-06-10 04:57:00 -04:00 · 2020-04-08 01:24:19 +02:00
parent 35a89676b4
commit 6e7a4485a8
5 changed files with 39 additions and 83 deletions
@@ -20,7 +20,7 @@ DECLARE_TYPEOF_COLLECTION(Variable);
 #define TokenType TokenType_ // some stupid windows header uses our name
 #endif

-String read_utf8_line(wxInputStream& input, bool eat_bom = true, bool until_eof = false);
+String read_utf8_line(wxInputStream& input, bool until_eof = false);

 extern ScriptValueP script_warning;
 extern ScriptValueP script_warning_if_neq;
@@ -205,7 +205,8 @@ void TokenIterator::readToken() {
    pos = 0;
    filename = include_file;
    InputStreamP is = package_manager.openFileFromPackage(package, include_file);
-    input = read_utf8_line(*is, true, true);
+    eat_utf8_bom(*is);
+    input = read_utf8_line(*is, true);
  } else if (isAlpha(c) || c == _('_') || (isDigit(c) && !buffer.empty() && buffer.back() == _("."))) {
    // name, or a number after a . token, as in array.0
    size_t start = pos - 1;
@@ -27,6 +27,8 @@ Reader::Reader(const InputStreamP& input, Packaged* package, const String& filen
  , filename(filename), package(package), line_number(0), previous_line_number(0)
  , input(input)
 {
+  assert(input);
+  eat_utf8_bom(*input);
  moveNext();
  handleAppVersion();
 }
@@ -37,6 +39,8 @@ Reader::Reader(Reader* parent, Packaged* pkg, const String& filename, bool ignor
  , filename(filename), package(pkg), line_number(0), previous_line_number(0)
  , input(package_manager.openFileFromPackage(package, filename))
 {
+  assert(input);
+  eat_utf8_bom(*input);
  moveNext();
  // in an included file, use the app version of the parent if we have none
  handleAppVersion();
@@ -162,83 +166,56 @@ template <typename T> class LocalVector {
  T small[SMALL_SIZE];
 };

+/// Eat a utf-8 byte order mark from the begining of a stream
+bool eat_utf8_bom(wxInputStream& input) {
+  int c;
+  if ((c = input.GetC()) == 0xEF) {
+    if ((c = input.GetC()) == 0xBB) {
+      if ((c = input.GetC()) == 0xBF) {
+        return true;
+      } else if (c != EOF) input.Ungetch(c);
+    } else if (c != EOF) input.Ungetch(c);
+  } else if (c != EOF) input.Ungetch(c);
+  return false;
+}
+
 /// Read an UTF-8 encoded line from an input stream
 /** As opposed to wx functions, this one actually reports errors
 */
-String read_utf8_line(wxInputStream& input, bool eat_bom = true, bool until_eof = false);
-String read_utf8_line(wxInputStream& input, bool eat_bom, bool until_eof) {
+String read_utf8_line(wxInputStream& input, bool until_eof = false);
+String read_utf8_line(wxInputStream& input, bool until_eof) {
  LocalVector<char> buffer;
-  while (!input.Eof()) {
-    Byte c = input.GetC(); if (input.LastRead() <= 0) break;
+  while (true) {
+    int c = input.GetC();
+    if (c == EOF) break;
    if (!until_eof) {
      if (c == '\n') break;
      if (c == '\r') {
-        if (input.Eof()) break;
-        c = input.GetC(); if (input.LastRead() <= 0) break;
-        if (c != '\n') {
+        c = input.GetC();
+        if (c != '\n' && c != EOF) {
          input.Ungetch(c); // \r but not \r\n
        }
        break; 
      }
    }
-    buffer.push_back(c);
+    buffer.push_back((Byte)c);
  }
  // convert to string
-  buffer.push_back('\0');
-  size_t size = wxConvUTF8.MB2WC(nullptr, buffer.get(), 0);
+  // TODO: Doing this in one step should be faster
+  size_t size = wxConvUTF8.ToWChar(nullptr, 0, buffer.get(), buffer.size());
  if (size == size_t(-1)) {
    throw ParseError(_("Invalid UTF-8 sequence"));
  } else if (size == 0) {
    return _("");
  }
-  #ifdef UNICODE
-    #if wxVERSION_NUMBER >= 2900
-      String result = wxString::FromUTF8(buffer.get(), buffer.size());
-      return eat_bom ? decodeUTF8BOM(result) : result;
-    #else
-      // NOTE: wx doc is wrong, parameter to GetWritableChar is numer of characters, not bytes
-      String result;
-      Char* result_buf = result.GetWriteBuf(size + 1);
-      wxConvUTF8.MB2WC(result_buf, buffer.get(), size + 1);
-      result.UngetWriteBuf(size);
-      return eat_bom ? decodeUTF8BOM(result) : result;
-    #endif
-  #else
-    String result;
-    // first to wchar, then back to local
-    vector<wchar_t> buf2; buf2.resize(size+1);
-    wxConvUTF8.MB2WC(&buf2[0], buffer.get(), size + 1);
-    // eat BOM?
-    if (eat_bom && buf2[0]==0xFEFF ) {
-      buf2.erase(buf2.begin()); // remove BOM
-    }
-    // convert
-    #ifdef __WXMSW__
-      // size includes null terminator
-      size = ::WideCharToMultiByte(CP_ACP, 0, &buf2[0], -1, nullptr, 0, nullptr, nullptr);
-      Char* result_buf = result.GetWriteBuf(size);
-      ::WideCharToMultiByte(CP_ACP, 0, &buf2[0], -1, result_buf, (int)size, nullptr, nullptr);
-      result.UngetWriteBuf(size - 1);
-    #else
-      for (size_t i = 0 ; i < size ; ++i) {
-        wchar_t wc = buf2[i];
-        if (wc < 0xFF) {
-          result += (Char)wc;
-        } else {
-          // not valid in Latin1
-          result += '?';
-        }
-      }
-    #endif
-    return result;
-  #endif
+  return wxString::FromUTF8(buffer.get(), buffer.size());
 }

 void Reader::readLine(bool in_string) {
  line_number += 1;
  // We have to do our own line reading, because wxTextInputStream is insane
  try {
-    line = read_utf8_line(*input, line_number == 1);
+    line = read_utf8_line(*input);
  } catch (const ParseError& e) {
    throw ParseError(e.what() + String(_(" on line ")) << line_number);
  }
@@ -19,7 +19,8 @@ using boost::tribool;

 Writer::Writer(const OutputStreamP& output, Version file_app_version)
  : indentation(0)
-  , output(output), stream(*output)
+  , output(output)
+  , stream(*output, wxEOL_UNIX, wxMBConvUTF8())
 {
  stream.WriteString(BYTE_ORDER_MARK);
  handle(_("mse_version"), file_app_version);
@@ -13,25 +13,6 @@

 // ----------------------------------------------------------------------------- : Unicode

-String decodeUTF8BOM(const String& s) {
-  #ifdef UNICODE
-    if (!s.empty() && s.GetChar(0) == L'\xFEFF') {
-      // skip byte-order-mark
-      return s.substr(1);
-    } else {
-      return s;
-    }
-  #else
-    wxWCharBuffer buf = s.wc_str(wxConvUTF8);
-    if (buf && buf[size_t(0)] == L'\xFEFF') {
-      // skip byte-order-mark
-      return String(buf + 1, *wxConvCurrent);
-    } else {
-      return String(buf,     *wxConvCurrent);
-    }
-  #endif
-}
-
 void writeUTF8(wxTextOutputStream& stream, const String& str) {
  #ifdef UNICODE
    stream.WriteString(str);
@@ -52,22 +52,18 @@ inline wxStdString const& toStdString(String const& s) {
 /// The character type used
 typedef wxChar Char;

-/// Decode a UTF8 string
-/** In non-unicode builds the input is considered to be an incorrectly encoded utf8 string.
- *  In unicode builds it is a normal string, utf8 already decoded.
- *  Also removes a byte-order-mark from the start of the string if it is pressent
- */
-String decodeUTF8BOM(const String& s);
-
-/// UTF8 Byte order mark for writing at the start of files
+/// UTF-8 Byte order mark for writing at the start of files
 /** In non-unicode builds it is UTF8 encoded \xFEFF.
 *  In unicode builds it is a normal \xFEFF.
 */
-const Char BYTE_ORDER_MARK[] = L"\xFEFF";
+const wchar_t BYTE_ORDER_MARK[] = L"\xFEFF";

 /// Writes a string to an output stream, encoded as UTF8
 void writeUTF8(wxTextOutputStream& stream, const String& str);

+/// Remove a UTF-8 Byte order mark from an input stream
+bool eat_utf8_bom(wxInputStream& input);
+
 /// Some constants we like to use
 #ifdef UNICODE
  #define  LEFT_ANGLE_BRACKET _("\x2039")