Cleaned up utf8 decoding code.

It was actually completely broken in newer wxWidgets versions, putting '\0' characters in the string, which broke things like concatenation.
This commit is contained in:
Twan van Laarhoven
2020-04-08 01:24:19 +02:00
parent 35a89676b4
commit 6e7a4485a8
5 changed files with 39 additions and 83 deletions
+3 -2
View File
@@ -20,7 +20,7 @@ DECLARE_TYPEOF_COLLECTION(Variable);
#define TokenType TokenType_ // some stupid windows header uses our name
#endif
String read_utf8_line(wxInputStream& input, bool eat_bom = true, bool until_eof = false);
String read_utf8_line(wxInputStream& input, bool until_eof = false);
extern ScriptValueP script_warning;
extern ScriptValueP script_warning_if_neq;
@@ -205,7 +205,8 @@ void TokenIterator::readToken() {
pos = 0;
filename = include_file;
InputStreamP is = package_manager.openFileFromPackage(package, include_file);
input = read_utf8_line(*is, true, true);
eat_utf8_bom(*is);
input = read_utf8_line(*is, true);
} else if (isAlpha(c) || c == _('_') || (isDigit(c) && !buffer.empty() && buffer.back() == _("."))) {
// name, or a number after a . token, as in array.0
size_t start = pos - 1;
+29 -52
View File
@@ -27,6 +27,8 @@ Reader::Reader(const InputStreamP& input, Packaged* package, const String& filen
, filename(filename), package(package), line_number(0), previous_line_number(0)
, input(input)
{
assert(input);
eat_utf8_bom(*input);
moveNext();
handleAppVersion();
}
@@ -37,6 +39,8 @@ Reader::Reader(Reader* parent, Packaged* pkg, const String& filename, bool ignor
, filename(filename), package(pkg), line_number(0), previous_line_number(0)
, input(package_manager.openFileFromPackage(package, filename))
{
assert(input);
eat_utf8_bom(*input);
moveNext();
// in an included file, use the app version of the parent if we have none
handleAppVersion();
@@ -162,83 +166,56 @@ template <typename T> class LocalVector {
T small[SMALL_SIZE];
};
/// Eat a utf-8 byte order mark from the begining of a stream
bool eat_utf8_bom(wxInputStream& input) {
int c;
if ((c = input.GetC()) == 0xEF) {
if ((c = input.GetC()) == 0xBB) {
if ((c = input.GetC()) == 0xBF) {
return true;
} else if (c != EOF) input.Ungetch(c);
} else if (c != EOF) input.Ungetch(c);
} else if (c != EOF) input.Ungetch(c);
return false;
}
/// Read an UTF-8 encoded line from an input stream
/** As opposed to wx functions, this one actually reports errors
*/
String read_utf8_line(wxInputStream& input, bool eat_bom = true, bool until_eof = false);
String read_utf8_line(wxInputStream& input, bool eat_bom, bool until_eof) {
String read_utf8_line(wxInputStream& input, bool until_eof = false);
String read_utf8_line(wxInputStream& input, bool until_eof) {
LocalVector<char> buffer;
while (!input.Eof()) {
Byte c = input.GetC(); if (input.LastRead() <= 0) break;
while (true) {
int c = input.GetC();
if (c == EOF) break;
if (!until_eof) {
if (c == '\n') break;
if (c == '\r') {
if (input.Eof()) break;
c = input.GetC(); if (input.LastRead() <= 0) break;
if (c != '\n') {
c = input.GetC();
if (c != '\n' && c != EOF) {
input.Ungetch(c); // \r but not \r\n
}
break;
}
}
buffer.push_back(c);
buffer.push_back((Byte)c);
}
// convert to string
buffer.push_back('\0');
size_t size = wxConvUTF8.MB2WC(nullptr, buffer.get(), 0);
// TODO: Doing this in one step should be faster
size_t size = wxConvUTF8.ToWChar(nullptr, 0, buffer.get(), buffer.size());
if (size == size_t(-1)) {
throw ParseError(_("Invalid UTF-8 sequence"));
} else if (size == 0) {
return _("");
}
#ifdef UNICODE
#if wxVERSION_NUMBER >= 2900
String result = wxString::FromUTF8(buffer.get(), buffer.size());
return eat_bom ? decodeUTF8BOM(result) : result;
#else
// NOTE: wx doc is wrong, parameter to GetWritableChar is numer of characters, not bytes
String result;
Char* result_buf = result.GetWriteBuf(size + 1);
wxConvUTF8.MB2WC(result_buf, buffer.get(), size + 1);
result.UngetWriteBuf(size);
return eat_bom ? decodeUTF8BOM(result) : result;
#endif
#else
String result;
// first to wchar, then back to local
vector<wchar_t> buf2; buf2.resize(size+1);
wxConvUTF8.MB2WC(&buf2[0], buffer.get(), size + 1);
// eat BOM?
if (eat_bom && buf2[0]==0xFEFF ) {
buf2.erase(buf2.begin()); // remove BOM
}
// convert
#ifdef __WXMSW__
// size includes null terminator
size = ::WideCharToMultiByte(CP_ACP, 0, &buf2[0], -1, nullptr, 0, nullptr, nullptr);
Char* result_buf = result.GetWriteBuf(size);
::WideCharToMultiByte(CP_ACP, 0, &buf2[0], -1, result_buf, (int)size, nullptr, nullptr);
result.UngetWriteBuf(size - 1);
#else
for (size_t i = 0 ; i < size ; ++i) {
wchar_t wc = buf2[i];
if (wc < 0xFF) {
result += (Char)wc;
} else {
// not valid in Latin1
result += '?';
}
}
#endif
return result;
#endif
return wxString::FromUTF8(buffer.get(), buffer.size());
}
void Reader::readLine(bool in_string) {
line_number += 1;
// We have to do our own line reading, because wxTextInputStream is insane
try {
line = read_utf8_line(*input, line_number == 1);
line = read_utf8_line(*input);
} catch (const ParseError& e) {
throw ParseError(e.what() + String(_(" on line ")) << line_number);
}
+2 -1
View File
@@ -19,7 +19,8 @@ using boost::tribool;
Writer::Writer(const OutputStreamP& output, Version file_app_version)
: indentation(0)
, output(output), stream(*output)
, output(output)
, stream(*output, wxEOL_UNIX, wxMBConvUTF8())
{
stream.WriteString(BYTE_ORDER_MARK);
handle(_("mse_version"), file_app_version);
-19
View File
@@ -13,25 +13,6 @@
// ----------------------------------------------------------------------------- : Unicode
String decodeUTF8BOM(const String& s) {
#ifdef UNICODE
if (!s.empty() && s.GetChar(0) == L'\xFEFF') {
// skip byte-order-mark
return s.substr(1);
} else {
return s;
}
#else
wxWCharBuffer buf = s.wc_str(wxConvUTF8);
if (buf && buf[size_t(0)] == L'\xFEFF') {
// skip byte-order-mark
return String(buf + 1, *wxConvCurrent);
} else {
return String(buf, *wxConvCurrent);
}
#endif
}
void writeUTF8(wxTextOutputStream& stream, const String& str) {
#ifdef UNICODE
stream.WriteString(str);
+5 -9
View File
@@ -52,22 +52,18 @@ inline wxStdString const& toStdString(String const& s) {
/// The character type used
typedef wxChar Char;
/// Decode a UTF8 string
/** In non-unicode builds the input is considered to be an incorrectly encoded utf8 string.
* In unicode builds it is a normal string, utf8 already decoded.
* Also removes a byte-order-mark from the start of the string if it is pressent
*/
String decodeUTF8BOM(const String& s);
/// UTF8 Byte order mark for writing at the start of files
/// UTF-8 Byte order mark for writing at the start of files
/** In non-unicode builds it is UTF8 encoded \xFEFF.
* In unicode builds it is a normal \xFEFF.
*/
const Char BYTE_ORDER_MARK[] = L"\xFEFF";
const wchar_t BYTE_ORDER_MARK[] = L"\xFEFF";
/// Writes a string to an output stream, encoded as UTF8
void writeUTF8(wxTextOutputStream& stream, const String& str);
/// Remove a UTF-8 Byte order mark from an input stream
bool eat_utf8_bom(wxInputStream& input);
/// Some constants we like to use
#ifdef UNICODE
#define LEFT_ANGLE_BRACKET _("\x2039")