mirror of
https://github.com/amyinspace/MagicSetEditor2.git
synced 2026-06-10 04:57:00 -04:00
Cleaned up utf8 decoding code.
It was actually completely broken in newer wxWidgets versions, putting '\0' characters in the string, which broke things like concatenation.
This commit is contained in:
@@ -20,7 +20,7 @@ DECLARE_TYPEOF_COLLECTION(Variable);
|
||||
#define TokenType TokenType_ // some stupid windows header uses our name
|
||||
#endif
|
||||
|
||||
String read_utf8_line(wxInputStream& input, bool eat_bom = true, bool until_eof = false);
|
||||
String read_utf8_line(wxInputStream& input, bool until_eof = false);
|
||||
|
||||
extern ScriptValueP script_warning;
|
||||
extern ScriptValueP script_warning_if_neq;
|
||||
@@ -205,7 +205,8 @@ void TokenIterator::readToken() {
|
||||
pos = 0;
|
||||
filename = include_file;
|
||||
InputStreamP is = package_manager.openFileFromPackage(package, include_file);
|
||||
input = read_utf8_line(*is, true, true);
|
||||
eat_utf8_bom(*is);
|
||||
input = read_utf8_line(*is, true);
|
||||
} else if (isAlpha(c) || c == _('_') || (isDigit(c) && !buffer.empty() && buffer.back() == _("."))) {
|
||||
// name, or a number after a . token, as in array.0
|
||||
size_t start = pos - 1;
|
||||
|
||||
+29
-52
@@ -27,6 +27,8 @@ Reader::Reader(const InputStreamP& input, Packaged* package, const String& filen
|
||||
, filename(filename), package(package), line_number(0), previous_line_number(0)
|
||||
, input(input)
|
||||
{
|
||||
assert(input);
|
||||
eat_utf8_bom(*input);
|
||||
moveNext();
|
||||
handleAppVersion();
|
||||
}
|
||||
@@ -37,6 +39,8 @@ Reader::Reader(Reader* parent, Packaged* pkg, const String& filename, bool ignor
|
||||
, filename(filename), package(pkg), line_number(0), previous_line_number(0)
|
||||
, input(package_manager.openFileFromPackage(package, filename))
|
||||
{
|
||||
assert(input);
|
||||
eat_utf8_bom(*input);
|
||||
moveNext();
|
||||
// in an included file, use the app version of the parent if we have none
|
||||
handleAppVersion();
|
||||
@@ -162,83 +166,56 @@ template <typename T> class LocalVector {
|
||||
T small[SMALL_SIZE];
|
||||
};
|
||||
|
||||
/// Eat a utf-8 byte order mark from the begining of a stream
|
||||
bool eat_utf8_bom(wxInputStream& input) {
|
||||
int c;
|
||||
if ((c = input.GetC()) == 0xEF) {
|
||||
if ((c = input.GetC()) == 0xBB) {
|
||||
if ((c = input.GetC()) == 0xBF) {
|
||||
return true;
|
||||
} else if (c != EOF) input.Ungetch(c);
|
||||
} else if (c != EOF) input.Ungetch(c);
|
||||
} else if (c != EOF) input.Ungetch(c);
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Read an UTF-8 encoded line from an input stream
|
||||
/** As opposed to wx functions, this one actually reports errors
|
||||
*/
|
||||
String read_utf8_line(wxInputStream& input, bool eat_bom = true, bool until_eof = false);
|
||||
String read_utf8_line(wxInputStream& input, bool eat_bom, bool until_eof) {
|
||||
String read_utf8_line(wxInputStream& input, bool until_eof = false);
|
||||
String read_utf8_line(wxInputStream& input, bool until_eof) {
|
||||
LocalVector<char> buffer;
|
||||
while (!input.Eof()) {
|
||||
Byte c = input.GetC(); if (input.LastRead() <= 0) break;
|
||||
while (true) {
|
||||
int c = input.GetC();
|
||||
if (c == EOF) break;
|
||||
if (!until_eof) {
|
||||
if (c == '\n') break;
|
||||
if (c == '\r') {
|
||||
if (input.Eof()) break;
|
||||
c = input.GetC(); if (input.LastRead() <= 0) break;
|
||||
if (c != '\n') {
|
||||
c = input.GetC();
|
||||
if (c != '\n' && c != EOF) {
|
||||
input.Ungetch(c); // \r but not \r\n
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
buffer.push_back(c);
|
||||
buffer.push_back((Byte)c);
|
||||
}
|
||||
// convert to string
|
||||
buffer.push_back('\0');
|
||||
size_t size = wxConvUTF8.MB2WC(nullptr, buffer.get(), 0);
|
||||
// TODO: Doing this in one step should be faster
|
||||
size_t size = wxConvUTF8.ToWChar(nullptr, 0, buffer.get(), buffer.size());
|
||||
if (size == size_t(-1)) {
|
||||
throw ParseError(_("Invalid UTF-8 sequence"));
|
||||
} else if (size == 0) {
|
||||
return _("");
|
||||
}
|
||||
#ifdef UNICODE
|
||||
#if wxVERSION_NUMBER >= 2900
|
||||
String result = wxString::FromUTF8(buffer.get(), buffer.size());
|
||||
return eat_bom ? decodeUTF8BOM(result) : result;
|
||||
#else
|
||||
// NOTE: wx doc is wrong, parameter to GetWritableChar is numer of characters, not bytes
|
||||
String result;
|
||||
Char* result_buf = result.GetWriteBuf(size + 1);
|
||||
wxConvUTF8.MB2WC(result_buf, buffer.get(), size + 1);
|
||||
result.UngetWriteBuf(size);
|
||||
return eat_bom ? decodeUTF8BOM(result) : result;
|
||||
#endif
|
||||
#else
|
||||
String result;
|
||||
// first to wchar, then back to local
|
||||
vector<wchar_t> buf2; buf2.resize(size+1);
|
||||
wxConvUTF8.MB2WC(&buf2[0], buffer.get(), size + 1);
|
||||
// eat BOM?
|
||||
if (eat_bom && buf2[0]==0xFEFF ) {
|
||||
buf2.erase(buf2.begin()); // remove BOM
|
||||
}
|
||||
// convert
|
||||
#ifdef __WXMSW__
|
||||
// size includes null terminator
|
||||
size = ::WideCharToMultiByte(CP_ACP, 0, &buf2[0], -1, nullptr, 0, nullptr, nullptr);
|
||||
Char* result_buf = result.GetWriteBuf(size);
|
||||
::WideCharToMultiByte(CP_ACP, 0, &buf2[0], -1, result_buf, (int)size, nullptr, nullptr);
|
||||
result.UngetWriteBuf(size - 1);
|
||||
#else
|
||||
for (size_t i = 0 ; i < size ; ++i) {
|
||||
wchar_t wc = buf2[i];
|
||||
if (wc < 0xFF) {
|
||||
result += (Char)wc;
|
||||
} else {
|
||||
// not valid in Latin1
|
||||
result += '?';
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return result;
|
||||
#endif
|
||||
return wxString::FromUTF8(buffer.get(), buffer.size());
|
||||
}
|
||||
|
||||
void Reader::readLine(bool in_string) {
|
||||
line_number += 1;
|
||||
// We have to do our own line reading, because wxTextInputStream is insane
|
||||
try {
|
||||
line = read_utf8_line(*input, line_number == 1);
|
||||
line = read_utf8_line(*input);
|
||||
} catch (const ParseError& e) {
|
||||
throw ParseError(e.what() + String(_(" on line ")) << line_number);
|
||||
}
|
||||
|
||||
@@ -19,7 +19,8 @@ using boost::tribool;
|
||||
|
||||
Writer::Writer(const OutputStreamP& output, Version file_app_version)
|
||||
: indentation(0)
|
||||
, output(output), stream(*output)
|
||||
, output(output)
|
||||
, stream(*output, wxEOL_UNIX, wxMBConvUTF8())
|
||||
{
|
||||
stream.WriteString(BYTE_ORDER_MARK);
|
||||
handle(_("mse_version"), file_app_version);
|
||||
|
||||
@@ -13,25 +13,6 @@
|
||||
|
||||
// ----------------------------------------------------------------------------- : Unicode
|
||||
|
||||
String decodeUTF8BOM(const String& s) {
|
||||
#ifdef UNICODE
|
||||
if (!s.empty() && s.GetChar(0) == L'\xFEFF') {
|
||||
// skip byte-order-mark
|
||||
return s.substr(1);
|
||||
} else {
|
||||
return s;
|
||||
}
|
||||
#else
|
||||
wxWCharBuffer buf = s.wc_str(wxConvUTF8);
|
||||
if (buf && buf[size_t(0)] == L'\xFEFF') {
|
||||
// skip byte-order-mark
|
||||
return String(buf + 1, *wxConvCurrent);
|
||||
} else {
|
||||
return String(buf, *wxConvCurrent);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void writeUTF8(wxTextOutputStream& stream, const String& str) {
|
||||
#ifdef UNICODE
|
||||
stream.WriteString(str);
|
||||
|
||||
+5
-9
@@ -52,22 +52,18 @@ inline wxStdString const& toStdString(String const& s) {
|
||||
/// The character type used
|
||||
typedef wxChar Char;
|
||||
|
||||
/// Decode a UTF8 string
|
||||
/** In non-unicode builds the input is considered to be an incorrectly encoded utf8 string.
|
||||
* In unicode builds it is a normal string, utf8 already decoded.
|
||||
* Also removes a byte-order-mark from the start of the string if it is pressent
|
||||
*/
|
||||
String decodeUTF8BOM(const String& s);
|
||||
|
||||
/// UTF8 Byte order mark for writing at the start of files
|
||||
/// UTF-8 Byte order mark for writing at the start of files
|
||||
/** In non-unicode builds it is UTF8 encoded \xFEFF.
|
||||
* In unicode builds it is a normal \xFEFF.
|
||||
*/
|
||||
const Char BYTE_ORDER_MARK[] = L"\xFEFF";
|
||||
const wchar_t BYTE_ORDER_MARK[] = L"\xFEFF";
|
||||
|
||||
/// Writes a string to an output stream, encoded as UTF8
|
||||
void writeUTF8(wxTextOutputStream& stream, const String& str);
|
||||
|
||||
/// Remove a UTF-8 Byte order mark from an input stream
|
||||
bool eat_utf8_bom(wxInputStream& input);
|
||||
|
||||
/// Some constants we like to use
|
||||
#ifdef UNICODE
|
||||
#define LEFT_ANGLE_BRACKET _("\x2039")
|
||||
|
||||
Reference in New Issue
Block a user