more robust wtf8 encoding

This commit is contained in:
Kristján Valur Jónsson 2019-02-18 12:27:44 +00:00
parent ca2d1d873d
commit 11b385820d
3 changed files with 123 additions and 89 deletions

View File

@ -451,93 +451,125 @@ unsigned char* s3fs_decode64(const char* input, size_t* plength)
return result;
}
/* handle invalid utf8 by creating surrogate escape pairs.
* this converts the data into the so-called wtf-8 encoding.
* It is necessary if we are given data that isn't proper utf8
* but the aws api requires proper utf8 for object names
/*
* detect and rewrite invalid utf8. We take invalid bytes
* and encode them into a private region of the unicode
* space. This is sometimes known as wtf8, wobbly transformation format.
* it is necessary because S3 validates the utf8 used for identifiers for
* correctness, while some clients may provide invalid utf, notably
* windows using cp1252.
*/
string s3fs_surrogateescape(const string &s)
static unsigned int escape_base = 0xe000; // base location for transform
// encode bytes into wobbly utf8. s can be null. returns true if transform was needed.
bool s3fs_wtf8_encode(const char *s, string *result)
{
bool invalid = false;
// Pass valid utf8 code through
string result;
for (unsigned i = 0; i < s.length(); i++) {
unsigned char c = s[i];
for (; *s; s++) {
unsigned char c = *s;
// single byte encoding
if (c <= 0x7f) {
result += c;
if (result)
*result += c;
continue;
}
// two byte encoding
if ((c & 0xe0) == 0xc0) {
if ((i + 1) < s.length() && (s[i+1] & 0xc0) == 0x80) {
// printf("two bytes %02x at %d\n", c, i);
result += c;
result += s[++i];
continue;
}
}
// three byte encoding
if ((c & 0xf0) == 0xe0) {
if ((i + 2) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80) {
// printf("three bytes %02x at %d\n", c, i);
result += c;
result += s[++i];
result += s[++i];
continue;
}
}
// four byte encoding
if ((c & 0xf8) == 0xf0) {
if ((i + 3) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80 && (s[i+3] & 0xc0) == 0x80) {
// printf("four bytes %02x at %d\n", c, i);
result += c;
result += s[++i];
result += s[++i];
result += s[++i];
continue;
}
}
// printf("invalid %02x at %d\n", c, i);
// Invalid utf8 code. Convert to the surrogate pair (also known as wtf-8 encoding)
// we use lone surrogates, UDC80-UDCFF for this.
// if the byte is below 128, we cannot do this so we just pass the byte through and hope
// for the best, but really, this should be an error
if (c < 128) {
result += c;
continue;
}
// output the lone surrogate as utf8 encoded. This is a three byte utf8 encoding:
unsigned surr = 0xdc00 + c;
result += 0xe0 | ((surr >> 12) & 0x0f);
result += 0x80 | ((surr >> 06) & 0x3f);
result += 0x80 | ((surr >> 00) & 0x3f);
}
return result;
}
string s3fs_surrogatedecode(const string &s)
{
// the reverse operation. Look for lone surrogates and replace them
string result;
for (unsigned i = 0; i < s.length(); i++) {
unsigned char c = s[i];
// look for a three byte encoding matching a lone surrogate
// three byte encoding
if ((c & 0xf0) == 0xe0) {
if ((i + 2) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80) {
unsigned surr = (c & 0x0f) << 12;
surr |= (s[i+1] & 0x3f) << 6;
surr |= (s[i+2] & 0x3f) << 0;
if (surr >= 0xdc80 && surr <= 0xdcff) {
// convert back
result += surr & 0xff;
i+=2;
continue;
// otherwise, it must be one of the valid start bytes
if ( c >= 0xc2 && c <= 0xf5 ) {
// two byte encoding
// don't need bounds check, string is zero terminated
if ((c & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80) {
// all two byte encodings starting higher than c1 are valid
if (result) {
*result += c;
*result += *(++s);
}
continue;
}
// three byte encoding
if ((c & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80) {
const unsigned code = ((c & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
if (code >= 0x800 && ! (code >= 0xd800 && code <= 0xd8ff)) {
// not overlong and not a surrogate pair
if (result) {
*result += c;
*result += *(++s);
*result += *(++s);
}
continue;
}
}
// four byte encoding
if ((c & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80) {
const unsigned code = ((c & 0x0f) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
if (code >= 0x1000 && code <= 0x10ffff) {
// not overlong and in defined unicode space
if (result) {
*result += c;
*result += *(++s);
*result += *(++s);
*result += *(++s);
}
continue;
}
}
}
result += c;
// printf("invalid %02x at %d\n", c, i);
// Invalid utf8 code. Convert it to a private two byte area of unicode
// e.g. the e000 - f8ff area. This will be a three byte encoding
invalid = true;
if (result) {
unsigned escape = escape_base + c;
*result += 0xe0 | ((escape >> 12) & 0x0f);
*result += 0x80 | ((escape >> 06) & 0x3f);
*result += 0x80 | ((escape >> 00) & 0x3f);
}
}
return invalid;
}
string s3fs_wtf8_encode(const string &s)
{
string result;
s3fs_wtf8_encode(s.c_str(), &result);
return result;
}
// The reverse operation, turn encoded bytes back into their original values
bool s3fs_wtf8_decode(const char *s, string *result)
{
// the reverse operation. Look for lone surrogates and replace them
bool encoded = false;
for (; *s; s++) {
unsigned char c = *s;
// look for a three byte tuple matching our encoding code
if ((c & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80) {
unsigned code = (c & 0x0f) << 12;
code |= (s[1] & 0x3f) << 6;
code |= (s[2] & 0x3f) << 0;
if (code >= escape_base && code < escape_base + 0xff) {
// convert back
encoded = true;
if (result)
*result += code - escape_base;
s+=2;
continue;
}
}
if (result)
*result += c;
}
return encoded;
}
string s3fs_wtf8_decode(const string &s)
{
string result;
s3fs_wtf8_decode(s.c_str(), &result);
return result;
}

View File

@ -58,8 +58,10 @@ std::string s3fs_hex(const unsigned char* input, size_t length);
char* s3fs_base64(const unsigned char* input, size_t length);
unsigned char* s3fs_decode64(const char* input, size_t* plength);
std::string s3fs_surrogateescape(const std::string &s);
std::string s3fs_surrogatedecode(const std::string &s);
bool s3fs_wtf8_encode(const char *s, std::string *result);
std::string s3fs_wtf8_encode(const std::string &s);
bool s3fs_wtf8_decode(const char *s, std::string *result);
std::string s3fs_wtf8_decode(const std::string &s);
#endif // S3FS_STRING_UTIL_H_

View File

@ -87,7 +87,7 @@ void test_strtoofft()
ASSERT_EQUALS(s3fs_strtoofft("deadbeef", /*is_base_16=*/ true), static_cast<off_t>(3735928559L));
}
void test_surrogateescape()
void test_wtf8_encoding()
{
std::string ascii("normal string");
std::string utf8("Hyld\xc3\xbdpi \xc3\xbej\xc3\xb3\xc3\xb0""f\xc3\xa9lagsins vex \xc3\xbar k\xc3\xa6rkomnu b\xc3\xb6li \xc3\xad \xc3\xa1st");
@ -96,19 +96,19 @@ void test_surrogateescape()
broken[14] = 0x97;
std::string mixed = ascii + utf8 + cp1252;
ASSERT_EQUALS(s3fs_surrogateescape(ascii), ascii);
ASSERT_EQUALS(s3fs_surrogatedecode(ascii), ascii);
ASSERT_EQUALS(s3fs_surrogateescape(utf8), utf8);
ASSERT_EQUALS(s3fs_surrogatedecode(utf8), utf8);
ASSERT_EQUALS(s3fs_wtf8_encode(ascii), ascii);
ASSERT_EQUALS(s3fs_wtf8_decode(ascii), ascii);
ASSERT_EQUALS(s3fs_wtf8_encode(utf8), utf8);
ASSERT_EQUALS(s3fs_wtf8_decode(utf8), utf8);
ASSERT_NEQUALS(s3fs_surrogateescape(cp1252), cp1252);
ASSERT_EQUALS(s3fs_surrogatedecode(s3fs_surrogateescape(cp1252)), cp1252);
ASSERT_NEQUALS(s3fs_wtf8_encode(cp1252), cp1252);
ASSERT_EQUALS(s3fs_wtf8_decode(s3fs_wtf8_encode(cp1252)), cp1252);
ASSERT_NEQUALS(s3fs_surrogateescape(broken), broken);
ASSERT_EQUALS(s3fs_surrogatedecode(s3fs_surrogateescape(broken)), broken);
ASSERT_NEQUALS(s3fs_wtf8_encode(broken), broken);
ASSERT_EQUALS(s3fs_wtf8_decode(s3fs_wtf8_encode(broken)), broken);
ASSERT_NEQUALS(s3fs_surrogateescape(mixed), mixed);
ASSERT_EQUALS(s3fs_surrogatedecode(s3fs_surrogateescape(mixed)), mixed);
ASSERT_NEQUALS(s3fs_wtf8_encode(mixed), mixed);
ASSERT_EQUALS(s3fs_wtf8_decode(s3fs_wtf8_encode(mixed)), mixed);
}
int main(int argc, char *argv[])
@ -116,6 +116,6 @@ int main(int argc, char *argv[])
test_trim();
test_base64();
test_strtoofft();
test_surrogateescape();
test_wtf8_encoding();
return 0;
}