Adding utility functions to convert invalid utf8 to wtf8 encoding

This is to deal with windows clients who pass in cp1252 as if it
were utf8
This commit is contained in:
Kristján Valur Jónsson 2019-02-15 15:57:03 +00:00
parent 951761ee2c
commit ca2d1d873d
4 changed files with 160 additions and 0 deletions

View File

@ -451,6 +451,96 @@ unsigned char* s3fs_decode64(const char* input, size_t* plength)
return result;
}
/* handle invalid utf8 by creating surrogate escape pairs.
* this converts the data into the so-called wtf-8 encoding.
* It is necessary if we are given data that isn't proper utf8
* but the aws api requires proper utf8 for object names
*/
string s3fs_surrogateescape(const string &s)
{
// Pass valid utf8 code through
string result;
for (unsigned i = 0; i < s.length(); i++) {
unsigned char c = s[i];
// single byte encoding
if (c <= 0x7f) {
result += c;
continue;
}
// two byte encoding
if ((c & 0xe0) == 0xc0) {
if ((i + 1) < s.length() && (s[i+1] & 0xc0) == 0x80) {
// printf("two bytes %02x at %d\n", c, i);
result += c;
result += s[++i];
continue;
}
}
// three byte encoding
if ((c & 0xf0) == 0xe0) {
if ((i + 2) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80) {
// printf("three bytes %02x at %d\n", c, i);
result += c;
result += s[++i];
result += s[++i];
continue;
}
}
// four byte encoding
if ((c & 0xf8) == 0xf0) {
if ((i + 3) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80 && (s[i+3] & 0xc0) == 0x80) {
// printf("four bytes %02x at %d\n", c, i);
result += c;
result += s[++i];
result += s[++i];
result += s[++i];
continue;
}
}
// printf("invalid %02x at %d\n", c, i);
// Invalid utf8 code. Convert to the surrogate pair (also known as wtf-8 encoding)
// we use lone surrogates, UDC80-UDCFF for this.
// if the byte is below 128, we cannot do this so we just pass the byte through and hope
// for the best, but really, this should be an error
if (c < 128) {
result += c;
continue;
}
// output the lone surrogate as utf8 encoded. This is a three byte utf8 encoding:
unsigned surr = 0xdc00 + c;
result += 0xe0 | ((surr >> 12) & 0x0f);
result += 0x80 | ((surr >> 06) & 0x3f);
result += 0x80 | ((surr >> 00) & 0x3f);
}
return result;
}
string s3fs_surrogatedecode(const string &s)
{
// the reverse operation. Look for lone surrogates and replace them
string result;
for (unsigned i = 0; i < s.length(); i++) {
unsigned char c = s[i];
// look for a three byte encoding matching a lone surrogate
// three byte encoding
if ((c & 0xf0) == 0xe0) {
if ((i + 2) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80) {
unsigned surr = (c & 0x0f) << 12;
surr |= (s[i+1] & 0x3f) << 6;
surr |= (s[i+2] & 0x3f) << 0;
if (surr >= 0xdc80 && surr <= 0xdcff) {
// convert back
result += surr & 0xff;
i+=2;
continue;
}
}
}
result += c;
}
return result;
}
/*
* Local variables:
* tab-width: 4

View File

@ -58,6 +58,9 @@ std::string s3fs_hex(const unsigned char* input, size_t length);
char* s3fs_base64(const unsigned char* input, size_t length);
unsigned char* s3fs_decode64(const char* input, size_t* plength);
std::string s3fs_surrogateescape(const std::string &s);
std::string s3fs_surrogatedecode(const std::string &s);
#endif // S3FS_STRING_UTIL_H_
/*

View File

@ -87,10 +87,35 @@ void test_strtoofft()
ASSERT_EQUALS(s3fs_strtoofft("deadbeef", /*is_base_16=*/ true), static_cast<off_t>(3735928559L));
}
void test_surrogateescape()
{
std::string ascii("normal string");
std::string utf8("Hyld\xc3\xbdpi \xc3\xbej\xc3\xb3\xc3\xb0""f\xc3\xa9lagsins vex \xc3\xbar k\xc3\xa6rkomnu b\xc3\xb6li \xc3\xad \xc3\xa1st");
std::string cp1252("Hyld\xfdpi \xfej\xf3\xf0""f\xe9lagsins vex \xfar k\xe6rkomnu b\xf6li \xed \xe1st");
std::string broken = utf8;
broken[14] = 0x97;
std::string mixed = ascii + utf8 + cp1252;
ASSERT_EQUALS(s3fs_surrogateescape(ascii), ascii);
ASSERT_EQUALS(s3fs_surrogatedecode(ascii), ascii);
ASSERT_EQUALS(s3fs_surrogateescape(utf8), utf8);
ASSERT_EQUALS(s3fs_surrogatedecode(utf8), utf8);
ASSERT_NEQUALS(s3fs_surrogateescape(cp1252), cp1252);
ASSERT_EQUALS(s3fs_surrogatedecode(s3fs_surrogateescape(cp1252)), cp1252);
ASSERT_NEQUALS(s3fs_surrogateescape(broken), broken);
ASSERT_EQUALS(s3fs_surrogatedecode(s3fs_surrogateescape(broken)), broken);
ASSERT_NEQUALS(s3fs_surrogateescape(mixed), mixed);
ASSERT_EQUALS(s3fs_surrogatedecode(s3fs_surrogateescape(mixed)), mixed);
}
int main(int argc, char *argv[])
{
test_trim();
test_base64();
test_strtoofft();
test_surrogateescape();
return 0;
}

View File

@ -20,11 +20,50 @@
#include <cstdlib>
#include <iostream>
#include <stdio.h>
template <typename T> void assert_equals(const T &x, const T &y, const char *file, int line)
{
if (x != y) {
std::cerr << x << " != " << y << " at " << file << ":" << line << std::endl;
std::cerr << std::endl;
std::exit(1);
}
}
template <> void assert_equals(const std::string &x, const std::string &y, const char *file, int line)
{
if (x != y) {
std::cerr << x << " != " << y << " at " << file << ":" << line << std::endl;
for (unsigned i=0; i<x.length(); i++)
fprintf(stderr, "%02x ", (unsigned char)x[i]);
std::cerr << std::endl;
for (unsigned i=0; i<y.length(); i++)
fprintf(stderr, "%02x ", (unsigned char)y[i]);
std::cerr << std::endl;
std::exit(1);
}
}
template <typename T> void assert_nequals(const T &x, const T &y, const char *file, int line)
{
if (x == y) {
std::cerr << x << " == " << y << " at " << file << ":" << line << std::endl;
std::exit(1);
}
}
template <> void assert_nequals(const std::string &x, const std::string &y, const char *file, int line)
{
if (x == y) {
std::cerr << x << " == " << y << " at " << file << ":" << line << std::endl;
for (unsigned i=0; i<x.length(); i++)
fprintf(stderr, "%02x ", (unsigned char)x[i]);
std::cerr << std::endl;
for (unsigned i=0; i<y.length(); i++)
fprintf(stderr, "%02x ", (unsigned char)y[i]);
std::cerr << std::endl;
std::exit(1);
}
}
@ -43,5 +82,8 @@ void assert_strequals(const char *x, const char *y, const char *file, int line)
#define ASSERT_EQUALS(x, y) \
assert_equals((x), (y), __FILE__, __LINE__)
#define ASSERT_NEQUALS(x, y) \
assert_nequals((x), (y), __FILE__, __LINE__)
#define ASSERT_STREQUALS(x, y) \
assert_strequals((x), (y), __FILE__, __LINE__)