mirror of
https://github.com/s3fs-fuse/s3fs-fuse.git
synced 2025-01-03 05:00:15 +00:00
Adding utility functions to convert invalid utf8 to wtf8 encoding
This is to deal with windows clients who pass in cp1252 as if it were utf8
This commit is contained in:
parent
951761ee2c
commit
ca2d1d873d
@ -451,6 +451,96 @@ unsigned char* s3fs_decode64(const char* input, size_t* plength)
|
||||
return result;
|
||||
}
|
||||
|
||||
/* handle invalid utf8 by creating surrogate escape pairs.
|
||||
* this converts the data into the so-called wtf-8 encoding.
|
||||
* It is necessary if we are given data that isn't proper utf8
|
||||
* but the aws api requires proper utf8 for object names
|
||||
*/
|
||||
string s3fs_surrogateescape(const string &s)
|
||||
{
|
||||
// Pass valid utf8 code through
|
||||
string result;
|
||||
for (unsigned i = 0; i < s.length(); i++) {
|
||||
unsigned char c = s[i];
|
||||
// single byte encoding
|
||||
if (c <= 0x7f) {
|
||||
result += c;
|
||||
continue;
|
||||
}
|
||||
// two byte encoding
|
||||
if ((c & 0xe0) == 0xc0) {
|
||||
if ((i + 1) < s.length() && (s[i+1] & 0xc0) == 0x80) {
|
||||
// printf("two bytes %02x at %d\n", c, i);
|
||||
result += c;
|
||||
result += s[++i];
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// three byte encoding
|
||||
if ((c & 0xf0) == 0xe0) {
|
||||
if ((i + 2) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80) {
|
||||
// printf("three bytes %02x at %d\n", c, i);
|
||||
result += c;
|
||||
result += s[++i];
|
||||
result += s[++i];
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// four byte encoding
|
||||
if ((c & 0xf8) == 0xf0) {
|
||||
if ((i + 3) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80 && (s[i+3] & 0xc0) == 0x80) {
|
||||
// printf("four bytes %02x at %d\n", c, i);
|
||||
result += c;
|
||||
result += s[++i];
|
||||
result += s[++i];
|
||||
result += s[++i];
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// printf("invalid %02x at %d\n", c, i);
|
||||
// Invalid utf8 code. Convert to the surrogate pair (also known as wtf-8 encoding)
|
||||
// we use lone surrogates, UDC80-UDCFF for this.
|
||||
// if the byte is below 128, we cannot do this so we just pass the byte through and hope
|
||||
// for the best, but really, this should be an error
|
||||
if (c < 128) {
|
||||
result += c;
|
||||
continue;
|
||||
}
|
||||
// output the lone surrogate as utf8 encoded. This is a three byte utf8 encoding:
|
||||
unsigned surr = 0xdc00 + c;
|
||||
result += 0xe0 | ((surr >> 12) & 0x0f);
|
||||
result += 0x80 | ((surr >> 06) & 0x3f);
|
||||
result += 0x80 | ((surr >> 00) & 0x3f);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
string s3fs_surrogatedecode(const string &s)
|
||||
{
|
||||
// the reverse operation. Look for lone surrogates and replace them
|
||||
string result;
|
||||
for (unsigned i = 0; i < s.length(); i++) {
|
||||
unsigned char c = s[i];
|
||||
// look for a three byte encoding matching a lone surrogate
|
||||
// three byte encoding
|
||||
if ((c & 0xf0) == 0xe0) {
|
||||
if ((i + 2) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80) {
|
||||
unsigned surr = (c & 0x0f) << 12;
|
||||
surr |= (s[i+1] & 0x3f) << 6;
|
||||
surr |= (s[i+2] & 0x3f) << 0;
|
||||
if (surr >= 0xdc80 && surr <= 0xdcff) {
|
||||
// convert back
|
||||
result += surr & 0xff;
|
||||
i+=2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
result += c;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* Local variables:
|
||||
* tab-width: 4
|
||||
|
@ -58,6 +58,9 @@ std::string s3fs_hex(const unsigned char* input, size_t length);
|
||||
char* s3fs_base64(const unsigned char* input, size_t length);
|
||||
unsigned char* s3fs_decode64(const char* input, size_t* plength);
|
||||
|
||||
std::string s3fs_surrogateescape(const std::string &s);
|
||||
std::string s3fs_surrogatedecode(const std::string &s);
|
||||
|
||||
#endif // S3FS_STRING_UTIL_H_
|
||||
|
||||
/*
|
||||
|
@ -87,10 +87,35 @@ void test_strtoofft()
|
||||
ASSERT_EQUALS(s3fs_strtoofft("deadbeef", /*is_base_16=*/ true), static_cast<off_t>(3735928559L));
|
||||
}
|
||||
|
||||
void test_surrogateescape()
|
||||
{
|
||||
std::string ascii("normal string");
|
||||
std::string utf8("Hyld\xc3\xbdpi \xc3\xbej\xc3\xb3\xc3\xb0""f\xc3\xa9lagsins vex \xc3\xbar k\xc3\xa6rkomnu b\xc3\xb6li \xc3\xad \xc3\xa1st");
|
||||
std::string cp1252("Hyld\xfdpi \xfej\xf3\xf0""f\xe9lagsins vex \xfar k\xe6rkomnu b\xf6li \xed \xe1st");
|
||||
std::string broken = utf8;
|
||||
broken[14] = 0x97;
|
||||
std::string mixed = ascii + utf8 + cp1252;
|
||||
|
||||
ASSERT_EQUALS(s3fs_surrogateescape(ascii), ascii);
|
||||
ASSERT_EQUALS(s3fs_surrogatedecode(ascii), ascii);
|
||||
ASSERT_EQUALS(s3fs_surrogateescape(utf8), utf8);
|
||||
ASSERT_EQUALS(s3fs_surrogatedecode(utf8), utf8);
|
||||
|
||||
ASSERT_NEQUALS(s3fs_surrogateescape(cp1252), cp1252);
|
||||
ASSERT_EQUALS(s3fs_surrogatedecode(s3fs_surrogateescape(cp1252)), cp1252);
|
||||
|
||||
ASSERT_NEQUALS(s3fs_surrogateescape(broken), broken);
|
||||
ASSERT_EQUALS(s3fs_surrogatedecode(s3fs_surrogateescape(broken)), broken);
|
||||
|
||||
ASSERT_NEQUALS(s3fs_surrogateescape(mixed), mixed);
|
||||
ASSERT_EQUALS(s3fs_surrogatedecode(s3fs_surrogateescape(mixed)), mixed);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
test_trim();
|
||||
test_base64();
|
||||
test_strtoofft();
|
||||
test_surrogateescape();
|
||||
return 0;
|
||||
}
|
||||
|
@ -20,11 +20,50 @@
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <stdio.h>
|
||||
|
||||
template <typename T> void assert_equals(const T &x, const T &y, const char *file, int line)
|
||||
{
|
||||
if (x != y) {
|
||||
std::cerr << x << " != " << y << " at " << file << ":" << line << std::endl;
|
||||
std::cerr << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
template <> void assert_equals(const std::string &x, const std::string &y, const char *file, int line)
|
||||
{
|
||||
if (x != y) {
|
||||
std::cerr << x << " != " << y << " at " << file << ":" << line << std::endl;
|
||||
for (unsigned i=0; i<x.length(); i++)
|
||||
fprintf(stderr, "%02x ", (unsigned char)x[i]);
|
||||
std::cerr << std::endl;
|
||||
for (unsigned i=0; i<y.length(); i++)
|
||||
fprintf(stderr, "%02x ", (unsigned char)y[i]);
|
||||
std::cerr << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T> void assert_nequals(const T &x, const T &y, const char *file, int line)
|
||||
{
|
||||
if (x == y) {
|
||||
std::cerr << x << " == " << y << " at " << file << ":" << line << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
template <> void assert_nequals(const std::string &x, const std::string &y, const char *file, int line)
|
||||
{
|
||||
if (x == y) {
|
||||
std::cerr << x << " == " << y << " at " << file << ":" << line << std::endl;
|
||||
for (unsigned i=0; i<x.length(); i++)
|
||||
fprintf(stderr, "%02x ", (unsigned char)x[i]);
|
||||
std::cerr << std::endl;
|
||||
for (unsigned i=0; i<y.length(); i++)
|
||||
fprintf(stderr, "%02x ", (unsigned char)y[i]);
|
||||
std::cerr << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
}
|
||||
@ -43,5 +82,8 @@ void assert_strequals(const char *x, const char *y, const char *file, int line)
|
||||
#define ASSERT_EQUALS(x, y) \
|
||||
assert_equals((x), (y), __FILE__, __LINE__)
|
||||
|
||||
#define ASSERT_NEQUALS(x, y) \
|
||||
assert_nequals((x), (y), __FILE__, __LINE__)
|
||||
|
||||
#define ASSERT_STREQUALS(x, y) \
|
||||
assert_strequals((x), (y), __FILE__, __LINE__)
|
||||
|
Loading…
Reference in New Issue
Block a user