Merge pull request #960 from kristjanvalur/wtf8

Wtf8
This commit is contained in:
Takeshi Nakatani 2019-02-27 21:21:38 +09:00 committed by GitHub
commit 0791fdca2a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 293 additions and 28 deletions

View File

@ -306,6 +306,14 @@ If you specify no argument as an option, objects older than 24 hours(24H) will b
You can specify an optional date format.
It can be specified as year, month, day, hour, minute, second, and it is expressed as "Y", "M", "D", "h", "m", "s" respectively.
For example, "1Y6M10D12h30m30s".
.TP
\fB\-o\fR use_wtf8 - support arbitrary file system encoding.
S3 requires all object names to be valid utf-8. But some
clients, notably Windows NFS clients, use their own encoding.
This option re-encodes invalid utf-8 object names into valid
utf-8 by mapping offending codes into a 'private' codepage of the
Unicode set.
Useful on clients not using utf-8 as their file system encoding.
.SH FUSE/MOUNT OPTIONS
.TP
Most of the generic mount options described in 'man mount' are supported (ro, rw, suid, nosuid, dev, nodev, exec, noexec, atime, noatime, sync async, dirsync). Filesystems are mounted with '\-onodev,nosuid' by default, which can only be overridden by a privileged user.

View File

@ -147,6 +147,7 @@ static bool is_specified_endpoint = false;
static int s3fs_init_deferred_exit_status = 0;
static bool support_compat_dir = true;// default supports compatibility directory type
static int max_keys_list_object = 1000;// default is 1000
static bool use_wtf8 = false;
static const std::string allbucket_fields_type; // special key for mapping(This name is absolutely not used as a bucket name)
static const std::string keyval_fields_type = "\t"; // special key for mapping(This name is absolutely not used as a bucket name)
@ -251,6 +252,17 @@ static int s3fs_getxattr(const char* path, const char* name, char* value, size_t
static int s3fs_listxattr(const char* path, char* list, size_t size);
static int s3fs_removexattr(const char* path, const char* name);
//-------------------------------------------------------------------
// WTF8 macros
//-------------------------------------------------------------------
#define WTF8_ENCODE(ARG) \
std::string ARG##_buf; \
const char * ARG = _##ARG; \
if (use_wtf8 && s3fs_wtf8_encode( _##ARG, 0 )) { \
s3fs_wtf8_encode( _##ARG, &ARG##_buf); \
ARG = ARG##_buf.c_str(); \
}
//-------------------------------------------------------------------
// Functions
//-------------------------------------------------------------------
@ -847,8 +859,9 @@ static int put_headers(const char* path, headers_t& meta, bool is_copy)
return 0;
}
static int s3fs_getattr(const char* path, struct stat* stbuf)
static int s3fs_getattr(const char* _path, struct stat* stbuf)
{
WTF8_ENCODE(path)
int result;
S3FS_PRN_INFO("[path=%s]", path);
@ -881,11 +894,12 @@ static int s3fs_getattr(const char* path, struct stat* stbuf)
return result;
}
static int s3fs_readlink(const char* path, char* buf, size_t size)
static int s3fs_readlink(const char* _path, char* buf, size_t size)
{
if(!path || !buf || 0 == size){
if(!_path || !buf || 0 == size){
return 0;
}
WTF8_ENCODE(path)
// Open
FdEntity* ent;
if(NULL == (ent = get_local_fent(path))){
@ -913,6 +927,9 @@ static int s3fs_readlink(const char* path, char* buf, size_t size)
// check buf if it has space words.
string strTmp = trim(string(buf));
// decode wtf8. This will always be shorter
if (use_wtf8)
strTmp = s3fs_wtf8_decode(strTmp);
strcpy(buf, strTmp.c_str());
FdManager::get()->Close(ent);
@ -985,8 +1002,9 @@ static int create_file_object(const char* path, mode_t mode, uid_t uid, gid_t gi
return s3fscurl.PutRequest(path, meta, -1); // fd=-1 means for creating zero byte object.
}
static int s3fs_mknod(const char *path, mode_t mode, dev_t rdev)
static int s3fs_mknod(const char *_path, mode_t mode, dev_t rdev)
{
WTF8_ENCODE(path)
int result;
struct fuse_context* pcxt;
@ -1006,8 +1024,9 @@ static int s3fs_mknod(const char *path, mode_t mode, dev_t rdev)
return result;
}
static int s3fs_create(const char* path, mode_t mode, struct fuse_file_info* fi)
static int s3fs_create(const char* _path, mode_t mode, struct fuse_file_info* fi)
{
WTF8_ENCODE(path)
int result;
struct fuse_context* pcxt;
@ -1073,8 +1092,9 @@ static int create_directory_object(const char* path, mode_t mode, time_t time, u
return s3fscurl.PutRequest(tpath.c_str(), meta, -1); // fd=-1 means for creating zero byte object.
}
static int s3fs_mkdir(const char* path, mode_t mode)
static int s3fs_mkdir(const char* _path, mode_t mode)
{
WTF8_ENCODE(path)
int result;
struct fuse_context* pcxt;
@ -1102,8 +1122,9 @@ static int s3fs_mkdir(const char* path, mode_t mode)
return result;
}
static int s3fs_unlink(const char* path)
static int s3fs_unlink(const char* _path)
{
WTF8_ENCODE(path)
int result;
S3FS_PRN_INFO("[path=%s]", path);
@ -1135,8 +1156,9 @@ static int directory_empty(const char* path)
return 0;
}
static int s3fs_rmdir(const char* path)
static int s3fs_rmdir(const char* _path)
{
WTF8_ENCODE(path)
int result;
string strpath;
struct stat stbuf;
@ -1191,8 +1213,10 @@ static int s3fs_rmdir(const char* path)
return result;
}
static int s3fs_symlink(const char* from, const char* to)
static int s3fs_symlink(const char* _from, const char* _to)
{
WTF8_ENCODE(from)
WTF8_ENCODE(to)
int result;
struct fuse_context* pcxt;
@ -1514,8 +1538,10 @@ static int rename_directory(const char* from, const char* to)
return 0;
}
static int s3fs_rename(const char* from, const char* to)
static int s3fs_rename(const char* _from, const char* _to)
{
WTF8_ENCODE(from)
WTF8_ENCODE(to)
struct stat buf;
int result;
@ -1556,14 +1582,17 @@ static int s3fs_rename(const char* from, const char* to)
return result;
}
static int s3fs_link(const char* from, const char* to)
static int s3fs_link(const char* _from, const char* _to)
{
WTF8_ENCODE(from)
WTF8_ENCODE(to)
S3FS_PRN_INFO("[from=%s][to=%s]", from, to);
return -ENOTSUP;
}
static int s3fs_chmod(const char* path, mode_t mode)
static int s3fs_chmod(const char* _path, mode_t mode)
{
WTF8_ENCODE(path)
int result;
string strpath;
string newpath;
@ -1639,8 +1668,9 @@ static int s3fs_chmod(const char* path, mode_t mode)
return 0;
}
static int s3fs_chmod_nocopy(const char* path, mode_t mode)
static int s3fs_chmod_nocopy(const char* _path, mode_t mode)
{
WTF8_ENCODE(path)
int result;
string strpath;
string newpath;
@ -1715,8 +1745,9 @@ static int s3fs_chmod_nocopy(const char* path, mode_t mode)
return result;
}
static int s3fs_chown(const char* path, uid_t uid, gid_t gid)
static int s3fs_chown(const char* _path, uid_t uid, gid_t gid)
{
WTF8_ENCODE(path)
int result;
string strpath;
string newpath;
@ -1786,8 +1817,9 @@ static int s3fs_chown(const char* path, uid_t uid, gid_t gid)
return 0;
}
static int s3fs_chown_nocopy(const char* path, uid_t uid, gid_t gid)
static int s3fs_chown_nocopy(const char* _path, uid_t uid, gid_t gid)
{
WTF8_ENCODE(path)
int result;
string strpath;
string newpath;
@ -1870,8 +1902,9 @@ static int s3fs_chown_nocopy(const char* path, uid_t uid, gid_t gid)
return result;
}
static int s3fs_utimens(const char* path, const struct timespec ts[2])
static int s3fs_utimens(const char* _path, const struct timespec ts[2])
{
WTF8_ENCODE(path)
int result;
string strpath;
string newpath;
@ -1935,8 +1968,9 @@ static int s3fs_utimens(const char* path, const struct timespec ts[2])
return 0;
}
static int s3fs_utimens_nocopy(const char* path, const struct timespec ts[2])
static int s3fs_utimens_nocopy(const char* _path, const struct timespec ts[2])
{
WTF8_ENCODE(path)
int result;
string strpath;
string newpath;
@ -2017,8 +2051,9 @@ static int s3fs_utimens_nocopy(const char* path, const struct timespec ts[2])
return result;
}
static int s3fs_truncate(const char* path, off_t size)
static int s3fs_truncate(const char* _path, off_t size)
{
WTF8_ENCODE(path)
int result;
headers_t meta;
FdEntity* ent = NULL;
@ -2084,8 +2119,9 @@ static int s3fs_truncate(const char* path, off_t size)
return result;
}
static int s3fs_open(const char* path, struct fuse_file_info* fi)
static int s3fs_open(const char* _path, struct fuse_file_info* fi)
{
WTF8_ENCODE(path)
int result;
struct stat st;
bool needs_flush = false;
@ -2144,8 +2180,9 @@ static int s3fs_open(const char* path, struct fuse_file_info* fi)
return 0;
}
static int s3fs_read(const char* path, char* buf, size_t size, off_t offset, struct fuse_file_info* fi)
static int s3fs_read(const char* _path, char* buf, size_t size, off_t offset, struct fuse_file_info* fi)
{
WTF8_ENCODE(path)
ssize_t res;
S3FS_PRN_DBG("[path=%s][size=%zu][offset=%jd][fd=%llu]", path, size, (intmax_t)offset, (unsigned long long)(fi->fh));
@ -2175,8 +2212,9 @@ static int s3fs_read(const char* path, char* buf, size_t size, off_t offset, str
return static_cast<int>(res);
}
static int s3fs_write(const char* path, const char* buf, size_t size, off_t offset, struct fuse_file_info* fi)
static int s3fs_write(const char* _path, const char* buf, size_t size, off_t offset, struct fuse_file_info* fi)
{
WTF8_ENCODE(path)
ssize_t res;
S3FS_PRN_DBG("[path=%s][size=%zu][offset=%jd][fd=%llu]", path, size, (intmax_t)offset, (unsigned long long)(fi->fh));
@ -2197,8 +2235,9 @@ static int s3fs_write(const char* path, const char* buf, size_t size, off_t offs
return static_cast<int>(res);
}
static int s3fs_statfs(const char* path, struct statvfs* stbuf)
static int s3fs_statfs(const char* _path, struct statvfs* stbuf)
{
// WTF8_ENCODE(path)
// 256T
stbuf->f_bsize = 0X1000000;
stbuf->f_blocks = 0X1000000;
@ -2208,8 +2247,9 @@ static int s3fs_statfs(const char* path, struct statvfs* stbuf)
return 0;
}
static int s3fs_flush(const char* path, struct fuse_file_info* fi)
static int s3fs_flush(const char* _path, struct fuse_file_info* fi)
{
WTF8_ENCODE(path)
int result;
S3FS_PRN_INFO("[path=%s][fd=%llu]", path, (unsigned long long)(fi->fh));
@ -2241,8 +2281,9 @@ static int s3fs_flush(const char* path, struct fuse_file_info* fi)
// [NOTICE]
// Assumption is a valid fd.
//
static int s3fs_fsync(const char* path, int datasync, struct fuse_file_info* fi)
static int s3fs_fsync(const char* _path, int datasync, struct fuse_file_info* fi)
{
WTF8_ENCODE(path)
int result = 0;
S3FS_PRN_INFO("[path=%s][fd=%llu]", path, (unsigned long long)(fi->fh));
@ -2263,8 +2304,9 @@ static int s3fs_fsync(const char* path, int datasync, struct fuse_file_info* fi)
return result;
}
static int s3fs_release(const char* path, struct fuse_file_info* fi)
static int s3fs_release(const char* _path, struct fuse_file_info* fi)
{
WTF8_ENCODE(path)
S3FS_PRN_INFO("[path=%s][fd=%llu]", path, (unsigned long long)(fi->fh));
// [NOTE]
@ -2303,8 +2345,9 @@ static int s3fs_release(const char* path, struct fuse_file_info* fi)
return 0;
}
static int s3fs_opendir(const char* path, struct fuse_file_info* fi)
static int s3fs_opendir(const char* _path, struct fuse_file_info* fi)
{
WTF8_ENCODE(path)
int result;
int mask = (O_RDONLY != (fi->flags & O_ACCMODE) ? W_OK : R_OK) | X_OK;
@ -2435,8 +2478,11 @@ static int readdir_multi_head(const char* path, S3ObjList& head, void* buf, fuse
//
for(iter = fillerlist.begin(); fillerlist.end() != iter; ++iter){
struct stat st;
bool in_cache = StatCache::getStatCacheData()->GetStat((*iter), &st);
string bpath = mybasename((*iter));
if(StatCache::getStatCacheData()->GetStat((*iter), &st)){
if (use_wtf8)
bpath = s3fs_wtf8_decode(bpath);
if(in_cache){
filler(buf, bpath.c_str(), &st, 0);
}else{
S3FS_PRN_INFO2("Could not find %s file in stat cache.", (*iter).c_str());
@ -2447,8 +2493,9 @@ static int readdir_multi_head(const char* path, S3ObjList& head, void* buf, fuse
return result;
}
static int s3fs_readdir(const char* path, void* buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info* fi)
static int s3fs_readdir(const char* _path, void* buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info* fi)
{
WTF8_ENCODE(path)
S3ObjList head;
int result;
@ -4944,6 +4991,10 @@ static int my_fuse_opt_proc(void* data, const char* arg, int key, struct fuse_ar
S3FS_PRN_EXIT("option secretAccessKey is no longer supported.");
return -1;
}
if(0 == strcmp(arg, "use_wtf8")){
use_wtf8 = true;
return 0;
}
}
return 1;
}

View File

@ -1338,6 +1338,14 @@ void show_help ()
" Please use this option when the directory in the bucket is\n"
" only \"dir/\" object.\n"
"\n"
" use_wtf8 - support arbitrary file system encoding.\n"
" S3 requires all object names to be valid utf-8. But some\n"
" clients, notably Windows NFS clients, use their own encoding.\n"
" This option re-encodes invalid utf-8 object names into valid\n"
" utf-8 by mapping offending codes into a 'private' codepage of the\n"
" Unicode set.\n"
" Useful on clients not using utf-8 as their file system encoding.\n"
"\n"
"FUSE/mount Options:\n"
"\n"
" Most of the generic mount options described in 'man mount' are\n"

View File

@ -451,6 +451,132 @@ unsigned char* s3fs_decode64(const char* input, size_t* plength)
return result;
}
/*
* detect and rewrite invalid utf8. We take invalid bytes
* and encode them into a private region of the unicode
* space. This is sometimes known as wtf8, wobbly transformation format.
* it is necessary because S3 validates the utf8 used for identifiers for
* correctness, while some clients may provide invalid utf, notably
* windows using cp1252.
*/
// Base location for transform. The range 0xE000 - 0xF8ff
// is a private range, se use the start of this range.
static unsigned int escape_base = 0xe000;
// encode bytes into wobbly utf8.
// 'result' can be null. returns true if transform was needed.
bool s3fs_wtf8_encode(const char *s, string *result)
{
bool invalid = false;
// Pass valid utf8 code through
for (; *s; s++) {
const unsigned char c = *s;
// single byte encoding
if (c <= 0x7f) {
if (result)
*result += c;
continue;
}
// otherwise, it must be one of the valid start bytes
if ( c >= 0xc2 && c <= 0xf5 ) {
// two byte encoding
// don't need bounds check, string is zero terminated
if ((c & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80) {
// all two byte encodings starting higher than c1 are valid
if (result) {
*result += c;
*result += *(++s);
}
continue;
}
// three byte encoding
if ((c & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80) {
const unsigned code = ((c & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
if (code >= 0x800 && ! (code >= 0xd800 && code <= 0xd8ff)) {
// not overlong and not a surrogate pair
if (result) {
*result += c;
*result += *(++s);
*result += *(++s);
}
continue;
}
}
// four byte encoding
if ((c & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80) {
const unsigned code = ((c & 0x07) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
if (code >= 0x10000 && code <= 0x10ffff) {
// not overlong and in defined unicode space
if (result) {
*result += c;
*result += *(++s);
*result += *(++s);
*result += *(++s);
}
continue;
}
}
}
// printf("invalid %02x at %d\n", c, i);
// Invalid utf8 code. Convert it to a private two byte area of unicode
// e.g. the e000 - f8ff area. This will be a three byte encoding
invalid = true;
if (result) {
unsigned escape = escape_base + c;
*result += 0xe0 | ((escape >> 12) & 0x0f);
*result += 0x80 | ((escape >> 06) & 0x3f);
*result += 0x80 | ((escape >> 00) & 0x3f);
}
}
return invalid;
}
string s3fs_wtf8_encode(const string &s)
{
string result;
s3fs_wtf8_encode(s.c_str(), &result);
return result;
}
// The reverse operation, turn encoded bytes back into their original values
// The code assumes that we map to a three-byte code point.
bool s3fs_wtf8_decode(const char *s, string *result)
{
bool encoded = false;
for (; *s; s++) {
unsigned char c = *s;
// look for a three byte tuple matching our encoding code
if ((c & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80) {
unsigned code = (c & 0x0f) << 12;
code |= (s[1] & 0x3f) << 6;
code |= (s[2] & 0x3f) << 0;
if (code >= escape_base && code <= escape_base + 0xff) {
// convert back
encoded = true;
if (result)
*result += code - escape_base;
s+=2;
continue;
}
}
if (result)
*result += c;
}
return encoded;
}
string s3fs_wtf8_decode(const string &s)
{
string result;
s3fs_wtf8_decode(s.c_str(), &result);
return result;
}
/*
* Local variables:
* tab-width: 4

View File

@ -58,6 +58,11 @@ std::string s3fs_hex(const unsigned char* input, size_t length);
char* s3fs_base64(const unsigned char* input, size_t length);
unsigned char* s3fs_decode64(const char* input, size_t* plength);
bool s3fs_wtf8_encode(const char *s, std::string *result);
std::string s3fs_wtf8_encode(const std::string &s);
bool s3fs_wtf8_decode(const char *s, std::string *result);
std::string s3fs_wtf8_decode(const std::string &s);
#endif // S3FS_STRING_UTIL_H_
/*

View File

@ -87,10 +87,35 @@ void test_strtoofft()
ASSERT_EQUALS(s3fs_strtoofft("deadbeef", /*is_base_16=*/ true), static_cast<off_t>(3735928559L));
}
void test_wtf8_encoding()
{
std::string ascii("normal string");
std::string utf8("Hyld\xc3\xbdpi \xc3\xbej\xc3\xb3\xc3\xb0""f\xc3\xa9lagsins vex \xc3\xbar k\xc3\xa6rkomnu b\xc3\xb6li \xc3\xad \xc3\xa1st");
std::string cp1252("Hyld\xfdpi \xfej\xf3\xf0""f\xe9lagsins vex \xfar k\xe6rkomnu b\xf6li \xed \xe1st");
std::string broken = utf8;
broken[14] = 0x97;
std::string mixed = ascii + utf8 + cp1252;
ASSERT_EQUALS(s3fs_wtf8_encode(ascii), ascii);
ASSERT_EQUALS(s3fs_wtf8_decode(ascii), ascii);
ASSERT_EQUALS(s3fs_wtf8_encode(utf8), utf8);
ASSERT_EQUALS(s3fs_wtf8_decode(utf8), utf8);
ASSERT_NEQUALS(s3fs_wtf8_encode(cp1252), cp1252);
ASSERT_EQUALS(s3fs_wtf8_decode(s3fs_wtf8_encode(cp1252)), cp1252);
ASSERT_NEQUALS(s3fs_wtf8_encode(broken), broken);
ASSERT_EQUALS(s3fs_wtf8_decode(s3fs_wtf8_encode(broken)), broken);
ASSERT_NEQUALS(s3fs_wtf8_encode(mixed), mixed);
ASSERT_EQUALS(s3fs_wtf8_decode(s3fs_wtf8_encode(mixed)), mixed);
}
int main(int argc, char *argv[])
{
test_trim();
test_base64();
test_strtoofft();
test_wtf8_encoding();
return 0;
}

View File

@ -20,11 +20,50 @@
#include <cstdlib>
#include <iostream>
#include <stdio.h>
template <typename T> void assert_equals(const T &x, const T &y, const char *file, int line)
{
if (x != y) {
std::cerr << x << " != " << y << " at " << file << ":" << line << std::endl;
std::cerr << std::endl;
std::exit(1);
}
}
template <> void assert_equals(const std::string &x, const std::string &y, const char *file, int line)
{
if (x != y) {
std::cerr << x << " != " << y << " at " << file << ":" << line << std::endl;
for (unsigned i=0; i<x.length(); i++)
fprintf(stderr, "%02x ", (unsigned char)x[i]);
std::cerr << std::endl;
for (unsigned i=0; i<y.length(); i++)
fprintf(stderr, "%02x ", (unsigned char)y[i]);
std::cerr << std::endl;
std::exit(1);
}
}
template <typename T> void assert_nequals(const T &x, const T &y, const char *file, int line)
{
if (x == y) {
std::cerr << x << " == " << y << " at " << file << ":" << line << std::endl;
std::exit(1);
}
}
template <> void assert_nequals(const std::string &x, const std::string &y, const char *file, int line)
{
if (x == y) {
std::cerr << x << " == " << y << " at " << file << ":" << line << std::endl;
for (unsigned i=0; i<x.length(); i++)
fprintf(stderr, "%02x ", (unsigned char)x[i]);
std::cerr << std::endl;
for (unsigned i=0; i<y.length(); i++)
fprintf(stderr, "%02x ", (unsigned char)y[i]);
std::cerr << std::endl;
std::exit(1);
}
}
@ -43,5 +82,8 @@ void assert_strequals(const char *x, const char *y, const char *file, int line)
#define ASSERT_EQUALS(x, y) \
assert_equals((x), (y), __FILE__, __LINE__)
#define ASSERT_NEQUALS(x, y) \
assert_nequals((x), (y), __FILE__, __LINE__)
#define ASSERT_STREQUALS(x, y) \
assert_strequals((x), (y), __FILE__, __LINE__)