2010-11-13 23:59:23 +00:00
|
|
|
/*
|
|
|
|
* s3fs - FUSE-based file system backed by Amazon S3
|
|
|
|
*
|
2017-05-07 11:24:17 +00:00
|
|
|
* Copyright(C) 2007 Randy Rizun <rrizun@gmail.com>
|
2010-11-13 23:59:23 +00:00
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version 2
|
|
|
|
* of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
*/
|
2019-07-13 01:52:35 +00:00
|
|
|
#include <cerrno>
|
2019-07-12 10:33:53 +00:00
|
|
|
#include <climits>
|
|
|
|
#include <cstdio>
|
|
|
|
#include <cstdlib>
|
|
|
|
#include <cstring>
|
2011-06-26 00:37:52 +00:00
|
|
|
#include <syslog.h>
|
2019-07-12 10:33:53 +00:00
|
|
|
#include <ctime>
|
2011-06-26 00:37:52 +00:00
|
|
|
|
2019-07-13 01:52:35 +00:00
|
|
|
#include <stdexcept>
|
2011-06-26 00:37:52 +00:00
|
|
|
#include <sstream>
|
|
|
|
#include <string>
|
2013-03-30 13:37:14 +00:00
|
|
|
#include <map>
|
2010-11-13 23:59:23 +00:00
|
|
|
|
2013-03-30 13:37:14 +00:00
|
|
|
#include "common.h"
|
2010-11-13 23:59:23 +00:00
|
|
|
#include "string_util.h"
|
|
|
|
|
2011-06-26 00:37:52 +00:00
|
|
|
using namespace std;
|
|
|
|
|
2017-11-18 18:10:29 +00:00
|
|
|
template <class T> std::string str(T value) {
|
2019-01-29 18:44:33 +00:00
|
|
|
std::ostringstream s;
|
2017-11-18 18:10:29 +00:00
|
|
|
s << value;
|
|
|
|
return s.str();
|
|
|
|
}
|
|
|
|
|
|
|
|
template std::string str(short value);
|
|
|
|
template std::string str(unsigned short value);
|
|
|
|
template std::string str(int value);
|
|
|
|
template std::string str(unsigned int value);
|
|
|
|
template std::string str(long value);
|
|
|
|
template std::string str(unsigned long value);
|
|
|
|
template std::string str(long long value);
|
|
|
|
template std::string str(unsigned long long value);
|
|
|
|
|
2011-06-26 00:37:52 +00:00
|
|
|
static const char hexAlphabet[] = "0123456789ABCDEF";
|
|
|
|
|
2019-07-13 01:52:35 +00:00
|
|
|
// replacement for C++11 std::stoll
|
2013-11-17 08:50:41 +00:00
|
|
|
off_t s3fs_strtoofft(const char* str, bool is_base_16)
|
|
|
|
{
|
2019-07-13 01:52:35 +00:00
|
|
|
errno = 0;
|
|
|
|
char *temp;
|
|
|
|
long long result = strtoll(str, &temp, is_base_16 ? 16 : 10);
|
|
|
|
|
|
|
|
if(temp == str || *temp != '\0'){
|
|
|
|
throw std::invalid_argument("s3fs_strtoofft");
|
2013-11-17 08:50:41 +00:00
|
|
|
}
|
2019-07-13 01:52:35 +00:00
|
|
|
if((result == LLONG_MIN || result == LLONG_MAX) && errno == ERANGE){
|
|
|
|
throw std::out_of_range("s3fs_strtoofft");
|
2013-11-17 08:50:41 +00:00
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2013-03-30 13:37:14 +00:00
|
|
|
string lower(string s)
|
|
|
|
{
|
2010-11-13 23:59:23 +00:00
|
|
|
// change each character of the string to lower case
|
2019-02-12 02:41:01 +00:00
|
|
|
for(size_t i = 0; i < s.length(); i++){
|
2010-11-13 23:59:23 +00:00
|
|
|
s[i] = tolower(s[i]);
|
2013-03-30 13:37:14 +00:00
|
|
|
}
|
2010-11-13 23:59:23 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2013-03-30 13:37:14 +00:00
|
|
|
string trim_left(const string &s, const string &t /* = SPACES */)
|
|
|
|
{
|
2011-06-26 00:37:52 +00:00
|
|
|
string d(s);
|
|
|
|
return d.erase(0, s.find_first_not_of(t));
|
|
|
|
}
|
|
|
|
|
2013-03-30 13:37:14 +00:00
|
|
|
string trim_right(const string &s, const string &t /* = SPACES */)
|
|
|
|
{
|
2011-06-26 00:37:52 +00:00
|
|
|
string d(s);
|
|
|
|
string::size_type i(d.find_last_not_of(t));
|
2013-03-30 13:37:14 +00:00
|
|
|
if(i == string::npos){
|
2011-06-26 00:37:52 +00:00
|
|
|
return "";
|
2013-03-30 13:37:14 +00:00
|
|
|
}else{
|
2011-06-26 00:37:52 +00:00
|
|
|
return d.erase(d.find_last_not_of(t) + 1);
|
2013-03-30 13:37:14 +00:00
|
|
|
}
|
2011-06-26 00:37:52 +00:00
|
|
|
}
|
|
|
|
|
2013-03-30 13:37:14 +00:00
|
|
|
string trim(const string &s, const string &t /* = SPACES */)
|
|
|
|
{
|
2019-01-23 07:04:47 +00:00
|
|
|
return trim_left(trim_right(s, t), t);
|
2011-06-26 00:37:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* urlEncode a fuse path,
|
|
|
|
* taking into special consideration "/",
|
|
|
|
* otherwise regular urlEncode.
|
|
|
|
*/
|
2013-03-30 13:37:14 +00:00
|
|
|
string urlEncode(const string &s)
|
|
|
|
{
|
2011-06-26 00:37:52 +00:00
|
|
|
string result;
|
2019-02-12 02:41:01 +00:00
|
|
|
for (size_t i = 0; i < s.length(); ++i) {
|
2015-05-26 01:27:01 +00:00
|
|
|
char c = s[i];
|
|
|
|
if (c == '/' // Note- special case for fuse paths...
|
|
|
|
|| c == '.'
|
|
|
|
|| c == '-'
|
|
|
|
|| c == '_'
|
|
|
|
|| c == '~'
|
|
|
|
|| (c >= 'a' && c <= 'z')
|
|
|
|
|| (c >= 'A' && c <= 'Z')
|
|
|
|
|| (c >= '0' && c <= '9')) {
|
|
|
|
result += c;
|
2013-03-30 13:37:14 +00:00
|
|
|
} else {
|
2011-06-26 00:37:52 +00:00
|
|
|
result += "%";
|
2015-05-26 01:27:01 +00:00
|
|
|
result += hexAlphabet[static_cast<unsigned char>(c) / 16];
|
|
|
|
result += hexAlphabet[static_cast<unsigned char>(c) % 16];
|
2011-06-26 00:37:52 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2015-01-20 16:31:36 +00:00
|
|
|
/**
|
|
|
|
* urlEncode a fuse path,
|
|
|
|
* taking into special consideration "/",
|
|
|
|
* otherwise regular urlEncode.
|
|
|
|
*/
|
|
|
|
string urlEncode2(const string &s)
|
|
|
|
{
|
|
|
|
string result;
|
2019-02-12 02:41:01 +00:00
|
|
|
for (size_t i = 0; i < s.length(); ++i) {
|
2015-05-26 01:27:01 +00:00
|
|
|
char c = s[i];
|
|
|
|
if (c == '=' // Note- special case for fuse paths...
|
|
|
|
|| c == '&' // Note- special case for s3...
|
|
|
|
|| c == '%'
|
|
|
|
|| c == '.'
|
|
|
|
|| c == '-'
|
|
|
|
|| c == '_'
|
|
|
|
|| c == '~'
|
|
|
|
|| (c >= 'a' && c <= 'z')
|
|
|
|
|| (c >= 'A' && c <= 'Z')
|
|
|
|
|| (c >= '0' && c <= '9')) {
|
|
|
|
result += c;
|
2015-01-20 16:31:36 +00:00
|
|
|
} else {
|
|
|
|
result += "%";
|
2015-05-26 01:27:01 +00:00
|
|
|
result += hexAlphabet[static_cast<unsigned char>(c) / 16];
|
|
|
|
result += hexAlphabet[static_cast<unsigned char>(c) % 16];
|
2015-01-20 16:31:36 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2015-04-20 17:24:57 +00:00
|
|
|
string urlDecode(const string& s)
|
|
|
|
{
|
|
|
|
string result;
|
2019-02-12 02:41:01 +00:00
|
|
|
for(size_t i = 0; i < s.length(); ++i){
|
2015-04-20 17:24:57 +00:00
|
|
|
if(s[i] != '%'){
|
|
|
|
result += s[i];
|
|
|
|
}else{
|
2019-02-12 02:41:01 +00:00
|
|
|
int ch = 0;
|
2015-04-20 17:24:57 +00:00
|
|
|
if(s.length() <= ++i){
|
|
|
|
break; // wrong format.
|
|
|
|
}
|
2015-06-06 16:39:39 +00:00
|
|
|
ch += ('0' <= s[i] && s[i] <= '9') ? (s[i] - '0') : ('A' <= s[i] && s[i] <= 'F') ? (s[i] - 'A' + 0x0a) : ('a' <= s[i] && s[i] <= 'f') ? (s[i] - 'a' + 0x0a) : 0x00;
|
2015-04-20 17:24:57 +00:00
|
|
|
if(s.length() <= ++i){
|
|
|
|
break; // wrong format.
|
|
|
|
}
|
|
|
|
ch *= 16;
|
2015-06-06 16:39:39 +00:00
|
|
|
ch += ('0' <= s[i] && s[i] <= '9') ? (s[i] - '0') : ('A' <= s[i] && s[i] <= 'F') ? (s[i] - 'A' + 0x0a) : ('a' <= s[i] && s[i] <= 'f') ? (s[i] - 'a' + 0x0a) : 0x00;
|
2019-02-12 02:41:01 +00:00
|
|
|
result += static_cast<char>(ch);
|
2015-04-20 17:24:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool takeout_str_dquart(string& str)
|
|
|
|
{
|
|
|
|
size_t pos;
|
|
|
|
|
|
|
|
// '"' for start
|
2019-01-18 04:24:24 +00:00
|
|
|
if(string::npos != (pos = str.find_first_of('\"'))){
|
2015-04-20 17:24:57 +00:00
|
|
|
str = str.substr(pos + 1);
|
|
|
|
|
|
|
|
// '"' for end
|
2019-01-18 04:24:24 +00:00
|
|
|
if(string::npos == (pos = str.find_last_of('\"'))){
|
2015-04-20 17:24:57 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
str = str.substr(0, pos);
|
2019-01-18 04:24:24 +00:00
|
|
|
if(string::npos != str.find_first_of('\"')){
|
2015-04-20 17:24:57 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-07-10 06:24:06 +00:00
|
|
|
//
|
|
|
|
// ex. target="http://......?keyword=value&..."
|
|
|
|
//
|
|
|
|
bool get_keyword_value(string& target, const char* keyword, string& value)
|
|
|
|
{
|
|
|
|
if(!keyword){
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
size_t spos;
|
|
|
|
size_t epos;
|
|
|
|
if(string::npos == (spos = target.find(keyword))){
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
spos += strlen(keyword);
|
|
|
|
if('=' != target.at(spos)){
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
spos++;
|
|
|
|
if(string::npos == (epos = target.find('&', spos))){
|
|
|
|
value = target.substr(spos);
|
|
|
|
}else{
|
|
|
|
value = target.substr(spos, (epos - spos));
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-06-26 00:37:52 +00:00
|
|
|
/**
|
|
|
|
* Returns the current date
|
|
|
|
* in a format suitable for a HTTP request header.
|
|
|
|
*/
|
2015-01-28 17:13:11 +00:00
|
|
|
string get_date_rfc850()
|
2013-03-30 13:37:14 +00:00
|
|
|
{
|
2011-06-26 00:37:52 +00:00
|
|
|
char buf[100];
|
|
|
|
time_t t = time(NULL);
|
2019-07-16 14:27:01 +00:00
|
|
|
struct tm res;
|
|
|
|
strftime(buf, sizeof(buf), "%a, %d %b %Y %H:%M:%S GMT", gmtime_r(&t, &res));
|
2011-06-26 00:37:52 +00:00
|
|
|
return buf;
|
|
|
|
}
|
2014-09-07 15:08:27 +00:00
|
|
|
|
2015-01-28 17:13:11 +00:00
|
|
|
void get_date_sigv3(string& date, string& date8601)
|
|
|
|
{
|
|
|
|
time_t tm = time(NULL);
|
|
|
|
date = get_date_string(tm);
|
|
|
|
date8601 = get_date_iso8601(tm);
|
|
|
|
}
|
|
|
|
|
|
|
|
string get_date_string(time_t tm)
|
2015-01-20 16:31:36 +00:00
|
|
|
{
|
|
|
|
char buf[100];
|
2019-07-16 14:27:01 +00:00
|
|
|
struct tm res;
|
|
|
|
strftime(buf, sizeof(buf), "%Y%m%d", gmtime_r(&tm, &res));
|
2015-01-20 16:31:36 +00:00
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
2015-01-28 17:13:11 +00:00
|
|
|
string get_date_iso8601(time_t tm)
|
2015-01-20 16:31:36 +00:00
|
|
|
{
|
|
|
|
char buf[100];
|
2019-07-16 14:27:01 +00:00
|
|
|
struct tm res;
|
|
|
|
strftime(buf, sizeof(buf), "%Y%m%dT%H%M%SZ", gmtime_r(&tm, &res));
|
2015-01-20 16:31:36 +00:00
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
2019-02-03 14:22:16 +00:00
|
|
|
bool get_unixtime_from_iso8601(const char* pdate, time_t& unixtime)
|
|
|
|
{
|
|
|
|
if(!pdate){
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct tm tm;
|
|
|
|
char* prest = strptime(pdate, "%Y-%m-%dT%T", &tm);
|
|
|
|
if(prest == pdate){
|
|
|
|
// wrong format
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
unixtime = mktime(&tm);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// Convert to unixtime from string which formatted by following:
|
|
|
|
// "12Y12M12D12h12m12s", "86400s", "9h30m", etc
|
|
|
|
//
|
|
|
|
bool convert_unixtime_from_option_arg(const char* argv, time_t& unixtime)
|
|
|
|
{
|
|
|
|
if(!argv){
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
unixtime = 0;
|
|
|
|
const char* ptmp;
|
|
|
|
int last_unit_type = 0; // unit flag.
|
|
|
|
bool is_last_number;
|
|
|
|
time_t tmptime;
|
|
|
|
for(ptmp = argv, is_last_number = true, tmptime = 0; ptmp && *ptmp; ++ptmp){
|
|
|
|
if('0' <= *ptmp && *ptmp <= '9'){
|
|
|
|
tmptime *= 10;
|
|
|
|
tmptime += static_cast<time_t>(*ptmp - '0');
|
|
|
|
is_last_number = true;
|
|
|
|
}else if(is_last_number){
|
|
|
|
if('Y' == *ptmp && 1 > last_unit_type){
|
|
|
|
unixtime += (tmptime * (60 * 60 * 24 * 365)); // average 365 day / year
|
|
|
|
last_unit_type = 1;
|
|
|
|
}else if('M' == *ptmp && 2 > last_unit_type){
|
|
|
|
unixtime += (tmptime * (60 * 60 * 24 * 30)); // average 30 day / month
|
|
|
|
last_unit_type = 2;
|
|
|
|
}else if('D' == *ptmp && 3 > last_unit_type){
|
|
|
|
unixtime += (tmptime * (60 * 60 * 24));
|
|
|
|
last_unit_type = 3;
|
|
|
|
}else if('h' == *ptmp && 4 > last_unit_type){
|
|
|
|
unixtime += (tmptime * (60 * 60));
|
|
|
|
last_unit_type = 4;
|
|
|
|
}else if('m' == *ptmp && 5 > last_unit_type){
|
|
|
|
unixtime += (tmptime * 60);
|
|
|
|
last_unit_type = 5;
|
|
|
|
}else if('s' == *ptmp && 6 > last_unit_type){
|
|
|
|
unixtime += tmptime;
|
|
|
|
last_unit_type = 6;
|
|
|
|
}else{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
tmptime = 0;
|
|
|
|
is_last_number = false;
|
|
|
|
}else{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(is_last_number){
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-08-19 18:22:30 +00:00
|
|
|
std::string s3fs_hex(const unsigned char* input, size_t length)
|
|
|
|
{
|
|
|
|
std::string hex;
|
|
|
|
for(size_t pos = 0; pos < length; ++pos){
|
|
|
|
char hexbuf[3];
|
|
|
|
snprintf(hexbuf, 3, "%02x", input[pos]);
|
|
|
|
hex += hexbuf;
|
|
|
|
}
|
|
|
|
return hex;
|
|
|
|
}
|
|
|
|
|
|
|
|
char* s3fs_base64(const unsigned char* input, size_t length)
|
|
|
|
{
|
|
|
|
static const char* base = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
|
|
|
|
char* result;
|
|
|
|
|
2019-02-17 13:59:11 +00:00
|
|
|
if(!input || 0 == length){
|
2015-08-19 18:22:30 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
2019-04-07 06:07:34 +00:00
|
|
|
result = new char[((length / 3) + 1) * 4 + 1];
|
2015-08-19 18:22:30 +00:00
|
|
|
|
|
|
|
unsigned char parts[4];
|
|
|
|
size_t rpos;
|
|
|
|
size_t wpos;
|
|
|
|
for(rpos = 0, wpos = 0; rpos < length; rpos += 3){
|
|
|
|
parts[0] = (input[rpos] & 0xfc) >> 2;
|
|
|
|
parts[1] = ((input[rpos] & 0x03) << 4) | ((((rpos + 1) < length ? input[rpos + 1] : 0x00) & 0xf0) >> 4);
|
|
|
|
parts[2] = (rpos + 1) < length ? (((input[rpos + 1] & 0x0f) << 2) | ((((rpos + 2) < length ? input[rpos + 2] : 0x00) & 0xc0) >> 6)) : 0x40;
|
|
|
|
parts[3] = (rpos + 2) < length ? (input[rpos + 2] & 0x3f) : 0x40;
|
|
|
|
|
|
|
|
result[wpos++] = base[parts[0]];
|
|
|
|
result[wpos++] = base[parts[1]];
|
|
|
|
result[wpos++] = base[parts[2]];
|
|
|
|
result[wpos++] = base[parts[3]];
|
|
|
|
}
|
|
|
|
result[wpos] = '\0';
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
inline unsigned char char_decode64(const char ch)
|
|
|
|
{
|
|
|
|
unsigned char by;
|
|
|
|
if('A' <= ch && ch <= 'Z'){ // A - Z
|
|
|
|
by = static_cast<unsigned char>(ch - 'A');
|
|
|
|
}else if('a' <= ch && ch <= 'z'){ // a - z
|
|
|
|
by = static_cast<unsigned char>(ch - 'a' + 26);
|
|
|
|
}else if('0' <= ch && ch <= '9'){ // 0 - 9
|
|
|
|
by = static_cast<unsigned char>(ch - '0' + 52);
|
|
|
|
}else if('+' == ch){ // +
|
|
|
|
by = 62;
|
|
|
|
}else if('/' == ch){ // /
|
|
|
|
by = 63;
|
|
|
|
}else if('=' == ch){ // =
|
|
|
|
by = 64;
|
|
|
|
}else{ // something wrong
|
|
|
|
by = UCHAR_MAX;
|
|
|
|
}
|
|
|
|
return by;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned char* s3fs_decode64(const char* input, size_t* plength)
|
|
|
|
{
|
|
|
|
unsigned char* result;
|
|
|
|
if(!input || 0 == strlen(input) || !plength){
|
|
|
|
return NULL;
|
|
|
|
}
|
2019-04-07 06:07:34 +00:00
|
|
|
result = new unsigned char[strlen(input) + 1];
|
2015-08-19 18:22:30 +00:00
|
|
|
|
|
|
|
unsigned char parts[4];
|
|
|
|
size_t input_len = strlen(input);
|
|
|
|
size_t rpos;
|
|
|
|
size_t wpos;
|
|
|
|
for(rpos = 0, wpos = 0; rpos < input_len; rpos += 4){
|
|
|
|
parts[0] = char_decode64(input[rpos]);
|
|
|
|
parts[1] = (rpos + 1) < input_len ? char_decode64(input[rpos + 1]) : 64;
|
|
|
|
parts[2] = (rpos + 2) < input_len ? char_decode64(input[rpos + 2]) : 64;
|
|
|
|
parts[3] = (rpos + 3) < input_len ? char_decode64(input[rpos + 3]) : 64;
|
|
|
|
|
|
|
|
result[wpos++] = ((parts[0] << 2) & 0xfc) | ((parts[1] >> 4) & 0x03);
|
|
|
|
if(64 == parts[2]){
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
result[wpos++] = ((parts[1] << 4) & 0xf0) | ((parts[2] >> 2) & 0x0f);
|
|
|
|
if(64 == parts[3]){
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
result[wpos++] = ((parts[2] << 6) & 0xc0) | (parts[3] & 0x3f);
|
|
|
|
}
|
2015-08-19 20:40:22 +00:00
|
|
|
result[wpos] = '\0';
|
2015-08-19 18:22:30 +00:00
|
|
|
*plength = wpos;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2019-02-18 12:27:44 +00:00
|
|
|
/*
|
|
|
|
* detect and rewrite invalid utf8. We take invalid bytes
|
|
|
|
* and encode them into a private region of the unicode
|
|
|
|
* space. This is sometimes known as wtf8, wobbly transformation format.
|
|
|
|
* it is necessary because S3 validates the utf8 used for identifiers for
|
|
|
|
* correctness, while some clients may provide invalid utf, notably
|
|
|
|
* windows using cp1252.
|
2019-02-15 15:57:03 +00:00
|
|
|
*/
|
2019-02-18 12:27:44 +00:00
|
|
|
|
2019-02-19 10:32:10 +00:00
|
|
|
// Base location for transform. The range 0xE000 - 0xF8ff
|
|
|
|
// is a private range, se use the start of this range.
|
|
|
|
static unsigned int escape_base = 0xe000;
|
|
|
|
|
|
|
|
// encode bytes into wobbly utf8.
|
|
|
|
// 'result' can be null. returns true if transform was needed.
|
2019-02-18 12:27:44 +00:00
|
|
|
bool s3fs_wtf8_encode(const char *s, string *result)
|
2019-02-15 15:57:03 +00:00
|
|
|
{
|
2019-02-18 12:27:44 +00:00
|
|
|
bool invalid = false;
|
|
|
|
|
2019-02-15 15:57:03 +00:00
|
|
|
// Pass valid utf8 code through
|
2019-02-18 12:27:44 +00:00
|
|
|
for (; *s; s++) {
|
2019-02-19 10:32:10 +00:00
|
|
|
const unsigned char c = *s;
|
2019-02-18 12:27:44 +00:00
|
|
|
|
2019-02-15 15:57:03 +00:00
|
|
|
// single byte encoding
|
|
|
|
if (c <= 0x7f) {
|
2019-04-07 14:06:06 +00:00
|
|
|
if (result) {
|
|
|
|
*result += c;
|
|
|
|
}
|
2019-02-15 15:57:03 +00:00
|
|
|
continue;
|
|
|
|
}
|
2019-02-18 12:27:44 +00:00
|
|
|
|
|
|
|
// otherwise, it must be one of the valid start bytes
|
|
|
|
if ( c >= 0xc2 && c <= 0xf5 ) {
|
|
|
|
|
|
|
|
// two byte encoding
|
|
|
|
// don't need bounds check, string is zero terminated
|
|
|
|
if ((c & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80) {
|
|
|
|
// all two byte encodings starting higher than c1 are valid
|
|
|
|
if (result) {
|
|
|
|
*result += c;
|
|
|
|
*result += *(++s);
|
|
|
|
}
|
2019-02-15 15:57:03 +00:00
|
|
|
continue;
|
2019-02-18 12:27:44 +00:00
|
|
|
}
|
|
|
|
// three byte encoding
|
|
|
|
if ((c & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80) {
|
|
|
|
const unsigned code = ((c & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
|
|
|
|
if (code >= 0x800 && ! (code >= 0xd800 && code <= 0xd8ff)) {
|
|
|
|
// not overlong and not a surrogate pair
|
|
|
|
if (result) {
|
|
|
|
*result += c;
|
|
|
|
*result += *(++s);
|
|
|
|
*result += *(++s);
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
2019-02-15 15:57:03 +00:00
|
|
|
}
|
2019-02-18 12:27:44 +00:00
|
|
|
// four byte encoding
|
2019-02-19 10:32:10 +00:00
|
|
|
if ((c & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80) {
|
|
|
|
const unsigned code = ((c & 0x07) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
|
|
|
|
if (code >= 0x10000 && code <= 0x10ffff) {
|
2019-02-18 12:27:44 +00:00
|
|
|
// not overlong and in defined unicode space
|
|
|
|
if (result) {
|
|
|
|
*result += c;
|
|
|
|
*result += *(++s);
|
|
|
|
*result += *(++s);
|
|
|
|
*result += *(++s);
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
2019-02-15 15:57:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
// printf("invalid %02x at %d\n", c, i);
|
2019-02-18 12:27:44 +00:00
|
|
|
// Invalid utf8 code. Convert it to a private two byte area of unicode
|
|
|
|
// e.g. the e000 - f8ff area. This will be a three byte encoding
|
|
|
|
invalid = true;
|
|
|
|
if (result) {
|
|
|
|
unsigned escape = escape_base + c;
|
2019-02-12 02:41:01 +00:00
|
|
|
*result += static_cast<char>(0xe0 | ((escape >> 12) & 0x0f));
|
|
|
|
*result += static_cast<char>(0x80 | ((escape >> 06) & 0x3f));
|
|
|
|
*result += static_cast<char>(0x80 | ((escape >> 00) & 0x3f));
|
2019-02-15 15:57:03 +00:00
|
|
|
}
|
|
|
|
}
|
2019-02-18 12:27:44 +00:00
|
|
|
return invalid;
|
|
|
|
}
|
|
|
|
|
|
|
|
string s3fs_wtf8_encode(const string &s)
|
|
|
|
{
|
|
|
|
string result;
|
|
|
|
s3fs_wtf8_encode(s.c_str(), &result);
|
2019-02-15 15:57:03 +00:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2019-02-18 12:27:44 +00:00
|
|
|
// The reverse operation, turn encoded bytes back into their original values
|
2019-02-19 10:32:10 +00:00
|
|
|
// The code assumes that we map to a three-byte code point.
|
2019-02-18 12:27:44 +00:00
|
|
|
bool s3fs_wtf8_decode(const char *s, string *result)
|
2019-02-15 15:57:03 +00:00
|
|
|
{
|
2019-02-18 12:27:44 +00:00
|
|
|
bool encoded = false;
|
|
|
|
for (; *s; s++) {
|
|
|
|
unsigned char c = *s;
|
|
|
|
// look for a three byte tuple matching our encoding code
|
|
|
|
if ((c & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80) {
|
|
|
|
unsigned code = (c & 0x0f) << 12;
|
|
|
|
code |= (s[1] & 0x3f) << 6;
|
|
|
|
code |= (s[2] & 0x3f) << 0;
|
2019-02-19 10:32:10 +00:00
|
|
|
if (code >= escape_base && code <= escape_base + 0xff) {
|
2019-02-18 12:27:44 +00:00
|
|
|
// convert back
|
|
|
|
encoded = true;
|
2019-04-07 14:06:06 +00:00
|
|
|
if(result){
|
2019-02-12 02:41:01 +00:00
|
|
|
*result += static_cast<char>(code - escape_base);
|
2019-04-07 14:06:06 +00:00
|
|
|
}
|
2019-02-18 12:27:44 +00:00
|
|
|
s+=2;
|
|
|
|
continue;
|
2019-02-15 15:57:03 +00:00
|
|
|
}
|
|
|
|
}
|
2019-04-07 14:06:06 +00:00
|
|
|
if (result) {
|
2019-02-18 12:27:44 +00:00
|
|
|
*result += c;
|
2019-04-07 14:06:06 +00:00
|
|
|
}
|
2019-02-15 15:57:03 +00:00
|
|
|
}
|
2019-02-18 12:27:44 +00:00
|
|
|
return encoded;
|
|
|
|
}
|
|
|
|
|
|
|
|
string s3fs_wtf8_decode(const string &s)
|
|
|
|
{
|
|
|
|
string result;
|
|
|
|
s3fs_wtf8_decode(s.c_str(), &result);
|
2019-02-15 15:57:03 +00:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2014-09-07 15:08:27 +00:00
|
|
|
/*
|
|
|
|
* Local variables:
|
|
|
|
* tab-width: 4
|
|
|
|
* c-basic-offset: 4
|
|
|
|
* End:
|
|
|
|
* vim600: noet sw=4 ts=4 fdm=marker
|
|
|
|
* vim<600: noet sw=4 ts=4
|
|
|
|
*/
|