more robust wtf8 encoding

2024-12-22 16:58:55 +00:00 · 2019-02-18 12:27:44 +00:00 · 2019-02-18 12:27:44 +00:00 · 11b385820d
commit 11b385820d
parent ca2d1d873d
3 changed files with 123 additions and 89 deletions
--- a/src/string_util.cpp
+++ b/src/string_util.cpp
@ -451,93 +451,125 @@ unsigned char* s3fs_decode64(const char* input, size_t* plength)
  return result;
 }

-/* handle invalid utf8 by creating surrogate escape pairs.
- * this converts the data into the so-called wtf-8 encoding.
- * It is necessary if we are given data that isn't proper utf8
- * but the aws api requires proper utf8 for object names
+/*
+ * detect and rewrite invalid utf8.  We take invalid bytes
+ * and encode them into a private region of the unicode
+ * space.  This is sometimes known as wtf8, wobbly transformation format.
+ * it is necessary because S3 validates the utf8 used for identifiers for
+ * correctness, while some clients may provide invalid utf, notably
+ * windows using cp1252.
 */
-string s3fs_surrogateescape(const string &s)
+static unsigned int escape_base = 0xe000;  // base location for transform
+
+// encode bytes into wobbly utf8.  s can be null. returns true if transform was needed.
+bool s3fs_wtf8_encode(const char *s, string *result)
 {
+  bool invalid = false;
+
  // Pass valid utf8 code through
-  string result;
-  for (unsigned i = 0; i < s.length(); i++) {
-    unsigned char c = s[i];
+  for (; *s; s++) {
+    unsigned char c = *s;
+
    // single byte encoding
    if (c <= 0x7f) {
-      result += c;
+      if (result)
+	*result += c;
      continue;
    }
-    // two byte encoding
-    if ((c & 0xe0) == 0xc0) {
-      if ((i + 1) < s.length() && (s[i+1] & 0xc0) == 0x80) {
-        // printf("two bytes %02x at %d\n", c, i);
-        result += c;
-        result += s[++i];
-        continue;
-      }
-    } 
-    // three byte encoding
-    if ((c & 0xf0) == 0xe0) {
-      if ((i + 2) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80) {
-        // printf("three bytes %02x at %d\n", c, i);
-        result += c;
-        result += s[++i];
-        result += s[++i];
-        continue;
-      }
-    }
-    // four byte encoding
-    if ((c & 0xf8) == 0xf0) {
-      if ((i + 3) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80 && (s[i+3] & 0xc0) == 0x80) {
-        // printf("four bytes %02x at %d\n", c, i);
-        result += c;
-        result += s[++i];
-        result += s[++i];
-        result += s[++i];
-        continue;
-      }
-    }
-    // printf("invalid %02x at %d\n", c, i);
-    // Invalid utf8 code.  Convert to the surrogate pair (also known as wtf-8 encoding)
-    // we use lone surrogates, UDC80-UDCFF for this.
-    // if the byte is below 128, we cannot do this so we just pass the byte through and hope
-    // for the best, but really, this should be an error
-    if (c < 128) {
-      result += c;
-      continue;
-    }
-    // output the lone surrogate as utf8 encoded.  This is a three byte utf8 encoding:
-    unsigned surr = 0xdc00 + c;
-    result += 0xe0 | ((surr >> 12) & 0x0f);
-    result += 0x80 | ((surr >> 06) & 0x3f);
-    result += 0x80 | ((surr >> 00) & 0x3f);
-  }
-  return result;
-}

-string s3fs_surrogatedecode(const string &s)
-{
-  // the reverse operation.  Look for lone surrogates and replace them
-  string result;
-  for (unsigned i = 0; i < s.length(); i++) {
-    unsigned char c = s[i];
-    // look for a three byte encoding matching a lone surrogate
-    // three byte encoding
-    if ((c & 0xf0) == 0xe0) {
-      if ((i + 2) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80) {
-        unsigned surr = (c & 0x0f) << 12;
-        surr |= (s[i+1] & 0x3f) << 6;
-        surr |= (s[i+2] & 0x3f) << 0;
-        if (surr >= 0xdc80 && surr <= 0xdcff) {
-           // convert back
-           result += surr & 0xff;
-	   i+=2;
-           continue;
+    // otherwise, it must be one of the valid start bytes
+    if ( c >= 0xc2 && c <= 0xf5 ) {
+
+      // two byte encoding
+      // don't need bounds check, string is zero terminated
+      if ((c & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80) {
+        // all two byte encodings starting higher than c1 are valid
+        if (result) {
+          *result += c;
+          *result += *(++s);
+        }
+        continue;
+      } 
+      // three byte encoding
+      if ((c & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80) {
+        const unsigned code = ((c & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
+        if (code >= 0x800 && ! (code >= 0xd800 && code <= 0xd8ff)) {
+          // not overlong and not a surrogate pair 
+          if (result) {
+            *result += c;
+            *result += *(++s);
+            *result += *(++s);
+          }
+          continue;
+        }
+      }
+      // four byte encoding
+      if ((c & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80) {
+        const unsigned code = ((c & 0x0f) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
+        if (code >= 0x1000 && code <= 0x10ffff) {
+          // not overlong and in defined unicode space
+          if (result) {
+            *result += c;
+            *result += *(++s);
+            *result += *(++s);
+            *result += *(++s);
+          }
+          continue;
        }
      }
    }
-    result += c;
+    // printf("invalid %02x at %d\n", c, i);
+    // Invalid utf8 code.  Convert it to a private two byte area of unicode
+    // e.g. the e000 - f8ff area.  This will be a three byte encoding
+    invalid = true;
+    if (result) {
+      unsigned escape = escape_base + c;
+      *result += 0xe0 | ((escape >> 12) & 0x0f);
+      *result += 0x80 | ((escape >> 06) & 0x3f);
+      *result += 0x80 | ((escape >> 00) & 0x3f);
+    }
  }
+  return invalid;
+}
+
+string s3fs_wtf8_encode(const string &s)
+{
+  string result;
+  s3fs_wtf8_encode(s.c_str(), &result);
+  return result;
+}
+
+// The reverse operation, turn encoded bytes back into their original values
+bool s3fs_wtf8_decode(const char *s, string *result)
+{
+  // the reverse operation.  Look for lone surrogates and replace them
+  bool encoded = false;
+  for (; *s; s++) {
+    unsigned char c = *s;
+    // look for a three byte tuple matching our encoding code
+    if ((c & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80) {
+      unsigned code = (c & 0x0f) << 12;
+      code |= (s[1] & 0x3f) << 6;
+      code |= (s[2] & 0x3f) << 0;
+      if (code >= escape_base && code < escape_base + 0xff) {
+        // convert back
+        encoded = true;
+        if (result)
+          *result += code - escape_base;
+        s+=2;
+        continue;
+      }
+    }
+    if (result)
+      *result += c;
+  }
+  return encoded;
+}
+ 
+string s3fs_wtf8_decode(const string &s)
+{
+  string result;
+  s3fs_wtf8_decode(s.c_str(), &result);
  return result;
 }

--- a/src/string_util.h
+++ b/src/string_util.h
@ -58,8 +58,10 @@ std::string s3fs_hex(const unsigned char* input, size_t length);
 char* s3fs_base64(const unsigned char* input, size_t length);
 unsigned char* s3fs_decode64(const char* input, size_t* plength);

-std::string s3fs_surrogateescape(const std::string &s);
-std::string s3fs_surrogatedecode(const std::string &s);
+bool s3fs_wtf8_encode(const char *s, std::string *result);
+std::string s3fs_wtf8_encode(const std::string &s);
+bool s3fs_wtf8_decode(const char *s, std::string *result);
+std::string s3fs_wtf8_decode(const std::string &s);

 #endif // S3FS_STRING_UTIL_H_

--- a/src/test_string_util.cpp
+++ b/src/test_string_util.cpp
@ -87,7 +87,7 @@ void test_strtoofft()
  ASSERT_EQUALS(s3fs_strtoofft("deadbeef", /*is_base_16=*/ true), static_cast<off_t>(3735928559L));
 }

-void test_surrogateescape()
+void test_wtf8_encoding()
 {
  std::string ascii("normal string");
  std::string utf8("Hyld\xc3\xbdpi \xc3\xbej\xc3\xb3\xc3\xb0""f\xc3\xa9lagsins vex \xc3\xbar k\xc3\xa6rkomnu b\xc3\xb6li \xc3\xad \xc3\xa1st");
@ -96,19 +96,19 @@ void test_surrogateescape()
  broken[14] = 0x97;
  std::string mixed = ascii + utf8 + cp1252;

-  ASSERT_EQUALS(s3fs_surrogateescape(ascii), ascii);
-  ASSERT_EQUALS(s3fs_surrogatedecode(ascii), ascii);
-  ASSERT_EQUALS(s3fs_surrogateescape(utf8), utf8);
-  ASSERT_EQUALS(s3fs_surrogatedecode(utf8), utf8);
+  ASSERT_EQUALS(s3fs_wtf8_encode(ascii), ascii);
+  ASSERT_EQUALS(s3fs_wtf8_decode(ascii), ascii);
+  ASSERT_EQUALS(s3fs_wtf8_encode(utf8), utf8);
+  ASSERT_EQUALS(s3fs_wtf8_decode(utf8), utf8);

-  ASSERT_NEQUALS(s3fs_surrogateescape(cp1252), cp1252);
-  ASSERT_EQUALS(s3fs_surrogatedecode(s3fs_surrogateescape(cp1252)), cp1252);
+  ASSERT_NEQUALS(s3fs_wtf8_encode(cp1252), cp1252);
+  ASSERT_EQUALS(s3fs_wtf8_decode(s3fs_wtf8_encode(cp1252)), cp1252);

-  ASSERT_NEQUALS(s3fs_surrogateescape(broken), broken);
-  ASSERT_EQUALS(s3fs_surrogatedecode(s3fs_surrogateescape(broken)), broken);
+  ASSERT_NEQUALS(s3fs_wtf8_encode(broken), broken);
+  ASSERT_EQUALS(s3fs_wtf8_decode(s3fs_wtf8_encode(broken)), broken);

-  ASSERT_NEQUALS(s3fs_surrogateescape(mixed), mixed);
-  ASSERT_EQUALS(s3fs_surrogatedecode(s3fs_surrogateescape(mixed)), mixed);
+  ASSERT_NEQUALS(s3fs_wtf8_encode(mixed), mixed);
+  ASSERT_EQUALS(s3fs_wtf8_decode(s3fs_wtf8_encode(mixed)), mixed);
 }

 int main(int argc, char *argv[])
@ -116,6 +116,6 @@ int main(int argc, char *argv[])
  test_trim();
  test_base64();
  test_strtoofft();
-  test_surrogateescape();
+  test_wtf8_encoding();
  return 0;
 }