JSON fix: correctly parse UTF-16 surrogate pairs

2025-01-03 07:12:28 +00:00 · 2022-05-19 20:28:13 -04:00 · 2022-05-19 20:28:13 -04:00 · 6c7326b290
commit 6c7326b290
parent 1ec561daa4
12 changed files with 115 additions and 15 deletions
--- a/libqpdf/JSON.cc
+++ b/libqpdf/JSON.cc
@ -574,7 +574,15 @@ namespace
      private:
        void getToken();
        void handleToken();
-        static std::string decode_string(std::string const& json);
+        static std::string
+        decode_string(std::string const& json, size_t offset);
+        static void handle_u_code(
+            char const* s,
+            size_t offset,
+            size_t i,
+            unsigned long& high_surrogate,
+            size_t& high_offset,
+            std::string& result);

        enum parser_state_e {
            ps_top,
@ -620,8 +628,54 @@ namespace
    };
 } // namespace

+void
+JSONParser::handle_u_code(
+    char const* s,
+    size_t offset,
+    size_t i,
+    unsigned long& high_surrogate,
+    size_t& high_offset,
+    std::string& result)
+{
+    std::string hex = QUtil::hex_decode(std::string(s + i + 1, s + i + 5));
+    unsigned char high = static_cast<unsigned char>(hex.at(0));
+    unsigned char low = static_cast<unsigned char>(hex.at(1));
+    unsigned long codepoint = high;
+    codepoint <<= 8;
+    codepoint += low;
+    if ((codepoint & 0xFC00) == 0xD800) {
+        // high surrogate
+        size_t new_high_offset = offset + i;
+        if (high_offset) {
+            QTC::TC("libtests", "JSON 16 high high");
+            throw std::runtime_error(
+                "JSON: offset " + QUtil::uint_to_string(new_high_offset) +
+                ": UTF-16 high surrogate found after previous high surrogate"
+                " at offset " +
+                QUtil::uint_to_string(high_offset));
+        }
+        high_offset = new_high_offset;
+        high_surrogate = codepoint;
+    } else if ((codepoint & 0xFC00) == 0xDC00) {
+        // low surrogate
+        if (offset + i != (high_offset + 6)) {
+            QTC::TC("libtests", "JSON 16 low not after high");
+            throw std::runtime_error(
+                "JSON: offset " + QUtil::uint_to_string(offset + i) +
+                ": UTF-16 low surrogate found not immediately after high"
+                " surrogate");
+        }
+        high_offset = 0;
+        codepoint =
+            0x10000U + ((high_surrogate & 0x3FFU) << 10U) + (codepoint & 0x3FF);
+        result += QUtil::toUTF8(codepoint);
+    } else {
+        result += QUtil::toUTF8(codepoint);
+    }
+}
+
 std::string
-JSONParser::decode_string(std::string const& str)
+JSONParser::decode_string(std::string const& str, size_t offset)
 {
    // The string has already been validated when this private method
    // is called, so errors are logic errors instead of runtime
@ -635,6 +689,9 @@ JSONParser::decode_string(std::string const& str)
    // Move inside the quotation marks
    ++s;
    len -= 2;
+    // Keep track of UTF-16 surrogate pairs.
+    unsigned long high_surrogate = 0;
+    size_t high_offset = 0;
    std::string result;
    for (size_t i = 0; i < len; ++i) {
        if (s[i] == '\\') {
@ -670,17 +727,9 @@ JSONParser::decode_string(std::string const& str)
                    throw std::logic_error(
                        "JSON parse: not enough characters after \\u");
                }
-                {
-                    std::string hex =
-                        QUtil::hex_decode(std::string(s + i + 1, s + i + 5));
-                    i += 4;
-                    unsigned char high = static_cast<unsigned char>(hex.at(0));
-                    unsigned char low = static_cast<unsigned char>(hex.at(1));
-                    unsigned long codepoint = high;
-                    codepoint <<= 8;
-                    codepoint += low;
-                    result += QUtil::toUTF8(codepoint);
-                }
+                handle_u_code(
+                    s, offset, i, high_surrogate, high_offset, result);
+                i += 4;
                break;
            default:
                throw std::logic_error("JSON parse: bad character after \\");
@ -690,6 +739,12 @@ JSONParser::decode_string(std::string const& str)
            result.append(1, s[i]);
        }
    }
+    if (high_offset) {
+        QTC::TC("libtests", "JSON 16 dangling high");
+        throw std::runtime_error(
+            "JSON: offset " + QUtil::uint_to_string(high_offset) +
+            ": UTF-16 high surrogate not followed by low surrogate");
+    }
    return result;
 }

@ -933,7 +988,7 @@ JSONParser::handleToken()
        if (token.length() < 2) {
            throw std::logic_error("JSON string length < 2");
        }
-        s_value = decode_string(token);
+        s_value = decode_string(token, offset - token.length());
    }
    // Based on the lexical state and value, figure out whether we are
    // looking at an item or a delimiter. It will always be exactly
--- a/libtests/libtests.testcov
+++ b/libtests/libtests.testcov
@ -89,3 +89,6 @@ JSONHandler unhandled value 0
 JSONHandler unexpected key 0
 JSON schema other type 0
 JSON optional key 0
+JSON 16 high high 0
+JSON 16 low not after high 0
+JSON 16 dangling high 0
--- a/libtests/qtest/json_parse.test
+++ b/libtests/qtest/json_parse.test
@ -32,7 +32,7 @@ if ($^O ne 'msys')

 cleanup();

-my $good = 10;
+my $good = 11;

 for (my $i = 1; $i <= $good; ++$i)
 {
@ -117,6 +117,9 @@ my @bad = (
    "premature end after u",    # 34
    "bad hex digit",            # 35
    "parser depth exceeded",    # 36
+    "stray low surrogate",      # 37
+    "high high surrogate",      # 38
+    "dangling high surrogate",  # 39
    );

 my $i = 0;
--- a/libtests/qtest/json_parse/bad-37.json
+++ b/libtests/qtest/json_parse/bad-37.json
@ -0,0 +1 @@
+[1, "u:potato: \udd54", 2]
--- a/libtests/qtest/json_parse/bad-37.out
+++ b/libtests/qtest/json_parse/bad-37.out
@ -0,0 +1 @@
+exception: bad-37.json: JSON: offset 15: UTF-16 low surrogate found not immediately after high surrogate
--- a/libtests/qtest/json_parse/bad-38.json
+++ b/libtests/qtest/json_parse/bad-38.json
@ -0,0 +1 @@
+"u:\ud83ezz\ud83ezz"
--- a/libtests/qtest/json_parse/bad-38.out
+++ b/libtests/qtest/json_parse/bad-38.out
@ -0,0 +1 @@
+exception: bad-38.json: JSON: offset 11: UTF-16 high surrogate found after previous high surrogate at offset 3
--- a/libtests/qtest/json_parse/bad-39.json
+++ b/libtests/qtest/json_parse/bad-39.json
@ -0,0 +1 @@
+"u:\ud83e all alone"
--- a/libtests/qtest/json_parse/bad-39.out
+++ b/libtests/qtest/json_parse/bad-39.out
@ -0,0 +1 @@
+exception: bad-39.json: JSON: offset 3: UTF-16 high surrogate not followed by low surrogate
--- a/libtests/qtest/json_parse/good-11-react.out
+++ b/libtests/qtest/json_parse/good-11-react.out
@ -0,0 +1,16 @@
+array start
+array item: [4, 0): []
+array start
+array item: [5, 11): "u:π"
+array item: [13, 23): "u:π"
+array item: [25, 39): "b:EFBBBFCF80"
+array item: [41, 53): "b:feff03c0"
+container end: [4, 54): []
+array item: [58, 0): []
+array start
+array item: [59, 67): "u:🥔"
+array item: [69, 85): "u:🥔"
+array item: [87, 103): "b:feffd83eDD54"
+container end: [58, 104): []
+container end: [0, 106): []
+[]
--- a/libtests/qtest/json_parse/good-11.json
+++ b/libtests/qtest/json_parse/good-11.json
@ -0,0 +1,4 @@
+[
+  ["u:π", "u:\u03c0", "b:EFBBBFCF80", "b:feff03c0"],
+  ["u:🥔", "u:\ud83e\udd54", "b:feffd83eDD54"]
+]
--- a/libtests/qtest/json_parse/save-11.json
+++ b/libtests/qtest/json_parse/save-11.json
@ -0,0 +1,13 @@
+[
+  [
+    "u:π",
+    "u:π",
+    "b:EFBBBFCF80",
+    "b:feff03c0"
+  ],
+  [
+    "u:🥔",
+    "u:🥔",
+    "b:feffd83eDD54"
+  ]
+]
				`@ -0,0 +1 @@`
				`exception: bad-37.json: JSON: offset 15: UTF-16 low surrogate found not immediately after high surrogate`
				`@ -0,0 +1 @@`
				`exception: bad-38.json: JSON: offset 11: UTF-16 high surrogate found after previous high surrogate at offset 3`
				`@ -0,0 +1 @@`
				`exception: bad-39.json: JSON: offset 3: UTF-16 high surrogate not followed by low surrogate`