2
1
mirror of https://github.com/qpdf/qpdf.git synced 2025-01-03 07:12:28 +00:00

JSON fix: correctly parse UTF-16 surrogate pairs

This commit is contained in:
Jay Berkenbilt 2022-05-19 20:28:13 -04:00
parent 1ec561daa4
commit 6c7326b290
12 changed files with 115 additions and 15 deletions

View File

@ -574,7 +574,15 @@ namespace
private:
void getToken();
void handleToken();
static std::string decode_string(std::string const& json);
static std::string
decode_string(std::string const& json, size_t offset);
static void handle_u_code(
char const* s,
size_t offset,
size_t i,
unsigned long& high_surrogate,
size_t& high_offset,
std::string& result);
enum parser_state_e {
ps_top,
@ -620,8 +628,54 @@ namespace
};
} // namespace
void
JSONParser::handle_u_code(
char const* s,
size_t offset,
size_t i,
unsigned long& high_surrogate,
size_t& high_offset,
std::string& result)
{
std::string hex = QUtil::hex_decode(std::string(s + i + 1, s + i + 5));
unsigned char high = static_cast<unsigned char>(hex.at(0));
unsigned char low = static_cast<unsigned char>(hex.at(1));
unsigned long codepoint = high;
codepoint <<= 8;
codepoint += low;
if ((codepoint & 0xFC00) == 0xD800) {
// high surrogate
size_t new_high_offset = offset + i;
if (high_offset) {
QTC::TC("libtests", "JSON 16 high high");
throw std::runtime_error(
"JSON: offset " + QUtil::uint_to_string(new_high_offset) +
": UTF-16 high surrogate found after previous high surrogate"
" at offset " +
QUtil::uint_to_string(high_offset));
}
high_offset = new_high_offset;
high_surrogate = codepoint;
} else if ((codepoint & 0xFC00) == 0xDC00) {
// low surrogate
if (offset + i != (high_offset + 6)) {
QTC::TC("libtests", "JSON 16 low not after high");
throw std::runtime_error(
"JSON: offset " + QUtil::uint_to_string(offset + i) +
": UTF-16 low surrogate found not immediately after high"
" surrogate");
}
high_offset = 0;
codepoint =
0x10000U + ((high_surrogate & 0x3FFU) << 10U) + (codepoint & 0x3FF);
result += QUtil::toUTF8(codepoint);
} else {
result += QUtil::toUTF8(codepoint);
}
}
std::string
JSONParser::decode_string(std::string const& str)
JSONParser::decode_string(std::string const& str, size_t offset)
{
// The string has already been validated when this private method
// is called, so errors are logic errors instead of runtime
@ -635,6 +689,9 @@ JSONParser::decode_string(std::string const& str)
// Move inside the quotation marks
++s;
len -= 2;
// Keep track of UTF-16 surrogate pairs.
unsigned long high_surrogate = 0;
size_t high_offset = 0;
std::string result;
for (size_t i = 0; i < len; ++i) {
if (s[i] == '\\') {
@ -670,17 +727,9 @@ JSONParser::decode_string(std::string const& str)
throw std::logic_error(
"JSON parse: not enough characters after \\u");
}
{
std::string hex =
QUtil::hex_decode(std::string(s + i + 1, s + i + 5));
i += 4;
unsigned char high = static_cast<unsigned char>(hex.at(0));
unsigned char low = static_cast<unsigned char>(hex.at(1));
unsigned long codepoint = high;
codepoint <<= 8;
codepoint += low;
result += QUtil::toUTF8(codepoint);
}
handle_u_code(
s, offset, i, high_surrogate, high_offset, result);
i += 4;
break;
default:
throw std::logic_error("JSON parse: bad character after \\");
@ -690,6 +739,12 @@ JSONParser::decode_string(std::string const& str)
result.append(1, s[i]);
}
}
if (high_offset) {
QTC::TC("libtests", "JSON 16 dangling high");
throw std::runtime_error(
"JSON: offset " + QUtil::uint_to_string(high_offset) +
": UTF-16 high surrogate not followed by low surrogate");
}
return result;
}
@ -933,7 +988,7 @@ JSONParser::handleToken()
if (token.length() < 2) {
throw std::logic_error("JSON string length < 2");
}
s_value = decode_string(token);
s_value = decode_string(token, offset - token.length());
}
// Based on the lexical state and value, figure out whether we are
// looking at an item or a delimiter. It will always be exactly

View File

@ -89,3 +89,6 @@ JSONHandler unhandled value 0
JSONHandler unexpected key 0
JSON schema other type 0
JSON optional key 0
JSON 16 high high 0
JSON 16 low not after high 0
JSON 16 dangling high 0

View File

@ -32,7 +32,7 @@ if ($^O ne 'msys')
cleanup();
my $good = 10;
my $good = 11;
for (my $i = 1; $i <= $good; ++$i)
{
@ -117,6 +117,9 @@ my @bad = (
"premature end after u", # 34
"bad hex digit", # 35
"parser depth exceeded", # 36
"stray low surrogate", # 37
"high high surrogate", # 38
"dangling high surrogate", # 39
);
my $i = 0;

View File

@ -0,0 +1 @@
[1, "u:potato: \udd54", 2]

View File

@ -0,0 +1 @@
exception: bad-37.json: JSON: offset 15: UTF-16 low surrogate found not immediately after high surrogate

View File

@ -0,0 +1 @@
"u:\ud83ezz\ud83ezz"

View File

@ -0,0 +1 @@
exception: bad-38.json: JSON: offset 11: UTF-16 high surrogate found after previous high surrogate at offset 3

View File

@ -0,0 +1 @@
"u:\ud83e all alone"

View File

@ -0,0 +1 @@
exception: bad-39.json: JSON: offset 3: UTF-16 high surrogate not followed by low surrogate

View File

@ -0,0 +1,16 @@
array start
array item: [4, 0): []
array start
array item: [5, 11): "u:π"
array item: [13, 23): "u:π"
array item: [25, 39): "b:EFBBBFCF80"
array item: [41, 53): "b:feff03c0"
container end: [4, 54): []
array item: [58, 0): []
array start
array item: [59, 67): "u:🥔"
array item: [69, 85): "u:🥔"
array item: [87, 103): "b:feffd83eDD54"
container end: [58, 104): []
container end: [0, 106): []
[]

View File

@ -0,0 +1,4 @@
[
["u:π", "u:\u03c0", "b:EFBBBFCF80", "b:feff03c0"],
["u:🥔", "u:\ud83e\udd54", "b:feffd83eDD54"]
]

View File

@ -0,0 +1,13 @@
[
[
"u:π",
"u:π",
"b:EFBBBFCF80",
"b:feff03c0"
],
[
"u:🥔",
"u:🥔",
"b:feffd83eDD54"
]
]