Detect overlong UTF-8 strings

This commit is contained in:
Jay Berkenbilt 2023-12-25 10:51:40 -05:00
parent 986d248578
commit 6d4115b7c5
3 changed files with 56 additions and 3 deletions

View File

@ -1,3 +1,8 @@
2023-12-25 Jay Berkenbilt <ejb@ql.org>
* Detect overlong UTF-8 in the UTF-8 decoder, and fix detection of
8-bit characters in erroneous UTF-8 strings.
2023-12-24 Jay Berkenbilt <ejb@ql.org>
* 11.7.0: release

View File

@ -1485,6 +1485,7 @@ encode_pdfdoc(unsigned long codepoint)
unsigned long
QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
{
auto o_pos = pos;
size_t len = utf8_val.length();
unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
error = false;
@ -1505,7 +1506,7 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e
return 0xfffd;
}
unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
auto codepoint = static_cast<unsigned long>(ch & ~to_clear);
while (bytes_needed > 0) {
--bytes_needed;
ch = static_cast<unsigned char>(utf8_val.at(pos++));
@ -1517,6 +1518,31 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e
codepoint <<= 6;
codepoint += (ch & 0x3f);
}
unsigned long lower_bound = 0;
switch (pos - o_pos) {
case 2:
lower_bound = 1 << 7;
break;
case 3:
lower_bound = 1 << 11;
break;
case 4:
lower_bound = 1 << 16;
break;
case 5:
lower_bound = 1 << 12;
break;
case 6:
lower_bound = 1 << 26;
break;
default:
lower_bound = 0;
}
if (lower_bound > 0 && codepoint < lower_bound) {
// Too many bytes were used, but return whatever character was encoded.
error = true;
}
return codepoint;
}
@ -1799,11 +1825,16 @@ QUtil::analyze_encoding(
bool any_errors = false;
while (pos < len) {
bool error = false;
auto old_pos = pos;
unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
if (error) {
any_errors = true;
}
if (codepoint >= 128) {
for (auto p = old_pos; p < pos; p++) {
if (static_cast<unsigned char>(val.at(p)) >= 128) {
has_8bit_chars = true;
}
}
} else if (codepoint >= 128) {
has_8bit_chars = true;
}
}

View File

@ -266,6 +266,23 @@ to_utf8_test()
} catch (std::runtime_error& e) {
std::cout << "0x80000000: " << e.what() << std::endl;
}
// Overlong characters: characters represented by more bytes than necessary.
size_t pos = 0;
std::string utf8 = "\xC0\x80" // 1 << 7
"\xE0\x80\x80" // 1 << 11
"\xF0\x80\x80\x80" // 1 << 16
"\xF8\x80\x80\x80\x80" // 1 << 21
"\xFC\x80\x80\x80\x80\x80"; // 1 << 26
auto check = [&pos, &utf8](unsigned long wanted_pos) {
bool error = false;
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0 && error && pos == wanted_pos);
};
check(2);
check(5);
check(9);
check(14);
check(20);
}
static void