mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 10:58:58 +00:00
Detect overlong UTF-8 strings
This commit is contained in:
parent
986d248578
commit
6d4115b7c5
@ -1,3 +1,8 @@
|
||||
2023-12-25 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Detect overlong UTF-8 in the UTF-8 decoder, and fix detection of
|
||||
8-bit characters in erroneous UTF-8 strings.
|
||||
|
||||
2023-12-24 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* 11.7.0: release
|
||||
|
@ -1485,6 +1485,7 @@ encode_pdfdoc(unsigned long codepoint)
|
||||
unsigned long
|
||||
QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
|
||||
{
|
||||
auto o_pos = pos;
|
||||
size_t len = utf8_val.length();
|
||||
unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
|
||||
error = false;
|
||||
@ -1505,7 +1506,7 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e
|
||||
return 0xfffd;
|
||||
}
|
||||
|
||||
unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
|
||||
auto codepoint = static_cast<unsigned long>(ch & ~to_clear);
|
||||
while (bytes_needed > 0) {
|
||||
--bytes_needed;
|
||||
ch = static_cast<unsigned char>(utf8_val.at(pos++));
|
||||
@ -1517,6 +1518,31 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e
|
||||
codepoint <<= 6;
|
||||
codepoint += (ch & 0x3f);
|
||||
}
|
||||
unsigned long lower_bound = 0;
|
||||
switch (pos - o_pos) {
|
||||
case 2:
|
||||
lower_bound = 1 << 7;
|
||||
break;
|
||||
case 3:
|
||||
lower_bound = 1 << 11;
|
||||
break;
|
||||
case 4:
|
||||
lower_bound = 1 << 16;
|
||||
break;
|
||||
case 5:
|
||||
lower_bound = 1 << 12;
|
||||
break;
|
||||
case 6:
|
||||
lower_bound = 1 << 26;
|
||||
break;
|
||||
default:
|
||||
lower_bound = 0;
|
||||
}
|
||||
|
||||
if (lower_bound > 0 && codepoint < lower_bound) {
|
||||
// Too many bytes were used, but return whatever character was encoded.
|
||||
error = true;
|
||||
}
|
||||
return codepoint;
|
||||
}
|
||||
|
||||
@ -1799,11 +1825,16 @@ QUtil::analyze_encoding(
|
||||
bool any_errors = false;
|
||||
while (pos < len) {
|
||||
bool error = false;
|
||||
auto old_pos = pos;
|
||||
unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
|
||||
if (error) {
|
||||
any_errors = true;
|
||||
}
|
||||
if (codepoint >= 128) {
|
||||
for (auto p = old_pos; p < pos; p++) {
|
||||
if (static_cast<unsigned char>(val.at(p)) >= 128) {
|
||||
has_8bit_chars = true;
|
||||
}
|
||||
}
|
||||
} else if (codepoint >= 128) {
|
||||
has_8bit_chars = true;
|
||||
}
|
||||
}
|
||||
|
@ -266,6 +266,23 @@ to_utf8_test()
|
||||
} catch (std::runtime_error& e) {
|
||||
std::cout << "0x80000000: " << e.what() << std::endl;
|
||||
}
|
||||
|
||||
// Overlong characters: characters represented by more bytes than necessary.
|
||||
size_t pos = 0;
|
||||
std::string utf8 = "\xC0\x80" // 1 << 7
|
||||
"\xE0\x80\x80" // 1 << 11
|
||||
"\xF0\x80\x80\x80" // 1 << 16
|
||||
"\xF8\x80\x80\x80\x80" // 1 << 21
|
||||
"\xFC\x80\x80\x80\x80\x80"; // 1 << 26
|
||||
auto check = [&pos, &utf8](unsigned long wanted_pos) {
|
||||
bool error = false;
|
||||
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0 && error && pos == wanted_pos);
|
||||
};
|
||||
check(2);
|
||||
check(5);
|
||||
check(9);
|
||||
check(14);
|
||||
check(20);
|
||||
}
|
||||
|
||||
static void
|
||||
|
Loading…
Reference in New Issue
Block a user