From 6d4115b7c565b6750ba4649d120446a1bd2b5af2 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Mon, 25 Dec 2023 10:51:40 -0500 Subject: [PATCH] Detect overlong UTF-8 strings --- ChangeLog | 5 +++++ libqpdf/QUtil.cc | 37 ++++++++++++++++++++++++++++++++++--- libtests/qutil.cc | 17 +++++++++++++++++ 3 files changed, 56 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index 1ed0dcde..f313fab4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2023-12-25 Jay Berkenbilt + + * Detect overlong UTF-8 in the UTF-8 decoder, and fix detection of + 8-bit characters in erroneous UTF-8 strings. + 2023-12-24 Jay Berkenbilt * 11.7.0: release diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index fcba203f..25c7281f 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -1485,6 +1485,7 @@ encode_pdfdoc(unsigned long codepoint) unsigned long QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error) { + auto o_pos = pos; size_t len = utf8_val.length(); unsigned char ch = static_cast(utf8_val.at(pos++)); error = false; @@ -1505,7 +1506,7 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e return 0xfffd; } - unsigned long codepoint = static_cast(ch & ~to_clear); + auto codepoint = static_cast(ch & ~to_clear); while (bytes_needed > 0) { --bytes_needed; ch = static_cast(utf8_val.at(pos++)); @@ -1517,6 +1518,31 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e codepoint <<= 6; codepoint += (ch & 0x3f); } + unsigned long lower_bound = 0; + switch (pos - o_pos) { + case 2: + lower_bound = 1 << 7; + break; + case 3: + lower_bound = 1 << 11; + break; + case 4: + lower_bound = 1 << 16; + break; + case 5: + lower_bound = 1 << 12; + break; + case 6: + lower_bound = 1 << 26; + break; + default: + lower_bound = 0; + } + + if (lower_bound > 0 && codepoint < lower_bound) { + // Too many bytes were used, but return whatever character was encoded. + error = true; + } return codepoint; } @@ -1799,11 +1825,16 @@ QUtil::analyze_encoding( bool any_errors = false; while (pos < len) { bool error = false; + auto old_pos = pos; unsigned long codepoint = get_next_utf8_codepoint(val, pos, error); if (error) { any_errors = true; - } - if (codepoint >= 128) { + for (auto p = old_pos; p < pos; p++) { + if (static_cast(val.at(p)) >= 128) { + has_8bit_chars = true; + } + } + } else if (codepoint >= 128) { has_8bit_chars = true; } } diff --git a/libtests/qutil.cc b/libtests/qutil.cc index e882a33a..ca6ee314 100644 --- a/libtests/qutil.cc +++ b/libtests/qutil.cc @@ -266,6 +266,23 @@ to_utf8_test() } catch (std::runtime_error& e) { std::cout << "0x80000000: " << e.what() << std::endl; } + + // Overlong characters: characters represented by more bytes than necessary. + size_t pos = 0; + std::string utf8 = "\xC0\x80" // 1 << 7 + "\xE0\x80\x80" // 1 << 11 + "\xF0\x80\x80\x80" // 1 << 16 + "\xF8\x80\x80\x80\x80" // 1 << 21 + "\xFC\x80\x80\x80\x80\x80"; // 1 << 26 + auto check = [&pos, &utf8](unsigned long wanted_pos) { + bool error = false; + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0 && error && pos == wanted_pos); + }; + check(2); + check(5); + check(9); + check(14); + check(20); } static void