Detect overlong UTF-8 strings

2024-12-22 02:49:00 +00:00 · 2023-12-25 10:51:40 -05:00 · 2023-12-25 10:51:40 -05:00 · 6d4115b7c5
commit 6d4115b7c5
parent 986d248578
3 changed files with 56 additions and 3 deletions
--- a/5
+++ b/5
@ -1,3 +1,8 @@
+2023-12-25  Jay Berkenbilt  <ejb@ql.org>
+
+	* Detect overlong UTF-8 in the UTF-8 decoder, and fix detection of
+	8-bit characters in erroneous UTF-8 strings.
+
 2023-12-24  Jay Berkenbilt  <ejb@ql.org>

 	* 11.7.0: release
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@ -1485,6 +1485,7 @@ encode_pdfdoc(unsigned long codepoint)
 unsigned long
 QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
 {
+    auto o_pos = pos;
    size_t len = utf8_val.length();
    unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
    error = false;
@ -1505,7 +1506,7 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e
        return 0xfffd;
    }

-    unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
+    auto codepoint = static_cast<unsigned long>(ch & ~to_clear);
    while (bytes_needed > 0) {
        --bytes_needed;
        ch = static_cast<unsigned char>(utf8_val.at(pos++));
@ -1517,6 +1518,31 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e
        codepoint <<= 6;
        codepoint += (ch & 0x3f);
    }
+    unsigned long lower_bound = 0;
+    switch (pos - o_pos) {
+    case 2:
+        lower_bound = 1 << 7;
+        break;
+    case 3:
+        lower_bound = 1 << 11;
+        break;
+    case 4:
+        lower_bound = 1 << 16;
+        break;
+    case 5:
+        lower_bound = 1 << 12;
+        break;
+    case 6:
+        lower_bound = 1 << 26;
+        break;
+    default:
+        lower_bound = 0;
+    }
+
+    if (lower_bound > 0 && codepoint < lower_bound) {
+        // Too many bytes were used, but return whatever character was encoded.
+        error = true;
+    }
    return codepoint;
 }

@ -1799,11 +1825,16 @@ QUtil::analyze_encoding(
    bool any_errors = false;
    while (pos < len) {
        bool error = false;
+        auto old_pos = pos;
        unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
        if (error) {
            any_errors = true;
-        }
-        if (codepoint >= 128) {
+            for (auto p = old_pos; p < pos; p++) {
+                if (static_cast<unsigned char>(val.at(p)) >= 128) {
+                    has_8bit_chars = true;
+                }
+            }
+        } else if (codepoint >= 128) {
            has_8bit_chars = true;
        }
    }
--- a/libtests/qutil.cc
+++ b/libtests/qutil.cc
@ -266,6 +266,23 @@ to_utf8_test()
    } catch (std::runtime_error& e) {
        std::cout << "0x80000000: " << e.what() << std::endl;
    }
+
+    // Overlong characters: characters represented by more bytes than necessary.
+    size_t pos = 0;
+    std::string utf8 = "\xC0\x80"                  // 1 << 7
+                       "\xE0\x80\x80"              // 1 << 11
+                       "\xF0\x80\x80\x80"          // 1 << 16
+                       "\xF8\x80\x80\x80\x80"      // 1 << 21
+                       "\xFC\x80\x80\x80\x80\x80"; // 1 << 26
+    auto check = [&pos, &utf8](unsigned long wanted_pos) {
+        bool error = false;
+        assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0 && error && pos == wanted_pos);
+    };
+    check(2);
+    check(5);
+    check(9);
+    check(14);
+    check(20);
 }

 static void