From 6d4115b7c565b6750ba4649d120446a1bd2b5af2 Mon Sep 17 00:00:00 2001
From: Jay Berkenbilt <ejb@ql.org>
Date: Mon, 25 Dec 2023 10:51:40 -0500
Subject: [PATCH] Detect overlong UTF-8 strings

---
 ChangeLog         |  5 +++++
 libqpdf/QUtil.cc  | 37 ++++++++++++++++++++++++++++++++++---
 libtests/qutil.cc | 17 +++++++++++++++++
 3 files changed, 56 insertions(+), 3 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 1ed0dcde..f313fab4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2023-12-25  Jay Berkenbilt  <ejb@ql.org>
+
+	* Detect overlong UTF-8 in the UTF-8 decoder, and fix detection of
+	8-bit characters in erroneous UTF-8 strings.
+
 2023-12-24  Jay Berkenbilt  <ejb@ql.org>
 
 	* 11.7.0: release
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index fcba203f..25c7281f 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -1485,6 +1485,7 @@ encode_pdfdoc(unsigned long codepoint)
 unsigned long
 QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
 {
+    auto o_pos = pos;
     size_t len = utf8_val.length();
     unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
     error = false;
@@ -1505,7 +1506,7 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e
         return 0xfffd;
     }
 
-    unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
+    auto codepoint = static_cast<unsigned long>(ch & ~to_clear);
     while (bytes_needed > 0) {
         --bytes_needed;
         ch = static_cast<unsigned char>(utf8_val.at(pos++));
@@ -1517,6 +1518,31 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e
         codepoint <<= 6;
         codepoint += (ch & 0x3f);
     }
+    unsigned long lower_bound = 0;
+    switch (pos - o_pos) {
+    case 2:
+        lower_bound = 1 << 7;
+        break;
+    case 3:
+        lower_bound = 1 << 11;
+        break;
+    case 4:
+        lower_bound = 1 << 16;
+        break;
+    case 5:
+        lower_bound = 1 << 12;
+        break;
+    case 6:
+        lower_bound = 1 << 26;
+        break;
+    default:
+        lower_bound = 0;
+    }
+
+    if (lower_bound > 0 && codepoint < lower_bound) {
+        // Too many bytes were used, but return whatever character was encoded.
+        error = true;
+    }
     return codepoint;
 }
 
@@ -1799,11 +1825,16 @@ QUtil::analyze_encoding(
     bool any_errors = false;
     while (pos < len) {
         bool error = false;
+        auto old_pos = pos;
         unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
         if (error) {
             any_errors = true;
-        }
-        if (codepoint >= 128) {
+            for (auto p = old_pos; p < pos; p++) {
+                if (static_cast<unsigned char>(val.at(p)) >= 128) {
+                    has_8bit_chars = true;
+                }
+            }
+        } else if (codepoint >= 128) {
             has_8bit_chars = true;
         }
     }
diff --git a/libtests/qutil.cc b/libtests/qutil.cc
index e882a33a..ca6ee314 100644
--- a/libtests/qutil.cc
+++ b/libtests/qutil.cc
@@ -266,6 +266,23 @@ to_utf8_test()
     } catch (std::runtime_error& e) {
         std::cout << "0x80000000: " << e.what() << std::endl;
     }
+
+    // Overlong characters: characters represented by more bytes than necessary.
+    size_t pos = 0;
+    std::string utf8 = "\xC0\x80"                  // 1 << 7
+                       "\xE0\x80\x80"              // 1 << 11
+                       "\xF0\x80\x80\x80"          // 1 << 16
+                       "\xF8\x80\x80\x80\x80"      // 1 << 21
+                       "\xFC\x80\x80\x80\x80\x80"; // 1 << 26
+    auto check = [&pos, &utf8](unsigned long wanted_pos) {
+        bool error = false;
+        assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0 && error && pos == wanted_pos);
+    };
+    check(2);
+    check(5);
+    check(9);
+    check(14);
+    check(20);
 }
 
 static void