Expose QUtil::get_next_utf8_codepoint

2024-12-22 10:58:58 +00:00 · 2022-04-23 16:39:27 -04:00 · 2022-04-23 16:39:27 -04:00 · 22b35c4928
commit 22b35c4928
parent 5bbb0d4c30
8 changed files with 85 additions and 20 deletions
--- a/.dir-locals.el
+++ b/.dir-locals.el
@ -1,4 +1,4 @@
-((nil . ((indent-tabs-mode . t)
+((nil . ((indent-tabs-mode . nil)
         (qpdf-cc-style
          .
          ("qpdf"
--- a/11
+++ b/11
@ -1,3 +1,14 @@
+2022-04-23  Jay Berkenbilt  <ejb@ql.org>
+
+	* Add new method QUtil::is_explicit_utf8 that tests whether a
+	string is explicitly marked as being UTF-8 encoded, as allowed by
+	the PDF 2.0 spec. Such a string starts with the bytes 0xEF 0xBB
+	0xBF, which is the UTF-8 encoding of U+FEFF.
+
+	* Add new method QUtil::get_next_utf8_codepoint as a low-level
+	helper for iterating through the UTF-8 characters in a byte
+	string.
+
 2022-04-16  Jay Berkenbilt  <ejb@ql.org>

 	* Breaking CLI change: the default value for --json is now
--- a/3
+++ b/3
@ -11,9 +11,6 @@ In order:
 Other (do in any order):

 Misc
-* Consider exposing get_next_utf8_codepoint in QUtil
-* Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val
-  does to detect UTF-8 encoded strings per PDF 2.0 spec.
 * Add an option --ignore-encryption to ignore encryption information
  and treat encrypted files as if they weren't encrypted. This should
  make it possible to solve #598 (--show-encryption without a
--- a/include/qpdf/QUtil.hh
+++ b/include/qpdf/QUtil.hh
@ -268,14 +268,33 @@ namespace QUtil
    QPDF_DLL
    std::string toUTF16(unsigned long uval);

+    // If utf8_val.at(pos) points to the beginning of a valid
+    // UTF-8-encoded character, return the codepoint of the character
+    // and set error to false. Otherwise, return 0xfffd and set error
+    // to true. In all cases, pos is advanced to the next position
+    // that may begin a valid character. When the string has been
+    // consumed, pos will be set to the string length. It is an error
+    // to pass a value of pos that is greater than or equal to the
+    // length of the string.
+    QPDF_DLL
+    unsigned long get_next_utf8_codepoint(
+        std::string const& utf8_val, size_t& pos, bool& error);
+
    // Test whether this is a UTF-16 string. This is indicated by
    // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE
-    // (little-endian). Starting in qpdf 10.6.2, this detects
+    // (little-endian), each of which is the encoding of U+FEFF, the
+    // Unicode marker. Starting in qpdf 10.6.2, this detects
    // little-endian as well as big-endian. Even though the PDF spec
    // doesn't allow little-endian, most readers seem to accept it.
    QPDF_DLL
    bool is_utf16(std::string const&);

+    // Test whether this is an explicit UTF-8 string as allowed by the
+    // PDF 2.0 spec. This is indicated by first three bytes being 0xEF
+    // 0xBB 0xBF, which is the UTF-8 encoding of U+FEFF.
+    QPDF_DLL
+    bool is_explicit_utf8(std::string const&);
+
    // Convert a UTF-8 encoded string to UTF-16 big-endian.
    // Unrepresentable code points are converted to U+FFFD.
    QPDF_DLL
--- a/libqpdf/QPDF_String.cc
+++ b/libqpdf/QPDF_String.cc
@ -166,11 +166,9 @@ QPDF_String::getUTF8Val() const
 {
    if (QUtil::is_utf16(this->val)) {
        return QUtil::utf16_to_utf8(this->val);
-    } else if (
-        (val.length() >= 3) && (val.at(0) == '\xEF') && (val.at(1) == '\xBB') &&
-        (val.at(2) == '\xBF')) {
+    } else if (QUtil::is_explicit_utf8(this->val)) {
        // PDF 2.0 allows UTF-8 strings when explicitly prefixed with
-        // the above bytes, which is just UTF-8 encoding of U+FEFF.
+        // the three-byte representation of U+FEFF.
        return this->val.substr(3);
    } else {
        return QUtil::pdf_doc_to_utf8(this->val);
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@ -1529,10 +1529,11 @@ encode_pdfdoc(unsigned long codepoint)
 }

 unsigned long
-get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
+QUtil::get_next_utf8_codepoint(
+    std::string const& utf8_val, size_t& pos, bool& error)
 {
    size_t len = utf8_val.length();
-    unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos));
+    unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
    error = false;
    if (ch < 128) {
        return static_cast<unsigned long>(ch);
@ -1547,7 +1548,7 @@ get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
        bit_check >>= 1;
    }
    if (((bytes_needed > 5) || (bytes_needed < 1)) ||
-        ((pos + bytes_needed) >= len)) {
+        ((pos + bytes_needed) > len)) {
        error = true;
        return 0xfffd;
    }
@ -1555,11 +1556,11 @@ get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
    unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
    while (bytes_needed > 0) {
        --bytes_needed;
-        ch = static_cast<unsigned char>(utf8_val.at(++pos));
+        ch = static_cast<unsigned char>(utf8_val.at(pos++));
        if ((ch & 0xc0) != 0x80) {
            --pos;
-            codepoint = 0xfffd;
-            break;
+            error = true;
+            return 0xfffd;
        }
        codepoint <<= 6;
        codepoint += (ch & 0x3f);
@ -1580,9 +1581,11 @@ transcode_utf8(
        result += "\xfe\xff";
    }
    size_t len = utf8_val.length();
-    for (size_t i = 0; i < len; ++i) {
+    size_t pos = 0;
+    while (pos < len) {
        bool error = false;
-        unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error);
+        unsigned long codepoint =
+            QUtil::get_next_utf8_codepoint(utf8_val, pos, error);
        if (error) {
            okay = false;
            if (encoding == e_utf16) {
@ -1710,6 +1713,15 @@ QUtil::is_utf16(std::string const& val)
         ((val.at(0) == '\xff') && (val.at(1) == '\xfe'))));
 }

+bool
+QUtil::is_explicit_utf8(std::string const& val)
+{
+    // QPDF_String.cc knows that this is a 3-byte sequence.
+    return (
+        (val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') &&
+        (val.at(2) == '\xbf'));
+}
+
 std::string
 QUtil::utf16_to_utf8(std::string const& val)
 {
@ -1826,10 +1838,11 @@ QUtil::analyze_encoding(
        return;
    }
    size_t len = val.length();
+    size_t pos = 0;
    bool any_errors = false;
-    for (size_t i = 0; i < len; ++i) {
+    while (pos < len) {
        bool error = false;
-        unsigned long codepoint = get_next_utf8_codepoint(val, i, error);
+        unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
        if (error) {
            any_errors = true;
        }
--- a/libtests/qutil.cc
+++ b/libtests/qutil.cc
@ -240,6 +240,33 @@ print_utf8(unsigned long val)
        }
    }
    std::cout << std::endl;
+
+    // Boundary conditions for QUtil::get_next_utf8_codepoint, which is
+    // also tested indirectly through test_pdf_unicode.cc.
+    std::string utf8 = "\xcf\x80\xcf\x30\xEF\xBF\x30\x31\xcf";
+    size_t pos = 0;
+    bool error = false;
+    assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x3c0);
+    assert(pos == 2);
+    assert(!error);
+    assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
+    assert(pos == 3);
+    assert(error);
+    assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
+    assert(pos == 4);
+    assert(!error);
+    assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
+    assert(pos == 6);
+    assert(error);
+    assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
+    assert(pos == 7);
+    assert(!error);
+    assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x31);
+    assert(pos == 8);
+    assert(!error);
+    assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
+    assert(pos == 9);
+    assert(error);
 }

 void
--- a/qpdf/qtest/qpdf/unicode-errors.out
+++ b/qpdf/qtest/qpdf/unicode-errors.out
@ -3,5 +3,5 @@ This file has utf-8 encoding errors and should be edited as a binary file. // <5
 0: too many bytes: <20>after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072>
 1: too few bytes: <20>after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072>
 2: invalid codepoint (U+DEAD): <20>after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072>
-3: not enough bytes for character: <20>!after (! included) // <333a206e6f7420656e6f75676820627974657320666f72206368617261637465723a209f21616674657220282120696e636c7564656429>
+3: not enough bytes for character: <20>!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029>
 4: not enough bytes left in file <20> // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd>