From 22b35c49289157204b35a851f3cb9cade9e98559 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sat, 23 Apr 2022 16:39:27 -0400 Subject: [PATCH] Expose QUtil::get_next_utf8_codepoint --- .dir-locals.el | 2 +- ChangeLog | 11 ++++++++++ TODO | 3 --- include/qpdf/QUtil.hh | 21 ++++++++++++++++++- libqpdf/QPDF_String.cc | 6 ++---- libqpdf/QUtil.cc | 33 +++++++++++++++++++++--------- libtests/qutil.cc | 27 ++++++++++++++++++++++++ qpdf/qtest/qpdf/unicode-errors.out | 2 +- 8 files changed, 85 insertions(+), 20 deletions(-) diff --git a/.dir-locals.el b/.dir-locals.el index 18e38e8d..052a2d96 100644 --- a/.dir-locals.el +++ b/.dir-locals.el @@ -1,4 +1,4 @@ -((nil . ((indent-tabs-mode . t) +((nil . ((indent-tabs-mode . nil) (qpdf-cc-style . ("qpdf" diff --git a/ChangeLog b/ChangeLog index a2e19b9d..aa8842ce 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +2022-04-23 Jay Berkenbilt + + * Add new method QUtil::is_explicit_utf8 that tests whether a + string is explicitly marked as being UTF-8 encoded, as allowed by + the PDF 2.0 spec. Such a string starts with the bytes 0xEF 0xBB + 0xBF, which is the UTF-8 encoding of U+FEFF. + + * Add new method QUtil::get_next_utf8_codepoint as a low-level + helper for iterating through the UTF-8 characters in a byte + string. + 2022-04-16 Jay Berkenbilt * Breaking CLI change: the default value for --json is now diff --git a/TODO b/TODO index 1cbf977f..00b2e3c7 100644 --- a/TODO +++ b/TODO @@ -11,9 +11,6 @@ In order: Other (do in any order): Misc -* Consider exposing get_next_utf8_codepoint in QUtil -* Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val - does to detect UTF-8 encoded strings per PDF 2.0 spec. * Add an option --ignore-encryption to ignore encryption information and treat encrypted files as if they weren't encrypted. This should make it possible to solve #598 (--show-encryption without a diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh index 2d0b7b56..8b2b5ff8 100644 --- a/include/qpdf/QUtil.hh +++ b/include/qpdf/QUtil.hh @@ -268,14 +268,33 @@ namespace QUtil QPDF_DLL std::string toUTF16(unsigned long uval); + // If utf8_val.at(pos) points to the beginning of a valid + // UTF-8-encoded character, return the codepoint of the character + // and set error to false. Otherwise, return 0xfffd and set error + // to true. In all cases, pos is advanced to the next position + // that may begin a valid character. When the string has been + // consumed, pos will be set to the string length. It is an error + // to pass a value of pos that is greater than or equal to the + // length of the string. + QPDF_DLL + unsigned long get_next_utf8_codepoint( + std::string const& utf8_val, size_t& pos, bool& error); + // Test whether this is a UTF-16 string. This is indicated by // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE - // (little-endian). Starting in qpdf 10.6.2, this detects + // (little-endian), each of which is the encoding of U+FEFF, the + // Unicode marker. Starting in qpdf 10.6.2, this detects // little-endian as well as big-endian. Even though the PDF spec // doesn't allow little-endian, most readers seem to accept it. QPDF_DLL bool is_utf16(std::string const&); + // Test whether this is an explicit UTF-8 string as allowed by the + // PDF 2.0 spec. This is indicated by first three bytes being 0xEF + // 0xBB 0xBF, which is the UTF-8 encoding of U+FEFF. + QPDF_DLL + bool is_explicit_utf8(std::string const&); + // Convert a UTF-8 encoded string to UTF-16 big-endian. // Unrepresentable code points are converted to U+FFFD. QPDF_DLL diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc index 89ddc498..30d6708b 100644 --- a/libqpdf/QPDF_String.cc +++ b/libqpdf/QPDF_String.cc @@ -166,11 +166,9 @@ QPDF_String::getUTF8Val() const { if (QUtil::is_utf16(this->val)) { return QUtil::utf16_to_utf8(this->val); - } else if ( - (val.length() >= 3) && (val.at(0) == '\xEF') && (val.at(1) == '\xBB') && - (val.at(2) == '\xBF')) { + } else if (QUtil::is_explicit_utf8(this->val)) { // PDF 2.0 allows UTF-8 strings when explicitly prefixed with - // the above bytes, which is just UTF-8 encoding of U+FEFF. + // the three-byte representation of U+FEFF. return this->val.substr(3); } else { return QUtil::pdf_doc_to_utf8(this->val); diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index a9e77777..5fa6c4b9 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -1529,10 +1529,11 @@ encode_pdfdoc(unsigned long codepoint) } unsigned long -get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error) +QUtil::get_next_utf8_codepoint( + std::string const& utf8_val, size_t& pos, bool& error) { size_t len = utf8_val.length(); - unsigned char ch = static_cast(utf8_val.at(pos)); + unsigned char ch = static_cast(utf8_val.at(pos++)); error = false; if (ch < 128) { return static_cast(ch); @@ -1547,7 +1548,7 @@ get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error) bit_check >>= 1; } if (((bytes_needed > 5) || (bytes_needed < 1)) || - ((pos + bytes_needed) >= len)) { + ((pos + bytes_needed) > len)) { error = true; return 0xfffd; } @@ -1555,11 +1556,11 @@ get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error) unsigned long codepoint = static_cast(ch & ~to_clear); while (bytes_needed > 0) { --bytes_needed; - ch = static_cast(utf8_val.at(++pos)); + ch = static_cast(utf8_val.at(pos++)); if ((ch & 0xc0) != 0x80) { --pos; - codepoint = 0xfffd; - break; + error = true; + return 0xfffd; } codepoint <<= 6; codepoint += (ch & 0x3f); @@ -1580,9 +1581,11 @@ transcode_utf8( result += "\xfe\xff"; } size_t len = utf8_val.length(); - for (size_t i = 0; i < len; ++i) { + size_t pos = 0; + while (pos < len) { bool error = false; - unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error); + unsigned long codepoint = + QUtil::get_next_utf8_codepoint(utf8_val, pos, error); if (error) { okay = false; if (encoding == e_utf16) { @@ -1710,6 +1713,15 @@ QUtil::is_utf16(std::string const& val) ((val.at(0) == '\xff') && (val.at(1) == '\xfe')))); } +bool +QUtil::is_explicit_utf8(std::string const& val) +{ + // QPDF_String.cc knows that this is a 3-byte sequence. + return ( + (val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') && + (val.at(2) == '\xbf')); +} + std::string QUtil::utf16_to_utf8(std::string const& val) { @@ -1826,10 +1838,11 @@ QUtil::analyze_encoding( return; } size_t len = val.length(); + size_t pos = 0; bool any_errors = false; - for (size_t i = 0; i < len; ++i) { + while (pos < len) { bool error = false; - unsigned long codepoint = get_next_utf8_codepoint(val, i, error); + unsigned long codepoint = get_next_utf8_codepoint(val, pos, error); if (error) { any_errors = true; } diff --git a/libtests/qutil.cc b/libtests/qutil.cc index 324dd84e..eb16bf0b 100644 --- a/libtests/qutil.cc +++ b/libtests/qutil.cc @@ -240,6 +240,33 @@ print_utf8(unsigned long val) } } std::cout << std::endl; + + // Boundary conditions for QUtil::get_next_utf8_codepoint, which is + // also tested indirectly through test_pdf_unicode.cc. + std::string utf8 = "\xcf\x80\xcf\x30\xEF\xBF\x30\x31\xcf"; + size_t pos = 0; + bool error = false; + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x3c0); + assert(pos == 2); + assert(!error); + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd); + assert(pos == 3); + assert(error); + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30); + assert(pos == 4); + assert(!error); + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd); + assert(pos == 6); + assert(error); + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30); + assert(pos == 7); + assert(!error); + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x31); + assert(pos == 8); + assert(!error); + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd); + assert(pos == 9); + assert(error); } void diff --git a/qpdf/qtest/qpdf/unicode-errors.out b/qpdf/qtest/qpdf/unicode-errors.out index 403bb503..4fd7c276 100644 --- a/qpdf/qtest/qpdf/unicode-errors.out +++ b/qpdf/qtest/qpdf/unicode-errors.out @@ -3,5 +3,5 @@ This file has utf-8 encoding errors and should be edited as a binary file. // <5 0: too many bytes: �after // 1: too few bytes: �after // 2: invalid codepoint (U+DEAD): �after // -3: not enough bytes for character: �!after (! included) // <333a206e6f7420656e6f75676820627974657320666f72206368617261637465723a209f21616674657220282120696e636c7564656429> +3: not enough bytes for character: �!after (! included) // 4: not enough bytes left in file � //