Expose QUtil::get_next_utf8_codepoint

This commit is contained in:
Jay Berkenbilt 2022-04-23 16:39:27 -04:00
parent 5bbb0d4c30
commit 22b35c4928
8 changed files with 85 additions and 20 deletions

View File

@ -1,4 +1,4 @@
((nil . ((indent-tabs-mode . t) ((nil . ((indent-tabs-mode . nil)
(qpdf-cc-style (qpdf-cc-style
. .
("qpdf" ("qpdf"

View File

@ -1,3 +1,14 @@
2022-04-23 Jay Berkenbilt <ejb@ql.org>
* Add new method QUtil::is_explicit_utf8 that tests whether a
string is explicitly marked as being UTF-8 encoded, as allowed by
the PDF 2.0 spec. Such a string starts with the bytes 0xEF 0xBB
0xBF, which is the UTF-8 encoding of U+FEFF.
* Add new method QUtil::get_next_utf8_codepoint as a low-level
helper for iterating through the UTF-8 characters in a byte
string.
2022-04-16 Jay Berkenbilt <ejb@ql.org> 2022-04-16 Jay Berkenbilt <ejb@ql.org>
* Breaking CLI change: the default value for --json is now * Breaking CLI change: the default value for --json is now

3
TODO
View File

@ -11,9 +11,6 @@ In order:
Other (do in any order): Other (do in any order):
Misc Misc
* Consider exposing get_next_utf8_codepoint in QUtil
* Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val
does to detect UTF-8 encoded strings per PDF 2.0 spec.
* Add an option --ignore-encryption to ignore encryption information * Add an option --ignore-encryption to ignore encryption information
and treat encrypted files as if they weren't encrypted. This should and treat encrypted files as if they weren't encrypted. This should
make it possible to solve #598 (--show-encryption without a make it possible to solve #598 (--show-encryption without a

View File

@ -268,14 +268,33 @@ namespace QUtil
QPDF_DLL QPDF_DLL
std::string toUTF16(unsigned long uval); std::string toUTF16(unsigned long uval);
// If utf8_val.at(pos) points to the beginning of a valid
// UTF-8-encoded character, return the codepoint of the character
// and set error to false. Otherwise, return 0xfffd and set error
// to true. In all cases, pos is advanced to the next position
// that may begin a valid character. When the string has been
// consumed, pos will be set to the string length. It is an error
// to pass a value of pos that is greater than or equal to the
// length of the string.
QPDF_DLL
unsigned long get_next_utf8_codepoint(
std::string const& utf8_val, size_t& pos, bool& error);
// Test whether this is a UTF-16 string. This is indicated by // Test whether this is a UTF-16 string. This is indicated by
// first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE
// (little-endian). Starting in qpdf 10.6.2, this detects // (little-endian), each of which is the encoding of U+FEFF, the
// Unicode marker. Starting in qpdf 10.6.2, this detects
// little-endian as well as big-endian. Even though the PDF spec // little-endian as well as big-endian. Even though the PDF spec
// doesn't allow little-endian, most readers seem to accept it. // doesn't allow little-endian, most readers seem to accept it.
QPDF_DLL QPDF_DLL
bool is_utf16(std::string const&); bool is_utf16(std::string const&);
// Test whether this is an explicit UTF-8 string as allowed by the
// PDF 2.0 spec. This is indicated by first three bytes being 0xEF
// 0xBB 0xBF, which is the UTF-8 encoding of U+FEFF.
QPDF_DLL
bool is_explicit_utf8(std::string const&);
// Convert a UTF-8 encoded string to UTF-16 big-endian. // Convert a UTF-8 encoded string to UTF-16 big-endian.
// Unrepresentable code points are converted to U+FFFD. // Unrepresentable code points are converted to U+FFFD.
QPDF_DLL QPDF_DLL

View File

@ -166,11 +166,9 @@ QPDF_String::getUTF8Val() const
{ {
if (QUtil::is_utf16(this->val)) { if (QUtil::is_utf16(this->val)) {
return QUtil::utf16_to_utf8(this->val); return QUtil::utf16_to_utf8(this->val);
} else if ( } else if (QUtil::is_explicit_utf8(this->val)) {
(val.length() >= 3) && (val.at(0) == '\xEF') && (val.at(1) == '\xBB') &&
(val.at(2) == '\xBF')) {
// PDF 2.0 allows UTF-8 strings when explicitly prefixed with // PDF 2.0 allows UTF-8 strings when explicitly prefixed with
// the above bytes, which is just UTF-8 encoding of U+FEFF. // the three-byte representation of U+FEFF.
return this->val.substr(3); return this->val.substr(3);
} else { } else {
return QUtil::pdf_doc_to_utf8(this->val); return QUtil::pdf_doc_to_utf8(this->val);

View File

@ -1529,10 +1529,11 @@ encode_pdfdoc(unsigned long codepoint)
} }
unsigned long unsigned long
get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error) QUtil::get_next_utf8_codepoint(
std::string const& utf8_val, size_t& pos, bool& error)
{ {
size_t len = utf8_val.length(); size_t len = utf8_val.length();
unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos)); unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
error = false; error = false;
if (ch < 128) { if (ch < 128) {
return static_cast<unsigned long>(ch); return static_cast<unsigned long>(ch);
@ -1547,7 +1548,7 @@ get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
bit_check >>= 1; bit_check >>= 1;
} }
if (((bytes_needed > 5) || (bytes_needed < 1)) || if (((bytes_needed > 5) || (bytes_needed < 1)) ||
((pos + bytes_needed) >= len)) { ((pos + bytes_needed) > len)) {
error = true; error = true;
return 0xfffd; return 0xfffd;
} }
@ -1555,11 +1556,11 @@ get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear); unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
while (bytes_needed > 0) { while (bytes_needed > 0) {
--bytes_needed; --bytes_needed;
ch = static_cast<unsigned char>(utf8_val.at(++pos)); ch = static_cast<unsigned char>(utf8_val.at(pos++));
if ((ch & 0xc0) != 0x80) { if ((ch & 0xc0) != 0x80) {
--pos; --pos;
codepoint = 0xfffd; error = true;
break; return 0xfffd;
} }
codepoint <<= 6; codepoint <<= 6;
codepoint += (ch & 0x3f); codepoint += (ch & 0x3f);
@ -1580,9 +1581,11 @@ transcode_utf8(
result += "\xfe\xff"; result += "\xfe\xff";
} }
size_t len = utf8_val.length(); size_t len = utf8_val.length();
for (size_t i = 0; i < len; ++i) { size_t pos = 0;
while (pos < len) {
bool error = false; bool error = false;
unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error); unsigned long codepoint =
QUtil::get_next_utf8_codepoint(utf8_val, pos, error);
if (error) { if (error) {
okay = false; okay = false;
if (encoding == e_utf16) { if (encoding == e_utf16) {
@ -1710,6 +1713,15 @@ QUtil::is_utf16(std::string const& val)
((val.at(0) == '\xff') && (val.at(1) == '\xfe')))); ((val.at(0) == '\xff') && (val.at(1) == '\xfe'))));
} }
bool
QUtil::is_explicit_utf8(std::string const& val)
{
// QPDF_String.cc knows that this is a 3-byte sequence.
return (
(val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') &&
(val.at(2) == '\xbf'));
}
std::string std::string
QUtil::utf16_to_utf8(std::string const& val) QUtil::utf16_to_utf8(std::string const& val)
{ {
@ -1826,10 +1838,11 @@ QUtil::analyze_encoding(
return; return;
} }
size_t len = val.length(); size_t len = val.length();
size_t pos = 0;
bool any_errors = false; bool any_errors = false;
for (size_t i = 0; i < len; ++i) { while (pos < len) {
bool error = false; bool error = false;
unsigned long codepoint = get_next_utf8_codepoint(val, i, error); unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
if (error) { if (error) {
any_errors = true; any_errors = true;
} }

View File

@ -240,6 +240,33 @@ print_utf8(unsigned long val)
} }
} }
std::cout << std::endl; std::cout << std::endl;
// Boundary conditions for QUtil::get_next_utf8_codepoint, which is
// also tested indirectly through test_pdf_unicode.cc.
std::string utf8 = "\xcf\x80\xcf\x30\xEF\xBF\x30\x31\xcf";
size_t pos = 0;
bool error = false;
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x3c0);
assert(pos == 2);
assert(!error);
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
assert(pos == 3);
assert(error);
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
assert(pos == 4);
assert(!error);
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
assert(pos == 6);
assert(error);
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
assert(pos == 7);
assert(!error);
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x31);
assert(pos == 8);
assert(!error);
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
assert(pos == 9);
assert(error);
} }
void void

View File

@ -3,5 +3,5 @@ This file has utf-8 encoding errors and should be edited as a binary file. // <5
0: too many bytes: <20>after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072> 0: too many bytes: <20>after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072>
1: too few bytes: <20>after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072> 1: too few bytes: <20>after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072>
2: invalid codepoint (U+DEAD): <20>after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072> 2: invalid codepoint (U+DEAD): <20>after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072>
3: not enough bytes for character: <20>!after (! included) // <333a206e6f7420656e6f75676820627974657320666f72206368617261637465723a209f21616674657220282120696e636c7564656429> 3: not enough bytes for character: <20>!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029>
4: not enough bytes left in file <20> // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd> 4: not enough bytes left in file <20> // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd>