Expose QUtil::get_next_utf8_codepoint

This commit is contained in:
Jay Berkenbilt 2022-04-23 16:39:27 -04:00
parent 5bbb0d4c30
commit 22b35c4928
8 changed files with 85 additions and 20 deletions

View File

@ -1,4 +1,4 @@
((nil . ((indent-tabs-mode . t)
((nil . ((indent-tabs-mode . nil)
(qpdf-cc-style
.
("qpdf"

View File

@ -1,3 +1,14 @@
2022-04-23 Jay Berkenbilt <ejb@ql.org>
* Add new method QUtil::is_explicit_utf8 that tests whether a
string is explicitly marked as being UTF-8 encoded, as allowed by
the PDF 2.0 spec. Such a string starts with the bytes 0xEF 0xBB
0xBF, which is the UTF-8 encoding of U+FEFF.
* Add new method QUtil::get_next_utf8_codepoint as a low-level
helper for iterating through the UTF-8 characters in a byte
string.
2022-04-16 Jay Berkenbilt <ejb@ql.org>
* Breaking CLI change: the default value for --json is now

3
TODO
View File

@ -11,9 +11,6 @@ In order:
Other (do in any order):
Misc
* Consider exposing get_next_utf8_codepoint in QUtil
* Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val
does to detect UTF-8 encoded strings per PDF 2.0 spec.
* Add an option --ignore-encryption to ignore encryption information
and treat encrypted files as if they weren't encrypted. This should
make it possible to solve #598 (--show-encryption without a

View File

@ -268,14 +268,33 @@ namespace QUtil
QPDF_DLL
std::string toUTF16(unsigned long uval);
// If utf8_val.at(pos) points to the beginning of a valid
// UTF-8-encoded character, return the codepoint of the character
// and set error to false. Otherwise, return 0xfffd and set error
// to true. In all cases, pos is advanced to the next position
// that may begin a valid character. When the string has been
// consumed, pos will be set to the string length. It is an error
// to pass a value of pos that is greater than or equal to the
// length of the string.
QPDF_DLL
unsigned long get_next_utf8_codepoint(
std::string const& utf8_val, size_t& pos, bool& error);
// Test whether this is a UTF-16 string. This is indicated by
// first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE
// (little-endian). Starting in qpdf 10.6.2, this detects
// (little-endian), each of which is the encoding of U+FEFF, the
// Unicode marker. Starting in qpdf 10.6.2, this detects
// little-endian as well as big-endian. Even though the PDF spec
// doesn't allow little-endian, most readers seem to accept it.
QPDF_DLL
bool is_utf16(std::string const&);
// Test whether this is an explicit UTF-8 string as allowed by the
// PDF 2.0 spec. This is indicated by first three bytes being 0xEF
// 0xBB 0xBF, which is the UTF-8 encoding of U+FEFF.
QPDF_DLL
bool is_explicit_utf8(std::string const&);
// Convert a UTF-8 encoded string to UTF-16 big-endian.
// Unrepresentable code points are converted to U+FFFD.
QPDF_DLL

View File

@ -166,11 +166,9 @@ QPDF_String::getUTF8Val() const
{
if (QUtil::is_utf16(this->val)) {
return QUtil::utf16_to_utf8(this->val);
} else if (
(val.length() >= 3) && (val.at(0) == '\xEF') && (val.at(1) == '\xBB') &&
(val.at(2) == '\xBF')) {
} else if (QUtil::is_explicit_utf8(this->val)) {
// PDF 2.0 allows UTF-8 strings when explicitly prefixed with
// the above bytes, which is just UTF-8 encoding of U+FEFF.
// the three-byte representation of U+FEFF.
return this->val.substr(3);
} else {
return QUtil::pdf_doc_to_utf8(this->val);

View File

@ -1529,10 +1529,11 @@ encode_pdfdoc(unsigned long codepoint)
}
unsigned long
get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
QUtil::get_next_utf8_codepoint(
std::string const& utf8_val, size_t& pos, bool& error)
{
size_t len = utf8_val.length();
unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos));
unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
error = false;
if (ch < 128) {
return static_cast<unsigned long>(ch);
@ -1547,7 +1548,7 @@ get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
bit_check >>= 1;
}
if (((bytes_needed > 5) || (bytes_needed < 1)) ||
((pos + bytes_needed) >= len)) {
((pos + bytes_needed) > len)) {
error = true;
return 0xfffd;
}
@ -1555,11 +1556,11 @@ get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
while (bytes_needed > 0) {
--bytes_needed;
ch = static_cast<unsigned char>(utf8_val.at(++pos));
ch = static_cast<unsigned char>(utf8_val.at(pos++));
if ((ch & 0xc0) != 0x80) {
--pos;
codepoint = 0xfffd;
break;
error = true;
return 0xfffd;
}
codepoint <<= 6;
codepoint += (ch & 0x3f);
@ -1580,9 +1581,11 @@ transcode_utf8(
result += "\xfe\xff";
}
size_t len = utf8_val.length();
for (size_t i = 0; i < len; ++i) {
size_t pos = 0;
while (pos < len) {
bool error = false;
unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error);
unsigned long codepoint =
QUtil::get_next_utf8_codepoint(utf8_val, pos, error);
if (error) {
okay = false;
if (encoding == e_utf16) {
@ -1710,6 +1713,15 @@ QUtil::is_utf16(std::string const& val)
((val.at(0) == '\xff') && (val.at(1) == '\xfe'))));
}
bool
QUtil::is_explicit_utf8(std::string const& val)
{
// QPDF_String.cc knows that this is a 3-byte sequence.
return (
(val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') &&
(val.at(2) == '\xbf'));
}
std::string
QUtil::utf16_to_utf8(std::string const& val)
{
@ -1826,10 +1838,11 @@ QUtil::analyze_encoding(
return;
}
size_t len = val.length();
size_t pos = 0;
bool any_errors = false;
for (size_t i = 0; i < len; ++i) {
while (pos < len) {
bool error = false;
unsigned long codepoint = get_next_utf8_codepoint(val, i, error);
unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
if (error) {
any_errors = true;
}

View File

@ -240,6 +240,33 @@ print_utf8(unsigned long val)
}
}
std::cout << std::endl;
// Boundary conditions for QUtil::get_next_utf8_codepoint, which is
// also tested indirectly through test_pdf_unicode.cc.
std::string utf8 = "\xcf\x80\xcf\x30\xEF\xBF\x30\x31\xcf";
size_t pos = 0;
bool error = false;
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x3c0);
assert(pos == 2);
assert(!error);
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
assert(pos == 3);
assert(error);
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
assert(pos == 4);
assert(!error);
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
assert(pos == 6);
assert(error);
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
assert(pos == 7);
assert(!error);
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x31);
assert(pos == 8);
assert(!error);
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
assert(pos == 9);
assert(error);
}
void

View File

@ -3,5 +3,5 @@ This file has utf-8 encoding errors and should be edited as a binary file. // <5
0: too many bytes: <20>after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072>
1: too few bytes: <20>after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072>
2: invalid codepoint (U+DEAD): <20>after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072>
3: not enough bytes for character: <20>!after (! included) // <333a206e6f7420656e6f75676820627974657320666f72206368617261637465723a209f21616674657220282120696e636c7564656429>
3: not enough bytes for character: <20>!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029>
4: not enough bytes left in file <20> // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd>