mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 10:58:58 +00:00
Expose QUtil::get_next_utf8_codepoint
This commit is contained in:
parent
5bbb0d4c30
commit
22b35c4928
@ -1,4 +1,4 @@
|
||||
((nil . ((indent-tabs-mode . t)
|
||||
((nil . ((indent-tabs-mode . nil)
|
||||
(qpdf-cc-style
|
||||
.
|
||||
("qpdf"
|
||||
|
11
ChangeLog
11
ChangeLog
@ -1,3 +1,14 @@
|
||||
2022-04-23 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Add new method QUtil::is_explicit_utf8 that tests whether a
|
||||
string is explicitly marked as being UTF-8 encoded, as allowed by
|
||||
the PDF 2.0 spec. Such a string starts with the bytes 0xEF 0xBB
|
||||
0xBF, which is the UTF-8 encoding of U+FEFF.
|
||||
|
||||
* Add new method QUtil::get_next_utf8_codepoint as a low-level
|
||||
helper for iterating through the UTF-8 characters in a byte
|
||||
string.
|
||||
|
||||
2022-04-16 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Breaking CLI change: the default value for --json is now
|
||||
|
3
TODO
3
TODO
@ -11,9 +11,6 @@ In order:
|
||||
Other (do in any order):
|
||||
|
||||
Misc
|
||||
* Consider exposing get_next_utf8_codepoint in QUtil
|
||||
* Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val
|
||||
does to detect UTF-8 encoded strings per PDF 2.0 spec.
|
||||
* Add an option --ignore-encryption to ignore encryption information
|
||||
and treat encrypted files as if they weren't encrypted. This should
|
||||
make it possible to solve #598 (--show-encryption without a
|
||||
|
@ -268,14 +268,33 @@ namespace QUtil
|
||||
QPDF_DLL
|
||||
std::string toUTF16(unsigned long uval);
|
||||
|
||||
// If utf8_val.at(pos) points to the beginning of a valid
|
||||
// UTF-8-encoded character, return the codepoint of the character
|
||||
// and set error to false. Otherwise, return 0xfffd and set error
|
||||
// to true. In all cases, pos is advanced to the next position
|
||||
// that may begin a valid character. When the string has been
|
||||
// consumed, pos will be set to the string length. It is an error
|
||||
// to pass a value of pos that is greater than or equal to the
|
||||
// length of the string.
|
||||
QPDF_DLL
|
||||
unsigned long get_next_utf8_codepoint(
|
||||
std::string const& utf8_val, size_t& pos, bool& error);
|
||||
|
||||
// Test whether this is a UTF-16 string. This is indicated by
|
||||
// first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE
|
||||
// (little-endian). Starting in qpdf 10.6.2, this detects
|
||||
// (little-endian), each of which is the encoding of U+FEFF, the
|
||||
// Unicode marker. Starting in qpdf 10.6.2, this detects
|
||||
// little-endian as well as big-endian. Even though the PDF spec
|
||||
// doesn't allow little-endian, most readers seem to accept it.
|
||||
QPDF_DLL
|
||||
bool is_utf16(std::string const&);
|
||||
|
||||
// Test whether this is an explicit UTF-8 string as allowed by the
|
||||
// PDF 2.0 spec. This is indicated by first three bytes being 0xEF
|
||||
// 0xBB 0xBF, which is the UTF-8 encoding of U+FEFF.
|
||||
QPDF_DLL
|
||||
bool is_explicit_utf8(std::string const&);
|
||||
|
||||
// Convert a UTF-8 encoded string to UTF-16 big-endian.
|
||||
// Unrepresentable code points are converted to U+FFFD.
|
||||
QPDF_DLL
|
||||
|
@ -166,11 +166,9 @@ QPDF_String::getUTF8Val() const
|
||||
{
|
||||
if (QUtil::is_utf16(this->val)) {
|
||||
return QUtil::utf16_to_utf8(this->val);
|
||||
} else if (
|
||||
(val.length() >= 3) && (val.at(0) == '\xEF') && (val.at(1) == '\xBB') &&
|
||||
(val.at(2) == '\xBF')) {
|
||||
} else if (QUtil::is_explicit_utf8(this->val)) {
|
||||
// PDF 2.0 allows UTF-8 strings when explicitly prefixed with
|
||||
// the above bytes, which is just UTF-8 encoding of U+FEFF.
|
||||
// the three-byte representation of U+FEFF.
|
||||
return this->val.substr(3);
|
||||
} else {
|
||||
return QUtil::pdf_doc_to_utf8(this->val);
|
||||
|
@ -1529,10 +1529,11 @@ encode_pdfdoc(unsigned long codepoint)
|
||||
}
|
||||
|
||||
unsigned long
|
||||
get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
|
||||
QUtil::get_next_utf8_codepoint(
|
||||
std::string const& utf8_val, size_t& pos, bool& error)
|
||||
{
|
||||
size_t len = utf8_val.length();
|
||||
unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos));
|
||||
unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
|
||||
error = false;
|
||||
if (ch < 128) {
|
||||
return static_cast<unsigned long>(ch);
|
||||
@ -1547,7 +1548,7 @@ get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
|
||||
bit_check >>= 1;
|
||||
}
|
||||
if (((bytes_needed > 5) || (bytes_needed < 1)) ||
|
||||
((pos + bytes_needed) >= len)) {
|
||||
((pos + bytes_needed) > len)) {
|
||||
error = true;
|
||||
return 0xfffd;
|
||||
}
|
||||
@ -1555,11 +1556,11 @@ get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
|
||||
unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
|
||||
while (bytes_needed > 0) {
|
||||
--bytes_needed;
|
||||
ch = static_cast<unsigned char>(utf8_val.at(++pos));
|
||||
ch = static_cast<unsigned char>(utf8_val.at(pos++));
|
||||
if ((ch & 0xc0) != 0x80) {
|
||||
--pos;
|
||||
codepoint = 0xfffd;
|
||||
break;
|
||||
error = true;
|
||||
return 0xfffd;
|
||||
}
|
||||
codepoint <<= 6;
|
||||
codepoint += (ch & 0x3f);
|
||||
@ -1580,9 +1581,11 @@ transcode_utf8(
|
||||
result += "\xfe\xff";
|
||||
}
|
||||
size_t len = utf8_val.length();
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
size_t pos = 0;
|
||||
while (pos < len) {
|
||||
bool error = false;
|
||||
unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error);
|
||||
unsigned long codepoint =
|
||||
QUtil::get_next_utf8_codepoint(utf8_val, pos, error);
|
||||
if (error) {
|
||||
okay = false;
|
||||
if (encoding == e_utf16) {
|
||||
@ -1710,6 +1713,15 @@ QUtil::is_utf16(std::string const& val)
|
||||
((val.at(0) == '\xff') && (val.at(1) == '\xfe'))));
|
||||
}
|
||||
|
||||
bool
|
||||
QUtil::is_explicit_utf8(std::string const& val)
|
||||
{
|
||||
// QPDF_String.cc knows that this is a 3-byte sequence.
|
||||
return (
|
||||
(val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') &&
|
||||
(val.at(2) == '\xbf'));
|
||||
}
|
||||
|
||||
std::string
|
||||
QUtil::utf16_to_utf8(std::string const& val)
|
||||
{
|
||||
@ -1826,10 +1838,11 @@ QUtil::analyze_encoding(
|
||||
return;
|
||||
}
|
||||
size_t len = val.length();
|
||||
size_t pos = 0;
|
||||
bool any_errors = false;
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
while (pos < len) {
|
||||
bool error = false;
|
||||
unsigned long codepoint = get_next_utf8_codepoint(val, i, error);
|
||||
unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
|
||||
if (error) {
|
||||
any_errors = true;
|
||||
}
|
||||
|
@ -240,6 +240,33 @@ print_utf8(unsigned long val)
|
||||
}
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
// Boundary conditions for QUtil::get_next_utf8_codepoint, which is
|
||||
// also tested indirectly through test_pdf_unicode.cc.
|
||||
std::string utf8 = "\xcf\x80\xcf\x30\xEF\xBF\x30\x31\xcf";
|
||||
size_t pos = 0;
|
||||
bool error = false;
|
||||
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x3c0);
|
||||
assert(pos == 2);
|
||||
assert(!error);
|
||||
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
|
||||
assert(pos == 3);
|
||||
assert(error);
|
||||
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
|
||||
assert(pos == 4);
|
||||
assert(!error);
|
||||
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
|
||||
assert(pos == 6);
|
||||
assert(error);
|
||||
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
|
||||
assert(pos == 7);
|
||||
assert(!error);
|
||||
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x31);
|
||||
assert(pos == 8);
|
||||
assert(!error);
|
||||
assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
|
||||
assert(pos == 9);
|
||||
assert(error);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -3,5 +3,5 @@ This file has utf-8 encoding errors and should be edited as a binary file. // <5
|
||||
0: too many bytes: <20>after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072>
|
||||
1: too few bytes: <20>after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072>
|
||||
2: invalid codepoint (U+DEAD): <20>after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072>
|
||||
3: not enough bytes for character: <20>!after (! included) // <333a206e6f7420656e6f75676820627974657320666f72206368617261637465723a209f21616674657220282120696e636c7564656429>
|
||||
3: not enough bytes for character: <20>!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029>
|
||||
4: not enough bytes left in file <20> // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd>
|
||||
|
Loading…
Reference in New Issue
Block a user