From 952a665a4ed51400b5925e7cd69f08f0aeb374fe Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Thu, 21 Jun 2018 14:03:45 -0400 Subject: [PATCH] Better support for creating Unicode strings --- ChangeLog | 8 +++++ include/qpdf/QPDFObjectHandle.hh | 10 ++++++ libqpdf/QPDFObjectHandle.cc | 20 ++++++++++++ libqpdf/QPDF_String.cc | 52 ++++++++++++++++++++++++++++++ libqpdf/qpdf/QPDF_String.hh | 1 + qpdf/build.mk | 1 + qpdf/qtest/qpdf.test | 12 +++++-- qpdf/qtest/qpdf/unicode-errors.in | 7 ++++ qpdf/qtest/qpdf/unicode-errors.out | 7 ++++ qpdf/qtest/qpdf/unicode.in | 5 +++ qpdf/qtest/qpdf/unicode.out | 5 +++ qpdf/test_pdf_unicode.cc | 46 ++++++++++++++++++++++++++ 12 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 qpdf/qtest/qpdf/unicode-errors.in create mode 100644 qpdf/qtest/qpdf/unicode-errors.out create mode 100644 qpdf/qtest/qpdf/unicode.in create mode 100644 qpdf/qtest/qpdf/unicode.out create mode 100644 qpdf/test_pdf_unicode.cc diff --git a/ChangeLog b/ChangeLog index cabf7efe..e27b680d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2018-06-21 Jay Berkenbilt + + * Added QPDFObject::newUnicodeString and QPDFObject::unparseBinary + to allow for more convenient creation of strings that are + explicitly encoded in UTF-16 BE. This is useful for creating + Unicode strings that appear outside of content streams, such as in + page labels, outlines, form field values, etc. + 2018-06-20 Jay Berkenbilt * Added new classes QPDFAcroFormDocumentHelper, diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index 967e786c..868b5c07 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -344,6 +344,12 @@ class QPDFObjectHandle static QPDFObjectHandle newName(std::string const& name); QPDF_DLL static QPDFObjectHandle newString(std::string const& str); + // Create a string encoded in UTF-16 from the given utf8-encoded + // string. Such strings are appropriately encoded to appear in PDF + // files outside of content streams, such as in document metadata + // form field values, page labels, outlines, and similar locations. + QPDF_DLL + static QPDFObjectHandle newUnicodeString(std::string const& utf8_str); QPDF_DLL static QPDFObjectHandle newOperator(std::string const&); QPDF_DLL @@ -715,6 +721,10 @@ class QPDFObjectHandle std::string unparse(); QPDF_DLL std::string unparseResolved(); + // For strings only, force binary representation. Otherwise, same + // as unparse. + QPDF_DLL + std::string unparseBinary(); // Legacy helper methods for commonly performed operations on // pages. Newer code should use QPDFPageObjectHelper instead. The diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 5c111cc8..da609cc2 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -1221,6 +1221,20 @@ QPDFObjectHandle::unparseResolved() return this->m->obj->unparse(); } +std::string +QPDFObjectHandle::unparseBinary() +{ + if (this->isString()) + { + return dynamic_cast( + this->m->obj.getPointer())->unparse(true); + } + else + { + return unparse(); + } +} + QPDFObjectHandle QPDFObjectHandle::parse(std::string const& object_str, std::string const& object_description) @@ -1845,6 +1859,12 @@ QPDFObjectHandle::newString(std::string const& str) return QPDFObjectHandle(new QPDF_String(str)); } +QPDFObjectHandle +QPDFObjectHandle::newUnicodeString(std::string const& utf8_str) +{ + return QPDFObjectHandle(QPDF_String::new_utf16(utf8_str)); +} + QPDFObjectHandle QPDFObjectHandle::newOperator(std::string const& value) { diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc index 60a3e0df..eb31a808 100644 --- a/libqpdf/QPDF_String.cc +++ b/libqpdf/QPDF_String.cc @@ -64,6 +64,58 @@ QPDF_String::~QPDF_String() { } +QPDF_String* +QPDF_String::new_utf16(std::string const& utf8_val) +{ + std::string result = "\xfe\xff"; + size_t len = utf8_val.length(); + for (size_t i = 0; i < len; ++i) + { + unsigned char ch = static_cast(utf8_val.at(i)); + if (ch < 128) + { + result += QUtil::toUTF16(ch); + } + else + { + size_t bytes_needed = 0; + unsigned bit_check = 0x40; + unsigned char to_clear = 0x80; + while (ch & bit_check) + { + ++bytes_needed; + to_clear |= bit_check; + bit_check >>= 1; + } + + if (((bytes_needed > 5) || (bytes_needed < 1)) || + ((i + bytes_needed) >= len)) + { + result += "\xff\xfd"; + } + else + { + unsigned long codepoint = (ch & ~to_clear); + while (bytes_needed > 0) + { + --bytes_needed; + ch = utf8_val.at(++i); + if ((ch & 0xc0) != 0x80) + { + --i; + codepoint = 0xfffd; + break; + } + codepoint <<= 6; + codepoint += (ch & 0x3f); + } + result += QUtil::toUTF16(codepoint); + } + } + } + return new QPDF_String(result); +} + std::string QPDF_String::unparse() { diff --git a/libqpdf/qpdf/QPDF_String.hh b/libqpdf/qpdf/QPDF_String.hh index abf8291a..b4858c49 100644 --- a/libqpdf/qpdf/QPDF_String.hh +++ b/libqpdf/qpdf/QPDF_String.hh @@ -9,6 +9,7 @@ class QPDF_String: public QPDFObject { public: QPDF_String(std::string const& val); + static QPDF_String* new_utf16(std::string const& utf8_val); virtual ~QPDF_String(); virtual std::string unparse(); virtual QPDFObject::object_type_e getTypeCode() const; diff --git a/qpdf/build.mk b/qpdf/build.mk index 1692fc92..21e7bb17 100644 --- a/qpdf/build.mk +++ b/qpdf/build.mk @@ -4,6 +4,7 @@ BINS_qpdf = \ test_driver \ test_large_file \ test_pdf_doc_encoding \ + test_pdf_unicode \ test_tokenizer CBINS_qpdf = qpdf-ctest diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index a23e20e8..f80da1c9 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -84,13 +84,21 @@ flush_tiff_cache(); show_ntests(); # ---------- -$td->notify("--- PDF Doc Encoding ---"); -$n_tests += 1; +$td->notify("--- Character Encoding ---"); +$n_tests += 3; $td->runtest("PDF doc encoding to Unicode", {$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"}, {$td->FILE => "pdf-doc-to-utf8.out", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); +$td->runtest("UTF-16 encoding", + {$td->COMMAND => "test_pdf_unicode unicode.in"}, + {$td->FILE => "unicode.out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); +$td->runtest("UTF-16 encoding errors", + {$td->COMMAND => "test_pdf_unicode unicode-errors.in"}, + {$td->FILE => "unicode-errors.out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); show_ntests(); # ---------- diff --git a/qpdf/qtest/qpdf/unicode-errors.in b/qpdf/qtest/qpdf/unicode-errors.in new file mode 100644 index 00000000..484928c3 --- /dev/null +++ b/qpdf/qtest/qpdf/unicode-errors.in @@ -0,0 +1,7 @@ +This file has utf-8 encoding errors and should be edited as a binary file. + +0: too many bytes: after +1: too few bytes: after +2: invalid codepoint (U+DEAD): after +3: not enough bytes for character: !after (! included) +4: not enough bytes left in file diff --git a/qpdf/qtest/qpdf/unicode-errors.out b/qpdf/qtest/qpdf/unicode-errors.out new file mode 100644 index 00000000..43a06511 --- /dev/null +++ b/qpdf/qtest/qpdf/unicode-errors.out @@ -0,0 +1,7 @@ +This file has utf-8 encoding errors and should be edited as a binary file. // + // +0: too many bytes: �after // +1: too few bytes: �after // +2: invalid codepoint (U+DEAD): �after // +3: not enough bytes for character: �!after (! included) // +4: not enough bytes left in file � // diff --git a/qpdf/qtest/qpdf/unicode.in b/qpdf/qtest/qpdf/unicode.in new file mode 100644 index 00000000..f686f9d6 --- /dev/null +++ b/qpdf/qtest/qpdf/unicode.in @@ -0,0 +1,5 @@ +This is a potato: 🥔 (u+01f954). +If you wanted to, you could cook some sweet 🥔 π. +If you think wwwwww is good, you should try ʬʬʬʬʬʬ. +బంగాళాదుంప సలాడ్ +𝄞 𝄢 𝄪 𝅂 diff --git a/qpdf/qtest/qpdf/unicode.out b/qpdf/qtest/qpdf/unicode.out new file mode 100644 index 00000000..bedec447 --- /dev/null +++ b/qpdf/qtest/qpdf/unicode.out @@ -0,0 +1,5 @@ +This is a potato: 🥔 (u+01f954). // +If you wanted to, you could cook some sweet 🥔 π. // +If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // +బంగాళాదుంప సలాడ్ // +𝄞 𝄢 𝄪 𝅂 // diff --git a/qpdf/test_pdf_unicode.cc b/qpdf/test_pdf_unicode.cc new file mode 100644 index 00000000..07073424 --- /dev/null +++ b/qpdf/test_pdf_unicode.cc @@ -0,0 +1,46 @@ +#include +#include +#include +#include +#include + +static char const* whoami = 0; + +void usage() +{ + std::cerr << "Usage: " << whoami << " infile" << std::endl; + exit(2); +} + +int main(int argc, char* argv[]) +{ + if ((whoami = strrchr(argv[0], '/')) == NULL) + { + whoami = argv[0]; + } + else + { + ++whoami; + } + // For libtool's sake.... + if (strncmp(whoami, "lt-", 3) == 0) + { + whoami += 3; + } + + if (argc != 2) + { + usage(); + } + char const* infilename = argv[1]; + std::list lines = + QUtil::read_lines_from_file(infilename); + for (std::list::iterator iter = lines.begin(); + iter != lines.end(); ++iter) + { + QPDFObjectHandle str = QPDFObjectHandle::newUnicodeString(*iter); + std::cout << str.getUTF8Value() << " // " + << str.unparseBinary() << std::endl; + } + return 0; +}