diff --git a/ChangeLog b/ChangeLog index b4b10f81..a6d7bcb2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2019-01-05 Jay Berkenbilt + + * Add method QUtil::utf8_to_utf16. + 2019-01-04 Jay Berkenbilt * Add new option --optimize-images, which recompresses every image diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh index 5532149c..c7473bf3 100644 --- a/include/qpdf/QUtil.hh +++ b/include/qpdf/QUtil.hh @@ -152,8 +152,14 @@ namespace QUtil QPDF_DLL std::string toUTF16(unsigned long uval); - // Convert a UTF-8 encoded string to ASCII by replacing all - // characters outside of ascii with the given unknown_char. + // Convert a UTF-8 encoded string to UTF-16. Unrepresentable code + // points are converted to U+FFFD. + QPDF_DLL + std::string utf8_to_utf16(std::string const& utf8); + + // Convert a UTF-8 encoded string to the specified single-byte + // encoding system by replacing all unsupported characters with + // the given unknown_char. QPDF_DLL std::string utf8_to_ascii( std::string const& utf8, char unknown_char = '?'); diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc index 633f1699..7cfb6bcc 100644 --- a/libqpdf/QPDF_String.cc +++ b/libqpdf/QPDF_String.cc @@ -64,65 +64,10 @@ QPDF_String::~QPDF_String() { } -enum encoding_e { e_utf16 }; - -static -std::string -transcode_utf8(std::string const& utf8_val, encoding_e encoding) -{ - std::string result = "\xfe\xff"; - size_t len = utf8_val.length(); - for (size_t i = 0; i < len; ++i) - { - unsigned char ch = static_cast(utf8_val.at(i)); - if (ch < 128) - { - result += QUtil::toUTF16(ch); - } - else - { - size_t bytes_needed = 0; - unsigned bit_check = 0x40; - unsigned char to_clear = 0x80; - while (ch & bit_check) - { - ++bytes_needed; - to_clear |= bit_check; - bit_check >>= 1; - } - - if (((bytes_needed > 5) || (bytes_needed < 1)) || - ((i + bytes_needed) >= len)) - { - result += "\xff\xfd"; - } - else - { - unsigned long codepoint = (ch & ~to_clear); - while (bytes_needed > 0) - { - --bytes_needed; - ch = utf8_val.at(++i); - if ((ch & 0xc0) != 0x80) - { - --i; - codepoint = 0xfffd; - break; - } - codepoint <<= 6; - codepoint += (ch & 0x3f); - } - result += QUtil::toUTF16(codepoint); - } - } - } - return result; -} - QPDF_String* QPDF_String::new_utf16(std::string const& utf8_val) { - return new QPDF_String(transcode_utf8(utf8_val, e_utf16)); + return new QPDF_String(QUtil::utf8_to_utf16(utf8_val)); } std::string diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index 7c2d9bc9..ba4aea2c 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -893,6 +893,67 @@ QUtil::parse_numrange(char const* range, int max) return result; } +enum encoding_e { e_utf16 }; + +static +std::string +transcode_utf8(std::string const& utf8_val, encoding_e encoding) +{ + std::string result = "\xfe\xff"; + size_t len = utf8_val.length(); + for (size_t i = 0; i < len; ++i) + { + unsigned char ch = static_cast(utf8_val.at(i)); + if (ch < 128) + { + result += QUtil::toUTF16(ch); + } + else + { + size_t bytes_needed = 0; + unsigned bit_check = 0x40; + unsigned char to_clear = 0x80; + while (ch & bit_check) + { + ++bytes_needed; + to_clear |= bit_check; + bit_check >>= 1; + } + + if (((bytes_needed > 5) || (bytes_needed < 1)) || + ((i + bytes_needed) >= len)) + { + result += "\xff\xfd"; + } + else + { + unsigned long codepoint = (ch & ~to_clear); + while (bytes_needed > 0) + { + --bytes_needed; + ch = utf8_val.at(++i); + if ((ch & 0xc0) != 0x80) + { + --i; + codepoint = 0xfffd; + break; + } + codepoint <<= 6; + codepoint += (ch & 0x3f); + } + result += QUtil::toUTF16(codepoint); + } + } + } + return result; +} + +std::string +QUtil::utf8_to_utf16(std::string const& utf8) +{ + return transcode_utf8(utf8, e_utf16); +} + std::string QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char) {