From 698485468a8b7d0f38d817d6055898932f46cc26 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sun, 13 Jan 2019 08:00:14 -0500 Subject: [PATCH] Move remaining existing transcoding to QUtil --- include/qpdf/QUtil.hh | 29 ++++- libqpdf/QPDF_String.cc | 93 +------------- libqpdf/QUtil.cc | 267 +++++++++++++++++++++++++++++++++++++++-- qpdf/qpdf.testcov | 2 +- 4 files changed, 288 insertions(+), 103 deletions(-) diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh index 8edef6e0..ea3f5da8 100644 --- a/include/qpdf/QUtil.hh +++ b/include/qpdf/QUtil.hh @@ -147,13 +147,18 @@ namespace QUtil std::string toUTF8(unsigned long uval); // Return a string containing the byte representation of the - // UTF-16 BE encoding for the unicode value passed in. + // UTF-16 big-endian encoding for the unicode value passed in. // Unrepresentable code points are converted to U+FFFD. QPDF_DLL std::string toUTF16(unsigned long uval); - // Convert a UTF-8 encoded string to UTF-16. Unrepresentable code - // points are converted to U+FFFD. + // Test whether this is a UTF-16 big-endian string. This is + // indicated by first two bytes being 0xFE 0xFF. + QPDF_DLL + bool is_utf16(std::string const&); + + // Convert a UTF-8 encoded string to UTF-16 big-endian. + // Unrepresentable code points are converted to U+FFFD. QPDF_DLL std::string utf8_to_utf16(std::string const& utf8); @@ -169,6 +174,24 @@ namespace QUtil QPDF_DLL std::string utf8_to_mac_roman( std::string const& utf8, char unknown_char = '?'); + QPDF_DLL + std::string utf8_to_pdf_doc( + std::string const& utf8, char unknown_char = '?'); + + // Convert a UTF-16 big-endian encoded string to UTF-8. + // Unrepresentable code points are converted to U+FFFD. + QPDF_DLL + std::string utf16_to_utf8(std::string const& utf16); + + // Convert from the specified single-byte encoding system to + // UTF-8. There is no ascii_to_utf8 because all ASCII strings are + // already valid UTF-8. + QPDF_DLL + std::string win_ansi_to_utf8(std::string const& win); + QPDF_DLL + std::string mac_roman_to_utf8(std::string const& mac); + QPDF_DLL + std::string pdf_doc_to_utf8(std::string const& pdfdoc); // If secure random number generation is supported on your // platform and qpdf was not compiled with insecure random number diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc index 7cfb6bcc..bf1141d1 100644 --- a/libqpdf/QPDF_String.cc +++ b/libqpdf/QPDF_String.cc @@ -8,43 +8,6 @@ // be used. #include -// First element is 128 -static unsigned short pdf_doc_to_unicode[] = { - 0x2022, // 0x80 BULLET - 0x2020, // 0x81 DAGGER - 0x2021, // 0x82 DOUBLE DAGGER - 0x2026, // 0x83 HORIZONTAL ELLIPSIS - 0x2014, // 0x84 EM DASH - 0x2013, // 0x85 EN DASH - 0x0192, // 0x86 SMALL LETTER F WITH HOOK - 0x2044, // 0x87 FRACTION SLASH (solidus) - 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK - 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK - 0x2212, // 0x8a MINUS SIGN - 0x2030, // 0x8b PER MILLE SIGN - 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase) - 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left) - 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright) - 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft) - 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright) - 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase) - 0x2122, // 0x92 TRADE MARK SIGN - 0xfb01, // 0x93 LATIN SMALL LIGATURE FI - 0xfb02, // 0x94 LATIN SMALL LIGATURE FL - 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE - 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE - 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON - 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS - 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON - 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I - 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE - 0x0153, // 0x9c LATIN SMALL LIGATURE OE - 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON - 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON - 0xfffd, // 0x9f UNDEFINED - 0x20ac, // 0xa0 EURO SIGN -}; - // See above about ctype. static bool is_ascii_printable(unsigned char ch) { @@ -210,62 +173,12 @@ QPDF_String::getVal() const std::string QPDF_String::getUTF8Val() const { - std::string result; - size_t len = this->val.length(); - if ((len >= 2) && (len % 2 == 0) && - (this->val.at(0) == '\xfe') && (this->val.at(1) == '\xff')) + if (QUtil::is_utf16(this->val)) { - // This is a Unicode string using big-endian UTF-16. This - // code uses unsigned long and unsigned short to hold - // codepoint values. It requires unsigned long to be at least - // 32 bits and unsigned short to be at least 16 bits, but it - // will work fine if they are larger. - unsigned long codepoint = 0L; - for (unsigned int i = 2; i < len; i += 2) - { - // Convert from UTF16-BE. If we get a malformed - // codepoint, this code will generate incorrect output - // without giving a warning. Specifically, a high - // codepoint not followed by a low codepoint will be - // discarded, and a low codepoint not preceded by a high - // codepoint will just get its low 10 bits output. - unsigned short bits = - (static_cast(this->val.at(i)) << 8) + - static_cast(this->val.at(i+1)); - if ((bits & 0xFC00) == 0xD800) - { - codepoint = 0x10000 + ((bits & 0x3FF) << 10); - continue; - } - else if ((bits & 0xFC00) == 0xDC00) - { - if (codepoint != 0) - { - QTC::TC("qpdf", "QPDF_String non-trivial UTF-16"); - } - codepoint += bits & 0x3FF; - } - else - { - codepoint = bits; - } - - result += QUtil::toUTF8(codepoint); - codepoint = 0; - } + return QUtil::utf16_to_utf8(this->val); } else { - for (unsigned int i = 0; i < len; ++i) - { - unsigned char ch = static_cast(this->val.at(i)); - unsigned short val = ch; - if ((ch >= 128) && (ch <= 160)) - { - val = pdf_doc_to_unicode[ch - 128]; - } - result += QUtil::toUTF8(val); - } + return QUtil::pdf_doc_to_utf8(this->val); } - return result; } diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index 8424854b..ac501676 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -8,6 +8,7 @@ #endif #include #include +#include #include #include @@ -29,6 +30,43 @@ #include #endif +// First element is 128 +static unsigned short pdf_doc_to_unicode[] = { + 0x2022, // 0x80 BULLET + 0x2020, // 0x81 DAGGER + 0x2021, // 0x82 DOUBLE DAGGER + 0x2026, // 0x83 HORIZONTAL ELLIPSIS + 0x2014, // 0x84 EM DASH + 0x2013, // 0x85 EN DASH + 0x0192, // 0x86 SMALL LETTER F WITH HOOK + 0x2044, // 0x87 FRACTION SLASH (solidus) + 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 0x2212, // 0x8a MINUS SIGN + 0x2030, // 0x8b PER MILLE SIGN + 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase) + 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left) + 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright) + 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft) + 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright) + 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase) + 0x2122, // 0x92 TRADE MARK SIGN + 0xfb01, // 0x93 LATIN SMALL LIGATURE FI + 0xfb02, // 0x94 LATIN SMALL LIGATURE FL + 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE + 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE + 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON + 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS + 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON + 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I + 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE + 0x0153, // 0x9c LATIN SMALL LIGATURE OE + 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON + 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON + 0xfffd, // 0x9f UNDEFINED + 0x20ac, // 0xa0 EURO SIGN +}; + std::string QUtil::int_to_string(long long num, int length) { @@ -895,7 +933,7 @@ QUtil::parse_numrange(char const* range, int max) return result; } -enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman }; +enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc }; static unsigned char encode_winansi(unsigned long codepoint) @@ -1342,6 +1380,119 @@ encode_macroman(unsigned long codepoint) return ch; } +static unsigned char +encode_pdfdoc(unsigned long codepoint) +{ + // Use this ugly switch statement to avoid a static, which is not + // thread-safe. + unsigned char ch = '\0'; + switch (codepoint) + { + case 0x2022: + ch = 0x80; + break; + case 0x2020: + ch = 0x81; + break; + case 0x2021: + ch = 0x82; + break; + case 0x2026: + ch = 0x83; + break; + case 0x2014: + ch = 0x84; + break; + case 0x2013: + ch = 0x85; + break; + case 0x0192: + ch = 0x86; + break; + case 0x2044: + ch = 0x87; + break; + case 0x2039: + ch = 0x88; + break; + case 0x203a: + ch = 0x89; + break; + case 0x2212: + ch = 0x8a; + break; + case 0x2030: + ch = 0x8b; + break; + case 0x201e: + ch = 0x8c; + break; + case 0x201c: + ch = 0x8d; + break; + case 0x201d: + ch = 0x8e; + break; + case 0x2018: + ch = 0x8f; + break; + case 0x2019: + ch = 0x90; + break; + case 0x201a: + ch = 0x91; + break; + case 0x2122: + ch = 0x92; + break; + case 0xfb01: + ch = 0x93; + break; + case 0xfb02: + ch = 0x94; + break; + case 0x0141: + ch = 0x95; + break; + case 0x0152: + ch = 0x96; + break; + case 0x0160: + ch = 0x97; + break; + case 0x0178: + ch = 0x98; + break; + case 0x017d: + ch = 0x99; + break; + case 0x0131: + ch = 0x9a; + break; + case 0x0142: + ch = 0x9b; + break; + case 0x0153: + ch = 0x9c; + break; + case 0x0161: + ch = 0x9d; + break; + case 0x017e: + ch = 0x9e; + break; + case 0xfffd: + ch = 0x9f; + break; + case 0x20ac: + ch = 0xa0; + break; + default: + break; + } + return ch; +} + static std::string transcode_utf8(std::string const& utf8_val, encoding_e encoding, char unknown) @@ -1410,24 +1561,27 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding, { result += QUtil::toUTF16(codepoint); } + else if ((codepoint >= 160) && (codepoint < 256) && + ((encoding == e_winansi) || (encoding == e_pdfdoc))) + { + ch = static_cast(codepoint & 0xff); + result.append(1, ch); + } else { ch = '\0'; if (encoding == e_winansi) { - if ((codepoint >= 160) && (codepoint < 256)) - { - ch = static_cast(codepoint & 0xff); - } - else - { - ch = encode_winansi(codepoint); - } + ch = encode_winansi(codepoint); } else if (encoding == e_macroman) { ch = encode_macroman(codepoint); } + else if (encoding == e_pdfdoc) + { + ch = encode_pdfdoc(codepoint); + } if (ch == '\0') { ch = static_cast(unknown); @@ -1463,3 +1617,98 @@ QUtil::utf8_to_mac_roman(std::string const& utf8, char unknown_char) { return transcode_utf8(utf8, e_macroman, unknown_char); } + +std::string +QUtil::utf8_to_pdf_doc(std::string const& utf8, char unknown_char) +{ + return transcode_utf8(utf8, e_pdfdoc, unknown_char); +} + +bool +QUtil::is_utf16(std::string const& val) +{ + return ((val.length() >= 2) && + (val.at(0) == '\xfe') && (val.at(1) == '\xff')); +} + +std::string +QUtil::utf16_to_utf8(std::string const& val) +{ + std::string result; + // This code uses unsigned long and unsigned short to hold + // codepoint values. It requires unsigned long to be at least + // 32 bits and unsigned short to be at least 16 bits, but it + // will work fine if they are larger. + unsigned long codepoint = 0L; + size_t len = val.length(); + size_t start = 0; + if (is_utf16(val)) + { + start += 2; + } + // If the string has an odd number of bytes, the last byte is + // ignored. + for (unsigned int i = start; i < len; i += 2) + { + // Convert from UTF16-BE. If we get a malformed + // codepoint, this code will generate incorrect output + // without giving a warning. Specifically, a high + // codepoint not followed by a low codepoint will be + // discarded, and a low codepoint not preceded by a high + // codepoint will just get its low 10 bits output. + unsigned short bits = + (static_cast(val.at(i)) << 8) + + static_cast(val.at(i+1)); + if ((bits & 0xFC00) == 0xD800) + { + codepoint = 0x10000 + ((bits & 0x3FF) << 10); + continue; + } + else if ((bits & 0xFC00) == 0xDC00) + { + if (codepoint != 0) + { + QTC::TC("qpdf", "QUtil non-trivial UTF-16"); + } + codepoint += bits & 0x3FF; + } + else + { + codepoint = bits; + } + + result += QUtil::toUTF8(codepoint); + codepoint = 0; + } + return result; +} + +std::string +QUtil::win_ansi_to_utf8(std::string const& val) +{ + return "QXXXQ"; +} + +std::string +QUtil::mac_roman_to_utf8(std::string const& val) +{ + return "QXXXQ"; +} + +std::string +QUtil::pdf_doc_to_utf8(std::string const& val) +{ + std::string result; + size_t len = val.length(); + for (unsigned int i = 0; i < len; ++i) + { + unsigned char ch = static_cast(val.at(i)); + unsigned short val = ch; + if ((ch >= 128) && (ch <= 160)) + { + val = pdf_doc_to_unicode[ch - 128]; + } + result += QUtil::toUTF8(val); + } + return result; +} diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 08f82592..5d14a0dd 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -108,7 +108,7 @@ QPDF_Stream pipeStreamData with null pipeline 0 QPDFWriter not recompressing /FlateDecode 0 QPDF_encryption xref stream from encrypted file 0 qpdf unable to filter 0 -QPDF_String non-trivial UTF-16 0 +QUtil non-trivial UTF-16 0 QPDF xref overwrite object 0 QPDF decoding error warning 0 qpdf-c called qpdf_init 0