diff --git a/ChangeLog b/ChangeLog index e1087f20..9a75f4ad 100644 --- a/ChangeLog +++ b/ChangeLog @@ -14,6 +14,13 @@ the first bug in qpdf's history that could result in silent loss of data when processing a correct input file. Fixes #276. +2019-01-13 Jay Berkenbilt + + * Add several more string transcoding and analysis methods to + QUtil for bidirectional conversion between PDF Doc, Win Ansi, Mac + Roman, UTF-6, and UTF-16 along with detection of valid UTF-8 and + UTF-16. + 2019-01-12 Jay Berkenbilt * In the --pages option, allow the same page to be specified more diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh index ea3f5da8..d9b0783e 100644 --- a/include/qpdf/QUtil.hh +++ b/include/qpdf/QUtil.hh @@ -193,6 +193,20 @@ namespace QUtil QPDF_DLL std::string pdf_doc_to_utf8(std::string const& pdfdoc); + // Analyze a string for encoding. We can't tell the difference + // between any single-byte encodings, and we can't tell for sure + // whether a string that happens to be valid UTF-8 isn't a + // different encoding, but we can at least tell a few things to + // help us guess. If there are no characters with the high bit + // set, has_8bit_chars is false, and the other values are also + // false, even though ASCII strings are valid UTF-8. is_valid_utf8 + // means that the string is non-trivially valid UTF-8. + QPDF_DLL + void analyze_encoding(std::string const& str, + bool& has_8bit_chars, + bool& is_valid_utf8, + bool& is_utf16); + // If secure random number generation is supported on your // platform and qpdf was not compiled with insecure random number // generation, this returns a cryptographically secure random diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index 9dbce98e..19b6fdab 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -1661,6 +1661,50 @@ encode_pdfdoc(unsigned long codepoint) return ch; } +unsigned long get_next_utf8_codepoint( + std::string const& utf8_val, size_t& pos, bool& error) +{ + size_t len = utf8_val.length(); + unsigned char ch = static_cast(utf8_val.at(pos)); + error = false; + if (ch < 128) + { + return static_cast(ch); + } + + size_t bytes_needed = 0; + unsigned bit_check = 0x40; + unsigned char to_clear = 0x80; + while (ch & bit_check) + { + ++bytes_needed; + to_clear |= bit_check; + bit_check >>= 1; + } + if (((bytes_needed > 5) || (bytes_needed < 1)) || + ((pos + bytes_needed) >= len)) + { + error = true; + return 0xfffd; + } + + unsigned long codepoint = (ch & ~to_clear); + while (bytes_needed > 0) + { + --bytes_needed; + ch = utf8_val.at(++pos); + if ((ch & 0xc0) != 0x80) + { + --pos; + codepoint = 0xfffd; + break; + } + codepoint <<= 6; + codepoint += (ch & 0x3f); + } + return codepoint; +} + static std::string transcode_utf8(std::string const& utf8_val, encoding_e encoding, char unknown) @@ -1673,9 +1717,22 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding, size_t len = utf8_val.length(); for (size_t i = 0; i < len; ++i) { - unsigned char ch = static_cast(utf8_val.at(i)); - if (ch < 128) + bool error = false; + unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error); + if (error) { + if (encoding == e_utf16) + { + result += "\xff\xfd"; + } + else + { + result.append(1, unknown); + } + } + else if (codepoint < 128) + { + char ch = static_cast(codepoint); if (encoding == e_utf16) { result += QUtil::toUTF16(ch); @@ -1685,78 +1742,35 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding, result.append(1, ch); } } + else if (encoding == e_utf16) + { + result += QUtil::toUTF16(codepoint); + } + else if ((codepoint > 160) && (codepoint < 256) && + ((encoding == e_winansi) || (encoding == e_pdfdoc))) + { + result.append(1, static_cast(codepoint & 0xff)); + } else { - size_t bytes_needed = 0; - unsigned bit_check = 0x40; - unsigned char to_clear = 0x80; - while (ch & bit_check) + unsigned char ch = '\0'; + if (encoding == e_winansi) { - ++bytes_needed; - to_clear |= bit_check; - bit_check >>= 1; + ch = encode_winansi(codepoint); } - - if (((bytes_needed > 5) || (bytes_needed < 1)) || - ((i + bytes_needed) >= len)) + else if (encoding == e_macroman) { - if (encoding == e_utf16) - { - result += "\xff\xfd"; - } - else - { - result.append(1, unknown); - } + ch = encode_macroman(codepoint); } - else + else if (encoding == e_pdfdoc) { - unsigned long codepoint = (ch & ~to_clear); - while (bytes_needed > 0) - { - --bytes_needed; - ch = utf8_val.at(++i); - if ((ch & 0xc0) != 0x80) - { - --i; - codepoint = 0xfffd; - break; - } - codepoint <<= 6; - codepoint += (ch & 0x3f); - } - if (encoding == e_utf16) - { - result += QUtil::toUTF16(codepoint); - } - else if ((codepoint > 160) && (codepoint < 256) && - ((encoding == e_winansi) || (encoding == e_pdfdoc))) - { - ch = static_cast(codepoint & 0xff); - result.append(1, ch); - } - else - { - ch = '\0'; - if (encoding == e_winansi) - { - ch = encode_winansi(codepoint); - } - else if (encoding == e_macroman) - { - ch = encode_macroman(codepoint); - } - else if (encoding == e_pdfdoc) - { - ch = encode_pdfdoc(codepoint); - } - if (ch == '\0') - { - ch = static_cast(unknown); - } - result.append(1, ch); - } + ch = encode_pdfdoc(codepoint); } + if (ch == '\0') + { + ch = static_cast(unknown); + } + result.append(1, ch); } } return result; @@ -1904,3 +1918,37 @@ QUtil::pdf_doc_to_utf8(std::string const& val) } return result; } + +void +QUtil::analyze_encoding(std::string const& val, + bool& has_8bit_chars, + bool& is_valid_utf8, + bool& is_utf16) +{ + has_8bit_chars = is_utf16 = is_valid_utf8 = false; + if (QUtil::is_utf16(val)) + { + has_8bit_chars = true; + is_utf16 = true; + return; + } + size_t len = val.length(); + bool any_errors = false; + for (size_t i = 0; i < len; ++i) + { + bool error = false; + unsigned long codepoint = get_next_utf8_codepoint(val, i, error); + if (error) + { + any_errors = true; + } + if (codepoint >= 128) + { + has_8bit_chars = true; + } + } + if (has_8bit_chars && (! any_errors)) + { + is_valid_utf8 = true; + } +} diff --git a/libtests/qtest/qutil/qutil.out b/libtests/qtest/qutil/qutil.out index 50ec26f9..c0789a36 100644 --- a/libtests/qtest/qutil/qutil.out +++ b/libtests/qtest/qutil/qutil.out @@ -57,6 +57,7 @@ HAGOOGAMAGOOGLE: 0 bidirectional pdf doc done bidirectional win ansi done bidirectional mac roman done +analysis done ---- whoami quack1 quack2 diff --git a/libtests/qutil.cc b/libtests/qutil.cc index 355bb9a2..91a656be 100644 --- a/libtests/qutil.cc +++ b/libtests/qutil.cc @@ -262,6 +262,20 @@ void transcoding_test(std::string (*to_utf8)(std::string const&), } } +void check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16) +{ + bool has_8bit_chars = false; + bool is_valid_utf8 = false; + bool is_utf16 = false; + QUtil::analyze_encoding(str, has_8bit_chars, is_valid_utf8, is_utf16); + if (! ((has_8bit_chars == has8bit) && + (is_valid_utf8 == utf8) && + (is_utf16 == utf16))) + { + std::cout << "analysis failed: " << str << std::endl; + } +} + void transcoding_test() { transcoding_test(&QUtil::pdf_doc_to_utf8, @@ -273,6 +287,11 @@ void transcoding_test() transcoding_test(&QUtil::mac_roman_to_utf8, &QUtil::utf8_to_mac_roman, 255, "?"); std::cout << "bidirectional mac roman done" << std::endl; + check_analyze("pi = \317\200", true, true, false); + check_analyze("pi != \317", true, false, false); + check_analyze("pi != 22/7", false, false, false); + check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true); + std::cout << "analysis done" << std::endl; } void print_whoami(char const* str)