From e87d149918ed6ed211f733f932df3b62ab445c12 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Tue, 15 Jan 2019 21:06:38 -0500 Subject: [PATCH] Add QUtil::possible_repaired_encodings --- ChangeLog | 8 +++ include/qpdf/QUtil.hh | 22 ++++++++ libqpdf/QUtil.cc | 93 ++++++++++++++++++++++++++++++++++ libtests/qtest/qutil/qutil.out | 13 +++++ libtests/qutil.cc | 22 ++++++++ 5 files changed, 158 insertions(+) diff --git a/ChangeLog b/ChangeLog index 8f1ed679..992cf507 100644 --- a/ChangeLog +++ b/ChangeLog @@ -14,6 +14,14 @@ the first bug in qpdf's history that could result in silent loss of data when processing a correct input file. Fixes #276. +2019-01-15 Jay Berkenbilt + + * Add QUtil::possible_repaired_encodings which, given a string, + generates other strings that represent re-interpretation of the + bytes in a different coding system. This is used to help recover + passwords if the password string was improperly encoded on a + different system due to user error or a software bug. + 2019-01-14 Jay Berkenbilt * Add new CLI flags to 128-bit and 256-bit encryption: --assemble, diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh index 5fe8e97c..02dec5ad 100644 --- a/include/qpdf/QUtil.hh +++ b/include/qpdf/QUtil.hh @@ -223,6 +223,28 @@ namespace QUtil bool& is_valid_utf8, bool& is_utf16); + // Try to compensate for previously incorrectly encoded strings. + // We want to compensate for the following errors: + // + // * The string was supposed to be UTF-8 but was one of the + // single-byte encodings + // * The string was supposed to be PDF Doc but was either UTF-8 or + // one of the other single-byte encodings + // + // The returned vector always contains the original string first, + // and then it contains what the correct string would be in the + // event that the original string was the result of any of the + // above errors. + // + // This method is useful for attempting to recover a password that + // may have been previously incorrectly encoded. For example, the + // password was supposed to be UTF-8 but the previous application + // used a password encoded in WinAnsi, or if the previous password + // was supposed to be PDFDoc but was actually given as UTF-8 or + // WinAnsi, this method would find the correct password. + QPDF_DLL + std::vector possible_repaired_encodings(std::string); + // If secure random number generation is supported on your // platform and qpdf was not compiled with insecure random number // generation, this returns a cryptographically secure random diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index e645c4fc..58646ade 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1992,3 +1993,95 @@ QUtil::analyze_encoding(std::string const& val, is_valid_utf8 = true; } } + +std::vector +QUtil::possible_repaired_encodings(std::string supplied) +{ + std::vector result; + // Always include the original string + result.push_back(supplied); + bool has_8bit_chars = false; + bool is_valid_utf8 = false; + bool is_utf16 = false; + analyze_encoding(supplied, has_8bit_chars, is_valid_utf8, is_utf16); + if (! has_8bit_chars) + { + return result; + } + if (is_utf16) + { + // Convert to UTF-8 and pretend we got a UTF-8 string. + is_utf16 = false; + is_valid_utf8 = true; + supplied = utf16_to_utf8(supplied); + } + std::string output; + if (is_valid_utf8) + { + // Maybe we were given UTF-8 but wanted one of the single-byte + // encodings. + if (utf8_to_pdf_doc(supplied, output)) + { + result.push_back(output); + } + if (utf8_to_win_ansi(supplied, output)) + { + result.push_back(output); + } + if (utf8_to_mac_roman(supplied, output)) + { + result.push_back(output); + } + } + else + { + // Maybe we were given one of the single-byte encodings but + // wanted UTF-8. + std::string from_pdf_doc(pdf_doc_to_utf8(supplied)); + result.push_back(from_pdf_doc); + std::string from_win_ansi(win_ansi_to_utf8(supplied)); + result.push_back(from_win_ansi); + std::string from_mac_roman(mac_roman_to_utf8(supplied)); + result.push_back(from_mac_roman); + + // Maybe we were given one of the other single-byte encodings + // but wanted one of the other ones. + if (utf8_to_win_ansi(from_pdf_doc, output)) + { + result.push_back(output); + } + if (utf8_to_mac_roman(from_pdf_doc, output)) + { + result.push_back(output); + } + if (utf8_to_pdf_doc(from_win_ansi, output)) + { + result.push_back(output); + } + if (utf8_to_mac_roman(from_win_ansi, output)) + { + result.push_back(output); + } + if (utf8_to_pdf_doc(from_mac_roman, output)) + { + result.push_back(output); + } + if (utf8_to_win_ansi(from_mac_roman, output)) + { + result.push_back(output); + } + } + // De-duplicate + std::vector t; + std::set seen; + for (std::vector::iterator iter = result.begin(); + iter != result.end(); ++iter) + { + if (! seen.count(*iter)) + { + seen.insert(*iter); + t.push_back(*iter); + } + } + return t; +} diff --git a/libtests/qtest/qutil/qutil.out b/libtests/qtest/qutil/qutil.out index c0789a36..c35f22e3 100644 --- a/libtests/qtest/qutil/qutil.out +++ b/libtests/qtest/qutil/qutil.out @@ -58,6 +58,19 @@ bidirectional pdf doc done bidirectional win ansi done bidirectional mac roman done analysis done +alternatives +0: 86a9e99e +1: c692c2a9c3a9c5be +2: e280a0c2a9c3a9c5be +3: c39cc2a9c388c3bb +4: 83a9e99e +5: 81a9e99e +6: dca9c8fb +0: c692c2a9c3a9c5be +1: 86a9e99e +2: 83a9e99e +0: 717561636b +done alternatives ---- whoami quack1 quack2 diff --git a/libtests/qutil.cc b/libtests/qutil.cc index 35877b9c..27881c6e 100644 --- a/libtests/qutil.cc +++ b/libtests/qutil.cc @@ -276,6 +276,16 @@ void check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16) } } +void print_alternatives(std::string const& str) +{ + std::vector result = QUtil::possible_repaired_encodings(str); + size_t n = result.size(); + for (size_t i = 0; i < n; ++i) + { + std::cout << i << ": " << QUtil::hex_encode(result.at(i)) << std::endl; + } +} + void transcoding_test() { transcoding_test(&QUtil::pdf_doc_to_utf8, @@ -308,6 +318,18 @@ void transcoding_test() assert(QUtil::utf8_to_pdf_doc(input1, output)); assert(! QUtil::utf8_to_pdf_doc(input2, output)); assert(QUtil::utf8_to_pdf_doc(input3, output)); + std::cout << "alternatives" << std::endl; + // char name mac win pdf-doc + // U+0192 florin 304 203 206 + // U+00A9 copyright 251 251 251 + // U+00E9 eacute 216 351 351 + // U+017E zcaron - 236 236 + std::string pdfdoc = "\206\251\351\236"; + std::string utf8 = QUtil::pdf_doc_to_utf8(pdfdoc); + print_alternatives(pdfdoc); + print_alternatives(utf8); + print_alternatives("quack"); + std::cout << "done alternatives" << std::endl; } void print_whoami(char const* str)