Add QUtil::possible_repaired_encodings

This commit is contained in:
Jay Berkenbilt 2019-01-15 21:06:38 -05:00
parent 997f4ab6cb
commit e87d149918
5 changed files with 158 additions and 0 deletions

View File

@ -14,6 +14,14 @@
the first bug in qpdf's history that could result in silent loss
of data when processing a correct input file. Fixes #276.
2019-01-15 Jay Berkenbilt <ejb@ql.org>
* Add QUtil::possible_repaired_encodings which, given a string,
generates other strings that represent re-interpretation of the
bytes in a different coding system. This is used to help recover
passwords if the password string was improperly encoded on a
different system due to user error or a software bug.
2019-01-14 Jay Berkenbilt <ejb@ql.org>
* Add new CLI flags to 128-bit and 256-bit encryption: --assemble,

View File

@ -223,6 +223,28 @@ namespace QUtil
bool& is_valid_utf8,
bool& is_utf16);
// Try to compensate for previously incorrectly encoded strings.
// We want to compensate for the following errors:
//
// * The string was supposed to be UTF-8 but was one of the
// single-byte encodings
// * The string was supposed to be PDF Doc but was either UTF-8 or
// one of the other single-byte encodings
//
// The returned vector always contains the original string first,
// and then it contains what the correct string would be in the
// event that the original string was the result of any of the
// above errors.
//
// This method is useful for attempting to recover a password that
// may have been previously incorrectly encoded. For example, the
// password was supposed to be UTF-8 but the previous application
// used a password encoded in WinAnsi, or if the previous password
// was supposed to be PDFDoc but was actually given as UTF-8 or
// WinAnsi, this method would find the correct password.
QPDF_DLL
std::vector<std::string> possible_repaired_encodings(std::string);
// If secure random number generation is supported on your
// platform and qpdf was not compiled with insecure random number
// generation, this returns a cryptographically secure random

View File

@ -15,6 +15,7 @@
#include <sstream>
#include <fstream>
#include <stdexcept>
#include <set>
#include <stdio.h>
#include <errno.h>
#include <ctype.h>
@ -1992,3 +1993,95 @@ QUtil::analyze_encoding(std::string const& val,
is_valid_utf8 = true;
}
}
std::vector<std::string>
QUtil::possible_repaired_encodings(std::string supplied)
{
std::vector<std::string> result;
// Always include the original string
result.push_back(supplied);
bool has_8bit_chars = false;
bool is_valid_utf8 = false;
bool is_utf16 = false;
analyze_encoding(supplied, has_8bit_chars, is_valid_utf8, is_utf16);
if (! has_8bit_chars)
{
return result;
}
if (is_utf16)
{
// Convert to UTF-8 and pretend we got a UTF-8 string.
is_utf16 = false;
is_valid_utf8 = true;
supplied = utf16_to_utf8(supplied);
}
std::string output;
if (is_valid_utf8)
{
// Maybe we were given UTF-8 but wanted one of the single-byte
// encodings.
if (utf8_to_pdf_doc(supplied, output))
{
result.push_back(output);
}
if (utf8_to_win_ansi(supplied, output))
{
result.push_back(output);
}
if (utf8_to_mac_roman(supplied, output))
{
result.push_back(output);
}
}
else
{
// Maybe we were given one of the single-byte encodings but
// wanted UTF-8.
std::string from_pdf_doc(pdf_doc_to_utf8(supplied));
result.push_back(from_pdf_doc);
std::string from_win_ansi(win_ansi_to_utf8(supplied));
result.push_back(from_win_ansi);
std::string from_mac_roman(mac_roman_to_utf8(supplied));
result.push_back(from_mac_roman);
// Maybe we were given one of the other single-byte encodings
// but wanted one of the other ones.
if (utf8_to_win_ansi(from_pdf_doc, output))
{
result.push_back(output);
}
if (utf8_to_mac_roman(from_pdf_doc, output))
{
result.push_back(output);
}
if (utf8_to_pdf_doc(from_win_ansi, output))
{
result.push_back(output);
}
if (utf8_to_mac_roman(from_win_ansi, output))
{
result.push_back(output);
}
if (utf8_to_pdf_doc(from_mac_roman, output))
{
result.push_back(output);
}
if (utf8_to_win_ansi(from_mac_roman, output))
{
result.push_back(output);
}
}
// De-duplicate
std::vector<std::string> t;
std::set<std::string> seen;
for (std::vector<std::string>::iterator iter = result.begin();
iter != result.end(); ++iter)
{
if (! seen.count(*iter))
{
seen.insert(*iter);
t.push_back(*iter);
}
}
return t;
}

View File

@ -58,6 +58,19 @@ bidirectional pdf doc done
bidirectional win ansi done
bidirectional mac roman done
analysis done
alternatives
0: 86a9e99e
1: c692c2a9c3a9c5be
2: e280a0c2a9c3a9c5be
3: c39cc2a9c388c3bb
4: 83a9e99e
5: 81a9e99e
6: dca9c8fb
0: c692c2a9c3a9c5be
1: 86a9e99e
2: 83a9e99e
0: 717561636b
done alternatives
---- whoami
quack1
quack2

View File

@ -276,6 +276,16 @@ void check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16)
}
}
void print_alternatives(std::string const& str)
{
std::vector<std::string> result = QUtil::possible_repaired_encodings(str);
size_t n = result.size();
for (size_t i = 0; i < n; ++i)
{
std::cout << i << ": " << QUtil::hex_encode(result.at(i)) << std::endl;
}
}
void transcoding_test()
{
transcoding_test(&QUtil::pdf_doc_to_utf8,
@ -308,6 +318,18 @@ void transcoding_test()
assert(QUtil::utf8_to_pdf_doc(input1, output));
assert(! QUtil::utf8_to_pdf_doc(input2, output));
assert(QUtil::utf8_to_pdf_doc(input3, output));
std::cout << "alternatives" << std::endl;
// char name mac win pdf-doc
// U+0192 florin 304 203 206
// U+00A9 copyright 251 251 251
// U+00E9 eacute 216 351 351
// U+017E zcaron - 236 236
std::string pdfdoc = "\206\251\351\236";
std::string utf8 = QUtil::pdf_doc_to_utf8(pdfdoc);
print_alternatives(pdfdoc);
print_alternatives(utf8);
print_alternatives("quack");
std::cout << "done alternatives" << std::endl;
}
void print_whoami(char const* str)