mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 02:49:00 +00:00
Add QUtil::possible_repaired_encodings
This commit is contained in:
parent
997f4ab6cb
commit
e87d149918
@ -14,6 +14,14 @@
|
||||
the first bug in qpdf's history that could result in silent loss
|
||||
of data when processing a correct input file. Fixes #276.
|
||||
|
||||
2019-01-15 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Add QUtil::possible_repaired_encodings which, given a string,
|
||||
generates other strings that represent re-interpretation of the
|
||||
bytes in a different coding system. This is used to help recover
|
||||
passwords if the password string was improperly encoded on a
|
||||
different system due to user error or a software bug.
|
||||
|
||||
2019-01-14 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Add new CLI flags to 128-bit and 256-bit encryption: --assemble,
|
||||
|
@ -223,6 +223,28 @@ namespace QUtil
|
||||
bool& is_valid_utf8,
|
||||
bool& is_utf16);
|
||||
|
||||
// Try to compensate for previously incorrectly encoded strings.
|
||||
// We want to compensate for the following errors:
|
||||
//
|
||||
// * The string was supposed to be UTF-8 but was one of the
|
||||
// single-byte encodings
|
||||
// * The string was supposed to be PDF Doc but was either UTF-8 or
|
||||
// one of the other single-byte encodings
|
||||
//
|
||||
// The returned vector always contains the original string first,
|
||||
// and then it contains what the correct string would be in the
|
||||
// event that the original string was the result of any of the
|
||||
// above errors.
|
||||
//
|
||||
// This method is useful for attempting to recover a password that
|
||||
// may have been previously incorrectly encoded. For example, the
|
||||
// password was supposed to be UTF-8 but the previous application
|
||||
// used a password encoded in WinAnsi, or if the previous password
|
||||
// was supposed to be PDFDoc but was actually given as UTF-8 or
|
||||
// WinAnsi, this method would find the correct password.
|
||||
QPDF_DLL
|
||||
std::vector<std::string> possible_repaired_encodings(std::string);
|
||||
|
||||
// If secure random number generation is supported on your
|
||||
// platform and qpdf was not compiled with insecure random number
|
||||
// generation, this returns a cryptographically secure random
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
#include <stdexcept>
|
||||
#include <set>
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#include <ctype.h>
|
||||
@ -1992,3 +1993,95 @@ QUtil::analyze_encoding(std::string const& val,
|
||||
is_valid_utf8 = true;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string>
|
||||
QUtil::possible_repaired_encodings(std::string supplied)
|
||||
{
|
||||
std::vector<std::string> result;
|
||||
// Always include the original string
|
||||
result.push_back(supplied);
|
||||
bool has_8bit_chars = false;
|
||||
bool is_valid_utf8 = false;
|
||||
bool is_utf16 = false;
|
||||
analyze_encoding(supplied, has_8bit_chars, is_valid_utf8, is_utf16);
|
||||
if (! has_8bit_chars)
|
||||
{
|
||||
return result;
|
||||
}
|
||||
if (is_utf16)
|
||||
{
|
||||
// Convert to UTF-8 and pretend we got a UTF-8 string.
|
||||
is_utf16 = false;
|
||||
is_valid_utf8 = true;
|
||||
supplied = utf16_to_utf8(supplied);
|
||||
}
|
||||
std::string output;
|
||||
if (is_valid_utf8)
|
||||
{
|
||||
// Maybe we were given UTF-8 but wanted one of the single-byte
|
||||
// encodings.
|
||||
if (utf8_to_pdf_doc(supplied, output))
|
||||
{
|
||||
result.push_back(output);
|
||||
}
|
||||
if (utf8_to_win_ansi(supplied, output))
|
||||
{
|
||||
result.push_back(output);
|
||||
}
|
||||
if (utf8_to_mac_roman(supplied, output))
|
||||
{
|
||||
result.push_back(output);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Maybe we were given one of the single-byte encodings but
|
||||
// wanted UTF-8.
|
||||
std::string from_pdf_doc(pdf_doc_to_utf8(supplied));
|
||||
result.push_back(from_pdf_doc);
|
||||
std::string from_win_ansi(win_ansi_to_utf8(supplied));
|
||||
result.push_back(from_win_ansi);
|
||||
std::string from_mac_roman(mac_roman_to_utf8(supplied));
|
||||
result.push_back(from_mac_roman);
|
||||
|
||||
// Maybe we were given one of the other single-byte encodings
|
||||
// but wanted one of the other ones.
|
||||
if (utf8_to_win_ansi(from_pdf_doc, output))
|
||||
{
|
||||
result.push_back(output);
|
||||
}
|
||||
if (utf8_to_mac_roman(from_pdf_doc, output))
|
||||
{
|
||||
result.push_back(output);
|
||||
}
|
||||
if (utf8_to_pdf_doc(from_win_ansi, output))
|
||||
{
|
||||
result.push_back(output);
|
||||
}
|
||||
if (utf8_to_mac_roman(from_win_ansi, output))
|
||||
{
|
||||
result.push_back(output);
|
||||
}
|
||||
if (utf8_to_pdf_doc(from_mac_roman, output))
|
||||
{
|
||||
result.push_back(output);
|
||||
}
|
||||
if (utf8_to_win_ansi(from_mac_roman, output))
|
||||
{
|
||||
result.push_back(output);
|
||||
}
|
||||
}
|
||||
// De-duplicate
|
||||
std::vector<std::string> t;
|
||||
std::set<std::string> seen;
|
||||
for (std::vector<std::string>::iterator iter = result.begin();
|
||||
iter != result.end(); ++iter)
|
||||
{
|
||||
if (! seen.count(*iter))
|
||||
{
|
||||
seen.insert(*iter);
|
||||
t.push_back(*iter);
|
||||
}
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
@ -58,6 +58,19 @@ bidirectional pdf doc done
|
||||
bidirectional win ansi done
|
||||
bidirectional mac roman done
|
||||
analysis done
|
||||
alternatives
|
||||
0: 86a9e99e
|
||||
1: c692c2a9c3a9c5be
|
||||
2: e280a0c2a9c3a9c5be
|
||||
3: c39cc2a9c388c3bb
|
||||
4: 83a9e99e
|
||||
5: 81a9e99e
|
||||
6: dca9c8fb
|
||||
0: c692c2a9c3a9c5be
|
||||
1: 86a9e99e
|
||||
2: 83a9e99e
|
||||
0: 717561636b
|
||||
done alternatives
|
||||
---- whoami
|
||||
quack1
|
||||
quack2
|
||||
|
@ -276,6 +276,16 @@ void check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16)
|
||||
}
|
||||
}
|
||||
|
||||
void print_alternatives(std::string const& str)
|
||||
{
|
||||
std::vector<std::string> result = QUtil::possible_repaired_encodings(str);
|
||||
size_t n = result.size();
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
std::cout << i << ": " << QUtil::hex_encode(result.at(i)) << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void transcoding_test()
|
||||
{
|
||||
transcoding_test(&QUtil::pdf_doc_to_utf8,
|
||||
@ -308,6 +318,18 @@ void transcoding_test()
|
||||
assert(QUtil::utf8_to_pdf_doc(input1, output));
|
||||
assert(! QUtil::utf8_to_pdf_doc(input2, output));
|
||||
assert(QUtil::utf8_to_pdf_doc(input3, output));
|
||||
std::cout << "alternatives" << std::endl;
|
||||
// char name mac win pdf-doc
|
||||
// U+0192 florin 304 203 206
|
||||
// U+00A9 copyright 251 251 251
|
||||
// U+00E9 eacute 216 351 351
|
||||
// U+017E zcaron - 236 236
|
||||
std::string pdfdoc = "\206\251\351\236";
|
||||
std::string utf8 = QUtil::pdf_doc_to_utf8(pdfdoc);
|
||||
print_alternatives(pdfdoc);
|
||||
print_alternatives(utf8);
|
||||
print_alternatives("quack");
|
||||
std::cout << "done alternatives" << std::endl;
|
||||
}
|
||||
|
||||
void print_whoami(char const* str)
|
||||
|
Loading…
Reference in New Issue
Block a user