QUtil::analyze_encoding

This commit is contained in:
Jay Berkenbilt 2019-01-13 09:41:13 -05:00
parent 6817ca585a
commit 8f389f14c0
5 changed files with 155 additions and 66 deletions

View File

@ -14,6 +14,13 @@
the first bug in qpdf's history that could result in silent loss
of data when processing a correct input file. Fixes #276.
2019-01-13 Jay Berkenbilt <ejb@ql.org>
* Add several more string transcoding and analysis methods to
QUtil for bidirectional conversion between PDF Doc, Win Ansi, Mac
Roman, UTF-6, and UTF-16 along with detection of valid UTF-8 and
UTF-16.
2019-01-12 Jay Berkenbilt <ejb@ql.org>
* In the --pages option, allow the same page to be specified more

View File

@ -193,6 +193,20 @@ namespace QUtil
QPDF_DLL
std::string pdf_doc_to_utf8(std::string const& pdfdoc);
// Analyze a string for encoding. We can't tell the difference
// between any single-byte encodings, and we can't tell for sure
// whether a string that happens to be valid UTF-8 isn't a
// different encoding, but we can at least tell a few things to
// help us guess. If there are no characters with the high bit
// set, has_8bit_chars is false, and the other values are also
// false, even though ASCII strings are valid UTF-8. is_valid_utf8
// means that the string is non-trivially valid UTF-8.
QPDF_DLL
void analyze_encoding(std::string const& str,
bool& has_8bit_chars,
bool& is_valid_utf8,
bool& is_utf16);
// If secure random number generation is supported on your
// platform and qpdf was not compiled with insecure random number
// generation, this returns a cryptographically secure random

View File

@ -1661,6 +1661,50 @@ encode_pdfdoc(unsigned long codepoint)
return ch;
}
unsigned long get_next_utf8_codepoint(
std::string const& utf8_val, size_t& pos, bool& error)
{
size_t len = utf8_val.length();
unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos));
error = false;
if (ch < 128)
{
return static_cast<unsigned long>(ch);
}
size_t bytes_needed = 0;
unsigned bit_check = 0x40;
unsigned char to_clear = 0x80;
while (ch & bit_check)
{
++bytes_needed;
to_clear |= bit_check;
bit_check >>= 1;
}
if (((bytes_needed > 5) || (bytes_needed < 1)) ||
((pos + bytes_needed) >= len))
{
error = true;
return 0xfffd;
}
unsigned long codepoint = (ch & ~to_clear);
while (bytes_needed > 0)
{
--bytes_needed;
ch = utf8_val.at(++pos);
if ((ch & 0xc0) != 0x80)
{
--pos;
codepoint = 0xfffd;
break;
}
codepoint <<= 6;
codepoint += (ch & 0x3f);
}
return codepoint;
}
static std::string
transcode_utf8(std::string const& utf8_val, encoding_e encoding,
char unknown)
@ -1673,9 +1717,22 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
size_t len = utf8_val.length();
for (size_t i = 0; i < len; ++i)
{
unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));
if (ch < 128)
bool error = false;
unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error);
if (error)
{
if (encoding == e_utf16)
{
result += "\xff\xfd";
}
else
{
result.append(1, unknown);
}
}
else if (codepoint < 128)
{
char ch = static_cast<char>(codepoint);
if (encoding == e_utf16)
{
result += QUtil::toUTF16(ch);
@ -1685,78 +1742,35 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
result.append(1, ch);
}
}
else if (encoding == e_utf16)
{
result += QUtil::toUTF16(codepoint);
}
else if ((codepoint > 160) && (codepoint < 256) &&
((encoding == e_winansi) || (encoding == e_pdfdoc)))
{
result.append(1, static_cast<unsigned char>(codepoint & 0xff));
}
else
{
size_t bytes_needed = 0;
unsigned bit_check = 0x40;
unsigned char to_clear = 0x80;
while (ch & bit_check)
unsigned char ch = '\0';
if (encoding == e_winansi)
{
++bytes_needed;
to_clear |= bit_check;
bit_check >>= 1;
ch = encode_winansi(codepoint);
}
if (((bytes_needed > 5) || (bytes_needed < 1)) ||
((i + bytes_needed) >= len))
else if (encoding == e_macroman)
{
if (encoding == e_utf16)
{
result += "\xff\xfd";
}
else
{
result.append(1, unknown);
}
ch = encode_macroman(codepoint);
}
else
else if (encoding == e_pdfdoc)
{
unsigned long codepoint = (ch & ~to_clear);
while (bytes_needed > 0)
{
--bytes_needed;
ch = utf8_val.at(++i);
if ((ch & 0xc0) != 0x80)
{
--i;
codepoint = 0xfffd;
break;
}
codepoint <<= 6;
codepoint += (ch & 0x3f);
}
if (encoding == e_utf16)
{
result += QUtil::toUTF16(codepoint);
}
else if ((codepoint > 160) && (codepoint < 256) &&
((encoding == e_winansi) || (encoding == e_pdfdoc)))
{
ch = static_cast<unsigned char>(codepoint & 0xff);
result.append(1, ch);
}
else
{
ch = '\0';
if (encoding == e_winansi)
{
ch = encode_winansi(codepoint);
}
else if (encoding == e_macroman)
{
ch = encode_macroman(codepoint);
}
else if (encoding == e_pdfdoc)
{
ch = encode_pdfdoc(codepoint);
}
if (ch == '\0')
{
ch = static_cast<unsigned char>(unknown);
}
result.append(1, ch);
}
ch = encode_pdfdoc(codepoint);
}
if (ch == '\0')
{
ch = static_cast<unsigned char>(unknown);
}
result.append(1, ch);
}
}
return result;
@ -1904,3 +1918,37 @@ QUtil::pdf_doc_to_utf8(std::string const& val)
}
return result;
}
void
QUtil::analyze_encoding(std::string const& val,
bool& has_8bit_chars,
bool& is_valid_utf8,
bool& is_utf16)
{
has_8bit_chars = is_utf16 = is_valid_utf8 = false;
if (QUtil::is_utf16(val))
{
has_8bit_chars = true;
is_utf16 = true;
return;
}
size_t len = val.length();
bool any_errors = false;
for (size_t i = 0; i < len; ++i)
{
bool error = false;
unsigned long codepoint = get_next_utf8_codepoint(val, i, error);
if (error)
{
any_errors = true;
}
if (codepoint >= 128)
{
has_8bit_chars = true;
}
}
if (has_8bit_chars && (! any_errors))
{
is_valid_utf8 = true;
}
}

View File

@ -57,6 +57,7 @@ HAGOOGAMAGOOGLE: 0
bidirectional pdf doc done
bidirectional win ansi done
bidirectional mac roman done
analysis done
---- whoami
quack1
quack2

View File

@ -262,6 +262,20 @@ void transcoding_test(std::string (*to_utf8)(std::string const&),
}
}
void check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16)
{
bool has_8bit_chars = false;
bool is_valid_utf8 = false;
bool is_utf16 = false;
QUtil::analyze_encoding(str, has_8bit_chars, is_valid_utf8, is_utf16);
if (! ((has_8bit_chars == has8bit) &&
(is_valid_utf8 == utf8) &&
(is_utf16 == utf16)))
{
std::cout << "analysis failed: " << str << std::endl;
}
}
void transcoding_test()
{
transcoding_test(&QUtil::pdf_doc_to_utf8,
@ -273,6 +287,11 @@ void transcoding_test()
transcoding_test(&QUtil::mac_roman_to_utf8,
&QUtil::utf8_to_mac_roman, 255, "?");
std::cout << "bidirectional mac roman done" << std::endl;
check_analyze("pi = \317\200", true, true, false);
check_analyze("pi != \317", true, false, false);
check_analyze("pi != 22/7", false, false, false);
check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true);
std::cout << "analysis done" << std::endl;
}
void print_whoami(char const* str)