mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 10:58:58 +00:00
QUtil::analyze_encoding
This commit is contained in:
parent
6817ca585a
commit
8f389f14c0
@ -14,6 +14,13 @@
|
||||
the first bug in qpdf's history that could result in silent loss
|
||||
of data when processing a correct input file. Fixes #276.
|
||||
|
||||
2019-01-13 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Add several more string transcoding and analysis methods to
|
||||
QUtil for bidirectional conversion between PDF Doc, Win Ansi, Mac
|
||||
Roman, UTF-6, and UTF-16 along with detection of valid UTF-8 and
|
||||
UTF-16.
|
||||
|
||||
2019-01-12 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* In the --pages option, allow the same page to be specified more
|
||||
|
@ -193,6 +193,20 @@ namespace QUtil
|
||||
QPDF_DLL
|
||||
std::string pdf_doc_to_utf8(std::string const& pdfdoc);
|
||||
|
||||
// Analyze a string for encoding. We can't tell the difference
|
||||
// between any single-byte encodings, and we can't tell for sure
|
||||
// whether a string that happens to be valid UTF-8 isn't a
|
||||
// different encoding, but we can at least tell a few things to
|
||||
// help us guess. If there are no characters with the high bit
|
||||
// set, has_8bit_chars is false, and the other values are also
|
||||
// false, even though ASCII strings are valid UTF-8. is_valid_utf8
|
||||
// means that the string is non-trivially valid UTF-8.
|
||||
QPDF_DLL
|
||||
void analyze_encoding(std::string const& str,
|
||||
bool& has_8bit_chars,
|
||||
bool& is_valid_utf8,
|
||||
bool& is_utf16);
|
||||
|
||||
// If secure random number generation is supported on your
|
||||
// platform and qpdf was not compiled with insecure random number
|
||||
// generation, this returns a cryptographically secure random
|
||||
|
180
libqpdf/QUtil.cc
180
libqpdf/QUtil.cc
@ -1661,6 +1661,50 @@ encode_pdfdoc(unsigned long codepoint)
|
||||
return ch;
|
||||
}
|
||||
|
||||
unsigned long get_next_utf8_codepoint(
|
||||
std::string const& utf8_val, size_t& pos, bool& error)
|
||||
{
|
||||
size_t len = utf8_val.length();
|
||||
unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos));
|
||||
error = false;
|
||||
if (ch < 128)
|
||||
{
|
||||
return static_cast<unsigned long>(ch);
|
||||
}
|
||||
|
||||
size_t bytes_needed = 0;
|
||||
unsigned bit_check = 0x40;
|
||||
unsigned char to_clear = 0x80;
|
||||
while (ch & bit_check)
|
||||
{
|
||||
++bytes_needed;
|
||||
to_clear |= bit_check;
|
||||
bit_check >>= 1;
|
||||
}
|
||||
if (((bytes_needed > 5) || (bytes_needed < 1)) ||
|
||||
((pos + bytes_needed) >= len))
|
||||
{
|
||||
error = true;
|
||||
return 0xfffd;
|
||||
}
|
||||
|
||||
unsigned long codepoint = (ch & ~to_clear);
|
||||
while (bytes_needed > 0)
|
||||
{
|
||||
--bytes_needed;
|
||||
ch = utf8_val.at(++pos);
|
||||
if ((ch & 0xc0) != 0x80)
|
||||
{
|
||||
--pos;
|
||||
codepoint = 0xfffd;
|
||||
break;
|
||||
}
|
||||
codepoint <<= 6;
|
||||
codepoint += (ch & 0x3f);
|
||||
}
|
||||
return codepoint;
|
||||
}
|
||||
|
||||
static std::string
|
||||
transcode_utf8(std::string const& utf8_val, encoding_e encoding,
|
||||
char unknown)
|
||||
@ -1673,9 +1717,22 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
|
||||
size_t len = utf8_val.length();
|
||||
for (size_t i = 0; i < len; ++i)
|
||||
{
|
||||
unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));
|
||||
if (ch < 128)
|
||||
bool error = false;
|
||||
unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error);
|
||||
if (error)
|
||||
{
|
||||
if (encoding == e_utf16)
|
||||
{
|
||||
result += "\xff\xfd";
|
||||
}
|
||||
else
|
||||
{
|
||||
result.append(1, unknown);
|
||||
}
|
||||
}
|
||||
else if (codepoint < 128)
|
||||
{
|
||||
char ch = static_cast<char>(codepoint);
|
||||
if (encoding == e_utf16)
|
||||
{
|
||||
result += QUtil::toUTF16(ch);
|
||||
@ -1685,78 +1742,35 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
|
||||
result.append(1, ch);
|
||||
}
|
||||
}
|
||||
else if (encoding == e_utf16)
|
||||
{
|
||||
result += QUtil::toUTF16(codepoint);
|
||||
}
|
||||
else if ((codepoint > 160) && (codepoint < 256) &&
|
||||
((encoding == e_winansi) || (encoding == e_pdfdoc)))
|
||||
{
|
||||
result.append(1, static_cast<unsigned char>(codepoint & 0xff));
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t bytes_needed = 0;
|
||||
unsigned bit_check = 0x40;
|
||||
unsigned char to_clear = 0x80;
|
||||
while (ch & bit_check)
|
||||
unsigned char ch = '\0';
|
||||
if (encoding == e_winansi)
|
||||
{
|
||||
++bytes_needed;
|
||||
to_clear |= bit_check;
|
||||
bit_check >>= 1;
|
||||
ch = encode_winansi(codepoint);
|
||||
}
|
||||
|
||||
if (((bytes_needed > 5) || (bytes_needed < 1)) ||
|
||||
((i + bytes_needed) >= len))
|
||||
else if (encoding == e_macroman)
|
||||
{
|
||||
if (encoding == e_utf16)
|
||||
{
|
||||
result += "\xff\xfd";
|
||||
}
|
||||
else
|
||||
{
|
||||
result.append(1, unknown);
|
||||
}
|
||||
ch = encode_macroman(codepoint);
|
||||
}
|
||||
else
|
||||
else if (encoding == e_pdfdoc)
|
||||
{
|
||||
unsigned long codepoint = (ch & ~to_clear);
|
||||
while (bytes_needed > 0)
|
||||
{
|
||||
--bytes_needed;
|
||||
ch = utf8_val.at(++i);
|
||||
if ((ch & 0xc0) != 0x80)
|
||||
{
|
||||
--i;
|
||||
codepoint = 0xfffd;
|
||||
break;
|
||||
}
|
||||
codepoint <<= 6;
|
||||
codepoint += (ch & 0x3f);
|
||||
}
|
||||
if (encoding == e_utf16)
|
||||
{
|
||||
result += QUtil::toUTF16(codepoint);
|
||||
}
|
||||
else if ((codepoint > 160) && (codepoint < 256) &&
|
||||
((encoding == e_winansi) || (encoding == e_pdfdoc)))
|
||||
{
|
||||
ch = static_cast<unsigned char>(codepoint & 0xff);
|
||||
result.append(1, ch);
|
||||
}
|
||||
else
|
||||
{
|
||||
ch = '\0';
|
||||
if (encoding == e_winansi)
|
||||
{
|
||||
ch = encode_winansi(codepoint);
|
||||
}
|
||||
else if (encoding == e_macroman)
|
||||
{
|
||||
ch = encode_macroman(codepoint);
|
||||
}
|
||||
else if (encoding == e_pdfdoc)
|
||||
{
|
||||
ch = encode_pdfdoc(codepoint);
|
||||
}
|
||||
if (ch == '\0')
|
||||
{
|
||||
ch = static_cast<unsigned char>(unknown);
|
||||
}
|
||||
result.append(1, ch);
|
||||
}
|
||||
ch = encode_pdfdoc(codepoint);
|
||||
}
|
||||
if (ch == '\0')
|
||||
{
|
||||
ch = static_cast<unsigned char>(unknown);
|
||||
}
|
||||
result.append(1, ch);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
@ -1904,3 +1918,37 @@ QUtil::pdf_doc_to_utf8(std::string const& val)
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void
|
||||
QUtil::analyze_encoding(std::string const& val,
|
||||
bool& has_8bit_chars,
|
||||
bool& is_valid_utf8,
|
||||
bool& is_utf16)
|
||||
{
|
||||
has_8bit_chars = is_utf16 = is_valid_utf8 = false;
|
||||
if (QUtil::is_utf16(val))
|
||||
{
|
||||
has_8bit_chars = true;
|
||||
is_utf16 = true;
|
||||
return;
|
||||
}
|
||||
size_t len = val.length();
|
||||
bool any_errors = false;
|
||||
for (size_t i = 0; i < len; ++i)
|
||||
{
|
||||
bool error = false;
|
||||
unsigned long codepoint = get_next_utf8_codepoint(val, i, error);
|
||||
if (error)
|
||||
{
|
||||
any_errors = true;
|
||||
}
|
||||
if (codepoint >= 128)
|
||||
{
|
||||
has_8bit_chars = true;
|
||||
}
|
||||
}
|
||||
if (has_8bit_chars && (! any_errors))
|
||||
{
|
||||
is_valid_utf8 = true;
|
||||
}
|
||||
}
|
||||
|
@ -57,6 +57,7 @@ HAGOOGAMAGOOGLE: 0
|
||||
bidirectional pdf doc done
|
||||
bidirectional win ansi done
|
||||
bidirectional mac roman done
|
||||
analysis done
|
||||
---- whoami
|
||||
quack1
|
||||
quack2
|
||||
|
@ -262,6 +262,20 @@ void transcoding_test(std::string (*to_utf8)(std::string const&),
|
||||
}
|
||||
}
|
||||
|
||||
void check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16)
|
||||
{
|
||||
bool has_8bit_chars = false;
|
||||
bool is_valid_utf8 = false;
|
||||
bool is_utf16 = false;
|
||||
QUtil::analyze_encoding(str, has_8bit_chars, is_valid_utf8, is_utf16);
|
||||
if (! ((has_8bit_chars == has8bit) &&
|
||||
(is_valid_utf8 == utf8) &&
|
||||
(is_utf16 == utf16)))
|
||||
{
|
||||
std::cout << "analysis failed: " << str << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void transcoding_test()
|
||||
{
|
||||
transcoding_test(&QUtil::pdf_doc_to_utf8,
|
||||
@ -273,6 +287,11 @@ void transcoding_test()
|
||||
transcoding_test(&QUtil::mac_roman_to_utf8,
|
||||
&QUtil::utf8_to_mac_roman, 255, "?");
|
||||
std::cout << "bidirectional mac roman done" << std::endl;
|
||||
check_analyze("pi = \317\200", true, true, false);
|
||||
check_analyze("pi != \317", true, false, false);
|
||||
check_analyze("pi != 22/7", false, false, false);
|
||||
check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true);
|
||||
std::cout << "analysis done" << std::endl;
|
||||
}
|
||||
|
||||
void print_whoami(char const* str)
|
||||
|
Loading…
Reference in New Issue
Block a user