mirror of
https://github.com/qpdf/qpdf.git
synced 2024-11-10 15:20:54 +00:00
QUtil::analyze_encoding
This commit is contained in:
parent
6817ca585a
commit
8f389f14c0
@ -14,6 +14,13 @@
|
|||||||
the first bug in qpdf's history that could result in silent loss
|
the first bug in qpdf's history that could result in silent loss
|
||||||
of data when processing a correct input file. Fixes #276.
|
of data when processing a correct input file. Fixes #276.
|
||||||
|
|
||||||
|
2019-01-13 Jay Berkenbilt <ejb@ql.org>
|
||||||
|
|
||||||
|
* Add several more string transcoding and analysis methods to
|
||||||
|
QUtil for bidirectional conversion between PDF Doc, Win Ansi, Mac
|
||||||
|
Roman, UTF-6, and UTF-16 along with detection of valid UTF-8 and
|
||||||
|
UTF-16.
|
||||||
|
|
||||||
2019-01-12 Jay Berkenbilt <ejb@ql.org>
|
2019-01-12 Jay Berkenbilt <ejb@ql.org>
|
||||||
|
|
||||||
* In the --pages option, allow the same page to be specified more
|
* In the --pages option, allow the same page to be specified more
|
||||||
|
@ -193,6 +193,20 @@ namespace QUtil
|
|||||||
QPDF_DLL
|
QPDF_DLL
|
||||||
std::string pdf_doc_to_utf8(std::string const& pdfdoc);
|
std::string pdf_doc_to_utf8(std::string const& pdfdoc);
|
||||||
|
|
||||||
|
// Analyze a string for encoding. We can't tell the difference
|
||||||
|
// between any single-byte encodings, and we can't tell for sure
|
||||||
|
// whether a string that happens to be valid UTF-8 isn't a
|
||||||
|
// different encoding, but we can at least tell a few things to
|
||||||
|
// help us guess. If there are no characters with the high bit
|
||||||
|
// set, has_8bit_chars is false, and the other values are also
|
||||||
|
// false, even though ASCII strings are valid UTF-8. is_valid_utf8
|
||||||
|
// means that the string is non-trivially valid UTF-8.
|
||||||
|
QPDF_DLL
|
||||||
|
void analyze_encoding(std::string const& str,
|
||||||
|
bool& has_8bit_chars,
|
||||||
|
bool& is_valid_utf8,
|
||||||
|
bool& is_utf16);
|
||||||
|
|
||||||
// If secure random number generation is supported on your
|
// If secure random number generation is supported on your
|
||||||
// platform and qpdf was not compiled with insecure random number
|
// platform and qpdf was not compiled with insecure random number
|
||||||
// generation, this returns a cryptographically secure random
|
// generation, this returns a cryptographically secure random
|
||||||
|
136
libqpdf/QUtil.cc
136
libqpdf/QUtil.cc
@ -1661,6 +1661,50 @@ encode_pdfdoc(unsigned long codepoint)
|
|||||||
return ch;
|
return ch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned long get_next_utf8_codepoint(
|
||||||
|
std::string const& utf8_val, size_t& pos, bool& error)
|
||||||
|
{
|
||||||
|
size_t len = utf8_val.length();
|
||||||
|
unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos));
|
||||||
|
error = false;
|
||||||
|
if (ch < 128)
|
||||||
|
{
|
||||||
|
return static_cast<unsigned long>(ch);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t bytes_needed = 0;
|
||||||
|
unsigned bit_check = 0x40;
|
||||||
|
unsigned char to_clear = 0x80;
|
||||||
|
while (ch & bit_check)
|
||||||
|
{
|
||||||
|
++bytes_needed;
|
||||||
|
to_clear |= bit_check;
|
||||||
|
bit_check >>= 1;
|
||||||
|
}
|
||||||
|
if (((bytes_needed > 5) || (bytes_needed < 1)) ||
|
||||||
|
((pos + bytes_needed) >= len))
|
||||||
|
{
|
||||||
|
error = true;
|
||||||
|
return 0xfffd;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned long codepoint = (ch & ~to_clear);
|
||||||
|
while (bytes_needed > 0)
|
||||||
|
{
|
||||||
|
--bytes_needed;
|
||||||
|
ch = utf8_val.at(++pos);
|
||||||
|
if ((ch & 0xc0) != 0x80)
|
||||||
|
{
|
||||||
|
--pos;
|
||||||
|
codepoint = 0xfffd;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
codepoint <<= 6;
|
||||||
|
codepoint += (ch & 0x3f);
|
||||||
|
}
|
||||||
|
return codepoint;
|
||||||
|
}
|
||||||
|
|
||||||
static std::string
|
static std::string
|
||||||
transcode_utf8(std::string const& utf8_val, encoding_e encoding,
|
transcode_utf8(std::string const& utf8_val, encoding_e encoding,
|
||||||
char unknown)
|
char unknown)
|
||||||
@ -1673,32 +1717,9 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
|
|||||||
size_t len = utf8_val.length();
|
size_t len = utf8_val.length();
|
||||||
for (size_t i = 0; i < len; ++i)
|
for (size_t i = 0; i < len; ++i)
|
||||||
{
|
{
|
||||||
unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));
|
bool error = false;
|
||||||
if (ch < 128)
|
unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error);
|
||||||
{
|
if (error)
|
||||||
if (encoding == e_utf16)
|
|
||||||
{
|
|
||||||
result += QUtil::toUTF16(ch);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
result.append(1, ch);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
size_t bytes_needed = 0;
|
|
||||||
unsigned bit_check = 0x40;
|
|
||||||
unsigned char to_clear = 0x80;
|
|
||||||
while (ch & bit_check)
|
|
||||||
{
|
|
||||||
++bytes_needed;
|
|
||||||
to_clear |= bit_check;
|
|
||||||
bit_check >>= 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (((bytes_needed > 5) || (bytes_needed < 1)) ||
|
|
||||||
((i + bytes_needed) >= len))
|
|
||||||
{
|
{
|
||||||
if (encoding == e_utf16)
|
if (encoding == e_utf16)
|
||||||
{
|
{
|
||||||
@ -1709,35 +1730,30 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
|
|||||||
result.append(1, unknown);
|
result.append(1, unknown);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (codepoint < 128)
|
||||||
|
{
|
||||||
|
char ch = static_cast<char>(codepoint);
|
||||||
|
if (encoding == e_utf16)
|
||||||
|
{
|
||||||
|
result += QUtil::toUTF16(ch);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
unsigned long codepoint = (ch & ~to_clear);
|
result.append(1, ch);
|
||||||
while (bytes_needed > 0)
|
|
||||||
{
|
|
||||||
--bytes_needed;
|
|
||||||
ch = utf8_val.at(++i);
|
|
||||||
if ((ch & 0xc0) != 0x80)
|
|
||||||
{
|
|
||||||
--i;
|
|
||||||
codepoint = 0xfffd;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
codepoint <<= 6;
|
|
||||||
codepoint += (ch & 0x3f);
|
|
||||||
}
|
}
|
||||||
if (encoding == e_utf16)
|
else if (encoding == e_utf16)
|
||||||
{
|
{
|
||||||
result += QUtil::toUTF16(codepoint);
|
result += QUtil::toUTF16(codepoint);
|
||||||
}
|
}
|
||||||
else if ((codepoint > 160) && (codepoint < 256) &&
|
else if ((codepoint > 160) && (codepoint < 256) &&
|
||||||
((encoding == e_winansi) || (encoding == e_pdfdoc)))
|
((encoding == e_winansi) || (encoding == e_pdfdoc)))
|
||||||
{
|
{
|
||||||
ch = static_cast<unsigned char>(codepoint & 0xff);
|
result.append(1, static_cast<unsigned char>(codepoint & 0xff));
|
||||||
result.append(1, ch);
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
ch = '\0';
|
unsigned char ch = '\0';
|
||||||
if (encoding == e_winansi)
|
if (encoding == e_winansi)
|
||||||
{
|
{
|
||||||
ch = encode_winansi(codepoint);
|
ch = encode_winansi(codepoint);
|
||||||
@ -1757,8 +1773,6 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
|
|||||||
result.append(1, ch);
|
result.append(1, ch);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1904,3 +1918,37 @@ QUtil::pdf_doc_to_utf8(std::string const& val)
|
|||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
QUtil::analyze_encoding(std::string const& val,
|
||||||
|
bool& has_8bit_chars,
|
||||||
|
bool& is_valid_utf8,
|
||||||
|
bool& is_utf16)
|
||||||
|
{
|
||||||
|
has_8bit_chars = is_utf16 = is_valid_utf8 = false;
|
||||||
|
if (QUtil::is_utf16(val))
|
||||||
|
{
|
||||||
|
has_8bit_chars = true;
|
||||||
|
is_utf16 = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
size_t len = val.length();
|
||||||
|
bool any_errors = false;
|
||||||
|
for (size_t i = 0; i < len; ++i)
|
||||||
|
{
|
||||||
|
bool error = false;
|
||||||
|
unsigned long codepoint = get_next_utf8_codepoint(val, i, error);
|
||||||
|
if (error)
|
||||||
|
{
|
||||||
|
any_errors = true;
|
||||||
|
}
|
||||||
|
if (codepoint >= 128)
|
||||||
|
{
|
||||||
|
has_8bit_chars = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (has_8bit_chars && (! any_errors))
|
||||||
|
{
|
||||||
|
is_valid_utf8 = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -57,6 +57,7 @@ HAGOOGAMAGOOGLE: 0
|
|||||||
bidirectional pdf doc done
|
bidirectional pdf doc done
|
||||||
bidirectional win ansi done
|
bidirectional win ansi done
|
||||||
bidirectional mac roman done
|
bidirectional mac roman done
|
||||||
|
analysis done
|
||||||
---- whoami
|
---- whoami
|
||||||
quack1
|
quack1
|
||||||
quack2
|
quack2
|
||||||
|
@ -262,6 +262,20 @@ void transcoding_test(std::string (*to_utf8)(std::string const&),
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16)
|
||||||
|
{
|
||||||
|
bool has_8bit_chars = false;
|
||||||
|
bool is_valid_utf8 = false;
|
||||||
|
bool is_utf16 = false;
|
||||||
|
QUtil::analyze_encoding(str, has_8bit_chars, is_valid_utf8, is_utf16);
|
||||||
|
if (! ((has_8bit_chars == has8bit) &&
|
||||||
|
(is_valid_utf8 == utf8) &&
|
||||||
|
(is_utf16 == utf16)))
|
||||||
|
{
|
||||||
|
std::cout << "analysis failed: " << str << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void transcoding_test()
|
void transcoding_test()
|
||||||
{
|
{
|
||||||
transcoding_test(&QUtil::pdf_doc_to_utf8,
|
transcoding_test(&QUtil::pdf_doc_to_utf8,
|
||||||
@ -273,6 +287,11 @@ void transcoding_test()
|
|||||||
transcoding_test(&QUtil::mac_roman_to_utf8,
|
transcoding_test(&QUtil::mac_roman_to_utf8,
|
||||||
&QUtil::utf8_to_mac_roman, 255, "?");
|
&QUtil::utf8_to_mac_roman, 255, "?");
|
||||||
std::cout << "bidirectional mac roman done" << std::endl;
|
std::cout << "bidirectional mac roman done" << std::endl;
|
||||||
|
check_analyze("pi = \317\200", true, true, false);
|
||||||
|
check_analyze("pi != \317", true, false, false);
|
||||||
|
check_analyze("pi != 22/7", false, false, false);
|
||||||
|
check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true);
|
||||||
|
std::cout << "analysis done" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_whoami(char const* str)
|
void print_whoami(char const* str)
|
||||||
|
Loading…
Reference in New Issue
Block a user