mirror of
https://github.com/qpdf/qpdf.git
synced 2024-05-29 08:20:53 +00:00
Silently/transparently recognize UTF-16LE as UTF-16 (fixes #649)
The PDF spec only allows UTF-16BE, but most readers seem to accept UTF-16LE as well, so now qpdf does too.
This commit is contained in:
parent
fbd3e56da7
commit
a478cbb6dc
|
@ -1,5 +1,9 @@
|
||||||
2022-02-15 Jay Berkenbilt <ejb@ql.org>
|
2022-02-15 Jay Berkenbilt <ejb@ql.org>
|
||||||
|
|
||||||
|
* When analyzing PDF strings, recognize UTF-16LE as UTF-16. The
|
||||||
|
PDF spec only allows UTF-16BE, but most readers seem to allow
|
||||||
|
both. Fixes #649.
|
||||||
|
|
||||||
* Bug fix: 10.6.0 inadvertently removed an unknown/undocumented
|
* Bug fix: 10.6.0 inadvertently removed an unknown/undocumented
|
||||||
CLI parsing feature, which has been restored in 10.6.2. Fixes #652.
|
CLI parsing feature, which has been restored in 10.6.2. Fixes #652.
|
||||||
|
|
||||||
|
|
|
@ -267,8 +267,11 @@ namespace QUtil
|
||||||
QPDF_DLL
|
QPDF_DLL
|
||||||
std::string toUTF16(unsigned long uval);
|
std::string toUTF16(unsigned long uval);
|
||||||
|
|
||||||
// Test whether this is a UTF-16 big-endian string. This is
|
// Test whether this is a UTF-16 string. This is indicated by
|
||||||
// indicated by first two bytes being 0xFE 0xFF.
|
// first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE
|
||||||
|
// (little-endian). Starting in qpdf 10.6.2, this detects
|
||||||
|
// little-endian as well as big-endian. Even though the PDF spec
|
||||||
|
// doesn't allow little-endian, most readers seem to accept it.
|
||||||
QPDF_DLL
|
QPDF_DLL
|
||||||
bool is_utf16(std::string const&);
|
bool is_utf16(std::string const&);
|
||||||
|
|
||||||
|
@ -309,8 +312,8 @@ namespace QUtil
|
||||||
bool utf8_to_pdf_doc(
|
bool utf8_to_pdf_doc(
|
||||||
std::string const& utf8, std::string& pdfdoc, char unknown_char = '?');
|
std::string const& utf8, std::string& pdfdoc, char unknown_char = '?');
|
||||||
|
|
||||||
// Convert a UTF-16 big-endian encoded string to UTF-8.
|
// Convert a UTF-16 encoded string to UTF-8. Unrepresentable code
|
||||||
// Unrepresentable code points are converted to U+FFFD.
|
// points are converted to U+FFFD.
|
||||||
QPDF_DLL
|
QPDF_DLL
|
||||||
std::string utf16_to_utf8(std::string const& utf16);
|
std::string utf16_to_utf8(std::string const& utf16);
|
||||||
|
|
||||||
|
@ -331,7 +334,9 @@ namespace QUtil
|
||||||
// help us guess. If there are no characters with the high bit
|
// help us guess. If there are no characters with the high bit
|
||||||
// set, has_8bit_chars is false, and the other values are also
|
// set, has_8bit_chars is false, and the other values are also
|
||||||
// false, even though ASCII strings are valid UTF-8. is_valid_utf8
|
// false, even though ASCII strings are valid UTF-8. is_valid_utf8
|
||||||
// means that the string is non-trivially valid UTF-8.
|
// means that the string is non-trivially valid UTF-8. Although
|
||||||
|
// the PDF spec requires UTF-16 to be UTF-16BE, qpdf (and just
|
||||||
|
// about everything else) accepts UTF-16LE (as of 10.6.2).
|
||||||
QPDF_DLL
|
QPDF_DLL
|
||||||
void analyze_encoding(std::string const& str,
|
void analyze_encoding(std::string const& str,
|
||||||
bool& has_8bit_chars,
|
bool& has_8bit_chars,
|
||||||
|
|
|
@ -2400,7 +2400,8 @@ bool
|
||||||
QUtil::is_utf16(std::string const& val)
|
QUtil::is_utf16(std::string const& val)
|
||||||
{
|
{
|
||||||
return ((val.length() >= 2) &&
|
return ((val.length() >= 2) &&
|
||||||
(val.at(0) == '\xfe') && (val.at(1) == '\xff'));
|
(((val.at(0) == '\xfe') && (val.at(1) == '\xff')) ||
|
||||||
|
((val.at(0) == '\xff') && (val.at(1) == '\xfe'))));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string
|
std::string
|
||||||
|
@ -2414,8 +2415,13 @@ QUtil::utf16_to_utf8(std::string const& val)
|
||||||
unsigned long codepoint = 0L;
|
unsigned long codepoint = 0L;
|
||||||
size_t len = val.length();
|
size_t len = val.length();
|
||||||
size_t start = 0;
|
size_t start = 0;
|
||||||
|
bool is_le = false;
|
||||||
if (is_utf16(val))
|
if (is_utf16(val))
|
||||||
{
|
{
|
||||||
|
if (static_cast<unsigned char>(val.at(0)) == 0xff)
|
||||||
|
{
|
||||||
|
is_le = true;
|
||||||
|
}
|
||||||
start += 2;
|
start += 2;
|
||||||
}
|
}
|
||||||
// If the string has an odd number of bytes, the last byte is
|
// If the string has an odd number of bytes, the last byte is
|
||||||
|
@ -2428,10 +2434,12 @@ QUtil::utf16_to_utf8(std::string const& val)
|
||||||
// codepoint not followed by a low codepoint will be
|
// codepoint not followed by a low codepoint will be
|
||||||
// discarded, and a low codepoint not preceded by a high
|
// discarded, and a low codepoint not preceded by a high
|
||||||
// codepoint will just get its low 10 bits output.
|
// codepoint will just get its low 10 bits output.
|
||||||
|
auto msb = is_le ? i+1 : i;
|
||||||
|
auto lsb = is_le ? i : i+1;
|
||||||
unsigned short bits =
|
unsigned short bits =
|
||||||
QIntC::to_ushort(
|
QIntC::to_ushort(
|
||||||
(static_cast<unsigned char>(val.at(i)) << 8) +
|
(static_cast<unsigned char>(val.at(msb)) << 8) +
|
||||||
static_cast<unsigned char>(val.at(i+1)));
|
static_cast<unsigned char>(val.at(lsb)));
|
||||||
if ((bits & 0xFC00) == 0xD800)
|
if ((bits & 0xFC00) == 0xD800)
|
||||||
{
|
{
|
||||||
codepoint = 0x10000U + ((bits & 0x3FFU) << 10U);
|
codepoint = 0x10000U + ((bits & 0x3FFU) << 10U);
|
||||||
|
|
|
@ -63,6 +63,7 @@ HAGOOGAMAGOOGLE: 0
|
||||||
0x80000000 -> ff fd
|
0x80000000 -> ff fd
|
||||||
π
|
π
|
||||||
π
|
π
|
||||||
|
LE: π
|
||||||
---- utf8_to_ascii
|
---- utf8_to_ascii
|
||||||
¿Does π have fingers?
|
¿Does π have fingers?
|
||||||
?Does ? have fingers?
|
?Does ? have fingers?
|
||||||
|
|
|
@ -303,6 +303,7 @@ void to_utf16_test()
|
||||||
std::string s(QUtil::utf8_to_utf16("\xcf\x80"));
|
std::string s(QUtil::utf8_to_utf16("\xcf\x80"));
|
||||||
std::cout << QUtil::utf16_to_utf8(s) << std::endl;
|
std::cout << QUtil::utf16_to_utf8(s) << std::endl;
|
||||||
std::cout << QUtil::utf16_to_utf8(s + ".") << std::endl;
|
std::cout << QUtil::utf16_to_utf8(s + ".") << std::endl;
|
||||||
|
std::cout << "LE: " << QUtil::utf16_to_utf8("\xff\xfe\xc0\x03") << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void utf8_to_ascii_test()
|
void utf8_to_ascii_test()
|
||||||
|
@ -388,7 +389,8 @@ void transcoding_test()
|
||||||
check_analyze("pi = \317\200", true, true, false);
|
check_analyze("pi = \317\200", true, true, false);
|
||||||
check_analyze("pi != \317", true, false, false);
|
check_analyze("pi != \317", true, false, false);
|
||||||
check_analyze("pi != 22/7", false, false, false);
|
check_analyze("pi != 22/7", false, false, false);
|
||||||
check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true);
|
check_analyze(std::string("\xfe\xff\x00\x51", 4), true, false, true);
|
||||||
|
check_analyze(std::string("\xff\xfe\x51\x00", 4), true, false, true);
|
||||||
std::cout << "analysis done" << std::endl;
|
std::cout << "analysis done" << std::endl;
|
||||||
std::string input1("a\302\277b");
|
std::string input1("a\302\277b");
|
||||||
std::string input2("a\317\200b");
|
std::string input2("a\317\200b");
|
||||||
|
|
|
@ -73,7 +73,7 @@ flush_tiff_cache();
|
||||||
show_ntests();
|
show_ntests();
|
||||||
# ----------
|
# ----------
|
||||||
$td->notify("--- Character Encoding ---");
|
$td->notify("--- Character Encoding ---");
|
||||||
$n_tests += 3;
|
$n_tests += 4;
|
||||||
|
|
||||||
$td->runtest("PDF doc encoding to Unicode",
|
$td->runtest("PDF doc encoding to Unicode",
|
||||||
{$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"},
|
{$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"},
|
||||||
|
@ -88,6 +88,13 @@ $td->runtest("UTF-16 encoding errors",
|
||||||
{$td->FILE => "unicode-errors.out", $td->EXIT_STATUS => 0},
|
{$td->FILE => "unicode-errors.out", $td->EXIT_STATUS => 0},
|
||||||
$td->NORMALIZE_NEWLINES);
|
$td->NORMALIZE_NEWLINES);
|
||||||
|
|
||||||
|
# UTF-16LE is not allowed by the PDF spec, but it seems that most
|
||||||
|
# readers accept it.
|
||||||
|
$td->runtest("UTF-16LE strings",
|
||||||
|
{$td->COMMAND => "qpdf --list-attachments --verbose utf16le.pdf"},
|
||||||
|
{$td->FILE => "utf16le-attachments.out", $td->EXIT_STATUS => 0},
|
||||||
|
$td->NORMALIZE_NEWLINES);
|
||||||
|
|
||||||
# Tests to exercise QPDFArgParser belong in arg_parser.test in
|
# Tests to exercise QPDFArgParser belong in arg_parser.test in
|
||||||
# libtests. These tests are supposed to be specific to the qpdf cli.
|
# libtests. These tests are supposed to be specific to the qpdf cli.
|
||||||
# Since they were written prior to moving QPDFArgParser into the
|
# Since they were written prior to moving QPDFArgParser into the
|
||||||
|
|
8
qpdf/qtest/qpdf/utf16le-attachments.out
Normal file
8
qpdf/qtest/qpdf/utf16le-attachments.out
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
potato.png -> 6,0
|
||||||
|
preferred name: π.png
|
||||||
|
all names:
|
||||||
|
/F -> π.png
|
||||||
|
/UF -> π.png
|
||||||
|
all data streams:
|
||||||
|
/F -> 6,0
|
||||||
|
/UF -> 6,0
|
BIN
qpdf/qtest/qpdf/utf16le.pdf
Normal file
BIN
qpdf/qtest/qpdf/utf16le.pdf
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user