mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 10:58:58 +00:00
Fix edge case in character encoding (fixes #778)
Avoid representing as PDF Doc encoding any string whose PDF Doc encoding representation starts with a UTF-16 or UTF-8 marker.
This commit is contained in:
parent
4fb7d1335a
commit
f4ca04cec1
@ -1,3 +1,9 @@
|
||||
2022-09-26 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Bug fix: avoid using PDF Doc encoding for strings whose PDF Doc
|
||||
encoding representation starts with UTF-16 or UTF-8 markers. Fixes
|
||||
#778.
|
||||
|
||||
2022-09-14 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* 11.1.0: release
|
||||
|
@ -1565,10 +1565,38 @@ transcode_utf8(
|
||||
{
|
||||
bool okay = true;
|
||||
result.clear();
|
||||
if (encoding == e_utf16) {
|
||||
result += "\xfe\xff";
|
||||
}
|
||||
size_t len = utf8_val.length();
|
||||
switch (encoding) {
|
||||
case e_utf16:
|
||||
result += "\xfe\xff";
|
||||
break;
|
||||
case e_pdfdoc:
|
||||
// We need to avoid having the result start with something
|
||||
// that will be interpreted as UTF-16 or UTF-8, meaning we
|
||||
// can't end up with a string that starts with "fe ff",
|
||||
// (UTF-16-BE) "ff fe" (UTF-16-LE, not officially part of the
|
||||
// PDF spec, but recognized by most readers including qpdf),
|
||||
// or "ef bb bf" (UTF-8). It's more efficient to check the
|
||||
// input string to see if it will map to one of those
|
||||
// sequences than to check the output string since all cases
|
||||
// start with the same starting character.
|
||||
if ((len >= 4) && (utf8_val[0] == '\xc3')) {
|
||||
static std::string fe_ff("\xbe\xc3\xbf");
|
||||
static std::string ff_fe("\xbf\xc3\xbe");
|
||||
static std::string ef_bb_bf("\xaf\xc2\xbb\xc2\xbf");
|
||||
// C++-20 has starts_with, but when this was written, qpdf
|
||||
// had a minimum supported version of C++-17.
|
||||
if ((utf8_val.compare(1, 3, fe_ff) == 0) ||
|
||||
(utf8_val.compare(1, 3, ff_fe) == 0) ||
|
||||
(utf8_val.compare(1, 5, ef_bb_bf) == 0)) {
|
||||
result += unknown;
|
||||
okay = false;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
size_t pos = 0;
|
||||
while (pos < len) {
|
||||
bool error = false;
|
||||
|
@ -436,6 +436,21 @@ transcoding_test()
|
||||
assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8));
|
||||
std::cout << other_to_utf8 << std::endl;
|
||||
std::cout << "done other characters" << std::endl;
|
||||
// These valid UTF8 strings when converted to PDFDoc would end up
|
||||
// with a byte sequence that would be recognized as UTF-8 or
|
||||
// UTF-16 rather than PDFDoc. A special case is required to store
|
||||
// them as UTF-16 rather than PDFDoc.
|
||||
static std::string fe_ff("\xc3\xbe\xc3\xbf potato");
|
||||
static std::string ff_fe("\xc3\xbf\xc3\xbe potato");
|
||||
static std::string ef_bb_bf("\xc3\xaf\xc2\xbb\xc2\xbf potato");
|
||||
assert(!QUtil::utf8_to_pdf_doc(fe_ff, pdfdoc));
|
||||
assert(pdfdoc == "?\xfe\xff potato");
|
||||
assert(!QUtil::utf8_to_pdf_doc(ff_fe, pdfdoc));
|
||||
assert(pdfdoc == "?\xff\xfe potato");
|
||||
assert(!QUtil::utf8_to_pdf_doc(ef_bb_bf, pdfdoc));
|
||||
assert(pdfdoc == "?\xef\xbb\xbf potato");
|
||||
assert(QUtil::utf8_to_pdf_doc("\xc3\xbe\xc3\xbe", pdfdoc));
|
||||
assert(QUtil::utf8_to_pdf_doc("\xc3\xaf\xc2\xbb\xc2\xbe", pdfdoc));
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ.
|
||||
𝄞 𝄢 𝄪 𝅂
|
||||
This can be encoded in ASCII.
|
||||
This can be encoded in PDFDocEncoding (€).
|
||||
þÿ -- PDFDoc would look like UTF-16-BE
|
||||
ÿþ -- PDFDoc would look like UTF-16-LE
|
||||
 -- PDFDoc would look like UTF-8
|
||||
ï»» -- PDFDoc okay
|
||||
þþ -- PDFDoc okay
|
||||
|
@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // <feff00490066002000
|
||||
𝄞 𝄢 𝄪 𝅂 // <feffd834dd1e0020d834dd220020d834dd2a0020d834dd42>
|
||||
This can be encoded in ASCII. // <546869732063616e20626520656e636f64656420696e2041534349492e>
|
||||
This can be encoded in PDFDocEncoding (€). // <546869732063616e20626520656e636f64656420696e20504446446f63456e636f64696e672028a0292e>
|
||||
þÿ -- PDFDoc would look like UTF-16-BE // <feff00fe00ff0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d00310036002d00420045>
|
||||
ÿþ -- PDFDoc would look like UTF-16-LE // <feff00ff00fe0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d00310036002d004c0045>
|
||||
 -- PDFDoc would look like UTF-8 // <feff00ef00bb00bf0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d0038>
|
||||
ï»» -- PDFDoc okay // <efbbbb202d2d20504446446f63206f6b6179>
|
||||
þþ -- PDFDoc okay // <fefe202d2d20504446446f63206f6b6179>
|
||||
|
Loading…
Reference in New Issue
Block a user