From f4ca04cec1a0c4a3c8341ff15f68c06bed89c0d7 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Mon, 26 Sep 2022 08:05:28 -0400 Subject: [PATCH] Fix edge case in character encoding (fixes #778) Avoid representing as PDF Doc encoding any string whose PDF Doc encoding representation starts with a UTF-16 or UTF-8 marker. --- ChangeLog | 6 ++++++ libqpdf/QUtil.cc | 34 +++++++++++++++++++++++++++++++--- libtests/qutil.cc | 15 +++++++++++++++ qpdf/qtest/qpdf/unicode.in | 5 +++++ qpdf/qtest/qpdf/unicode.out | 5 +++++ 5 files changed, 62 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index 525adb1d..4c1840e4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2022-09-26 Jay Berkenbilt + + * Bug fix: avoid using PDF Doc encoding for strings whose PDF Doc + encoding representation starts with UTF-16 or UTF-8 markers. Fixes + #778. + 2022-09-14 Jay Berkenbilt * 11.1.0: release diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index bcf4aa4e..7f23bd03 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -1565,10 +1565,38 @@ transcode_utf8( { bool okay = true; result.clear(); - if (encoding == e_utf16) { - result += "\xfe\xff"; - } size_t len = utf8_val.length(); + switch (encoding) { + case e_utf16: + result += "\xfe\xff"; + break; + case e_pdfdoc: + // We need to avoid having the result start with something + // that will be interpreted as UTF-16 or UTF-8, meaning we + // can't end up with a string that starts with "fe ff", + // (UTF-16-BE) "ff fe" (UTF-16-LE, not officially part of the + // PDF spec, but recognized by most readers including qpdf), + // or "ef bb bf" (UTF-8). It's more efficient to check the + // input string to see if it will map to one of those + // sequences than to check the output string since all cases + // start with the same starting character. + if ((len >= 4) && (utf8_val[0] == '\xc3')) { + static std::string fe_ff("\xbe\xc3\xbf"); + static std::string ff_fe("\xbf\xc3\xbe"); + static std::string ef_bb_bf("\xaf\xc2\xbb\xc2\xbf"); + // C++-20 has starts_with, but when this was written, qpdf + // had a minimum supported version of C++-17. + if ((utf8_val.compare(1, 3, fe_ff) == 0) || + (utf8_val.compare(1, 3, ff_fe) == 0) || + (utf8_val.compare(1, 5, ef_bb_bf) == 0)) { + result += unknown; + okay = false; + } + } + break; + default: + break; + } size_t pos = 0; while (pos < len) { bool error = false; diff --git a/libtests/qutil.cc b/libtests/qutil.cc index 82c2dd1a..972046b9 100644 --- a/libtests/qutil.cc +++ b/libtests/qutil.cc @@ -436,6 +436,21 @@ transcoding_test() assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8)); std::cout << other_to_utf8 << std::endl; std::cout << "done other characters" << std::endl; + // These valid UTF8 strings when converted to PDFDoc would end up + // with a byte sequence that would be recognized as UTF-8 or + // UTF-16 rather than PDFDoc. A special case is required to store + // them as UTF-16 rather than PDFDoc. + static std::string fe_ff("\xc3\xbe\xc3\xbf potato"); + static std::string ff_fe("\xc3\xbf\xc3\xbe potato"); + static std::string ef_bb_bf("\xc3\xaf\xc2\xbb\xc2\xbf potato"); + assert(!QUtil::utf8_to_pdf_doc(fe_ff, pdfdoc)); + assert(pdfdoc == "?\xfe\xff potato"); + assert(!QUtil::utf8_to_pdf_doc(ff_fe, pdfdoc)); + assert(pdfdoc == "?\xff\xfe potato"); + assert(!QUtil::utf8_to_pdf_doc(ef_bb_bf, pdfdoc)); + assert(pdfdoc == "?\xef\xbb\xbf potato"); + assert(QUtil::utf8_to_pdf_doc("\xc3\xbe\xc3\xbe", pdfdoc)); + assert(QUtil::utf8_to_pdf_doc("\xc3\xaf\xc2\xbb\xc2\xbe", pdfdoc)); } void diff --git a/qpdf/qtest/qpdf/unicode.in b/qpdf/qtest/qpdf/unicode.in index 2984b5f3..1ddf1178 100644 --- a/qpdf/qtest/qpdf/unicode.in +++ b/qpdf/qtest/qpdf/unicode.in @@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ. 𝄞 𝄢 𝄪 𝅂 This can be encoded in ASCII. This can be encoded in PDFDocEncoding (€). +þÿ -- PDFDoc would look like UTF-16-BE +ÿþ -- PDFDoc would look like UTF-16-LE + -- PDFDoc would look like UTF-8 +ï»» -- PDFDoc okay +þþ -- PDFDoc okay diff --git a/qpdf/qtest/qpdf/unicode.out b/qpdf/qtest/qpdf/unicode.out index c1901585..4f8ee322 100644 --- a/qpdf/qtest/qpdf/unicode.out +++ b/qpdf/qtest/qpdf/unicode.out @@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // This can be encoded in ASCII. // <546869732063616e20626520656e636f64656420696e2041534349492e> This can be encoded in PDFDocEncoding (€). // <546869732063616e20626520656e636f64656420696e20504446446f63456e636f64696e672028a0292e> +þÿ -- PDFDoc would look like UTF-16-BE // +ÿþ -- PDFDoc would look like UTF-16-LE // + -- PDFDoc would look like UTF-8 // +ï»» -- PDFDoc okay // +þþ -- PDFDoc okay //