From f4ca04cec1a0c4a3c8341ff15f68c06bed89c0d7 Mon Sep 17 00:00:00 2001
From: Jay Berkenbilt <ejb@ql.org>
Date: Mon, 26 Sep 2022 08:05:28 -0400
Subject: [PATCH] Fix edge case in character encoding (fixes #778)

Avoid representing as PDF Doc encoding any string whose PDF Doc
encoding representation starts with a UTF-16 or UTF-8 marker.
---
 ChangeLog                   |  6 ++++++
 libqpdf/QUtil.cc            | 34 +++++++++++++++++++++++++++++++---
 libtests/qutil.cc           | 15 +++++++++++++++
 qpdf/qtest/qpdf/unicode.in  |  5 +++++
 qpdf/qtest/qpdf/unicode.out |  5 +++++
 5 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 525adb1d..4c1840e4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2022-09-26  Jay Berkenbilt  <ejb@ql.org>
+
+        * Bug fix: avoid using PDF Doc encoding for strings whose PDF Doc
+        encoding representation starts with UTF-16 or UTF-8 markers. Fixes
+        #778.
+
 2022-09-14  Jay Berkenbilt  <ejb@ql.org>
 
         * 11.1.0: release
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index bcf4aa4e..7f23bd03 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -1565,10 +1565,38 @@ transcode_utf8(
 {
     bool okay = true;
     result.clear();
-    if (encoding == e_utf16) {
-        result += "\xfe\xff";
-    }
     size_t len = utf8_val.length();
+    switch (encoding) {
+    case e_utf16:
+        result += "\xfe\xff";
+        break;
+    case e_pdfdoc:
+        // We need to avoid having the result start with something
+        // that will be interpreted as UTF-16 or UTF-8, meaning we
+        // can't end up with a string that starts with "fe ff",
+        // (UTF-16-BE) "ff fe" (UTF-16-LE, not officially part of the
+        // PDF spec, but recognized by most readers including qpdf),
+        // or "ef bb bf" (UTF-8). It's more efficient to check the
+        // input string to see if it will map to one of those
+        // sequences than to check the output string since all cases
+        // start with the same starting character.
+        if ((len >= 4) && (utf8_val[0] == '\xc3')) {
+            static std::string fe_ff("\xbe\xc3\xbf");
+            static std::string ff_fe("\xbf\xc3\xbe");
+            static std::string ef_bb_bf("\xaf\xc2\xbb\xc2\xbf");
+            // C++-20 has starts_with, but when this was written, qpdf
+            // had a minimum supported version of C++-17.
+            if ((utf8_val.compare(1, 3, fe_ff) == 0) ||
+                (utf8_val.compare(1, 3, ff_fe) == 0) ||
+                (utf8_val.compare(1, 5, ef_bb_bf) == 0)) {
+                result += unknown;
+                okay = false;
+            }
+        }
+        break;
+    default:
+        break;
+    }
     size_t pos = 0;
     while (pos < len) {
         bool error = false;
diff --git a/libtests/qutil.cc b/libtests/qutil.cc
index 82c2dd1a..972046b9 100644
--- a/libtests/qutil.cc
+++ b/libtests/qutil.cc
@@ -436,6 +436,21 @@ transcoding_test()
     assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8));
     std::cout << other_to_utf8 << std::endl;
     std::cout << "done other characters" << std::endl;
+    // These valid UTF8 strings when converted to PDFDoc would end up
+    // with a byte sequence that would be recognized as UTF-8 or
+    // UTF-16 rather than PDFDoc. A special case is required to store
+    // them as UTF-16 rather than PDFDoc.
+    static std::string fe_ff("\xc3\xbe\xc3\xbf potato");
+    static std::string ff_fe("\xc3\xbf\xc3\xbe potato");
+    static std::string ef_bb_bf("\xc3\xaf\xc2\xbb\xc2\xbf potato");
+    assert(!QUtil::utf8_to_pdf_doc(fe_ff, pdfdoc));
+    assert(pdfdoc == "?\xfe\xff potato");
+    assert(!QUtil::utf8_to_pdf_doc(ff_fe, pdfdoc));
+    assert(pdfdoc == "?\xff\xfe potato");
+    assert(!QUtil::utf8_to_pdf_doc(ef_bb_bf, pdfdoc));
+    assert(pdfdoc == "?\xef\xbb\xbf potato");
+    assert(QUtil::utf8_to_pdf_doc("\xc3\xbe\xc3\xbe", pdfdoc));
+    assert(QUtil::utf8_to_pdf_doc("\xc3\xaf\xc2\xbb\xc2\xbe", pdfdoc));
 }
 
 void
diff --git a/qpdf/qtest/qpdf/unicode.in b/qpdf/qtest/qpdf/unicode.in
index 2984b5f3..1ddf1178 100644
--- a/qpdf/qtest/qpdf/unicode.in
+++ b/qpdf/qtest/qpdf/unicode.in
@@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ.
 𝄞 𝄢 𝄪 𝅂
 This can be encoded in ASCII.
 This can be encoded in PDFDocEncoding (€).
+þÿ -- PDFDoc would look like UTF-16-BE
+ÿþ -- PDFDoc would look like UTF-16-LE
+ï»¿ -- PDFDoc would look like UTF-8
+ï»» -- PDFDoc okay
+þþ -- PDFDoc okay
diff --git a/qpdf/qtest/qpdf/unicode.out b/qpdf/qtest/qpdf/unicode.out
index c1901585..4f8ee322 100644
--- a/qpdf/qtest/qpdf/unicode.out
+++ b/qpdf/qtest/qpdf/unicode.out
@@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // <feff00490066002000
 𝄞 𝄢 𝄪 𝅂 // <feffd834dd1e0020d834dd220020d834dd2a0020d834dd42>
 This can be encoded in ASCII. // <546869732063616e20626520656e636f64656420696e2041534349492e>
 This can be encoded in PDFDocEncoding (€). // <546869732063616e20626520656e636f64656420696e20504446446f63456e636f64696e672028a0292e>
+þÿ -- PDFDoc would look like UTF-16-BE // <feff00fe00ff0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d00310036002d00420045>
+ÿþ -- PDFDoc would look like UTF-16-LE // <feff00ff00fe0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d00310036002d004c0045>
+ï»¿ -- PDFDoc would look like UTF-8 // <feff00ef00bb00bf0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d0038>
+ï»» -- PDFDoc okay // <efbbbb202d2d20504446446f63206f6b6179>
+þþ -- PDFDoc okay // <fefe202d2d20504446446f63206f6b6179>