From 1065bbb0165b4608bd715866332751be9213cd51 Mon Sep 17 00:00:00 2001
From: Jay Berkenbilt <ejb@ql.org>
Date: Tue, 15 Feb 2022 08:29:29 -0500
Subject: [PATCH] Handle odd PDFDoc codepoints in UTF-8 during transcoding
 (fixes #650)

There are codepoints in PDFDoc that are not valid UTF-8 but map to
valid UTF-8. We were handling those correctly with bidirectional
mapping.

However, if those same code points appeared in UTF-8, where they have
no meaning, they were left as fixed points when converting to PDFDoc,
where they do have meaning. This change recognizes them as errors.
---
 ChangeLog                      |  6 ++++++
 libqpdf/QUtil.cc               | 17 +++++++++++++++++
 libtests/qtest/qutil/qutil.out |  3 ++-
 libtests/qutil.cc              | 13 ++++++++++---
 4 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index f6879a4d..b63a785d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2022-02-15  Jay Berkenbilt  <ejb@ql.org>
+
+	* Don't map 0x18 through 0x1f, 0x7f, 0x9f, or 0xad as fixed points
+	when transcoding UTF-8 to PDFDoc. These codepoints have different
+	meanings in those two encoding systems. Fixes #650.
+
 2022-02-11  Jay Berkenbilt  <ejb@ql.org>
 
 	* 10.6.1: release
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index c4aa3afb..f01746b6 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -2272,6 +2272,16 @@ transcode_utf8(std::string const& utf8_val, std::string& result,
             {
                 result += QUtil::toUTF16(QIntC::to_ulong(ch));
             }
+            else if ((encoding == e_pdfdoc) &&
+                     (((ch >= 0x18) && (ch <= 0x1f)) || (ch == 127)))
+            {
+                // PDFDocEncoding maps some low characters to Unicode,
+                // so if we encounter those invalid UTF-8 code points,
+                // map them to unknown so reversing the mapping
+                // doesn't change them into other characters.
+                okay = false;
+                result.append(1, unknown);
+            }
             else
             {
                 result.append(1, ch);
@@ -2281,6 +2291,13 @@ transcode_utf8(std::string const& utf8_val, std::string& result,
         {
             result += QUtil::toUTF16(codepoint);
         }
+        else if ((codepoint == 0xad) && (encoding == e_pdfdoc))
+        {
+            // PDFDocEncoding omits 0x00ad (soft hyphen), but rather
+            // than treating it as undefined, map it to a regular
+            // hyphen.
+            result.append(1, '-');
+        }
         else if ((codepoint > 160) && (codepoint < 256) &&
                  ((encoding == e_winansi) || (encoding == e_pdfdoc)))
         {
diff --git a/libtests/qtest/qutil/qutil.out b/libtests/qtest/qutil/qutil.out
index aedf49e1..fa284237 100644
--- a/libtests/qtest/qutil/qutil.out
+++ b/libtests/qtest/qutil/qutil.out
@@ -88,7 +88,8 @@ alternatives
 2: 83a9e99e
 0: 717561636b
 done alternatives
-w˘wˇwˆw˙w˝w˛w˚w˜w�w�w
+w˘wˇwˆw˙w˝w˛w˚w˜w�w�w�w
+w?w?w?w?w?w?w?w?w?w?w-w
 done other characters
 ---- whoami
 quack1
diff --git a/libtests/qutil.cc b/libtests/qutil.cc
index 2142346e..2e4d9cdd 100644
--- a/libtests/qutil.cc
+++ b/libtests/qutil.cc
@@ -418,9 +418,16 @@ void transcoding_test()
     print_alternatives(utf8);
     print_alternatives("quack");
     std::cout << "done alternatives" << std::endl;
-    std::string other = QUtil::pdf_doc_to_utf8(
-        "w\030w\031w\032w\033w\034w\035w\036w\037w\177w\255w");
-    std::cout << other << std::endl;
+    // These are characters are either valid in PDFDoc and invalid in
+    // UTF-8 or the other way around.
+    std::string other("w\x18w\x19w\x1aw\x1bw\x1cw\x1dw\x1ew\x1fw\x7fw");
+    std::string other_doc = other + "\x9fw\xadw";
+    std::cout << QUtil::pdf_doc_to_utf8(other_doc) << std::endl;
+    std::string other_utf8 =
+        other + QUtil::toUTF8(0x9f) + "w" + QUtil::toUTF8(0xad) + "w";
+    std::string other_to_utf8;
+    assert(! QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8));
+    std::cout << other_to_utf8 << std::endl;
     std::cout << "done other characters" << std::endl;
 }