From 1065bbb0165b4608bd715866332751be9213cd51 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Tue, 15 Feb 2022 08:29:29 -0500 Subject: [PATCH] Handle odd PDFDoc codepoints in UTF-8 during transcoding (fixes #650) There are codepoints in PDFDoc that are not valid UTF-8 but map to valid UTF-8. We were handling those correctly with bidirectional mapping. However, if those same code points appeared in UTF-8, where they have no meaning, they were left as fixed points when converting to PDFDoc, where they do have meaning. This change recognizes them as errors. --- ChangeLog | 6 ++++++ libqpdf/QUtil.cc | 17 +++++++++++++++++ libtests/qtest/qutil/qutil.out | 3 ++- libtests/qutil.cc | 13 ++++++++++--- 4 files changed, 35 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index f6879a4d..b63a785d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2022-02-15 Jay Berkenbilt + + * Don't map 0x18 through 0x1f, 0x7f, 0x9f, or 0xad as fixed points + when transcoding UTF-8 to PDFDoc. These codepoints have different + meanings in those two encoding systems. Fixes #650. + 2022-02-11 Jay Berkenbilt * 10.6.1: release diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index c4aa3afb..f01746b6 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -2272,6 +2272,16 @@ transcode_utf8(std::string const& utf8_val, std::string& result, { result += QUtil::toUTF16(QIntC::to_ulong(ch)); } + else if ((encoding == e_pdfdoc) && + (((ch >= 0x18) && (ch <= 0x1f)) || (ch == 127))) + { + // PDFDocEncoding maps some low characters to Unicode, + // so if we encounter those invalid UTF-8 code points, + // map them to unknown so reversing the mapping + // doesn't change them into other characters. + okay = false; + result.append(1, unknown); + } else { result.append(1, ch); @@ -2281,6 +2291,13 @@ transcode_utf8(std::string const& utf8_val, std::string& result, { result += QUtil::toUTF16(codepoint); } + else if ((codepoint == 0xad) && (encoding == e_pdfdoc)) + { + // PDFDocEncoding omits 0x00ad (soft hyphen), but rather + // than treating it as undefined, map it to a regular + // hyphen. + result.append(1, '-'); + } else if ((codepoint > 160) && (codepoint < 256) && ((encoding == e_winansi) || (encoding == e_pdfdoc))) { diff --git a/libtests/qtest/qutil/qutil.out b/libtests/qtest/qutil/qutil.out index aedf49e1..fa284237 100644 --- a/libtests/qtest/qutil/qutil.out +++ b/libtests/qtest/qutil/qutil.out @@ -88,7 +88,8 @@ alternatives 2: 83a9e99e 0: 717561636b done alternatives -w˘wˇwˆw˙w˝w˛w˚w˜w�w�w +w˘wˇwˆw˙w˝w˛w˚w˜w�w�w�w +w?w?w?w?w?w?w?w?w?w?w-w done other characters ---- whoami quack1 diff --git a/libtests/qutil.cc b/libtests/qutil.cc index 2142346e..2e4d9cdd 100644 --- a/libtests/qutil.cc +++ b/libtests/qutil.cc @@ -418,9 +418,16 @@ void transcoding_test() print_alternatives(utf8); print_alternatives("quack"); std::cout << "done alternatives" << std::endl; - std::string other = QUtil::pdf_doc_to_utf8( - "w\030w\031w\032w\033w\034w\035w\036w\037w\177w\255w"); - std::cout << other << std::endl; + // These are characters are either valid in PDFDoc and invalid in + // UTF-8 or the other way around. + std::string other("w\x18w\x19w\x1aw\x1bw\x1cw\x1dw\x1ew\x1fw\x7fw"); + std::string other_doc = other + "\x9fw\xadw"; + std::cout << QUtil::pdf_doc_to_utf8(other_doc) << std::endl; + std::string other_utf8 = + other + QUtil::toUTF8(0x9f) + "w" + QUtil::toUTF8(0xad) + "w"; + std::string other_to_utf8; + assert(! QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8)); + std::cout << other_to_utf8 << std::endl; std::cout << "done other characters" << std::endl; }