From f7ac5915909c7197acf84265f8d8ad41b95a36a8 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Tue, 22 Feb 2022 08:04:11 -0500 Subject: [PATCH] Recognize explicit UTF-8 strings (fixes #654) --- ChangeLog | 5 +++++ TODO | 8 ++++++++ libqpdf/QPDF_String.cc | 9 +++++++++ manual/release-notes.rst | 3 +++ qpdf/qtest/qpdf/numeric-and-string-3.out | 3 ++- qpdf/qtest/qpdf/numeric-and-string-3.pdf | 21 +++++++++++---------- 6 files changed, 38 insertions(+), 11 deletions(-) diff --git a/ChangeLog b/ChangeLog index 0622b834..da642862 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2022-02-22 Jay Berkenbilt + + * Recognize PDF strings explicitly marked as UTF-8 as allowed by + the PDF 2.0 spec. Fixes #654. + 2022-02-18 Jay Berkenbilt * Bug fix: when generating appearance streams, the font size was diff --git a/TODO b/TODO index c1b1b440..6de30079 100644 --- a/TODO +++ b/TODO @@ -10,6 +10,14 @@ Priorities for 11: * PointerHolder -> shared_ptr * ABI +Misc +* Get rid of "ugly switch statements" in QUtil.cc -- replace with + static map initializers. (Search for "ugly switch statements" below + as well.) +* Consider exposing get_next_utf8_codepoint in QUtil +* Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val + does to detect UTF-8 encoded strings per PDF 2.0 spec. + Soon: Break ground on "Document-level work" Code Formatting diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc index f0153b1c..931ccd61 100644 --- a/libqpdf/QPDF_String.cc +++ b/libqpdf/QPDF_String.cc @@ -183,6 +183,15 @@ QPDF_String::getUTF8Val() const { return QUtil::utf16_to_utf8(this->val); } + else if ((val.length() >= 3) && + (val[0] == '\xEF') && + (val[1] == '\xBB') && + (val[2] == '\xBF')) + { + // PDF 2.0 allows UTF-8 strings when explicitly prefixed with + // the above bytes, which is just UTF-8 encoding of U+FEFF. + return this->val.substr(3); + } else { return QUtil::pdf_doc_to_utf8(this->val); diff --git a/manual/release-notes.rst b/manual/release-notes.rst index daec3b25..d7959b96 100644 --- a/manual/release-notes.rst +++ b/manual/release-notes.rst @@ -9,6 +9,9 @@ For a detailed list of changes, please see the file 10.6.3: XXX - Bug fixes: + - Recognize strings explicitly encoded as UTF-8 as allowed by the + PDF 2.0 spec. + - Fix edge cases with appearance stream generation for form fields whose ``/DA`` field lacks proper font size specification or that specifies auto sizing. At this time, qpdf does not support auto diff --git a/qpdf/qtest/qpdf/numeric-and-string-3.out b/qpdf/qtest/qpdf/numeric-and-string-3.out index 390e9a61..0774b228 100644 --- a/qpdf/qtest/qpdf/numeric-and-string-3.out +++ b/qpdf/qtest/qpdf/numeric-and-string-3.out @@ -7,8 +7,9 @@ end page 1 QStrings: No Special Characters These: ¿÷¢þ and no more +Explicit utf-8 with π πωτατω -treble clef: 𝄠; sixteenth note: 𝅘𝅥𝅮 +treble clef: 𝄠; sixteenth note: 𝅘𝅥𝅯 QNumbers: 1.000 3.142 diff --git a/qpdf/qtest/qpdf/numeric-and-string-3.pdf b/qpdf/qtest/qpdf/numeric-and-string-3.pdf index 4225d239..b6073704 100644 --- a/qpdf/qtest/qpdf/numeric-and-string-3.pdf +++ b/qpdf/qtest/qpdf/numeric-and-string-3.pdf @@ -12,8 +12,9 @@ /QStrings [ (No Special Characters) (These: and no more) + (\357\273\277Explicit utf-8 with \317\200) - + ] /Type /Catalog >> @@ -110,19 +111,19 @@ xref 0 10 0000000000 65535 f 0000000025 00000 n -0000000377 00000 n -0000000459 00000 n -0000000694 00000 n -0000000793 00000 n -0000000835 00000 n -0000000933 00000 n -0000000952 00000 n -0000001070 00000 n +0000000424 00000 n +0000000506 00000 n +0000000741 00000 n +0000000840 00000 n +0000000882 00000 n +0000000980 00000 n +0000000999 00000 n +0000001117 00000 n trailer << /Root 1 0 R /Size 10 /ID [<76269ee0b6579446b731e060af8ef436>] >> startxref -1105 +1152 %%EOF