From 337b9007088670363ff6444b2bffa7e8aa6498dc Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sun, 23 Nov 2008 18:49:13 +0000 Subject: [PATCH] handle UTF-16BE fully git-svn-id: svn+q:///qpdf/trunk@639 71b93d88-0707-0410-a8cf-f5a4172ac649 --- ChangeLog | 6 ++++++ libqpdf/QPDF_String.cc | 40 ++++++++++++++++++++++++++++++++++---- qpdf/qpdf.testcov | 1 + qpdf/qtest/qpdf/misc-3.out | 1 + qpdf/qtest/qpdf/misc-3.pdf | 19 +++++++++--------- 5 files changed, 54 insertions(+), 13 deletions(-) diff --git a/ChangeLog b/ChangeLog index deb9ee43..8f143c15 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2008-11-23 Jay Berkenbilt + + * libqpdf/QPDF_String.cc (QPDF_String::getUTF8Val): handle + UTF-16BE properly rather than just treating the string as a string + of 16-bit characters. + 2008-06-30 Jay Berkenbilt * 2.0.2: release diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc index cc8ca042..739006b4 100644 --- a/libqpdf/QPDF_String.cc +++ b/libqpdf/QPDF_String.cc @@ -2,6 +2,8 @@ #include #include +#include + // DO NOT USE ctype -- it is locale dependent for some things, and // it's not worth the risk of including it in case it may accidentally // be used. @@ -159,12 +161,42 @@ QPDF_String::getUTF8Val() const (this->val[0] == '\xfe') && (this->val[1] == '\xff')) { // This is a Unicode string using big-endian UTF-16. This - // code is not actually correct as it doesn't properly handle - // characters past 0xffff. + // code uses unsigned long and unsigned short to hold + // codepoint values. It requires unsigned long to be at least + // 32 bits and unsigned short to be at least 16 bits, but it + // will work fine if they are larger. + unsigned long codepoint = 0L; for (unsigned int i = 2; i < len; i += 2) { - result += QUtil::toUTF8(((unsigned char) this->val[i] << 8) + - ((unsigned char) this->val[i+1])); + // Convert from UTF16-BE. If we get a malformed + // codepoint, this code will generate incorrect output + // without giving a warning. Specifically, a high + // codepoint not followed by a low codepoint will be + // discarded, and a low codepoint not preceded by a high + // codepoint will just get its low 10 bits output. + unsigned short bits = + (((unsigned char) this->val[i]) << 8) + + ((unsigned char) this->val[i+1]); + if ((bits & 0xFC00) == 0xD800) + { + codepoint = 0x10000 + ((bits & 0x3FF) << 10); + continue; + } + else if ((bits & 0xFC00) == 0xDC00) + { + if (codepoint != 0) + { + QTC::TC("qpdf", "QPDF_String non-trivial UTF-16"); + } + codepoint += bits & 0x3FF; + } + else + { + codepoint = bits; + } + + result += QUtil::toUTF8(codepoint); + codepoint = 0; } } else diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index e6323600..0c2c0416 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -115,3 +115,4 @@ QPDF_Stream pipeStreamData with null pipeline 0 QPDFWriter not recompressing /FlateDecode 0 QPDF piping xref stream from encrypted file 0 unable to filter 0 +QPDF_String non-trivial UTF-16 0 diff --git a/qpdf/qtest/qpdf/misc-3.out b/qpdf/qtest/qpdf/misc-3.out index f9c89df2..390e9a61 100644 --- a/qpdf/qtest/qpdf/misc-3.out +++ b/qpdf/qtest/qpdf/misc-3.out @@ -8,6 +8,7 @@ QStrings: No Special Characters These: ¿÷¢þ and no more πωτατω +treble clef: 𝄠; sixteenth note: 𝅘𝅥𝅮 QNumbers: 1.000 3.142 diff --git a/qpdf/qtest/qpdf/misc-3.pdf b/qpdf/qtest/qpdf/misc-3.pdf index 6b9aa3c7..4225d239 100644 --- a/qpdf/qtest/qpdf/misc-3.pdf +++ b/qpdf/qtest/qpdf/misc-3.pdf @@ -13,6 +13,7 @@ (No Special Characters) (These: and no more) + ] /Type /Catalog >> @@ -109,19 +110,19 @@ xref 0 10 0000000000 65535 f 0000000025 00000 n -0000000226 00000 n -0000000308 00000 n -0000000543 00000 n -0000000642 00000 n -0000000684 00000 n -0000000782 00000 n -0000000801 00000 n -0000000919 00000 n +0000000377 00000 n +0000000459 00000 n +0000000694 00000 n +0000000793 00000 n +0000000835 00000 n +0000000933 00000 n +0000000952 00000 n +0000001070 00000 n trailer << /Root 1 0 R /Size 10 /ID [<76269ee0b6579446b731e060af8ef436>] >> startxref -954 +1105 %%EOF