handle UTF-16BE fully

git-svn-id: svn+q:///qpdf/trunk@639 71b93d88-0707-0410-a8cf-f5a4172ac649
This commit is contained in:
Jay Berkenbilt 2008-11-23 18:49:13 +00:00
parent 6e07eb1aae
commit 337b900708
5 changed files with 54 additions and 13 deletions

View File

@ -1,3 +1,9 @@
2008-11-23 Jay Berkenbilt <ejb@ql.org>
* libqpdf/QPDF_String.cc (QPDF_String::getUTF8Val): handle
UTF-16BE properly rather than just treating the string as a string
of 16-bit characters.
2008-06-30 Jay Berkenbilt <ejb@ql.org>
* 2.0.2: release

View File

@ -2,6 +2,8 @@
#include <qpdf/QPDF_String.hh>
#include <qpdf/QUtil.hh>
#include <qpdf/QTC.hh>
// DO NOT USE ctype -- it is locale dependent for some things, and
// it's not worth the risk of including it in case it may accidentally
// be used.
@ -159,12 +161,42 @@ QPDF_String::getUTF8Val() const
(this->val[0] == '\xfe') && (this->val[1] == '\xff'))
{
// This is a Unicode string using big-endian UTF-16. This
// code is not actually correct as it doesn't properly handle
// characters past 0xffff.
// code uses unsigned long and unsigned short to hold
// codepoint values. It requires unsigned long to be at least
// 32 bits and unsigned short to be at least 16 bits, but it
// will work fine if they are larger.
unsigned long codepoint = 0L;
for (unsigned int i = 2; i < len; i += 2)
{
result += QUtil::toUTF8(((unsigned char) this->val[i] << 8) +
((unsigned char) this->val[i+1]));
// Convert from UTF16-BE. If we get a malformed
// codepoint, this code will generate incorrect output
// without giving a warning. Specifically, a high
// codepoint not followed by a low codepoint will be
// discarded, and a low codepoint not preceded by a high
// codepoint will just get its low 10 bits output.
unsigned short bits =
(((unsigned char) this->val[i]) << 8) +
((unsigned char) this->val[i+1]);
if ((bits & 0xFC00) == 0xD800)
{
codepoint = 0x10000 + ((bits & 0x3FF) << 10);
continue;
}
else if ((bits & 0xFC00) == 0xDC00)
{
if (codepoint != 0)
{
QTC::TC("qpdf", "QPDF_String non-trivial UTF-16");
}
codepoint += bits & 0x3FF;
}
else
{
codepoint = bits;
}
result += QUtil::toUTF8(codepoint);
codepoint = 0;
}
}
else

View File

@ -115,3 +115,4 @@ QPDF_Stream pipeStreamData with null pipeline 0
QPDFWriter not recompressing /FlateDecode 0
QPDF piping xref stream from encrypted file 0
unable to filter 0
QPDF_String non-trivial UTF-16 0

View File

@ -8,6 +8,7 @@ QStrings:
No Special Characters
These: ¿÷¢þ and no more
πωτατω
treble clef: 𝄠; sixteenth note: 𝅘𝅥𝅮
QNumbers:
1.000
3.142

View File

@ -13,6 +13,7 @@
(No Special Characters)
(These: ¿÷¢þ and no more)
<feff03c003c903c403b103c403c9>
<feff0074007200650062006c006500200063006c00650066003a0020d834dd20003b0020007300690078007400650065006e007400680020006e006f00740065003a0020d834dd60>
]
/Type /Catalog
>>
@ -109,19 +110,19 @@ xref
0 10
0000000000 65535 f
0000000025 00000 n
0000000226 00000 n
0000000308 00000 n
0000000543 00000 n
0000000642 00000 n
0000000684 00000 n
0000000782 00000 n
0000000801 00000 n
0000000919 00000 n
0000000377 00000 n
0000000459 00000 n
0000000694 00000 n
0000000793 00000 n
0000000835 00000 n
0000000933 00000 n
0000000952 00000 n
0000001070 00000 n
trailer <<
/Root 1 0 R
/Size 10
/ID [<e017d8dc1fe53a81e40aa79bcb43fdec><76269ee0b6579446b731e060af8ef436>]
>>
startxref
954
1105
%%EOF