mirror of
https://github.com/qpdf/qpdf.git
synced 2025-01-08 17:24:06 +00:00
Move remaining existing transcoding to QUtil
This commit is contained in:
parent
5cfcd4f361
commit
698485468a
@ -147,13 +147,18 @@ namespace QUtil
|
||||
std::string toUTF8(unsigned long uval);
|
||||
|
||||
// Return a string containing the byte representation of the
|
||||
// UTF-16 BE encoding for the unicode value passed in.
|
||||
// UTF-16 big-endian encoding for the unicode value passed in.
|
||||
// Unrepresentable code points are converted to U+FFFD.
|
||||
QPDF_DLL
|
||||
std::string toUTF16(unsigned long uval);
|
||||
|
||||
// Convert a UTF-8 encoded string to UTF-16. Unrepresentable code
|
||||
// points are converted to U+FFFD.
|
||||
// Test whether this is a UTF-16 big-endian string. This is
|
||||
// indicated by first two bytes being 0xFE 0xFF.
|
||||
QPDF_DLL
|
||||
bool is_utf16(std::string const&);
|
||||
|
||||
// Convert a UTF-8 encoded string to UTF-16 big-endian.
|
||||
// Unrepresentable code points are converted to U+FFFD.
|
||||
QPDF_DLL
|
||||
std::string utf8_to_utf16(std::string const& utf8);
|
||||
|
||||
@ -169,6 +174,24 @@ namespace QUtil
|
||||
QPDF_DLL
|
||||
std::string utf8_to_mac_roman(
|
||||
std::string const& utf8, char unknown_char = '?');
|
||||
QPDF_DLL
|
||||
std::string utf8_to_pdf_doc(
|
||||
std::string const& utf8, char unknown_char = '?');
|
||||
|
||||
// Convert a UTF-16 big-endian encoded string to UTF-8.
|
||||
// Unrepresentable code points are converted to U+FFFD.
|
||||
QPDF_DLL
|
||||
std::string utf16_to_utf8(std::string const& utf16);
|
||||
|
||||
// Convert from the specified single-byte encoding system to
|
||||
// UTF-8. There is no ascii_to_utf8 because all ASCII strings are
|
||||
// already valid UTF-8.
|
||||
QPDF_DLL
|
||||
std::string win_ansi_to_utf8(std::string const& win);
|
||||
QPDF_DLL
|
||||
std::string mac_roman_to_utf8(std::string const& mac);
|
||||
QPDF_DLL
|
||||
std::string pdf_doc_to_utf8(std::string const& pdfdoc);
|
||||
|
||||
// If secure random number generation is supported on your
|
||||
// platform and qpdf was not compiled with insecure random number
|
||||
|
@ -8,43 +8,6 @@
|
||||
// be used.
|
||||
#include <string.h>
|
||||
|
||||
// First element is 128
|
||||
static unsigned short pdf_doc_to_unicode[] = {
|
||||
0x2022, // 0x80 BULLET
|
||||
0x2020, // 0x81 DAGGER
|
||||
0x2021, // 0x82 DOUBLE DAGGER
|
||||
0x2026, // 0x83 HORIZONTAL ELLIPSIS
|
||||
0x2014, // 0x84 EM DASH
|
||||
0x2013, // 0x85 EN DASH
|
||||
0x0192, // 0x86 SMALL LETTER F WITH HOOK
|
||||
0x2044, // 0x87 FRACTION SLASH (solidus)
|
||||
0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
0x2212, // 0x8a MINUS SIGN
|
||||
0x2030, // 0x8b PER MILLE SIGN
|
||||
0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
|
||||
0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
|
||||
0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
|
||||
0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
|
||||
0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
|
||||
0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
|
||||
0x2122, // 0x92 TRADE MARK SIGN
|
||||
0xfb01, // 0x93 LATIN SMALL LIGATURE FI
|
||||
0xfb02, // 0x94 LATIN SMALL LIGATURE FL
|
||||
0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
|
||||
0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
|
||||
0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
|
||||
0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
|
||||
0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
|
||||
0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
|
||||
0x0153, // 0x9c LATIN SMALL LIGATURE OE
|
||||
0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
|
||||
0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
|
||||
0xfffd, // 0x9f UNDEFINED
|
||||
0x20ac, // 0xa0 EURO SIGN
|
||||
};
|
||||
|
||||
// See above about ctype.
|
||||
static bool is_ascii_printable(unsigned char ch)
|
||||
{
|
||||
@ -210,62 +173,12 @@ QPDF_String::getVal() const
|
||||
std::string
|
||||
QPDF_String::getUTF8Val() const
|
||||
{
|
||||
std::string result;
|
||||
size_t len = this->val.length();
|
||||
if ((len >= 2) && (len % 2 == 0) &&
|
||||
(this->val.at(0) == '\xfe') && (this->val.at(1) == '\xff'))
|
||||
if (QUtil::is_utf16(this->val))
|
||||
{
|
||||
// This is a Unicode string using big-endian UTF-16. This
|
||||
// code uses unsigned long and unsigned short to hold
|
||||
// codepoint values. It requires unsigned long to be at least
|
||||
// 32 bits and unsigned short to be at least 16 bits, but it
|
||||
// will work fine if they are larger.
|
||||
unsigned long codepoint = 0L;
|
||||
for (unsigned int i = 2; i < len; i += 2)
|
||||
{
|
||||
// Convert from UTF16-BE. If we get a malformed
|
||||
// codepoint, this code will generate incorrect output
|
||||
// without giving a warning. Specifically, a high
|
||||
// codepoint not followed by a low codepoint will be
|
||||
// discarded, and a low codepoint not preceded by a high
|
||||
// codepoint will just get its low 10 bits output.
|
||||
unsigned short bits =
|
||||
(static_cast<unsigned char>(this->val.at(i)) << 8) +
|
||||
static_cast<unsigned char>(this->val.at(i+1));
|
||||
if ((bits & 0xFC00) == 0xD800)
|
||||
{
|
||||
codepoint = 0x10000 + ((bits & 0x3FF) << 10);
|
||||
continue;
|
||||
}
|
||||
else if ((bits & 0xFC00) == 0xDC00)
|
||||
{
|
||||
if (codepoint != 0)
|
||||
{
|
||||
QTC::TC("qpdf", "QPDF_String non-trivial UTF-16");
|
||||
}
|
||||
codepoint += bits & 0x3FF;
|
||||
}
|
||||
else
|
||||
{
|
||||
codepoint = bits;
|
||||
}
|
||||
|
||||
result += QUtil::toUTF8(codepoint);
|
||||
codepoint = 0;
|
||||
}
|
||||
return QUtil::utf16_to_utf8(this->val);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (unsigned int i = 0; i < len; ++i)
|
||||
{
|
||||
unsigned char ch = static_cast<unsigned char>(this->val.at(i));
|
||||
unsigned short val = ch;
|
||||
if ((ch >= 128) && (ch <= 160))
|
||||
{
|
||||
val = pdf_doc_to_unicode[ch - 128];
|
||||
}
|
||||
result += QUtil::toUTF8(val);
|
||||
}
|
||||
return QUtil::pdf_doc_to_utf8(this->val);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
267
libqpdf/QUtil.cc
267
libqpdf/QUtil.cc
@ -8,6 +8,7 @@
|
||||
#endif
|
||||
#include <qpdf/SecureRandomDataProvider.hh>
|
||||
#include <qpdf/QPDFSystemError.hh>
|
||||
#include <qpdf/QTC.hh>
|
||||
|
||||
#include <cmath>
|
||||
#include <iomanip>
|
||||
@ -29,6 +30,43 @@
|
||||
#include <sys/stat.h>
|
||||
#endif
|
||||
|
||||
// First element is 128
|
||||
static unsigned short pdf_doc_to_unicode[] = {
|
||||
0x2022, // 0x80 BULLET
|
||||
0x2020, // 0x81 DAGGER
|
||||
0x2021, // 0x82 DOUBLE DAGGER
|
||||
0x2026, // 0x83 HORIZONTAL ELLIPSIS
|
||||
0x2014, // 0x84 EM DASH
|
||||
0x2013, // 0x85 EN DASH
|
||||
0x0192, // 0x86 SMALL LETTER F WITH HOOK
|
||||
0x2044, // 0x87 FRACTION SLASH (solidus)
|
||||
0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
0x2212, // 0x8a MINUS SIGN
|
||||
0x2030, // 0x8b PER MILLE SIGN
|
||||
0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
|
||||
0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
|
||||
0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
|
||||
0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
|
||||
0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
|
||||
0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
|
||||
0x2122, // 0x92 TRADE MARK SIGN
|
||||
0xfb01, // 0x93 LATIN SMALL LIGATURE FI
|
||||
0xfb02, // 0x94 LATIN SMALL LIGATURE FL
|
||||
0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
|
||||
0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
|
||||
0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
|
||||
0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
|
||||
0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
|
||||
0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
|
||||
0x0153, // 0x9c LATIN SMALL LIGATURE OE
|
||||
0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
|
||||
0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
|
||||
0xfffd, // 0x9f UNDEFINED
|
||||
0x20ac, // 0xa0 EURO SIGN
|
||||
};
|
||||
|
||||
std::string
|
||||
QUtil::int_to_string(long long num, int length)
|
||||
{
|
||||
@ -895,7 +933,7 @@ QUtil::parse_numrange(char const* range, int max)
|
||||
return result;
|
||||
}
|
||||
|
||||
enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman };
|
||||
enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc };
|
||||
|
||||
static unsigned char
|
||||
encode_winansi(unsigned long codepoint)
|
||||
@ -1342,6 +1380,119 @@ encode_macroman(unsigned long codepoint)
|
||||
return ch;
|
||||
}
|
||||
|
||||
static unsigned char
|
||||
encode_pdfdoc(unsigned long codepoint)
|
||||
{
|
||||
// Use this ugly switch statement to avoid a static, which is not
|
||||
// thread-safe.
|
||||
unsigned char ch = '\0';
|
||||
switch (codepoint)
|
||||
{
|
||||
case 0x2022:
|
||||
ch = 0x80;
|
||||
break;
|
||||
case 0x2020:
|
||||
ch = 0x81;
|
||||
break;
|
||||
case 0x2021:
|
||||
ch = 0x82;
|
||||
break;
|
||||
case 0x2026:
|
||||
ch = 0x83;
|
||||
break;
|
||||
case 0x2014:
|
||||
ch = 0x84;
|
||||
break;
|
||||
case 0x2013:
|
||||
ch = 0x85;
|
||||
break;
|
||||
case 0x0192:
|
||||
ch = 0x86;
|
||||
break;
|
||||
case 0x2044:
|
||||
ch = 0x87;
|
||||
break;
|
||||
case 0x2039:
|
||||
ch = 0x88;
|
||||
break;
|
||||
case 0x203a:
|
||||
ch = 0x89;
|
||||
break;
|
||||
case 0x2212:
|
||||
ch = 0x8a;
|
||||
break;
|
||||
case 0x2030:
|
||||
ch = 0x8b;
|
||||
break;
|
||||
case 0x201e:
|
||||
ch = 0x8c;
|
||||
break;
|
||||
case 0x201c:
|
||||
ch = 0x8d;
|
||||
break;
|
||||
case 0x201d:
|
||||
ch = 0x8e;
|
||||
break;
|
||||
case 0x2018:
|
||||
ch = 0x8f;
|
||||
break;
|
||||
case 0x2019:
|
||||
ch = 0x90;
|
||||
break;
|
||||
case 0x201a:
|
||||
ch = 0x91;
|
||||
break;
|
||||
case 0x2122:
|
||||
ch = 0x92;
|
||||
break;
|
||||
case 0xfb01:
|
||||
ch = 0x93;
|
||||
break;
|
||||
case 0xfb02:
|
||||
ch = 0x94;
|
||||
break;
|
||||
case 0x0141:
|
||||
ch = 0x95;
|
||||
break;
|
||||
case 0x0152:
|
||||
ch = 0x96;
|
||||
break;
|
||||
case 0x0160:
|
||||
ch = 0x97;
|
||||
break;
|
||||
case 0x0178:
|
||||
ch = 0x98;
|
||||
break;
|
||||
case 0x017d:
|
||||
ch = 0x99;
|
||||
break;
|
||||
case 0x0131:
|
||||
ch = 0x9a;
|
||||
break;
|
||||
case 0x0142:
|
||||
ch = 0x9b;
|
||||
break;
|
||||
case 0x0153:
|
||||
ch = 0x9c;
|
||||
break;
|
||||
case 0x0161:
|
||||
ch = 0x9d;
|
||||
break;
|
||||
case 0x017e:
|
||||
ch = 0x9e;
|
||||
break;
|
||||
case 0xfffd:
|
||||
ch = 0x9f;
|
||||
break;
|
||||
case 0x20ac:
|
||||
ch = 0xa0;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
|
||||
static std::string
|
||||
transcode_utf8(std::string const& utf8_val, encoding_e encoding,
|
||||
char unknown)
|
||||
@ -1410,24 +1561,27 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
|
||||
{
|
||||
result += QUtil::toUTF16(codepoint);
|
||||
}
|
||||
else if ((codepoint >= 160) && (codepoint < 256) &&
|
||||
((encoding == e_winansi) || (encoding == e_pdfdoc)))
|
||||
{
|
||||
ch = static_cast<unsigned char>(codepoint & 0xff);
|
||||
result.append(1, ch);
|
||||
}
|
||||
else
|
||||
{
|
||||
ch = '\0';
|
||||
if (encoding == e_winansi)
|
||||
{
|
||||
if ((codepoint >= 160) && (codepoint < 256))
|
||||
{
|
||||
ch = static_cast<unsigned char>(codepoint & 0xff);
|
||||
}
|
||||
else
|
||||
{
|
||||
ch = encode_winansi(codepoint);
|
||||
}
|
||||
ch = encode_winansi(codepoint);
|
||||
}
|
||||
else if (encoding == e_macroman)
|
||||
{
|
||||
ch = encode_macroman(codepoint);
|
||||
}
|
||||
else if (encoding == e_pdfdoc)
|
||||
{
|
||||
ch = encode_pdfdoc(codepoint);
|
||||
}
|
||||
if (ch == '\0')
|
||||
{
|
||||
ch = static_cast<unsigned char>(unknown);
|
||||
@ -1463,3 +1617,98 @@ QUtil::utf8_to_mac_roman(std::string const& utf8, char unknown_char)
|
||||
{
|
||||
return transcode_utf8(utf8, e_macroman, unknown_char);
|
||||
}
|
||||
|
||||
std::string
|
||||
QUtil::utf8_to_pdf_doc(std::string const& utf8, char unknown_char)
|
||||
{
|
||||
return transcode_utf8(utf8, e_pdfdoc, unknown_char);
|
||||
}
|
||||
|
||||
bool
|
||||
QUtil::is_utf16(std::string const& val)
|
||||
{
|
||||
return ((val.length() >= 2) &&
|
||||
(val.at(0) == '\xfe') && (val.at(1) == '\xff'));
|
||||
}
|
||||
|
||||
std::string
|
||||
QUtil::utf16_to_utf8(std::string const& val)
|
||||
{
|
||||
std::string result;
|
||||
// This code uses unsigned long and unsigned short to hold
|
||||
// codepoint values. It requires unsigned long to be at least
|
||||
// 32 bits and unsigned short to be at least 16 bits, but it
|
||||
// will work fine if they are larger.
|
||||
unsigned long codepoint = 0L;
|
||||
size_t len = val.length();
|
||||
size_t start = 0;
|
||||
if (is_utf16(val))
|
||||
{
|
||||
start += 2;
|
||||
}
|
||||
// If the string has an odd number of bytes, the last byte is
|
||||
// ignored.
|
||||
for (unsigned int i = start; i < len; i += 2)
|
||||
{
|
||||
// Convert from UTF16-BE. If we get a malformed
|
||||
// codepoint, this code will generate incorrect output
|
||||
// without giving a warning. Specifically, a high
|
||||
// codepoint not followed by a low codepoint will be
|
||||
// discarded, and a low codepoint not preceded by a high
|
||||
// codepoint will just get its low 10 bits output.
|
||||
unsigned short bits =
|
||||
(static_cast<unsigned char>(val.at(i)) << 8) +
|
||||
static_cast<unsigned char>(val.at(i+1));
|
||||
if ((bits & 0xFC00) == 0xD800)
|
||||
{
|
||||
codepoint = 0x10000 + ((bits & 0x3FF) << 10);
|
||||
continue;
|
||||
}
|
||||
else if ((bits & 0xFC00) == 0xDC00)
|
||||
{
|
||||
if (codepoint != 0)
|
||||
{
|
||||
QTC::TC("qpdf", "QUtil non-trivial UTF-16");
|
||||
}
|
||||
codepoint += bits & 0x3FF;
|
||||
}
|
||||
else
|
||||
{
|
||||
codepoint = bits;
|
||||
}
|
||||
|
||||
result += QUtil::toUTF8(codepoint);
|
||||
codepoint = 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string
|
||||
QUtil::win_ansi_to_utf8(std::string const& val)
|
||||
{
|
||||
return "QXXXQ";
|
||||
}
|
||||
|
||||
std::string
|
||||
QUtil::mac_roman_to_utf8(std::string const& val)
|
||||
{
|
||||
return "QXXXQ";
|
||||
}
|
||||
|
||||
std::string
|
||||
QUtil::pdf_doc_to_utf8(std::string const& val)
|
||||
{
|
||||
std::string result;
|
||||
size_t len = val.length();
|
||||
for (unsigned int i = 0; i < len; ++i)
|
||||
{
|
||||
unsigned char ch = static_cast<unsigned char>(val.at(i));
|
||||
unsigned short val = ch;
|
||||
if ((ch >= 128) && (ch <= 160))
|
||||
{
|
||||
val = pdf_doc_to_unicode[ch - 128];
|
||||
}
|
||||
result += QUtil::toUTF8(val);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -108,7 +108,7 @@ QPDF_Stream pipeStreamData with null pipeline 0
|
||||
QPDFWriter not recompressing /FlateDecode 0
|
||||
QPDF_encryption xref stream from encrypted file 0
|
||||
qpdf unable to filter 0
|
||||
QPDF_String non-trivial UTF-16 0
|
||||
QUtil non-trivial UTF-16 0
|
||||
QPDF xref overwrite object 0
|
||||
QPDF decoding error warning 0
|
||||
qpdf-c called qpdf_init 0
|
||||
|
Loading…
Reference in New Issue
Block a user