2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-05-28 16:00:53 +00:00

Move remaining existing transcoding to QUtil

This commit is contained in:
Jay Berkenbilt 2019-01-13 08:00:14 -05:00
parent 5cfcd4f361
commit 698485468a
4 changed files with 288 additions and 103 deletions

View File

@ -147,13 +147,18 @@ namespace QUtil
std::string toUTF8(unsigned long uval);
// Return a string containing the byte representation of the
// UTF-16 BE encoding for the unicode value passed in.
// UTF-16 big-endian encoding for the unicode value passed in.
// Unrepresentable code points are converted to U+FFFD.
QPDF_DLL
std::string toUTF16(unsigned long uval);
// Convert a UTF-8 encoded string to UTF-16. Unrepresentable code
// points are converted to U+FFFD.
// Test whether this is a UTF-16 big-endian string. This is
// indicated by first two bytes being 0xFE 0xFF.
QPDF_DLL
bool is_utf16(std::string const&);
// Convert a UTF-8 encoded string to UTF-16 big-endian.
// Unrepresentable code points are converted to U+FFFD.
QPDF_DLL
std::string utf8_to_utf16(std::string const& utf8);
@ -169,6 +174,24 @@ namespace QUtil
QPDF_DLL
std::string utf8_to_mac_roman(
std::string const& utf8, char unknown_char = '?');
QPDF_DLL
std::string utf8_to_pdf_doc(
std::string const& utf8, char unknown_char = '?');
// Convert a UTF-16 big-endian encoded string to UTF-8.
// Unrepresentable code points are converted to U+FFFD.
QPDF_DLL
std::string utf16_to_utf8(std::string const& utf16);
// Convert from the specified single-byte encoding system to
// UTF-8. There is no ascii_to_utf8 because all ASCII strings are
// already valid UTF-8.
QPDF_DLL
std::string win_ansi_to_utf8(std::string const& win);
QPDF_DLL
std::string mac_roman_to_utf8(std::string const& mac);
QPDF_DLL
std::string pdf_doc_to_utf8(std::string const& pdfdoc);
// If secure random number generation is supported on your
// platform and qpdf was not compiled with insecure random number

View File

@ -8,43 +8,6 @@
// be used.
#include <string.h>
// First element is 128
static unsigned short pdf_doc_to_unicode[] = {
0x2022, // 0x80 BULLET
0x2020, // 0x81 DAGGER
0x2021, // 0x82 DOUBLE DAGGER
0x2026, // 0x83 HORIZONTAL ELLIPSIS
0x2014, // 0x84 EM DASH
0x2013, // 0x85 EN DASH
0x0192, // 0x86 SMALL LETTER F WITH HOOK
0x2044, // 0x87 FRACTION SLASH (solidus)
0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x2212, // 0x8a MINUS SIGN
0x2030, // 0x8b PER MILLE SIGN
0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
0x2122, // 0x92 TRADE MARK SIGN
0xfb01, // 0x93 LATIN SMALL LIGATURE FI
0xfb02, // 0x94 LATIN SMALL LIGATURE FL
0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
0x0153, // 0x9c LATIN SMALL LIGATURE OE
0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
0xfffd, // 0x9f UNDEFINED
0x20ac, // 0xa0 EURO SIGN
};
// See above about ctype.
static bool is_ascii_printable(unsigned char ch)
{
@ -210,62 +173,12 @@ QPDF_String::getVal() const
std::string
QPDF_String::getUTF8Val() const
{
std::string result;
size_t len = this->val.length();
if ((len >= 2) && (len % 2 == 0) &&
(this->val.at(0) == '\xfe') && (this->val.at(1) == '\xff'))
if (QUtil::is_utf16(this->val))
{
// This is a Unicode string using big-endian UTF-16. This
// code uses unsigned long and unsigned short to hold
// codepoint values. It requires unsigned long to be at least
// 32 bits and unsigned short to be at least 16 bits, but it
// will work fine if they are larger.
unsigned long codepoint = 0L;
for (unsigned int i = 2; i < len; i += 2)
{
// Convert from UTF16-BE. If we get a malformed
// codepoint, this code will generate incorrect output
// without giving a warning. Specifically, a high
// codepoint not followed by a low codepoint will be
// discarded, and a low codepoint not preceded by a high
// codepoint will just get its low 10 bits output.
unsigned short bits =
(static_cast<unsigned char>(this->val.at(i)) << 8) +
static_cast<unsigned char>(this->val.at(i+1));
if ((bits & 0xFC00) == 0xD800)
{
codepoint = 0x10000 + ((bits & 0x3FF) << 10);
continue;
}
else if ((bits & 0xFC00) == 0xDC00)
{
if (codepoint != 0)
{
QTC::TC("qpdf", "QPDF_String non-trivial UTF-16");
}
codepoint += bits & 0x3FF;
}
else
{
codepoint = bits;
}
result += QUtil::toUTF8(codepoint);
codepoint = 0;
}
return QUtil::utf16_to_utf8(this->val);
}
else
{
for (unsigned int i = 0; i < len; ++i)
{
unsigned char ch = static_cast<unsigned char>(this->val.at(i));
unsigned short val = ch;
if ((ch >= 128) && (ch <= 160))
{
val = pdf_doc_to_unicode[ch - 128];
}
result += QUtil::toUTF8(val);
}
return QUtil::pdf_doc_to_utf8(this->val);
}
return result;
}

View File

@ -8,6 +8,7 @@
#endif
#include <qpdf/SecureRandomDataProvider.hh>
#include <qpdf/QPDFSystemError.hh>
#include <qpdf/QTC.hh>
#include <cmath>
#include <iomanip>
@ -29,6 +30,43 @@
#include <sys/stat.h>
#endif
// First element is 128
static unsigned short pdf_doc_to_unicode[] = {
0x2022, // 0x80 BULLET
0x2020, // 0x81 DAGGER
0x2021, // 0x82 DOUBLE DAGGER
0x2026, // 0x83 HORIZONTAL ELLIPSIS
0x2014, // 0x84 EM DASH
0x2013, // 0x85 EN DASH
0x0192, // 0x86 SMALL LETTER F WITH HOOK
0x2044, // 0x87 FRACTION SLASH (solidus)
0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x2212, // 0x8a MINUS SIGN
0x2030, // 0x8b PER MILLE SIGN
0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
0x2122, // 0x92 TRADE MARK SIGN
0xfb01, // 0x93 LATIN SMALL LIGATURE FI
0xfb02, // 0x94 LATIN SMALL LIGATURE FL
0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
0x0153, // 0x9c LATIN SMALL LIGATURE OE
0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
0xfffd, // 0x9f UNDEFINED
0x20ac, // 0xa0 EURO SIGN
};
std::string
QUtil::int_to_string(long long num, int length)
{
@ -895,7 +933,7 @@ QUtil::parse_numrange(char const* range, int max)
return result;
}
enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman };
enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc };
static unsigned char
encode_winansi(unsigned long codepoint)
@ -1342,6 +1380,119 @@ encode_macroman(unsigned long codepoint)
return ch;
}
static unsigned char
encode_pdfdoc(unsigned long codepoint)
{
// Use this ugly switch statement to avoid a static, which is not
// thread-safe.
unsigned char ch = '\0';
switch (codepoint)
{
case 0x2022:
ch = 0x80;
break;
case 0x2020:
ch = 0x81;
break;
case 0x2021:
ch = 0x82;
break;
case 0x2026:
ch = 0x83;
break;
case 0x2014:
ch = 0x84;
break;
case 0x2013:
ch = 0x85;
break;
case 0x0192:
ch = 0x86;
break;
case 0x2044:
ch = 0x87;
break;
case 0x2039:
ch = 0x88;
break;
case 0x203a:
ch = 0x89;
break;
case 0x2212:
ch = 0x8a;
break;
case 0x2030:
ch = 0x8b;
break;
case 0x201e:
ch = 0x8c;
break;
case 0x201c:
ch = 0x8d;
break;
case 0x201d:
ch = 0x8e;
break;
case 0x2018:
ch = 0x8f;
break;
case 0x2019:
ch = 0x90;
break;
case 0x201a:
ch = 0x91;
break;
case 0x2122:
ch = 0x92;
break;
case 0xfb01:
ch = 0x93;
break;
case 0xfb02:
ch = 0x94;
break;
case 0x0141:
ch = 0x95;
break;
case 0x0152:
ch = 0x96;
break;
case 0x0160:
ch = 0x97;
break;
case 0x0178:
ch = 0x98;
break;
case 0x017d:
ch = 0x99;
break;
case 0x0131:
ch = 0x9a;
break;
case 0x0142:
ch = 0x9b;
break;
case 0x0153:
ch = 0x9c;
break;
case 0x0161:
ch = 0x9d;
break;
case 0x017e:
ch = 0x9e;
break;
case 0xfffd:
ch = 0x9f;
break;
case 0x20ac:
ch = 0xa0;
break;
default:
break;
}
return ch;
}
static std::string
transcode_utf8(std::string const& utf8_val, encoding_e encoding,
char unknown)
@ -1410,24 +1561,27 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
{
result += QUtil::toUTF16(codepoint);
}
else if ((codepoint >= 160) && (codepoint < 256) &&
((encoding == e_winansi) || (encoding == e_pdfdoc)))
{
ch = static_cast<unsigned char>(codepoint & 0xff);
result.append(1, ch);
}
else
{
ch = '\0';
if (encoding == e_winansi)
{
if ((codepoint >= 160) && (codepoint < 256))
{
ch = static_cast<unsigned char>(codepoint & 0xff);
}
else
{
ch = encode_winansi(codepoint);
}
ch = encode_winansi(codepoint);
}
else if (encoding == e_macroman)
{
ch = encode_macroman(codepoint);
}
else if (encoding == e_pdfdoc)
{
ch = encode_pdfdoc(codepoint);
}
if (ch == '\0')
{
ch = static_cast<unsigned char>(unknown);
@ -1463,3 +1617,98 @@ QUtil::utf8_to_mac_roman(std::string const& utf8, char unknown_char)
{
return transcode_utf8(utf8, e_macroman, unknown_char);
}
std::string
QUtil::utf8_to_pdf_doc(std::string const& utf8, char unknown_char)
{
return transcode_utf8(utf8, e_pdfdoc, unknown_char);
}
bool
QUtil::is_utf16(std::string const& val)
{
return ((val.length() >= 2) &&
(val.at(0) == '\xfe') && (val.at(1) == '\xff'));
}
std::string
QUtil::utf16_to_utf8(std::string const& val)
{
std::string result;
// This code uses unsigned long and unsigned short to hold
// codepoint values. It requires unsigned long to be at least
// 32 bits and unsigned short to be at least 16 bits, but it
// will work fine if they are larger.
unsigned long codepoint = 0L;
size_t len = val.length();
size_t start = 0;
if (is_utf16(val))
{
start += 2;
}
// If the string has an odd number of bytes, the last byte is
// ignored.
for (unsigned int i = start; i < len; i += 2)
{
// Convert from UTF16-BE. If we get a malformed
// codepoint, this code will generate incorrect output
// without giving a warning. Specifically, a high
// codepoint not followed by a low codepoint will be
// discarded, and a low codepoint not preceded by a high
// codepoint will just get its low 10 bits output.
unsigned short bits =
(static_cast<unsigned char>(val.at(i)) << 8) +
static_cast<unsigned char>(val.at(i+1));
if ((bits & 0xFC00) == 0xD800)
{
codepoint = 0x10000 + ((bits & 0x3FF) << 10);
continue;
}
else if ((bits & 0xFC00) == 0xDC00)
{
if (codepoint != 0)
{
QTC::TC("qpdf", "QUtil non-trivial UTF-16");
}
codepoint += bits & 0x3FF;
}
else
{
codepoint = bits;
}
result += QUtil::toUTF8(codepoint);
codepoint = 0;
}
return result;
}
std::string
QUtil::win_ansi_to_utf8(std::string const& val)
{
return "QXXXQ";
}
std::string
QUtil::mac_roman_to_utf8(std::string const& val)
{
return "QXXXQ";
}
std::string
QUtil::pdf_doc_to_utf8(std::string const& val)
{
std::string result;
size_t len = val.length();
for (unsigned int i = 0; i < len; ++i)
{
unsigned char ch = static_cast<unsigned char>(val.at(i));
unsigned short val = ch;
if ((ch >= 128) && (ch <= 160))
{
val = pdf_doc_to_unicode[ch - 128];
}
result += QUtil::toUTF8(val);
}
return result;
}

View File

@ -108,7 +108,7 @@ QPDF_Stream pipeStreamData with null pipeline 0
QPDFWriter not recompressing /FlateDecode 0
QPDF_encryption xref stream from encrypted file 0
qpdf unable to filter 0
QPDF_String non-trivial UTF-16 0
QUtil non-trivial UTF-16 0
QPDF xref overwrite object 0
QPDF decoding error warning 0
qpdf-c called qpdf_init 0