qpdf/libqpdf/QPDF_String.cc

#include <qpdf/QPDF_String.hh>

#include <qpdf/QUtil.hh>
#include <qpdf/QTC.hh>

// DO NOT USE ctype -- it is locale dependent for some things, and
// it's not worth the risk of including it in case it may accidentally
// be used.
#include <string.h>

// First element is 128
static unsigned short pdf_doc_to_unicode[] = {
    0x2022,    // 0x80    BULLET
    0x2020,    // 0x81    DAGGER
    0x2021,    // 0x82    DOUBLE DAGGER
    0x2026,    // 0x83    HORIZONTAL ELLIPSIS
    0x2014,    // 0x84    EM DASH
    0x2013,    // 0x85    EN DASH
    0x0192,    // 0x86    SMALL LETTER F WITH HOOK
    0x2044,    // 0x87    FRACTION SLASH (solidus)
    0x2039,    // 0x88    SINGLE LEFT-POINTING ANGLE QUOTATION MARK
    0x203a,    // 0x89    SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
    0x2212,    // 0x8a    MINUS SIGN
    0x2030,    // 0x8b    PER MILLE SIGN
    0x201e,    // 0x8c    DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
    0x201c,    // 0x8d    LEFT DOUBLE QUOTATION MARK (double quote left)
    0x201d,    // 0x8e    RIGHT DOUBLE QUOTATION MARK (quotedblright)
    0x2018,    // 0x8f    LEFT SINGLE QUOTATION MARK (quoteleft)
    0x2019,    // 0x90    RIGHT SINGLE QUOTATION MARK (quoteright)
    0x201a,    // 0x91    SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
    0x2122,    // 0x92    TRADE MARK SIGN
    0xfb01,    // 0x93    LATIN SMALL LIGATURE FI
    0xfb02,    // 0x94    LATIN SMALL LIGATURE FL
    0x0141,    // 0x95    LATIN CAPITAL LETTER L WITH STROKE
    0x0152,    // 0x96    LATIN CAPITAL LIGATURE OE
    0x0160,    // 0x97    LATIN CAPITAL LETTER S WITH CARON
    0x0178,    // 0x98    LATIN CAPITAL LETTER Y WITH DIAERESIS
    0x017d,    // 0x99    LATIN CAPITAL LETTER Z WITH CARON
    0x0131,    // 0x9a    LATIN SMALL LETTER DOTLESS I
    0x0142,    // 0x9b    LATIN SMALL LETTER L WITH STROKE
    0x0153,    // 0x9c    LATIN SMALL LIGATURE OE
    0x0161,    // 0x9d    LATIN SMALL LETTER S WITH CARON
    0x017e,    // 0x9e    LATIN SMALL LETTER Z WITH CARON
    0xfffd,    // 0x9f    UNDEFINED
    0x20ac,    // 0xa0    EURO SIGN
};

// See above about ctype.
static bool is_ascii_printable(unsigned char ch)
{
    return ((ch >= 32) && (ch <= 126));
}
static bool is_iso_latin1_printable(unsigned char ch)
{
    return (((ch >= 32) && (ch <= 126)) || (ch >= 160));
}

QPDF_String::QPDF_String(std::string const& val) :
    val(val)
{
}

QPDF_String::~QPDF_String()
{
}

QPDF_String*
QPDF_String::new_utf16(std::string const& utf8_val)
{
    return new QPDF_String(QUtil::utf8_to_utf16(utf8_val));
}

std::string
QPDF_String::unparse()
{
    return unparse(false);
}

JSON
QPDF_String::getJSON()
{
    return JSON::makeString(getUTF8Val());
}

QPDFObject::object_type_e
QPDF_String::getTypeCode() const
{
    return QPDFObject::ot_string;
}

char const*
QPDF_String::getTypeName() const
{
    return "string";
}

std::string
QPDF_String::unparse(bool force_binary)
{
    bool use_hexstring = force_binary;
    if (! use_hexstring)
    {
	unsigned int nonprintable = 0;
	int consecutive_printable = 0;
	for (unsigned int i = 0; i < this->val.length(); ++i)
	{
	    char ch = this->val.at(i);
	    // Note: do not use locale to determine printability.  The
	    // PDF specification accepts arbitrary binary data.  Some
	    // locales imply multibyte characters.  We'll consider
	    // something printable if it is printable in 7-bit ASCII.
	    // We'll code this manually rather than being rude and
	    // setting locale.
	    if ((ch == 0) || (! (is_ascii_printable(ch) ||
				 strchr("\n\r\t\b\f", ch))))
	    {
		++nonprintable;
		consecutive_printable = 0;
	    }
	    else
	    {
		if (++consecutive_printable > 5)
		{
		    // If there are more than 5 consecutive printable
		    // characters, I want to see them as such.
		    nonprintable = 0;
		    break;
		}
	    }
	}

	// Use hex notation if more than 20% of the characters are not
	// printable in plain ASCII.
	if (5 * nonprintable > val.length())
	{
	    use_hexstring = true;
	}
    }
    std::string result;
    if (use_hexstring)
    {
	result += "<" + QUtil::hex_encode(this->val) + ">";
    }
    else
    {
	result += "(";
	for (unsigned int i = 0; i < this->val.length(); ++i)
	{
	    char ch = this->val.at(i);
	    switch (ch)
	    {
	      case '\n':
		result += "\\n";
		break;

	      case '\r':
		result += "\\r";
		break;

	      case '\t':
		result += "\\t";
		break;

	      case '\b':
		result += "\\b";
		break;

	      case '\f':
		result += "\\f";
		break;

	      case '(':
		result += "\\(";
		break;

	      case ')':
		result += "\\)";
		break;

	      case '\\':
		result += "\\\\";
		break;

	      default:
		if (is_iso_latin1_printable(ch))
		{
		    result += this->val.at(i);
		}
		else
		{
		    result += "\\" + QUtil::int_to_string_base(
                        static_cast<int>(static_cast<unsigned char>(ch)),
                        8, 3);
		}
		break;
	    }
	}
	result += ")";
    }

    return result;
}

std::string
QPDF_String::getVal() const
{
    return this->val;
}

std::string
QPDF_String::getUTF8Val() const
{
    std::string result;
    size_t len = this->val.length();
    if ((len >= 2) && (len % 2 == 0) &&
	(this->val.at(0) == '\xfe') && (this->val.at(1) == '\xff'))
    {
	// This is a Unicode string using big-endian UTF-16.  This
	// code uses unsigned long and unsigned short to hold
	// codepoint values.  It requires unsigned long to be at least
	// 32 bits and unsigned short to be at least 16 bits, but it
	// will work fine if they are larger.
	unsigned long codepoint = 0L;
	for (unsigned int i = 2; i < len; i += 2)
	{
	    // Convert from UTF16-BE.  If we get a malformed
	    // codepoint, this code will generate incorrect output
	    // without giving a warning.  Specifically, a high
	    // codepoint not followed by a low codepoint will be
	    // discarded, and a low codepoint not preceded by a high
	    // codepoint will just get its low 10 bits output.
	    unsigned short bits =
		(static_cast<unsigned char>(this->val.at(i)) << 8) +
		static_cast<unsigned char>(this->val.at(i+1));
	    if ((bits & 0xFC00) == 0xD800)
	    {
		codepoint = 0x10000 + ((bits & 0x3FF) << 10);
		continue;
	    }
	    else if ((bits & 0xFC00) == 0xDC00)
	    {
		if (codepoint != 0)
		{
		    QTC::TC("qpdf", "QPDF_String non-trivial UTF-16");
		}
		codepoint += bits & 0x3FF;
	    }
	    else
	    {
		codepoint = bits;
	    }

	    result += QUtil::toUTF8(codepoint);
	    codepoint = 0;
	}
    }
    else
    {
	for (unsigned int i = 0; i < len; ++i)
	{
            unsigned char ch = static_cast<unsigned char>(this->val.at(i));
            unsigned short val = ch;
            if ((ch >= 128) && (ch <= 160))
            {
                val = pdf_doc_to_unicode[ch - 128];
            }
	    result += QUtil::toUTF8(val);
	}
    }
    return result;
}
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`#include <qpdf/QPDF_String.hh>`

			`#include <qpdf/QUtil.hh>`
handle UTF-16BE fully git-svn-id: svn+q:///qpdf/trunk@639 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-11-23 18:49:13 +00:00			`#include <qpdf/QTC.hh>`

update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`// DO NOT USE ctype -- it is locale dependent for some things, and`
			`// it's not worth the risk of including it in case it may accidentally`
			`// be used.`
			`#include <string.h>`

Properly handle strings with PDF Doc Encoding (fixes #179) The QPDF_String::getUTF8Val() method was not treating strings that weren't explicitly Unicode as PDF Doc Encoded. This only affects characters in the range 0x80 through 0xa0. 2018-02-17 23:47:57 +00:00			`// First element is 128`
			`static unsigned short pdf_doc_to_unicode[] = {`
			`0x2022, // 0x80 BULLET`
			`0x2020, // 0x81 DAGGER`
			`0x2021, // 0x82 DOUBLE DAGGER`
			`0x2026, // 0x83 HORIZONTAL ELLIPSIS`
			`0x2014, // 0x84 EM DASH`
			`0x2013, // 0x85 EN DASH`
			`0x0192, // 0x86 SMALL LETTER F WITH HOOK`
			`0x2044, // 0x87 FRACTION SLASH (solidus)`
			`0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK`
			`0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK`
			`0x2212, // 0x8a MINUS SIGN`
			`0x2030, // 0x8b PER MILLE SIGN`
			`0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)`
			`0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)`
			`0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)`
			`0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)`
			`0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)`
			`0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)`
			`0x2122, // 0x92 TRADE MARK SIGN`
			`0xfb01, // 0x93 LATIN SMALL LIGATURE FI`
			`0xfb02, // 0x94 LATIN SMALL LIGATURE FL`
			`0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE`
			`0x0152, // 0x96 LATIN CAPITAL LIGATURE OE`
			`0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON`
			`0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS`
			`0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON`
			`0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I`
			`0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE`
			`0x0153, // 0x9c LATIN SMALL LIGATURE OE`
			`0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON`
			`0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON`
			`0xfffd, // 0x9f UNDEFINED`
			`0x20ac, // 0xa0 EURO SIGN`
			`};`

update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`// See above about ctype.`
tweak when we decide to use hex strings vs literal strings git-svn-id: svn+q:///qpdf/trunk@810 71b93d88-0707-0410-a8cf-f5a4172ac649 2009-10-17 17:31:52 +00:00			`static bool is_ascii_printable(unsigned char ch)`
			`{`
			`return ((ch >= 32) && (ch <= 126));`
			`}`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`static bool is_iso_latin1_printable(unsigned char ch)`
			`{`
			`return (((ch >= 32) && (ch <= 126)) \|\| (ch >= 160));`
			`}`

			`QPDF_String::QPDF_String(std::string const& val) :`
			`val(val)`
			`{`
			`}`

			`QPDF_String::~QPDF_String()`
			`{`
			`}`

Refactor string transcoding in QPDF_String 2019-01-05 17:54:41 +00:00			`QPDF_String*`
			`QPDF_String::new_utf16(std::string const& utf8_val)`
			`{`
Move utf8_to_utf16 into QUtil 2019-01-05 18:00:18 +00:00			`return new QPDF_String(QUtil::utf8_to_utf16(utf8_val));`
Better support for creating Unicode strings 2018-06-21 18:03:45 +00:00			`}`

update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`std::string`
			`QPDF_String::unparse()`
			`{`
			`return unparse(false);`
			`}`

Add QPDFObjectHandle::getJSON() 2018-12-17 22:40:29 +00:00			`JSON`
			`QPDF_String::getJSON()`
			`{`
			`return JSON::makeString(getUTF8Val());`
			`}`

Add getTypeCode() and getTypeName() Add virtual methods to QPDFObject, wrappers to QPDFObjectHandle, and implementations to all the QPDF_Object types. 2013-01-22 14:57:07 +00:00			`QPDFObject::object_type_e`
			`QPDF_String::getTypeCode() const`
			`{`
			`return QPDFObject::ot_string;`
			`}`

			`char const*`
			`QPDF_String::getTypeName() const`
			`{`
			`return "string";`
			`}`

update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`std::string`
			`QPDF_String::unparse(bool force_binary)`
			`{`
			`bool use_hexstring = force_binary;`
			`if (! use_hexstring)`
			`{`
			`unsigned int nonprintable = 0;`
			`int consecutive_printable = 0;`
			`for (unsigned int i = 0; i < this->val.length(); ++i)`
			`{`
Security: replace operator[] with at For std::string and std::vector, replace operator[] with at. This was done using an automated process. See README.hardening for details. 2013-10-05 23:42:39 +00:00			`char ch = this->val.at(i);`
tweak when we decide to use hex strings vs literal strings git-svn-id: svn+q:///qpdf/trunk@810 71b93d88-0707-0410-a8cf-f5a4172ac649 2009-10-17 17:31:52 +00:00			`// Note: do not use locale to determine printability. The`
			`// PDF specification accepts arbitrary binary data. Some`
			`// locales imply multibyte characters. We'll consider`
			`// something printable if it is printable in 7-bit ASCII.`
			`// We'll code this manually rather than being rude and`
			`// setting locale.`
			`if ((ch == 0) \|\| (! (is_ascii_printable(ch) \|\|`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`strchr("\n\r\t\b\f", ch))))`
			`{`
			`++nonprintable;`
			`consecutive_printable = 0;`
			`}`
			`else`
			`{`
			`if (++consecutive_printable > 5)`
			`{`
			`// If there are more than 5 consecutive printable`
			`// characters, I want to see them as such.`
			`nonprintable = 0;`
			`break;`
			`}`
			`}`
			`}`

			`// Use hex notation if more than 20% of the characters are not`
tweak when we decide to use hex strings vs literal strings git-svn-id: svn+q:///qpdf/trunk@810 71b93d88-0707-0410-a8cf-f5a4172ac649 2009-10-17 17:31:52 +00:00			`// printable in plain ASCII.`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`if (5 * nonprintable > val.length())`
			`{`
			`use_hexstring = true;`
			`}`
			`}`
			`std::string result;`
			`if (use_hexstring)`
			`{`
Replace many calls to sprintf with QUtil::hex_encode Add QUtil::hex_encode to encode binary data has a hexadecimal string, and use it in place of sprintf where possible. 2013-01-25 13:59:55 +00:00			`result += "<" + QUtil::hex_encode(this->val) + ">";`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`}`
			`else`
			`{`
			`result += "(";`
			`for (unsigned int i = 0; i < this->val.length(); ++i)`
			`{`
Security: replace operator[] with at For std::string and std::vector, replace operator[] with at. This was done using an automated process. See README.hardening for details. 2013-10-05 23:42:39 +00:00			`char ch = this->val.at(i);`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`switch (ch)`
			`{`
			`case '\n':`
			`result += "\\n";`
			`break;`

			`case '\r':`
			`result += "\\r";`
			`break;`

			`case '\t':`
			`result += "\\t";`
			`break;`

			`case '\b':`
			`result += "\\b";`
			`break;`

			`case '\f':`
			`result += "\\f";`
			`break;`

			`case '(':`
			`result += "\\(";`
			`break;`

			`case ')':`
			`result += "\\)";`
			`break;`

			`case '\\':`
			`result += "\\\\";`
			`break;`

			`default:`
			`if (is_iso_latin1_printable(ch))`
			`{`
Security: replace operator[] with at For std::string and std::vector, replace operator[] with at. This was done using an automated process. See README.hardening for details. 2013-10-05 23:42:39 +00:00			`result += this->val.at(i);`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`}`
			`else`
			`{`
Remove all calls to sprintf 2013-02-28 21:20:45 +00:00			`result += "\\" + QUtil::int_to_string_base(`
			`static_cast<int>(static_cast<unsigned char>(ch)),`
			`8, 3);`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`}`
			`break;`
			`}`
			`}`
			`result += ")";`
			`}`

			`return result;`
			`}`

			`std::string`
			`QPDF_String::getVal() const`
			`{`
			`return this->val;`
			`}`

			`std::string`
			`QPDF_String::getUTF8Val() const`
			`{`
			`std::string result;`
ABI change: fix use of off_t, size_t, and integer types Significantly improve the code's use of off_t for file offsets, size_t for memory sizes, and integer types in cases where there has to be compatibility with external interfaces. Rework sections of the code that would have prevented qpdf from working on files larger than 2 (or maybe 4) GB in size. 2012-06-20 15:20:57 +00:00			`size_t len = this->val.length();`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`if ((len >= 2) && (len % 2 == 0) &&`
Security: replace operator[] with at For std::string and std::vector, replace operator[] with at. This was done using an automated process. See README.hardening for details. 2013-10-05 23:42:39 +00:00			`(this->val.at(0) == '\xfe') && (this->val.at(1) == '\xff'))`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`{`
			`// This is a Unicode string using big-endian UTF-16. This`
handle UTF-16BE fully git-svn-id: svn+q:///qpdf/trunk@639 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-11-23 18:49:13 +00:00			`// code uses unsigned long and unsigned short to hold`
			`// codepoint values. It requires unsigned long to be at least`
			`// 32 bits and unsigned short to be at least 16 bits, but it`
			`// will work fine if they are larger.`
			`unsigned long codepoint = 0L;`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`for (unsigned int i = 2; i < len; i += 2)`
			`{`
handle UTF-16BE fully git-svn-id: svn+q:///qpdf/trunk@639 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-11-23 18:49:13 +00:00			`// Convert from UTF16-BE. If we get a malformed`
			`// codepoint, this code will generate incorrect output`
			`// without giving a warning. Specifically, a high`
			`// codepoint not followed by a low codepoint will be`
			`// discarded, and a low codepoint not preceded by a high`
			`// codepoint will just get its low 10 bits output.`
			`unsigned short bits =`
Security: replace operator[] with at For std::string and std::vector, replace operator[] with at. This was done using an automated process. See README.hardening for details. 2013-10-05 23:42:39 +00:00			`(static_cast<unsigned char>(this->val.at(i)) << 8) +`
			`static_cast<unsigned char>(this->val.at(i+1));`
handle UTF-16BE fully git-svn-id: svn+q:///qpdf/trunk@639 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-11-23 18:49:13 +00:00			`if ((bits & 0xFC00) == 0xD800)`
			`{`
			`codepoint = 0x10000 + ((bits & 0x3FF) << 10);`
			`continue;`
			`}`
			`else if ((bits & 0xFC00) == 0xDC00)`
			`{`
			`if (codepoint != 0)`
			`{`
			`QTC::TC("qpdf", "QPDF_String non-trivial UTF-16");`
			`}`
			`codepoint += bits & 0x3FF;`
			`}`
			`else`
			`{`
			`codepoint = bits;`
			`}`

			`result += QUtil::toUTF8(codepoint);`
			`codepoint = 0;`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`}`
			`}`
			`else`
			`{`
			`for (unsigned int i = 0; i < len; ++i)`
			`{`
Properly handle strings with PDF Doc Encoding (fixes #179) The QPDF_String::getUTF8Val() method was not treating strings that weren't explicitly Unicode as PDF Doc Encoded. This only affects characters in the range 0x80 through 0xa0. 2018-02-17 23:47:57 +00:00			`unsigned char ch = static_cast<unsigned char>(this->val.at(i));`
			`unsigned short val = ch;`
			`if ((ch >= 128) && (ch <= 160))`
			`{`
			`val = pdf_doc_to_unicode[ch - 128];`
			`}`
			`result += QUtil::toUTF8(val);`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`}`
			`}`
			`return result;`
			`}`