mirror of
https://github.com/qpdf/qpdf.git
synced 2025-01-03 15:17:29 +00:00
f3d7c26de1
git-svn-id: svn+q:///qpdf/trunk@709 71b93d88-0707-0410-a8cf-f5a4172ac649
210 lines
4.4 KiB
C++
210 lines
4.4 KiB
C++
#include <qpdf/QPDF_String.hh>
|
|
|
|
#include <qpdf/QUtil.hh>
|
|
#include <qpdf/QTC.hh>
|
|
|
|
// DO NOT USE ctype -- it is locale dependent for some things, and
|
|
// it's not worth the risk of including it in case it may accidentally
|
|
// be used.
|
|
#include <string.h>
|
|
|
|
// See above about ctype.
|
|
static bool is_iso_latin1_printable(unsigned char ch)
|
|
{
|
|
return (((ch >= 32) && (ch <= 126)) || (ch >= 160));
|
|
}
|
|
|
|
QPDF_String::QPDF_String(std::string const& val) :
|
|
val(val)
|
|
{
|
|
}
|
|
|
|
QPDF_String::~QPDF_String()
|
|
{
|
|
}
|
|
|
|
std::string
|
|
QPDF_String::unparse()
|
|
{
|
|
return unparse(false);
|
|
}
|
|
|
|
std::string
|
|
QPDF_String::unparse(bool force_binary)
|
|
{
|
|
bool use_hexstring = force_binary;
|
|
if (! use_hexstring)
|
|
{
|
|
unsigned int nonprintable = 0;
|
|
int consecutive_printable = 0;
|
|
for (unsigned int i = 0; i < this->val.length(); ++i)
|
|
{
|
|
char ch = this->val[i];
|
|
// Note: do not use locale to determine printability. The PDF
|
|
// specification accepts arbitrary binary data. Some locales
|
|
// imply multibyte characters. We'll consider something
|
|
// printable if it is printable in ISO-Latin-1. We'll code
|
|
// this manually rather than being rude and setting locale.
|
|
if ((ch == 0) || (! (is_iso_latin1_printable(ch) ||
|
|
strchr("\n\r\t\b\f", ch))))
|
|
{
|
|
++nonprintable;
|
|
consecutive_printable = 0;
|
|
}
|
|
else
|
|
{
|
|
if (++consecutive_printable > 5)
|
|
{
|
|
// If there are more than 5 consecutive printable
|
|
// characters, I want to see them as such.
|
|
nonprintable = 0;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Use hex notation if more than 20% of the characters are not
|
|
// printable in the current locale. Uniformly distributed random
|
|
// characters will not pass this test even with ISO-Latin-1 in
|
|
// which 76% are either printable or in the set of standard
|
|
// escaped characters.
|
|
if (5 * nonprintable > val.length())
|
|
{
|
|
use_hexstring = true;
|
|
}
|
|
}
|
|
std::string result;
|
|
if (use_hexstring)
|
|
{
|
|
result += "<";
|
|
char num[3];
|
|
for (unsigned int i = 0; i < this->val.length(); ++i)
|
|
{
|
|
sprintf(num, "%02x", (unsigned char) this->val[i]);
|
|
result += num;
|
|
}
|
|
result += ">";
|
|
}
|
|
else
|
|
{
|
|
result += "(";
|
|
char num[5];
|
|
for (unsigned int i = 0; i < this->val.length(); ++i)
|
|
{
|
|
char ch = this->val[i];
|
|
switch (ch)
|
|
{
|
|
case '\n':
|
|
result += "\\n";
|
|
break;
|
|
|
|
case '\r':
|
|
result += "\\r";
|
|
break;
|
|
|
|
case '\t':
|
|
result += "\\t";
|
|
break;
|
|
|
|
case '\b':
|
|
result += "\\b";
|
|
break;
|
|
|
|
case '\f':
|
|
result += "\\f";
|
|
break;
|
|
|
|
case '(':
|
|
result += "\\(";
|
|
break;
|
|
|
|
case ')':
|
|
result += "\\)";
|
|
break;
|
|
|
|
case '\\':
|
|
result += "\\\\";
|
|
break;
|
|
|
|
default:
|
|
if (is_iso_latin1_printable(ch))
|
|
{
|
|
result += this->val[i];
|
|
}
|
|
else
|
|
{
|
|
sprintf(num, "\\%03o", (unsigned char)ch);
|
|
result += num;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
result += ")";
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
std::string
|
|
QPDF_String::getVal() const
|
|
{
|
|
return this->val;
|
|
}
|
|
|
|
std::string
|
|
QPDF_String::getUTF8Val() const
|
|
{
|
|
std::string result;
|
|
unsigned int len = this->val.length();
|
|
if ((len >= 2) && (len % 2 == 0) &&
|
|
(this->val[0] == '\xfe') && (this->val[1] == '\xff'))
|
|
{
|
|
// This is a Unicode string using big-endian UTF-16. This
|
|
// code uses unsigned long and unsigned short to hold
|
|
// codepoint values. It requires unsigned long to be at least
|
|
// 32 bits and unsigned short to be at least 16 bits, but it
|
|
// will work fine if they are larger.
|
|
unsigned long codepoint = 0L;
|
|
for (unsigned int i = 2; i < len; i += 2)
|
|
{
|
|
// Convert from UTF16-BE. If we get a malformed
|
|
// codepoint, this code will generate incorrect output
|
|
// without giving a warning. Specifically, a high
|
|
// codepoint not followed by a low codepoint will be
|
|
// discarded, and a low codepoint not preceded by a high
|
|
// codepoint will just get its low 10 bits output.
|
|
unsigned short bits =
|
|
(((unsigned char) this->val[i]) << 8) +
|
|
((unsigned char) this->val[i+1]);
|
|
if ((bits & 0xFC00) == 0xD800)
|
|
{
|
|
codepoint = 0x10000 + ((bits & 0x3FF) << 10);
|
|
continue;
|
|
}
|
|
else if ((bits & 0xFC00) == 0xDC00)
|
|
{
|
|
if (codepoint != 0)
|
|
{
|
|
QTC::TC("qpdf", "QPDF_String non-trivial UTF-16");
|
|
}
|
|
codepoint += bits & 0x3FF;
|
|
}
|
|
else
|
|
{
|
|
codepoint = bits;
|
|
}
|
|
|
|
result += QUtil::toUTF8(codepoint);
|
|
codepoint = 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (unsigned int i = 0; i < len; ++i)
|
|
{
|
|
result += QUtil::toUTF8((unsigned char) this->val[i]);
|
|
}
|
|
}
|
|
return result;
|
|
}
|