2008-04-29 12:55:25 +00:00
|
|
|
#include <qpdf/QPDF_String.hh>
|
|
|
|
|
2022-04-02 17:14:10 -04:00
|
|
|
#include <qpdf/QUtil.hh>
|
2008-11-23 18:49:13 +00:00
|
|
|
|
2023-05-27 18:19:52 +01:00
|
|
|
// DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of
|
|
|
|
// including it in case it may accidentally be used.
|
2008-04-29 12:55:25 +00:00
|
|
|
|
2022-04-02 17:14:10 -04:00
|
|
|
static bool
|
|
|
|
is_iso_latin1_printable(char ch)
|
2008-04-29 12:55:25 +00:00
|
|
|
{
|
2023-05-21 13:35:09 -04:00
|
|
|
return (((ch >= 32) && (ch <= 126)) || (static_cast<unsigned char>(ch) >= 160));
|
2008-04-29 12:55:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
QPDF_String::QPDF_String(std::string const& val) :
|
2022-08-02 22:57:33 +01:00
|
|
|
QPDFValue(::ot_string, "string"),
|
2008-04-29 12:55:25 +00:00
|
|
|
val(val)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2022-09-08 11:29:23 -04:00
|
|
|
std::shared_ptr<QPDFObject>
|
2022-06-16 17:45:04 +01:00
|
|
|
QPDF_String::create(std::string const& val)
|
|
|
|
{
|
|
|
|
return do_create(new QPDF_String(val));
|
|
|
|
}
|
|
|
|
|
2022-09-08 11:29:23 -04:00
|
|
|
std::shared_ptr<QPDFObject>
|
2022-06-16 17:45:04 +01:00
|
|
|
QPDF_String::create_utf16(std::string const& utf8_val)
|
2019-01-05 12:54:41 -05:00
|
|
|
{
|
2021-01-23 17:58:23 -05:00
|
|
|
std::string result;
|
2022-04-02 17:14:10 -04:00
|
|
|
if (!QUtil::utf8_to_pdf_doc(utf8_val, result, '?')) {
|
2021-01-23 17:58:23 -05:00
|
|
|
result = QUtil::utf8_to_utf16(utf8_val);
|
|
|
|
}
|
2022-06-16 17:45:04 +01:00
|
|
|
return do_create(new QPDF_String(result));
|
|
|
|
}
|
|
|
|
|
2022-09-08 11:29:23 -04:00
|
|
|
std::shared_ptr<QPDFObject>
|
2022-11-14 17:54:12 +00:00
|
|
|
QPDF_String::copy(bool shallow)
|
2022-06-16 17:45:04 +01:00
|
|
|
{
|
|
|
|
return create(val);
|
2018-06-21 14:03:45 -04:00
|
|
|
}
|
|
|
|
|
2008-04-29 12:55:25 +00:00
|
|
|
std::string
|
|
|
|
QPDF_String::unparse()
|
|
|
|
{
|
|
|
|
return unparse(false);
|
|
|
|
}
|
|
|
|
|
2018-12-17 17:40:29 -05:00
|
|
|
JSON
|
2022-05-07 07:53:45 -04:00
|
|
|
QPDF_String::getJSON(int json_version)
|
2018-12-17 17:40:29 -05:00
|
|
|
{
|
2022-05-07 08:20:09 -04:00
|
|
|
if (json_version == 1) {
|
|
|
|
return JSON::makeString(getUTF8Val());
|
|
|
|
}
|
|
|
|
// See if we can unambiguously represent as Unicode.
|
|
|
|
bool is_unicode = false;
|
|
|
|
std::string result;
|
|
|
|
std::string candidate = getUTF8Val();
|
|
|
|
if (QUtil::is_utf16(this->val) || QUtil::is_explicit_utf8(this->val)) {
|
|
|
|
is_unicode = true;
|
|
|
|
result = candidate;
|
|
|
|
} else if (!useHexString()) {
|
|
|
|
std::string test;
|
2023-05-21 13:35:09 -04:00
|
|
|
if (QUtil::utf8_to_pdf_doc(candidate, test, '?') && (test == this->val)) {
|
2023-05-27 18:19:52 +01:00
|
|
|
// This is a PDF-doc string that can be losslessly encoded as Unicode.
|
2022-05-07 08:20:09 -04:00
|
|
|
is_unicode = true;
|
|
|
|
result = candidate;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (is_unicode) {
|
|
|
|
result = "u:" + result;
|
|
|
|
} else {
|
|
|
|
result = "b:" + QUtil::hex_encode(this->val);
|
|
|
|
}
|
|
|
|
return JSON::makeString(result);
|
2018-12-17 17:40:29 -05:00
|
|
|
}
|
|
|
|
|
2022-05-07 08:20:09 -04:00
|
|
|
bool
|
|
|
|
QPDF_String::useHexString() const
|
2008-04-29 12:55:25 +00:00
|
|
|
{
|
2023-05-27 18:19:52 +01:00
|
|
|
// Heuristic: use the hexadecimal representation of a string if there are any non-printable (in
|
|
|
|
// PDF Doc encoding) characters or if too large of a proportion of the string consists of
|
2022-05-07 08:20:09 -04:00
|
|
|
// non-ASCII characters.
|
|
|
|
unsigned int non_ascii = 0;
|
2022-09-23 18:56:07 +01:00
|
|
|
for (auto const ch: this->val) {
|
|
|
|
if (ch > 126) {
|
|
|
|
++non_ascii;
|
|
|
|
} else if (ch >= 32) {
|
|
|
|
continue;
|
|
|
|
} else if (ch < 0 || ch >= 24) {
|
2022-05-07 08:20:09 -04:00
|
|
|
++non_ascii;
|
2023-05-21 13:35:09 -04:00
|
|
|
} else if (!(ch == '\n' || ch == '\r' || ch == '\t' || ch == '\b' || ch == '\f')) {
|
2022-09-23 18:56:07 +01:00
|
|
|
return true;
|
2022-02-08 09:18:08 -05:00
|
|
|
}
|
2008-04-29 12:55:25 +00:00
|
|
|
}
|
2022-09-23 18:56:07 +01:00
|
|
|
return 5 * non_ascii > val.length();
|
2022-05-07 08:20:09 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
std::string
|
|
|
|
QPDF_String::unparse(bool force_binary)
|
|
|
|
{
|
|
|
|
bool use_hexstring = force_binary || useHexString();
|
2008-04-29 12:55:25 +00:00
|
|
|
std::string result;
|
2022-04-02 17:14:10 -04:00
|
|
|
if (use_hexstring) {
|
2022-09-21 20:21:17 +01:00
|
|
|
static auto constexpr hexchars = "0123456789abcdef";
|
|
|
|
result.reserve(2 * this->val.length() + 2);
|
|
|
|
result += '<';
|
|
|
|
for (const char c: this->val) {
|
|
|
|
result += hexchars[static_cast<unsigned char>(c) >> 4];
|
|
|
|
result += hexchars[c & 0x0f];
|
|
|
|
}
|
|
|
|
result += '>';
|
2022-04-02 17:14:10 -04:00
|
|
|
} else {
|
2022-02-08 09:18:08 -05:00
|
|
|
result += "(";
|
2022-04-02 17:14:10 -04:00
|
|
|
for (unsigned int i = 0; i < this->val.length(); ++i) {
|
2022-02-08 09:18:08 -05:00
|
|
|
char ch = this->val.at(i);
|
2022-04-02 17:14:10 -04:00
|
|
|
switch (ch) {
|
|
|
|
case '\n':
|
2022-02-08 09:18:08 -05:00
|
|
|
result += "\\n";
|
|
|
|
break;
|
|
|
|
|
2022-04-02 17:14:10 -04:00
|
|
|
case '\r':
|
2022-02-08 09:18:08 -05:00
|
|
|
result += "\\r";
|
|
|
|
break;
|
|
|
|
|
2022-04-02 17:14:10 -04:00
|
|
|
case '\t':
|
2022-02-08 09:18:08 -05:00
|
|
|
result += "\\t";
|
|
|
|
break;
|
|
|
|
|
2022-04-02 17:14:10 -04:00
|
|
|
case '\b':
|
2022-02-08 09:18:08 -05:00
|
|
|
result += "\\b";
|
|
|
|
break;
|
|
|
|
|
2022-04-02 17:14:10 -04:00
|
|
|
case '\f':
|
2022-02-08 09:18:08 -05:00
|
|
|
result += "\\f";
|
|
|
|
break;
|
|
|
|
|
2022-04-02 17:14:10 -04:00
|
|
|
case '(':
|
2022-02-08 09:18:08 -05:00
|
|
|
result += "\\(";
|
|
|
|
break;
|
|
|
|
|
2022-04-02 17:14:10 -04:00
|
|
|
case ')':
|
2022-02-08 09:18:08 -05:00
|
|
|
result += "\\)";
|
|
|
|
break;
|
|
|
|
|
2022-04-02 17:14:10 -04:00
|
|
|
case '\\':
|
2022-02-08 09:18:08 -05:00
|
|
|
result += "\\\\";
|
|
|
|
break;
|
|
|
|
|
2022-04-02 17:14:10 -04:00
|
|
|
default:
|
|
|
|
if (is_iso_latin1_printable(ch)) {
|
2022-02-08 09:18:08 -05:00
|
|
|
result += this->val.at(i);
|
2022-04-02 17:14:10 -04:00
|
|
|
} else {
|
2023-05-21 13:35:09 -04:00
|
|
|
result += "\\" +
|
2022-04-02 17:14:10 -04:00
|
|
|
QUtil::int_to_string_base(
|
2023-05-21 13:35:09 -04:00
|
|
|
static_cast<int>(static_cast<unsigned char>(ch)), 8, 3);
|
2022-02-08 09:18:08 -05:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
result += ")";
|
2008-04-29 12:55:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string
|
|
|
|
QPDF_String::getUTF8Val() const
|
|
|
|
{
|
2022-04-02 17:14:10 -04:00
|
|
|
if (QUtil::is_utf16(this->val)) {
|
2019-01-13 08:00:14 -05:00
|
|
|
return QUtil::utf16_to_utf8(this->val);
|
2022-04-23 16:39:27 -04:00
|
|
|
} else if (QUtil::is_explicit_utf8(this->val)) {
|
2023-05-27 18:19:52 +01:00
|
|
|
// PDF 2.0 allows UTF-8 strings when explicitly prefixed with the three-byte representation
|
|
|
|
// of U+FEFF.
|
2022-02-22 08:04:11 -05:00
|
|
|
return this->val.substr(3);
|
2022-04-02 17:14:10 -04:00
|
|
|
} else {
|
2019-01-13 08:00:14 -05:00
|
|
|
return QUtil::pdf_doc_to_utf8(this->val);
|
2008-04-29 12:55:25 +00:00
|
|
|
}
|
|
|
|
}
|