2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-10-31 19:02:30 +00:00

Implement JSON v2 for String

Also refine the herustic for deciding whether to use hexadecimal
notation for a string.
This commit is contained in:
Jay Berkenbilt 2022-05-07 08:20:09 -04:00
parent 16f4f94cd9
commit 3246923cf2
9 changed files with 69 additions and 53 deletions

View File

@ -45,8 +45,32 @@ QPDF_String::unparse()
JSON
QPDF_String::getJSON(int json_version)
{
// QXXXQ
return JSON::makeString(getUTF8Val());
if (json_version == 1) {
return JSON::makeString(getUTF8Val());
}
// See if we can unambiguously represent as Unicode.
bool is_unicode = false;
std::string result;
std::string candidate = getUTF8Val();
if (QUtil::is_utf16(this->val) || QUtil::is_explicit_utf8(this->val)) {
is_unicode = true;
result = candidate;
} else if (!useHexString()) {
std::string test;
if (QUtil::utf8_to_pdf_doc(candidate, test, '?') &&
(test == this->val)) {
// This is a PDF-doc string that can be losslessly encoded
// as Unicode.
is_unicode = true;
result = candidate;
}
}
if (is_unicode) {
result = "u:" + result;
} else {
result = "b:" + QUtil::hex_encode(this->val);
}
return JSON::makeString(result);
}
QPDFObject::object_type_e
@ -61,41 +85,32 @@ QPDF_String::getTypeName() const
return "string";
}
bool
QPDF_String::useHexString() const
{
// Heuristic: use the hexadecimal representation of a string if
// there are any non-printable (in PDF Doc encoding) characters or
// if too large of a proportion of the string consists of
// non-ASCII characters.
bool nonprintable = false;
unsigned int non_ascii = 0;
for (unsigned int i = 0; i < this->val.length(); ++i) {
char ch = this->val.at(i);
if ((ch == 0) ||
(!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) {
if ((ch >= 0) && (ch < 24)) {
nonprintable = true;
}
++non_ascii;
}
}
return (nonprintable || (5 * non_ascii > val.length()));
}
std::string
QPDF_String::unparse(bool force_binary)
{
bool use_hexstring = force_binary;
if (!use_hexstring) {
unsigned int nonprintable = 0;
int consecutive_printable = 0;
for (unsigned int i = 0; i < this->val.length(); ++i) {
char ch = this->val.at(i);
// Note: do not use locale to determine printability. The
// PDF specification accepts arbitrary binary data. Some
// locales imply multibyte characters. We'll consider
// something printable if it is printable in 7-bit ASCII.
// We'll code this manually rather than being rude and
// setting locale.
if ((ch == 0) ||
(!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) {
++nonprintable;
consecutive_printable = 0;
} else {
if (++consecutive_printable > 5) {
// If there are more than 5 consecutive printable
// characters, I want to see them as such.
nonprintable = 0;
break;
}
}
}
// Use hex notation if more than 20% of the characters are not
// printable in plain ASCII.
if (5 * nonprintable > val.length()) {
use_hexstring = true;
}
}
bool use_hexstring = force_binary || useHexString();
std::string result;
if (use_hexstring) {
result += "<" + QUtil::hex_encode(this->val) + ">";

View File

@ -20,6 +20,7 @@ class QPDF_String: public QPDFObject
std::string getUTF8Val() const;
private:
bool useHexString() const;
std::string val;
};

Binary file not shown.

View File

@ -65,8 +65,8 @@
],
"trailer": {
"/ID": [
"\u0013#¥fi|WzfsU…©6ŸÎ<",
"7,¿DöÛ«`Ù&<\u000f\u000bÒj"
"b:1323a5937c577a66735583a93698ce3c",
"b:372cbf44f6db88ab60d9263c0f0bd26a"
],
"/Root": "1 0 R",
"/Size": 7

View File

@ -89,8 +89,8 @@
},
"trailer": {
"/ID": [
"\u0013#¥fi|WzfsU…©6ŸÎ<",
"7,¿DöÛ«`Ù&<\u000f\u000bÒj"
"b:1323a5937c577a66735583a93698ce3c",
"b:372cbf44f6db88ab60d9263c0f0bd26a"
],
"/Root": "1 0 R",
"/Size": 7

View File

@ -9,7 +9,7 @@ three lines
(string with \nCRLF and\nCR and\nLF)
and another
indentation
(\001B%DEF)<01>
<014225444546><01>
<8a8b>
(ab)
<8c><dd> ) >

View File

@ -1,9 +1,9 @@
{
"/k1": "scalar1",
"/k1": "u:scalar1",
"/k2": 16059,
"/k3": {
"/a": "a",
"/b": "conflict: seen",
"/a": "u:a",
"/b": "u:conflict: seen",
"/c": [
2,
3
@ -12,7 +12,7 @@
"/y": 25,
"/z": 26
},
"/e": "e"
"/e": "u:e"
},
"/k4": {
"/A": 65,
@ -24,11 +24,11 @@
"/k5": [
"/one",
2,
"three",
"u:three",
[
"/four"
],
"two"
"u:two"
]
}
/A

View File

@ -9,8 +9,8 @@
"/Type": "/Catalog"
},
"2 0 R": {
"/CreationDate": "D:20120621124041",
"/Producer": "Apex PDFWriter"
"/CreationDate": "u:D:20120621124041",
"/Producer": "u:Apex PDFWriter"
},
"3 0 R": {
"/Count": 3,
@ -77,8 +77,8 @@
"10 0 R": 47,
"trailer": {
"/ID": [
"û˘·ƒÿ{5\u0005ÚS*ºo",
"÷\u0017ž³QY¿ÔÀ\u000f\u0012¼ý˜\u0002"
"b:fb18b786ff7b358705da8a532aba8f6f",
"b:f7179eb35159bfd4c00f128abcfd1f02"
],
"/Info": "2 0 R",
"/Root": "1 0 R",

View File

@ -41,8 +41,8 @@
"/Type": "/Catalog"
},
"2 0 R": {
"/CreationDate": "D:20120621124041",
"/Producer": "Apex PDFWriter"
"/CreationDate": "u:D:20120621124041",
"/Producer": "u:Apex PDFWriter"
},
"3 0 R": {
"/Count": 3,
@ -129,8 +129,8 @@
},
"trailer": {
"/ID": [
"û˘·ƒÿ{5\u0005ÚS*ºo",
"÷\u0017ž³QY¿ÔÀ\u000f\u0012¼ý˜\u0002"
"b:fb18b786ff7b358705da8a532aba8f6f",
"b:f7179eb35159bfd4c00f128abcfd1f02"
],
"/Info": "2 0 R",
"/Root": "1 0 R",