2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-11-01 03:12:29 +00:00

Implement JSON v2 for String

Also refine the herustic for deciding whether to use hexadecimal
notation for a string.
This commit is contained in:
Jay Berkenbilt 2022-05-07 08:20:09 -04:00
parent 16f4f94cd9
commit 3246923cf2
9 changed files with 69 additions and 53 deletions

View File

@ -45,9 +45,33 @@ QPDF_String::unparse()
JSON JSON
QPDF_String::getJSON(int json_version) QPDF_String::getJSON(int json_version)
{ {
// QXXXQ if (json_version == 1) {
return JSON::makeString(getUTF8Val()); return JSON::makeString(getUTF8Val());
} }
// See if we can unambiguously represent as Unicode.
bool is_unicode = false;
std::string result;
std::string candidate = getUTF8Val();
if (QUtil::is_utf16(this->val) || QUtil::is_explicit_utf8(this->val)) {
is_unicode = true;
result = candidate;
} else if (!useHexString()) {
std::string test;
if (QUtil::utf8_to_pdf_doc(candidate, test, '?') &&
(test == this->val)) {
// This is a PDF-doc string that can be losslessly encoded
// as Unicode.
is_unicode = true;
result = candidate;
}
}
if (is_unicode) {
result = "u:" + result;
} else {
result = "b:" + QUtil::hex_encode(this->val);
}
return JSON::makeString(result);
}
QPDFObject::object_type_e QPDFObject::object_type_e
QPDF_String::getTypeCode() const QPDF_String::getTypeCode() const
@ -61,41 +85,32 @@ QPDF_String::getTypeName() const
return "string"; return "string";
} }
bool
QPDF_String::useHexString() const
{
// Heuristic: use the hexadecimal representation of a string if
// there are any non-printable (in PDF Doc encoding) characters or
// if too large of a proportion of the string consists of
// non-ASCII characters.
bool nonprintable = false;
unsigned int non_ascii = 0;
for (unsigned int i = 0; i < this->val.length(); ++i) {
char ch = this->val.at(i);
if ((ch == 0) ||
(!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) {
if ((ch >= 0) && (ch < 24)) {
nonprintable = true;
}
++non_ascii;
}
}
return (nonprintable || (5 * non_ascii > val.length()));
}
std::string std::string
QPDF_String::unparse(bool force_binary) QPDF_String::unparse(bool force_binary)
{ {
bool use_hexstring = force_binary; bool use_hexstring = force_binary || useHexString();
if (!use_hexstring) {
unsigned int nonprintable = 0;
int consecutive_printable = 0;
for (unsigned int i = 0; i < this->val.length(); ++i) {
char ch = this->val.at(i);
// Note: do not use locale to determine printability. The
// PDF specification accepts arbitrary binary data. Some
// locales imply multibyte characters. We'll consider
// something printable if it is printable in 7-bit ASCII.
// We'll code this manually rather than being rude and
// setting locale.
if ((ch == 0) ||
(!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) {
++nonprintable;
consecutive_printable = 0;
} else {
if (++consecutive_printable > 5) {
// If there are more than 5 consecutive printable
// characters, I want to see them as such.
nonprintable = 0;
break;
}
}
}
// Use hex notation if more than 20% of the characters are not
// printable in plain ASCII.
if (5 * nonprintable > val.length()) {
use_hexstring = true;
}
}
std::string result; std::string result;
if (use_hexstring) { if (use_hexstring) {
result += "<" + QUtil::hex_encode(this->val) + ">"; result += "<" + QUtil::hex_encode(this->val) + ">";

View File

@ -20,6 +20,7 @@ class QPDF_String: public QPDFObject
std::string getUTF8Val() const; std::string getUTF8Val() const;
private: private:
bool useHexString() const;
std::string val; std::string val;
}; };

Binary file not shown.

View File

@ -65,8 +65,8 @@
], ],
"trailer": { "trailer": {
"/ID": [ "/ID": [
"\u0013#¥fi|WzfsU…©6ŸÎ<", "b:1323a5937c577a66735583a93698ce3c",
"7,¿DöÛ«`Ù&<\u000f\u000bÒj" "b:372cbf44f6db88ab60d9263c0f0bd26a"
], ],
"/Root": "1 0 R", "/Root": "1 0 R",
"/Size": 7 "/Size": 7

View File

@ -89,8 +89,8 @@
}, },
"trailer": { "trailer": {
"/ID": [ "/ID": [
"\u0013#¥fi|WzfsU…©6ŸÎ<", "b:1323a5937c577a66735583a93698ce3c",
"7,¿DöÛ«`Ù&<\u000f\u000bÒj" "b:372cbf44f6db88ab60d9263c0f0bd26a"
], ],
"/Root": "1 0 R", "/Root": "1 0 R",
"/Size": 7 "/Size": 7

View File

@ -9,7 +9,7 @@ three lines
(string with \nCRLF and\nCR and\nLF) (string with \nCRLF and\nCR and\nLF)
and another and another
indentation indentation
(\001B%DEF)<01> <014225444546><01>
<8a8b> <8a8b>
(ab) (ab)
<8c><dd> ) > <8c><dd> ) >

View File

@ -1,9 +1,9 @@
{ {
"/k1": "scalar1", "/k1": "u:scalar1",
"/k2": 16059, "/k2": 16059,
"/k3": { "/k3": {
"/a": "a", "/a": "u:a",
"/b": "conflict: seen", "/b": "u:conflict: seen",
"/c": [ "/c": [
2, 2,
3 3
@ -12,7 +12,7 @@
"/y": 25, "/y": 25,
"/z": 26 "/z": 26
}, },
"/e": "e" "/e": "u:e"
}, },
"/k4": { "/k4": {
"/A": 65, "/A": 65,
@ -24,11 +24,11 @@
"/k5": [ "/k5": [
"/one", "/one",
2, 2,
"three", "u:three",
[ [
"/four" "/four"
], ],
"two" "u:two"
] ]
} }
/A /A

View File

@ -9,8 +9,8 @@
"/Type": "/Catalog" "/Type": "/Catalog"
}, },
"2 0 R": { "2 0 R": {
"/CreationDate": "D:20120621124041", "/CreationDate": "u:D:20120621124041",
"/Producer": "Apex PDFWriter" "/Producer": "u:Apex PDFWriter"
}, },
"3 0 R": { "3 0 R": {
"/Count": 3, "/Count": 3,
@ -77,8 +77,8 @@
"10 0 R": 47, "10 0 R": 47,
"trailer": { "trailer": {
"/ID": [ "/ID": [
"û˘·ƒÿ{5\u0005ÚS*ºo", "b:fb18b786ff7b358705da8a532aba8f6f",
"÷\u0017ž³QY¿ÔÀ\u000f\u0012¼ý˜\u0002" "b:f7179eb35159bfd4c00f128abcfd1f02"
], ],
"/Info": "2 0 R", "/Info": "2 0 R",
"/Root": "1 0 R", "/Root": "1 0 R",

View File

@ -41,8 +41,8 @@
"/Type": "/Catalog" "/Type": "/Catalog"
}, },
"2 0 R": { "2 0 R": {
"/CreationDate": "D:20120621124041", "/CreationDate": "u:D:20120621124041",
"/Producer": "Apex PDFWriter" "/Producer": "u:Apex PDFWriter"
}, },
"3 0 R": { "3 0 R": {
"/Count": 3, "/Count": 3,
@ -129,8 +129,8 @@
}, },
"trailer": { "trailer": {
"/ID": [ "/ID": [
"û˘·ƒÿ{5\u0005ÚS*ºo", "b:fb18b786ff7b358705da8a532aba8f6f",
"÷\u0017ž³QY¿ÔÀ\u000f\u0012¼ý˜\u0002" "b:f7179eb35159bfd4c00f128abcfd1f02"
], ],
"/Info": "2 0 R", "/Info": "2 0 R",
"/Root": "1 0 R", "/Root": "1 0 R",