mirror of
https://github.com/qpdf/qpdf.git
synced 2024-11-01 03:12:29 +00:00
Implement JSON v2 for String
Also refine the herustic for deciding whether to use hexadecimal notation for a string.
This commit is contained in:
parent
16f4f94cd9
commit
3246923cf2
@ -45,9 +45,33 @@ QPDF_String::unparse()
|
|||||||
JSON
|
JSON
|
||||||
QPDF_String::getJSON(int json_version)
|
QPDF_String::getJSON(int json_version)
|
||||||
{
|
{
|
||||||
// QXXXQ
|
if (json_version == 1) {
|
||||||
return JSON::makeString(getUTF8Val());
|
return JSON::makeString(getUTF8Val());
|
||||||
}
|
}
|
||||||
|
// See if we can unambiguously represent as Unicode.
|
||||||
|
bool is_unicode = false;
|
||||||
|
std::string result;
|
||||||
|
std::string candidate = getUTF8Val();
|
||||||
|
if (QUtil::is_utf16(this->val) || QUtil::is_explicit_utf8(this->val)) {
|
||||||
|
is_unicode = true;
|
||||||
|
result = candidate;
|
||||||
|
} else if (!useHexString()) {
|
||||||
|
std::string test;
|
||||||
|
if (QUtil::utf8_to_pdf_doc(candidate, test, '?') &&
|
||||||
|
(test == this->val)) {
|
||||||
|
// This is a PDF-doc string that can be losslessly encoded
|
||||||
|
// as Unicode.
|
||||||
|
is_unicode = true;
|
||||||
|
result = candidate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (is_unicode) {
|
||||||
|
result = "u:" + result;
|
||||||
|
} else {
|
||||||
|
result = "b:" + QUtil::hex_encode(this->val);
|
||||||
|
}
|
||||||
|
return JSON::makeString(result);
|
||||||
|
}
|
||||||
|
|
||||||
QPDFObject::object_type_e
|
QPDFObject::object_type_e
|
||||||
QPDF_String::getTypeCode() const
|
QPDF_String::getTypeCode() const
|
||||||
@ -61,41 +85,32 @@ QPDF_String::getTypeName() const
|
|||||||
return "string";
|
return "string";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
QPDF_String::useHexString() const
|
||||||
|
{
|
||||||
|
// Heuristic: use the hexadecimal representation of a string if
|
||||||
|
// there are any non-printable (in PDF Doc encoding) characters or
|
||||||
|
// if too large of a proportion of the string consists of
|
||||||
|
// non-ASCII characters.
|
||||||
|
bool nonprintable = false;
|
||||||
|
unsigned int non_ascii = 0;
|
||||||
|
for (unsigned int i = 0; i < this->val.length(); ++i) {
|
||||||
|
char ch = this->val.at(i);
|
||||||
|
if ((ch == 0) ||
|
||||||
|
(!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) {
|
||||||
|
if ((ch >= 0) && (ch < 24)) {
|
||||||
|
nonprintable = true;
|
||||||
|
}
|
||||||
|
++non_ascii;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return (nonprintable || (5 * non_ascii > val.length()));
|
||||||
|
}
|
||||||
|
|
||||||
std::string
|
std::string
|
||||||
QPDF_String::unparse(bool force_binary)
|
QPDF_String::unparse(bool force_binary)
|
||||||
{
|
{
|
||||||
bool use_hexstring = force_binary;
|
bool use_hexstring = force_binary || useHexString();
|
||||||
if (!use_hexstring) {
|
|
||||||
unsigned int nonprintable = 0;
|
|
||||||
int consecutive_printable = 0;
|
|
||||||
for (unsigned int i = 0; i < this->val.length(); ++i) {
|
|
||||||
char ch = this->val.at(i);
|
|
||||||
// Note: do not use locale to determine printability. The
|
|
||||||
// PDF specification accepts arbitrary binary data. Some
|
|
||||||
// locales imply multibyte characters. We'll consider
|
|
||||||
// something printable if it is printable in 7-bit ASCII.
|
|
||||||
// We'll code this manually rather than being rude and
|
|
||||||
// setting locale.
|
|
||||||
if ((ch == 0) ||
|
|
||||||
(!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) {
|
|
||||||
++nonprintable;
|
|
||||||
consecutive_printable = 0;
|
|
||||||
} else {
|
|
||||||
if (++consecutive_printable > 5) {
|
|
||||||
// If there are more than 5 consecutive printable
|
|
||||||
// characters, I want to see them as such.
|
|
||||||
nonprintable = 0;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use hex notation if more than 20% of the characters are not
|
|
||||||
// printable in plain ASCII.
|
|
||||||
if (5 * nonprintable > val.length()) {
|
|
||||||
use_hexstring = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
std::string result;
|
std::string result;
|
||||||
if (use_hexstring) {
|
if (use_hexstring) {
|
||||||
result += "<" + QUtil::hex_encode(this->val) + ">";
|
result += "<" + QUtil::hex_encode(this->val) + ">";
|
||||||
|
@ -20,6 +20,7 @@ class QPDF_String: public QPDFObject
|
|||||||
std::string getUTF8Val() const;
|
std::string getUTF8Val() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
bool useHexString() const;
|
||||||
std::string val;
|
std::string val;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Binary file not shown.
@ -65,8 +65,8 @@
|
|||||||
],
|
],
|
||||||
"trailer": {
|
"trailer": {
|
||||||
"/ID": [
|
"/ID": [
|
||||||
"\u0013#¥fi|WzfsU…©6ŸÎ<",
|
"b:1323a5937c577a66735583a93698ce3c",
|
||||||
"7,¿DöÛ‹«`Ù&<\u000f\u000bÒj"
|
"b:372cbf44f6db88ab60d9263c0f0bd26a"
|
||||||
],
|
],
|
||||||
"/Root": "1 0 R",
|
"/Root": "1 0 R",
|
||||||
"/Size": 7
|
"/Size": 7
|
||||||
|
@ -89,8 +89,8 @@
|
|||||||
},
|
},
|
||||||
"trailer": {
|
"trailer": {
|
||||||
"/ID": [
|
"/ID": [
|
||||||
"\u0013#¥fi|WzfsU…©6ŸÎ<",
|
"b:1323a5937c577a66735583a93698ce3c",
|
||||||
"7,¿DöÛ‹«`Ù&<\u000f\u000bÒj"
|
"b:372cbf44f6db88ab60d9263c0f0bd26a"
|
||||||
],
|
],
|
||||||
"/Root": "1 0 R",
|
"/Root": "1 0 R",
|
||||||
"/Size": 7
|
"/Size": 7
|
||||||
|
@ -9,7 +9,7 @@ three lines
|
|||||||
(string with \nCRLF and\nCR and\nLF)
|
(string with \nCRLF and\nCR and\nLF)
|
||||||
and another
|
and another
|
||||||
indentation
|
indentation
|
||||||
(\001B%DEF)<01>
|
<014225444546><01>
|
||||||
<8a8b>
|
<8a8b>
|
||||||
(ab)
|
(ab)
|
||||||
<8c><dd> ) >
|
<8c><dd> ) >
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
{
|
{
|
||||||
"/k1": "scalar1",
|
"/k1": "u:scalar1",
|
||||||
"/k2": 16059,
|
"/k2": 16059,
|
||||||
"/k3": {
|
"/k3": {
|
||||||
"/a": "a",
|
"/a": "u:a",
|
||||||
"/b": "conflict: seen",
|
"/b": "u:conflict: seen",
|
||||||
"/c": [
|
"/c": [
|
||||||
2,
|
2,
|
||||||
3
|
3
|
||||||
@ -12,7 +12,7 @@
|
|||||||
"/y": 25,
|
"/y": 25,
|
||||||
"/z": 26
|
"/z": 26
|
||||||
},
|
},
|
||||||
"/e": "e"
|
"/e": "u:e"
|
||||||
},
|
},
|
||||||
"/k4": {
|
"/k4": {
|
||||||
"/A": 65,
|
"/A": 65,
|
||||||
@ -24,11 +24,11 @@
|
|||||||
"/k5": [
|
"/k5": [
|
||||||
"/one",
|
"/one",
|
||||||
2,
|
2,
|
||||||
"three",
|
"u:three",
|
||||||
[
|
[
|
||||||
"/four"
|
"/four"
|
||||||
],
|
],
|
||||||
"two"
|
"u:two"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
/A
|
/A
|
||||||
|
@ -9,8 +9,8 @@
|
|||||||
"/Type": "/Catalog"
|
"/Type": "/Catalog"
|
||||||
},
|
},
|
||||||
"2 0 R": {
|
"2 0 R": {
|
||||||
"/CreationDate": "D:20120621124041",
|
"/CreationDate": "u:D:20120621124041",
|
||||||
"/Producer": "Apex PDFWriter"
|
"/Producer": "u:Apex PDFWriter"
|
||||||
},
|
},
|
||||||
"3 0 R": {
|
"3 0 R": {
|
||||||
"/Count": 3,
|
"/Count": 3,
|
||||||
@ -77,8 +77,8 @@
|
|||||||
"10 0 R": 47,
|
"10 0 R": 47,
|
||||||
"trailer": {
|
"trailer": {
|
||||||
"/ID": [
|
"/ID": [
|
||||||
"û˘·ƒÿ{5⁄\u0005Ú−S*º‘o",
|
"b:fb18b786ff7b358705da8a532aba8f6f",
|
||||||
"÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002"
|
"b:f7179eb35159bfd4c00f128abcfd1f02"
|
||||||
],
|
],
|
||||||
"/Info": "2 0 R",
|
"/Info": "2 0 R",
|
||||||
"/Root": "1 0 R",
|
"/Root": "1 0 R",
|
||||||
|
@ -41,8 +41,8 @@
|
|||||||
"/Type": "/Catalog"
|
"/Type": "/Catalog"
|
||||||
},
|
},
|
||||||
"2 0 R": {
|
"2 0 R": {
|
||||||
"/CreationDate": "D:20120621124041",
|
"/CreationDate": "u:D:20120621124041",
|
||||||
"/Producer": "Apex PDFWriter"
|
"/Producer": "u:Apex PDFWriter"
|
||||||
},
|
},
|
||||||
"3 0 R": {
|
"3 0 R": {
|
||||||
"/Count": 3,
|
"/Count": 3,
|
||||||
@ -129,8 +129,8 @@
|
|||||||
},
|
},
|
||||||
"trailer": {
|
"trailer": {
|
||||||
"/ID": [
|
"/ID": [
|
||||||
"û˘·ƒÿ{5⁄\u0005Ú−S*º‘o",
|
"b:fb18b786ff7b358705da8a532aba8f6f",
|
||||||
"÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002"
|
"b:f7179eb35159bfd4c00f128abcfd1f02"
|
||||||
],
|
],
|
||||||
"/Info": "2 0 R",
|
"/Info": "2 0 R",
|
||||||
"/Root": "1 0 R",
|
"/Root": "1 0 R",
|
||||||
|
Loading…
Reference in New Issue
Block a user