From 3246923cf2189554f7c348ebf51c9774c09deec8 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sat, 7 May 2022 08:20:09 -0400 Subject: [PATCH] Implement JSON v2 for String Also refine the herustic for deciding whether to use hexadecimal notation for a string. --- libqpdf/QPDF_String.cc | 83 +++++++++++------- libqpdf/qpdf/QPDF_String.hh | 1 + qpdf/qtest/qpdf/V4-clearmeta.pdf | Bin 15225 -> 15240 bytes qpdf/qtest/qpdf/direct-pages-json-objects.out | 4 +- qpdf/qtest/qpdf/direct-pages-json-pages.out | 4 +- qpdf/qtest/qpdf/good14.out | 2 +- qpdf/qtest/qpdf/merge-dict.out | 12 +-- qpdf/qtest/qpdf/page_api_2-json-objects.out | 8 +- qpdf/qtest/qpdf/page_api_2-json-pages.out | 8 +- 9 files changed, 69 insertions(+), 53 deletions(-) diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc index fd820998..4d45d851 100644 --- a/libqpdf/QPDF_String.cc +++ b/libqpdf/QPDF_String.cc @@ -45,8 +45,32 @@ QPDF_String::unparse() JSON QPDF_String::getJSON(int json_version) { - // QXXXQ - return JSON::makeString(getUTF8Val()); + if (json_version == 1) { + return JSON::makeString(getUTF8Val()); + } + // See if we can unambiguously represent as Unicode. + bool is_unicode = false; + std::string result; + std::string candidate = getUTF8Val(); + if (QUtil::is_utf16(this->val) || QUtil::is_explicit_utf8(this->val)) { + is_unicode = true; + result = candidate; + } else if (!useHexString()) { + std::string test; + if (QUtil::utf8_to_pdf_doc(candidate, test, '?') && + (test == this->val)) { + // This is a PDF-doc string that can be losslessly encoded + // as Unicode. + is_unicode = true; + result = candidate; + } + } + if (is_unicode) { + result = "u:" + result; + } else { + result = "b:" + QUtil::hex_encode(this->val); + } + return JSON::makeString(result); } QPDFObject::object_type_e @@ -61,41 +85,32 @@ QPDF_String::getTypeName() const return "string"; } +bool +QPDF_String::useHexString() const +{ + // Heuristic: use the hexadecimal representation of a string if + // there are any non-printable (in PDF Doc encoding) characters or + // if too large of a proportion of the string consists of + // non-ASCII characters. + bool nonprintable = false; + unsigned int non_ascii = 0; + for (unsigned int i = 0; i < this->val.length(); ++i) { + char ch = this->val.at(i); + if ((ch == 0) || + (!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) { + if ((ch >= 0) && (ch < 24)) { + nonprintable = true; + } + ++non_ascii; + } + } + return (nonprintable || (5 * non_ascii > val.length())); +} + std::string QPDF_String::unparse(bool force_binary) { - bool use_hexstring = force_binary; - if (!use_hexstring) { - unsigned int nonprintable = 0; - int consecutive_printable = 0; - for (unsigned int i = 0; i < this->val.length(); ++i) { - char ch = this->val.at(i); - // Note: do not use locale to determine printability. The - // PDF specification accepts arbitrary binary data. Some - // locales imply multibyte characters. We'll consider - // something printable if it is printable in 7-bit ASCII. - // We'll code this manually rather than being rude and - // setting locale. - if ((ch == 0) || - (!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) { - ++nonprintable; - consecutive_printable = 0; - } else { - if (++consecutive_printable > 5) { - // If there are more than 5 consecutive printable - // characters, I want to see them as such. - nonprintable = 0; - break; - } - } - } - - // Use hex notation if more than 20% of the characters are not - // printable in plain ASCII. - if (5 * nonprintable > val.length()) { - use_hexstring = true; - } - } + bool use_hexstring = force_binary || useHexString(); std::string result; if (use_hexstring) { result += "<" + QUtil::hex_encode(this->val) + ">"; diff --git a/libqpdf/qpdf/QPDF_String.hh b/libqpdf/qpdf/QPDF_String.hh index df33138c..6fd1b0e9 100644 --- a/libqpdf/qpdf/QPDF_String.hh +++ b/libqpdf/qpdf/QPDF_String.hh @@ -20,6 +20,7 @@ class QPDF_String: public QPDFObject std::string getUTF8Val() const; private: + bool useHexString() const; std::string val; }; diff --git a/qpdf/qtest/qpdf/V4-clearmeta.pdf b/qpdf/qtest/qpdf/V4-clearmeta.pdf index 7d5786c0472e5cccda36169b3fdf224f3c0ce6f8..5e804faa0b2aedbdb18c33a1a06db43f6fde4299 100644 GIT binary patch delta 1522 zcmZwHJ(AQw3%%(Za| z8#=+$EQFXVAr?Z2WnGY^(XNM^{OM}ygtTUav<^aA3L}*?h@+F?Ha0@sl@JdQ;#q`r z2|~I$8NJy*V_f|A-j$G^!bl|{flf$Rqd~&$%J?uZe}v>ZAdy8M%qg1d=MqZqCeB0N zKhyKixF4HPv(r}Kce-CSHuseg6#Bf;SKa8V&@Kyi74XvD*8`Ll4OI?;s~a_4(P{Tq z=F+$X93ogF5ZD@lsIL)-q^MfAt8XSf8A7~OaVsc70~|2 wXAy9K0}ce8FF-Y#P7z$Q8#V6~dT7w@fEJwxFu0t5zxwt(Wb^p=_WirfUyc1MbpQYW delta 1537 zcmZvcziL%M5XQ@p`wFJGO&}2x=FiT|PP!D@2DB2+3)rMdNJw>BOCc|yU}Y7^0|ab@ zynxumT6_U51os{cxo5w-#o;jQ?r*-CZ|B|RpNpHFE!`W4>~s6bAgX`w?|(l(`}FPf z`>h^-c>VL&K&(H$9t?Z0-o6>m2O+uJzkYxC?DlHw%fVPPmb6;S)8%6NAt9>}6HSN} zA&5B%#G)36rTZ#2u7i~efK?SBY77wd1c=s{AnS}W7uWc06)6BIMF3Kq08*kNBprkp z8!${q1&A2~#5@6FH2~t|0C6e6@TPcb$+*^cZ>Re{c~pdC03_G|5~?61jAx9g{^6-T zxN+ga+I-&ctITk=ashA^2cuWbrhjt4ll!Y}xX^W$_3B6sgBMXZB%K?QPL}^w9iPA{ z#(grW0gAIEfYSx4kWoN!)|fL^_0P;6!o=J|n56U&CWW#xm7GI}n4FOjt^S>2rs@c3 zY4`h>xo;h_fZ>`uV3=_mGmqXHAEzEOx}P~<__Vqxs1tUK>4unDXUs!l33xgz3B!W5 TNx6PF`nYYj*xh|`@^bMH*rYQS diff --git a/qpdf/qtest/qpdf/direct-pages-json-objects.out b/qpdf/qtest/qpdf/direct-pages-json-objects.out index 91b69e8b..1e0fe469 100644 --- a/qpdf/qtest/qpdf/direct-pages-json-objects.out +++ b/qpdf/qtest/qpdf/direct-pages-json-objects.out @@ -65,8 +65,8 @@ ], "trailer": { "/ID": [ - "\u0013#¥fi|WzfsU…©6ŸÎ<", - "7,¿DöÛ‹«`Ù&<\u000f\u000bÒj" + "b:1323a5937c577a66735583a93698ce3c", + "b:372cbf44f6db88ab60d9263c0f0bd26a" ], "/Root": "1 0 R", "/Size": 7 diff --git a/qpdf/qtest/qpdf/direct-pages-json-pages.out b/qpdf/qtest/qpdf/direct-pages-json-pages.out index 57cc0cb7..d58aafb1 100644 --- a/qpdf/qtest/qpdf/direct-pages-json-pages.out +++ b/qpdf/qtest/qpdf/direct-pages-json-pages.out @@ -89,8 +89,8 @@ }, "trailer": { "/ID": [ - "\u0013#¥fi|WzfsU…©6ŸÎ<", - "7,¿DöÛ‹«`Ù&<\u000f\u000bÒj" + "b:1323a5937c577a66735583a93698ce3c", + "b:372cbf44f6db88ab60d9263c0f0bd26a" ], "/Root": "1 0 R", "/Size": 7 diff --git a/qpdf/qtest/qpdf/good14.out b/qpdf/qtest/qpdf/good14.out index 2ac91d53..5963b3a6 100644 --- a/qpdf/qtest/qpdf/good14.out +++ b/qpdf/qtest/qpdf/good14.out @@ -9,7 +9,7 @@ three lines (string with \nCRLF and\nCR and\nLF) and another indentation -(\001B%DEF)<01> +<014225444546><01> <8a8b> (ab) <8c>
) > diff --git a/qpdf/qtest/qpdf/merge-dict.out b/qpdf/qtest/qpdf/merge-dict.out index e0b6dc3e..0135f75d 100644 --- a/qpdf/qtest/qpdf/merge-dict.out +++ b/qpdf/qtest/qpdf/merge-dict.out @@ -1,9 +1,9 @@ { - "/k1": "scalar1", + "/k1": "u:scalar1", "/k2": 16059, "/k3": { - "/a": "a", - "/b": "conflict: seen", + "/a": "u:a", + "/b": "u:conflict: seen", "/c": [ 2, 3 @@ -12,7 +12,7 @@ "/y": 25, "/z": 26 }, - "/e": "e" + "/e": "u:e" }, "/k4": { "/A": 65, @@ -24,11 +24,11 @@ "/k5": [ "/one", 2, - "three", + "u:three", [ "/four" ], - "two" + "u:two" ] } /A diff --git a/qpdf/qtest/qpdf/page_api_2-json-objects.out b/qpdf/qtest/qpdf/page_api_2-json-objects.out index cc6d1630..995a00e4 100644 --- a/qpdf/qtest/qpdf/page_api_2-json-objects.out +++ b/qpdf/qtest/qpdf/page_api_2-json-objects.out @@ -9,8 +9,8 @@ "/Type": "/Catalog" }, "2 0 R": { - "/CreationDate": "D:20120621124041", - "/Producer": "Apex PDFWriter" + "/CreationDate": "u:D:20120621124041", + "/Producer": "u:Apex PDFWriter" }, "3 0 R": { "/Count": 3, @@ -77,8 +77,8 @@ "10 0 R": 47, "trailer": { "/ID": [ - "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o", - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002" + "b:fb18b786ff7b358705da8a532aba8f6f", + "b:f7179eb35159bfd4c00f128abcfd1f02" ], "/Info": "2 0 R", "/Root": "1 0 R", diff --git a/qpdf/qtest/qpdf/page_api_2-json-pages.out b/qpdf/qtest/qpdf/page_api_2-json-pages.out index bf6a2d25..caf27100 100644 --- a/qpdf/qtest/qpdf/page_api_2-json-pages.out +++ b/qpdf/qtest/qpdf/page_api_2-json-pages.out @@ -41,8 +41,8 @@ "/Type": "/Catalog" }, "2 0 R": { - "/CreationDate": "D:20120621124041", - "/Producer": "Apex PDFWriter" + "/CreationDate": "u:D:20120621124041", + "/Producer": "u:Apex PDFWriter" }, "3 0 R": { "/Count": 3, @@ -129,8 +129,8 @@ }, "trailer": { "/ID": [ - "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o", - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002" + "b:fb18b786ff7b358705da8a532aba8f6f", + "b:f7179eb35159bfd4c00f128abcfd1f02" ], "/Info": "2 0 R", "/Root": "1 0 R",