From 4400ce84eeb204cdcb35950dd8fde094fc249051 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Thu, 21 Dec 2023 17:14:28 -0500 Subject: [PATCH] Add "n:/pdf-name" to qpdf JSON for binary names (fixes #1072) --- ChangeLog | 7 +++ libqpdf/QPDF_Name.cc | 10 +++- libqpdf/QPDF_json.cc | 8 +++ manual/json.rst | 9 ++- manual/release-notes.rst | 7 +++ qpdf/qtest/qpdf-json.test | 17 ++++++ qpdf/qtest/qpdf/weird-tokens.json | 83 +++++++++++++++++++++++++++ qpdf/qtest/qpdf/weird-tokens.pdf | 95 +++++++++++++++++++++++++++++++ 8 files changed, 234 insertions(+), 2 deletions(-) create mode 100644 qpdf/qtest/qpdf/weird-tokens.json create mode 100644 qpdf/qtest/qpdf/weird-tokens.pdf diff --git a/ChangeLog b/ChangeLog index 414244be..7450ddbc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,12 @@ 2023-12-21 Jay Berkenbilt + * Fix to QPDF JSON: the syntax "n:/pdf-syntax" is now accepted as + an alternative way to represent names. This can be used for any + name (e.g. "n:/text#2fplain"), but it is necessary when the name + contains binary characters. For example, /one#a0two must be + represented as "n:/one#a0two" since the single byte a0 is not + valid in JSON. Fixes #1072. + * From M. Holger: Refactor QPDFParser for performance. See #1059 for a discussion. diff --git a/libqpdf/QPDF_Name.cc b/libqpdf/QPDF_Name.cc index 4597372e..5fde9c65 100644 --- a/libqpdf/QPDF_Name.cc +++ b/libqpdf/QPDF_Name.cc @@ -57,6 +57,14 @@ QPDF_Name::getJSON(int json_version) if (json_version == 1) { return JSON::makeString(normalizeName(this->name)); } else { - return JSON::makeString(this->name); + bool has_8bit_chars; + bool is_valid_utf8; + bool is_utf16; + QUtil::analyze_encoding(this->name, has_8bit_chars, is_valid_utf8, is_utf16); + if (!has_8bit_chars || is_valid_utf8) { + return JSON::makeString(this->name); + } else { + return JSON::makeString("n:" + normalizeName(this->name)); + } } } diff --git a/libqpdf/QPDF_json.cc b/libqpdf/QPDF_json.cc index f8fd689a..864e1a56 100644 --- a/libqpdf/QPDF_json.cc +++ b/libqpdf/QPDF_json.cc @@ -144,6 +144,12 @@ is_name(std::string const& v) return ((v.length() > 1) && (v.at(0) == '/')); } +static bool +is_pdf_name(std::string const& v) +{ + return ((v.length() > 3) && (v.substr(0, 3) == "n:/")); +} + bool QPDF::test_json_validators() { @@ -740,6 +746,8 @@ QPDF::JSONReactor::makeObject(JSON const& value) result = QPDFObjectHandle::newString(QUtil::hex_decode(str)); } else if (is_name(str_v)) { result = QPDFObjectHandle::newName(str_v); + } else if (is_pdf_name(str_v)) { + result = QPDFObjectHandle::parse(str_v.substr(2)); } else { QTC::TC("qpdf", "QPDF_json unrecognized string value"); error(value.getStart(), "unrecognized string value"); diff --git a/manual/json.rst b/manual/json.rst index e848cc65..e07dde3b 100644 --- a/manual/json.rst +++ b/manual/json.rst @@ -258,6 +258,12 @@ Object Values syntax resolved. For example, the name whose canonical form (per the PDF specification) is ``text/plain`` would be represented in JSON as ``"/text/plain"`` and in PDF as ``"/text#2fplain"``. + Starting with qpdf 11.7.0, the syntax ``"n:/pdf-syntax"`` is + accepted as an alternative. This can be used for any name (e.g. + ``"n:/text#2fplain"``), but it is necessary when the name contains + binary characters. For example, ``/one#a0two`` must be represented + as ``"n:/one#a0two"`` since the single byte ``a0`` is not valid in + JSON. - Indirect object references are represented as JSON strings that look like a PDF indirect object reference and have the form @@ -824,7 +830,8 @@ version 2. - Names are shown in qpdf's canonical form rather than in PDF syntax. (Example: the PDF-syntax name ``/text#2fplain`` appeared as ``"/text#2fplain"`` in v1 but appears as ``"/text/plain"`` in - v2. + v2. In qpdf 11.7.0, a fix was made to accept ``"n:/pdf-syntax"`` + for names containing binary characters. - The top-level representation of an object in ``"objects"`` is a dictionary containing either a ``"value"`` key or a ``"stream"`` diff --git a/manual/release-notes.rst b/manual/release-notes.rst index 5e41fef7..f720f99e 100644 --- a/manual/release-notes.rst +++ b/manual/release-notes.rst @@ -45,6 +45,13 @@ Planned changes for future 12.x (subject to change): reference streams, linearization hint streams, and object streams. This has been fixed. + - Fix to QPDF JSON: the syntax ``"n:/pdf-syntax"`` is now accepted + as an alternative way to represent names. This can be used for + any name (e.g. ``"n:/text#2fplain"``), but it is necessary when + the name contains binary characters. For example, ``/one#a0two`` + must be represented as ``"n:/one#a0two"`` since the single byte + ``a0`` is not valid in JSON. + - Build Enhancements: - The qpdf test suite now passes when qpdf is linked with an diff --git a/qpdf/qtest/qpdf-json.test b/qpdf/qtest/qpdf-json.test index 961b507a..9691d995 100644 --- a/qpdf/qtest/qpdf-json.test +++ b/qpdf/qtest/qpdf-json.test @@ -61,6 +61,7 @@ my @goodfiles = ( 'form-fields-and-annotations.pdf', 'need-appearances.pdf', 'fxo-blue.pdf', + 'weird-tokens.pdf', ); $n_tests += 6 * scalar(@goodfiles); @@ -341,5 +342,21 @@ $td->runtest("check C API write to JSON stream", {$td->FILE => "auto-4"}, {$td->FILE => "qpdf-ctest-47-4"}); +# Bugs #1072 and #1079 illustrate cases that qpdf-json got wrong. In +# #1072, it was noticed that name tokens containing binary characters +# (using #xx) would generate invalid JSON, even though qpdf's own JSON +# parser would accept it. Also, the JSON spec allows real numbers in +# scientific notation, but the PDF spec does not. +$n_tests += 2; +$td->runtest("handle binary names", + {$td->COMMAND => + "qpdf --json-output weird-tokens.pdf a.json"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}); +# Round-trip is tested above. +$td->runtest("check json", + {$td->FILE => "a.json"}, + {$td->FILE => "weird-tokens.json"}, + $td->NORMALIZE_NEWLINES); + cleanup(); $td->report($n_tests); diff --git a/qpdf/qtest/qpdf/weird-tokens.json b/qpdf/qtest/qpdf/weird-tokens.json new file mode 100644 index 00000000..66f0ff06 --- /dev/null +++ b/qpdf/qtest/qpdf/weird-tokens.json @@ -0,0 +1,83 @@ +{ + "qpdf": [ + { + "jsonversion": 2, + "pdfversion": "2.0", + "pushedinheritedpageresources": false, + "calledgetallpages": false, + "maxobjectid": 6 + }, + { + "obj:1 0 R": { + "value": { + "/Extra": [ + "u:Names with binary data", + "n:/ABCDEF+#ba#da#cc#e5", + "/ABCEDEF+Ï€", + "n:/one+#a0two", + "/text/plain", + "u:Very small/large reals", + 0.00001, + 1000000000000 + ], + "/Pages": "2 0 R", + "/Type": "/Catalog" + } + }, + "obj:2 0 R": { + "value": { + "/Count": 1, + "/Kids": [ + "3 0 R" + ], + "/Type": "/Pages" + } + }, + "obj:3 0 R": { + "value": { + "/Contents": "4 0 R", + "/MediaBox": [ + 0, + 0, + 612, + 792 + ], + "/Parent": "2 0 R", + "/Resources": { + "/Font": { + "/F1": "6 0 R" + } + }, + "/Type": "/Page" + } + }, + "obj:4 0 R": { + "stream": { + "data": "QlQKICAvRjEgMjQgVGYKICA3MiA3MjAgVGQKICAoUG90YXRvKSBUagpFVAo=", + "dict": {} + } + }, + "obj:5 0 R": { + "value": 44 + }, + "obj:6 0 R": { + "value": { + "/BaseFont": "/Helvetica", + "/Encoding": "/WinAnsiEncoding", + "/Subtype": "/Type1", + "/Type": "/Font" + } + }, + "trailer": { + "value": { + "/ID": [ + "b:42841c13bbf709d79a200fa1691836f8", + "b:728c020f464c3cf7e02c12605fa7d88b" + ], + "/Root": "1 0 R", + "/Size": 7 + } + } + } + ] +} diff --git a/qpdf/qtest/qpdf/weird-tokens.pdf b/qpdf/qtest/qpdf/weird-tokens.pdf new file mode 100644 index 00000000..7c645df3 --- /dev/null +++ b/qpdf/qtest/qpdf/weird-tokens.pdf @@ -0,0 +1,95 @@ +%PDF-2.0 +%¿÷¢þ +%QDF-1.0 + +1 0 obj +<< + /Extra [ + (Names with binary data) + /ABCDEF+#ba#da#cc#e5 + /ABCEDEF+#cf#80 + /one+#a0two + /text#2fplain + (Very small/large reals) + 0.00001 + 1000000000000 + ] + /Pages 2 0 R + /Type /Catalog +>> +endobj + +2 0 obj +<< + /Count 1 + /Kids [ + 3 0 R + ] + /Type /Pages +>> +endobj + +%% Page 1 +3 0 obj +<< + /Contents 4 0 R + /MediaBox [ + 0 + 0 + 612 + 792 + ] + /Parent 2 0 R + /Resources << + /Font << + /F1 6 0 R + >> + >> + /Type /Page +>> +endobj + +%% Contents for page 1 +4 0 obj +<< + /Length 5 0 R +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET +endstream +endobj + +5 0 obj +44 +endobj + +6 0 obj +<< + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + /Subtype /Type1 + /Type /Font +>> +endobj + +xref +0 7 +0000000000 65535 f +0000000025 00000 n +0000000261 00000 n +0000000343 00000 n +0000000539 00000 n +0000000638 00000 n +0000000657 00000 n +trailer << + /Root 1 0 R + /Size 7 + /ID [<42841c13bbf709d79a200fa1691836f8><728c020f464c3cf7e02c12605fa7d88b>] +>> +startxref +763 +%%EOF