From 8b25de24c9b1e6acba042ea9ecdee783839e20a6 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Wed, 4 May 2022 07:32:30 -0400 Subject: [PATCH] Make "objects" and "pages" consistent in JSON output --- ChangeLog | 11 ++ cSpell.json | 1 + libqpdf/QPDFJob.cc | 21 ++- manual/json.rst | 10 ++ manual/release-notes.rst | 7 + qpdf/qtest/qpdf.test | 22 ++- ...json.out => direct-pages-json-objects.out} | 57 ------- qpdf/qtest/qpdf/direct-pages-json-pages.out | 157 +++++++++++++++++ qpdf/qtest/qpdf/page_api_2-json-objects.out | 160 ++++++++++++++++++ ...i_2-json.out => page_api_2-json-pages.out} | 64 +++---- 10 files changed, 401 insertions(+), 109 deletions(-) rename qpdf/qtest/qpdf/{direct-pages-json.out => direct-pages-json-objects.out} (65%) create mode 100644 qpdf/qtest/qpdf/direct-pages-json-pages.out create mode 100644 qpdf/qtest/qpdf/page_api_2-json-objects.out rename qpdf/qtest/qpdf/{page_api_2-json.out => page_api_2-json-pages.out} (81%) diff --git a/ChangeLog b/ChangeLog index dfcadf49..be196fff 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +2022-05-04 Jay Berkenbilt + + * json v1 output: make "pages" and "objects" consistent. + Previously, "objects" always reflected the objects exactly as they + appeared in the original file, while "pages" reflected objects + after repair of the pages tree. This could be misleading. Now, if + "pages" is specified, "objects" shows the effects of repairing the + page tree, and if not, it doesn't. This makes no difference for + correct PDF files that don't have problems in the pages tree. JSON + v2 will behave in a similar way. + 2022-05-03 Jay Berkenbilt * Add new Pipeline class Pl_String which appends to a std::string& diff --git a/cSpell.json b/cSpell.json index f757e511..2a5a4db4 100644 --- a/cSpell.json +++ b/cSpell.json @@ -511,6 +511,7 @@ "unfilterable", "unparse", "unpickling", + "unrepaired", "unretrieved", "unversioned", "upages", diff --git a/libqpdf/QPDFJob.cc b/libqpdf/QPDFJob.cc index 0c9b1583..ca56b8d5 100644 --- a/libqpdf/QPDFJob.cc +++ b/libqpdf/QPDFJob.cc @@ -1618,15 +1618,7 @@ QPDFJob::doJSON(QPDF& pdf) bool all_keys = m->json_keys.empty(); // The list of selectable top-level keys id duplicated in the // following places: job.yml, QPDFJob::json_schema, and - // QPDFJob::doJSON. We do objects and objectinfo first so they - // reflect the original file without any side effects caused by - // other operations, such as repairing the pages tree. - if (all_keys || m->json_keys.count("objects")) { - doJSONObjects(pdf, j); - } - if (all_keys || m->json_keys.count("objectinfo")) { - doJSONObjectinfo(pdf, j); - } + // QPDFJob::doJSON. if (all_keys || m->json_keys.count("pages")) { doJSONPages(pdf, j); } @@ -1646,6 +1638,17 @@ QPDFJob::doJSON(QPDF& pdf) doJSONAttachments(pdf, j); } + // We do objects and objectinfo last so their information is + // consistent with repairing the page tree. To see the original + // file with any page tree problems and the page tree not + // flattened, select objects/objectinfo without other keys. + if (all_keys || m->json_keys.count("objects")) { + doJSONObjects(pdf, j); + } + if (all_keys || m->json_keys.count("objectinfo")) { + doJSONObjectinfo(pdf, j); + } + // Check against schema JSON schema = json_schema(&m->json_keys); diff --git a/manual/json.rst b/manual/json.rst index 358cac90..ef6bed96 100644 --- a/manual/json.rst +++ b/manual/json.rst @@ -147,6 +147,16 @@ For the most part, the built-in JSON help tells you everything you need to know about the JSON format, but there are a few non-obvious things to be aware of: +- If a PDF file has certain types of errors in its pages tree (such as + page objects that are direct or multiple pages sharing the same + object ID), qpdf will automatically repair the pages tree. If you + specify ``"objects"`` and/or ``"objectinfo"`` without any other + keys, you will see the original pages tree without any corrections. + If you specify any of keys that require page tree traversal (for + example, ``"pages"``, ``"outlines"``, or ``"pagelabel"``), then + ``"objects"`` and ``"objectinfo"`` will show the repaired page tree + so that object references will be consistent throughout the file. + - While qpdf guarantees that keys present in the help will be present in the output, those fields may be null or empty if the information is not known or absent in the file. Also, if you specify diff --git a/manual/release-notes.rst b/manual/release-notes.rst index 08e2fd52..f313cd82 100644 --- a/manual/release-notes.rst +++ b/manual/release-notes.rst @@ -125,6 +125,13 @@ For a detailed list of changes, please see the file - Other changes + - In JSON v1 mode, the ``"objects"`` key now reflects the repaired + pages tree if ``"pages"`` (or any other key that has the side + effect of repairing the page tree) is specified. To see the + original objects with any unrepaired page tree errors, specify + ``"objects"`` and/or ``"objectinfo"`` by themselves. This is + consistent with how JSON v2 behaves. + - A new chapter on contributing to qpdf has been added to the documentation. See :ref:`contributing`. diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index d8359f75..3b26c9c8 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -2829,7 +2829,7 @@ $td->runtest("check output", show_ntests(); # ---------- $td->notify("--- Page Tree Issues ---"); -$n_tests += 9; +$n_tests += 11; $td->runtest("linearize duplicated pages", {$td->COMMAND => @@ -2864,14 +2864,22 @@ $td->runtest("show direct pages", $td->NORMALIZE_NEWLINES); # Json mode for direct and duplicated pages illustrates that the -# "objects" section still shows the original objects before correction -# but the "pages" section shows the pages with their new object -# numbers. +# "objects" section the original objects before correction when +# "pages" is not output but after correct when it is.# numbers. foreach my $f (qw(page_api_2 direct-pages)) { - $td->runtest("json for $f", - {$td->COMMAND => "qpdf --json=latest $f.pdf"}, - {$td->FILE => "$f-json.out", $td->EXIT_STATUS => 0}, + $td->runtest("json for $f (objects only)", + {$td->COMMAND => + "qpdf --json=latest $f.pdf" . + " --json-key=objects --json-key=objectinfo"}, + {$td->FILE => "$f-json-objects.out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); + $td->runtest("json for $f (with pages)", + {$td->COMMAND => + "qpdf --json=latest $f.pdf" . + " --json-key=objects --json-key=objectinfo" . + " --json-key=pages"}, + {$td->FILE => "$f-json-pages.out", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); } diff --git a/qpdf/qtest/qpdf/direct-pages-json.out b/qpdf/qtest/qpdf/direct-pages-json-objects.out similarity index 65% rename from qpdf/qtest/qpdf/direct-pages-json.out rename to qpdf/qtest/qpdf/direct-pages-json-objects.out index 52e5e2dd..1c1c50a3 100644 --- a/qpdf/qtest/qpdf/direct-pages-json.out +++ b/qpdf/qtest/qpdf/direct-pages-json-objects.out @@ -1,37 +1,4 @@ { - "acroform": { - "fields": [], - "hasacroform": false, - "needappearances": false - }, - "attachments": {}, - "encrypt": { - "capabilities": { - "accessibility": true, - "extract": true, - "moddifyannotations": true, - "modify": true, - "modifyassembly": true, - "modifyforms": true, - "modifyother": true, - "printhigh": true, - "printlow": true - }, - "encrypted": false, - "ownerpasswordmatched": false, - "parameters": { - "P": 0, - "R": 0, - "V": 0, - "bits": 0, - "filemethod": "none", - "key": null, - "method": "none", - "streammethod": "none", - "stringmethod": "none" - }, - "userpasswordmatched": false - }, "objectinfo": { "1 0 R": { "stream": { @@ -145,30 +112,6 @@ "/Size": 7 } }, - "outlines": [], - "pagelabels": [], - "pages": [ - { - "contents": [ - "3 0 R" - ], - "images": [], - "label": null, - "object": "7 0 R", - "outlines": [], - "pageposfrom1": 1 - }, - { - "contents": [ - "3 0 R" - ], - "images": [], - "label": null, - "object": "8 0 R", - "outlines": [], - "pageposfrom1": 2 - } - ], "parameters": { "decodelevel": "generalized" }, diff --git a/qpdf/qtest/qpdf/direct-pages-json-pages.out b/qpdf/qtest/qpdf/direct-pages-json-pages.out new file mode 100644 index 00000000..ee2c03d4 --- /dev/null +++ b/qpdf/qtest/qpdf/direct-pages-json-pages.out @@ -0,0 +1,157 @@ +{ + "objectinfo": { + "1 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + }, + "2 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + }, + "3 0 R": { + "stream": { + "filter": null, + "is": true, + "length": 44 + } + }, + "4 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + }, + "5 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + }, + "6 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + }, + "7 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + }, + "8 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + } + }, + "objects": { + "1 0 R": { + "/Pages": "2 0 R", + "/Type": "/Catalog" + }, + "2 0 R": { + "/Count": 2, + "/Kids": [ + "7 0 R", + "8 0 R" + ], + "/Type": "/Pages" + }, + "3 0 R": { + "/Length": "4 0 R" + }, + "4 0 R": 44, + "5 0 R": { + "/BaseFont": "/Helvetica", + "/Encoding": "/WinAnsiEncoding", + "/Name": "/F1", + "/Subtype": "/Type1", + "/Type": "/Font" + }, + "6 0 R": [ + "/PDF", + "/Text" + ], + "7 0 R": { + "/Contents": "3 0 R", + "/MediaBox": [ + 0, + 0, + 612, + 792 + ], + "/Parent": "2 0 R", + "/Resources": { + "/Font": { + "/F1": "5 0 R" + }, + "/ProcSet": "6 0 R" + }, + "/Type": "/Page" + }, + "8 0 R": { + "/Contents": "3 0 R", + "/MediaBox": [ + 0, + 0, + 612, + 792 + ], + "/Parent": "2 0 R", + "/Resources": { + "/Font": { + "/F1": "5 0 R" + }, + "/ProcSet": "6 0 R" + }, + "/Type": "/Page" + }, + "trailer": { + "/ID": [ + "\u0013#¥fi|WzfsU…©6ŸÎ<", + "7,¿DöÛ‹«`Ù&<\u000f\u000bÒj" + ], + "/Root": "1 0 R", + "/Size": 7 + } + }, + "pages": [ + { + "contents": [ + "3 0 R" + ], + "images": [], + "label": null, + "object": "7 0 R", + "outlines": [], + "pageposfrom1": 1 + }, + { + "contents": [ + "3 0 R" + ], + "images": [], + "label": null, + "object": "8 0 R", + "outlines": [], + "pageposfrom1": 2 + } + ], + "parameters": { + "decodelevel": "generalized" + }, + "version": 1 +} diff --git a/qpdf/qtest/qpdf/page_api_2-json-objects.out b/qpdf/qtest/qpdf/page_api_2-json-objects.out new file mode 100644 index 00000000..76feb0d6 --- /dev/null +++ b/qpdf/qtest/qpdf/page_api_2-json-objects.out @@ -0,0 +1,160 @@ +{ + "objectinfo": { + "1 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + }, + "10 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + }, + "2 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + }, + "3 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + }, + "4 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + }, + "5 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + }, + "6 0 R": { + "stream": { + "filter": null, + "is": true, + "length": 47 + } + }, + "7 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + }, + "8 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + }, + "9 0 R": { + "stream": { + "filter": null, + "is": true, + "length": 47 + } + } + }, + "objects": { + "1 0 R": { + "/Pages": "3 0 R", + "/Type": "/Catalog" + }, + "10 0 R": 47, + "2 0 R": { + "/CreationDate": "D:20120621124041", + "/Producer": "Apex PDFWriter" + }, + "3 0 R": { + "/Count": 3, + "/Kids": [ + "4 0 R", + "4 0 R", + "5 0 R" + ], + "/Type": "/Pages" + }, + "4 0 R": { + "/Contents": "6 0 R", + "/MediaBox": [ + 0, + 0, + 612, + 792 + ], + "/Parent": "3 0 R", + "/Resources": { + "/Font": { + "/F1": "8 0 R" + }, + "/ProcSet": [ + "/PDF", + "/Text" + ] + }, + "/Type": "/Page" + }, + "5 0 R": { + "/Contents": "9 0 R", + "/MediaBox": [ + 0, + 0, + 612, + 792 + ], + "/Parent": "3 0 R", + "/Resources": { + "/Font": { + "/F1": "8 0 R" + }, + "/ProcSet": [ + "/PDF", + "/Text" + ] + }, + "/Type": "/Page" + }, + "6 0 R": { + "/Length": "7 0 R" + }, + "7 0 R": 47, + "8 0 R": { + "/BaseFont": "/Times-Roman", + "/Encoding": "/WinAnsiEncoding", + "/Subtype": "/Type1", + "/Type": "/Font" + }, + "9 0 R": { + "/Length": "10 0 R" + }, + "trailer": { + "/ID": [ + "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o", + "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002" + ], + "/Info": "2 0 R", + "/Root": "1 0 R", + "/Size": 11 + } + }, + "parameters": { + "decodelevel": "generalized" + }, + "version": 1 +} diff --git a/qpdf/qtest/qpdf/page_api_2-json.out b/qpdf/qtest/qpdf/page_api_2-json-pages.out similarity index 81% rename from qpdf/qtest/qpdf/page_api_2-json.out rename to qpdf/qtest/qpdf/page_api_2-json-pages.out index bef00d02..d08e18d6 100644 --- a/qpdf/qtest/qpdf/page_api_2-json.out +++ b/qpdf/qtest/qpdf/page_api_2-json-pages.out @@ -1,37 +1,4 @@ { - "acroform": { - "fields": [], - "hasacroform": false, - "needappearances": false - }, - "attachments": {}, - "encrypt": { - "capabilities": { - "accessibility": true, - "extract": true, - "moddifyannotations": true, - "modify": true, - "modifyassembly": true, - "modifyforms": true, - "modifyother": true, - "printhigh": true, - "printlow": true - }, - "encrypted": false, - "ownerpasswordmatched": false, - "parameters": { - "P": 0, - "R": 0, - "V": 0, - "bits": 0, - "filemethod": "none", - "key": null, - "method": "none", - "streammethod": "none", - "stringmethod": "none" - }, - "userpasswordmatched": false - }, "objectinfo": { "1 0 R": { "stream": { @@ -47,6 +14,13 @@ "length": null } }, + "11 0 R": { + "stream": { + "filter": null, + "is": false, + "length": null + } + }, "2 0 R": { "stream": { "filter": null, @@ -110,6 +84,26 @@ "/Type": "/Catalog" }, "10 0 R": 47, + "11 0 R": { + "/Contents": "6 0 R", + "/MediaBox": [ + 0, + 0, + 612, + 792 + ], + "/Parent": "3 0 R", + "/Resources": { + "/Font": { + "/F1": "8 0 R" + }, + "/ProcSet": [ + "/PDF", + "/Text" + ] + }, + "/Type": "/Page" + }, "2 0 R": { "/CreationDate": "D:20120621124041", "/Producer": "Apex PDFWriter" @@ -118,7 +112,7 @@ "/Count": 3, "/Kids": [ "4 0 R", - "4 0 R", + "11 0 R", "5 0 R" ], "/Type": "/Pages" @@ -186,8 +180,6 @@ "/Size": 11 } }, - "outlines": [], - "pagelabels": [], "pages": [ { "contents": [