2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-12-31 14:01:59 +00:00

Make "objects" and "pages" consistent in JSON output

This commit is contained in:
Jay Berkenbilt 2022-05-04 07:32:30 -04:00
parent 6b576797cd
commit 8b25de24c9
10 changed files with 401 additions and 109 deletions

View File

@ -1,3 +1,14 @@
2022-05-04 Jay Berkenbilt <ejb@ql.org>
* json v1 output: make "pages" and "objects" consistent.
Previously, "objects" always reflected the objects exactly as they
appeared in the original file, while "pages" reflected objects
after repair of the pages tree. This could be misleading. Now, if
"pages" is specified, "objects" shows the effects of repairing the
page tree, and if not, it doesn't. This makes no difference for
correct PDF files that don't have problems in the pages tree. JSON
v2 will behave in a similar way.
2022-05-03 Jay Berkenbilt <ejb@ql.org>
* Add new Pipeline class Pl_String which appends to a std::string&

View File

@ -511,6 +511,7 @@
"unfilterable",
"unparse",
"unpickling",
"unrepaired",
"unretrieved",
"unversioned",
"upages",

View File

@ -1618,15 +1618,7 @@ QPDFJob::doJSON(QPDF& pdf)
bool all_keys = m->json_keys.empty();
// The list of selectable top-level keys id duplicated in the
// following places: job.yml, QPDFJob::json_schema, and
// QPDFJob::doJSON. We do objects and objectinfo first so they
// reflect the original file without any side effects caused by
// other operations, such as repairing the pages tree.
if (all_keys || m->json_keys.count("objects")) {
doJSONObjects(pdf, j);
}
if (all_keys || m->json_keys.count("objectinfo")) {
doJSONObjectinfo(pdf, j);
}
// QPDFJob::doJSON.
if (all_keys || m->json_keys.count("pages")) {
doJSONPages(pdf, j);
}
@ -1646,6 +1638,17 @@ QPDFJob::doJSON(QPDF& pdf)
doJSONAttachments(pdf, j);
}
// We do objects and objectinfo last so their information is
// consistent with repairing the page tree. To see the original
// file with any page tree problems and the page tree not
// flattened, select objects/objectinfo without other keys.
if (all_keys || m->json_keys.count("objects")) {
doJSONObjects(pdf, j);
}
if (all_keys || m->json_keys.count("objectinfo")) {
doJSONObjectinfo(pdf, j);
}
// Check against schema
JSON schema = json_schema(&m->json_keys);

View File

@ -147,6 +147,16 @@ For the most part, the built-in JSON help tells you everything you need
to know about the JSON format, but there are a few non-obvious things to
be aware of:
- If a PDF file has certain types of errors in its pages tree (such as
page objects that are direct or multiple pages sharing the same
object ID), qpdf will automatically repair the pages tree. If you
specify ``"objects"`` and/or ``"objectinfo"`` without any other
keys, you will see the original pages tree without any corrections.
If you specify any of keys that require page tree traversal (for
example, ``"pages"``, ``"outlines"``, or ``"pagelabel"``), then
``"objects"`` and ``"objectinfo"`` will show the repaired page tree
so that object references will be consistent throughout the file.
- While qpdf guarantees that keys present in the help will be present
in the output, those fields may be null or empty if the information
is not known or absent in the file. Also, if you specify

View File

@ -125,6 +125,13 @@ For a detailed list of changes, please see the file
- Other changes
- In JSON v1 mode, the ``"objects"`` key now reflects the repaired
pages tree if ``"pages"`` (or any other key that has the side
effect of repairing the page tree) is specified. To see the
original objects with any unrepaired page tree errors, specify
``"objects"`` and/or ``"objectinfo"`` by themselves. This is
consistent with how JSON v2 behaves.
- A new chapter on contributing to qpdf has been added to the
documentation. See :ref:`contributing`.

View File

@ -2829,7 +2829,7 @@ $td->runtest("check output",
show_ntests();
# ----------
$td->notify("--- Page Tree Issues ---");
$n_tests += 9;
$n_tests += 11;
$td->runtest("linearize duplicated pages",
{$td->COMMAND =>
@ -2864,14 +2864,22 @@ $td->runtest("show direct pages",
$td->NORMALIZE_NEWLINES);
# Json mode for direct and duplicated pages illustrates that the
# "objects" section still shows the original objects before correction
# but the "pages" section shows the pages with their new object
# numbers.
# "objects" section the original objects before correction when
# "pages" is not output but after correct when it is.# numbers.
foreach my $f (qw(page_api_2 direct-pages))
{
$td->runtest("json for $f",
{$td->COMMAND => "qpdf --json=latest $f.pdf"},
{$td->FILE => "$f-json.out", $td->EXIT_STATUS => 0},
$td->runtest("json for $f (objects only)",
{$td->COMMAND =>
"qpdf --json=latest $f.pdf" .
" --json-key=objects --json-key=objectinfo"},
{$td->FILE => "$f-json-objects.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->runtest("json for $f (with pages)",
{$td->COMMAND =>
"qpdf --json=latest $f.pdf" .
" --json-key=objects --json-key=objectinfo" .
" --json-key=pages"},
{$td->FILE => "$f-json-pages.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
}

View File

@ -1,37 +1,4 @@
{
"acroform": {
"fields": [],
"hasacroform": false,
"needappearances": false
},
"attachments": {},
"encrypt": {
"capabilities": {
"accessibility": true,
"extract": true,
"moddifyannotations": true,
"modify": true,
"modifyassembly": true,
"modifyforms": true,
"modifyother": true,
"printhigh": true,
"printlow": true
},
"encrypted": false,
"ownerpasswordmatched": false,
"parameters": {
"P": 0,
"R": 0,
"V": 0,
"bits": 0,
"filemethod": "none",
"key": null,
"method": "none",
"streammethod": "none",
"stringmethod": "none"
},
"userpasswordmatched": false
},
"objectinfo": {
"1 0 R": {
"stream": {
@ -145,30 +112,6 @@
"/Size": 7
}
},
"outlines": [],
"pagelabels": [],
"pages": [
{
"contents": [
"3 0 R"
],
"images": [],
"label": null,
"object": "7 0 R",
"outlines": [],
"pageposfrom1": 1
},
{
"contents": [
"3 0 R"
],
"images": [],
"label": null,
"object": "8 0 R",
"outlines": [],
"pageposfrom1": 2
}
],
"parameters": {
"decodelevel": "generalized"
},

View File

@ -0,0 +1,157 @@
{
"objectinfo": {
"1 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
},
"2 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
},
"3 0 R": {
"stream": {
"filter": null,
"is": true,
"length": 44
}
},
"4 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
},
"5 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
},
"6 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
},
"7 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
},
"8 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
}
},
"objects": {
"1 0 R": {
"/Pages": "2 0 R",
"/Type": "/Catalog"
},
"2 0 R": {
"/Count": 2,
"/Kids": [
"7 0 R",
"8 0 R"
],
"/Type": "/Pages"
},
"3 0 R": {
"/Length": "4 0 R"
},
"4 0 R": 44,
"5 0 R": {
"/BaseFont": "/Helvetica",
"/Encoding": "/WinAnsiEncoding",
"/Name": "/F1",
"/Subtype": "/Type1",
"/Type": "/Font"
},
"6 0 R": [
"/PDF",
"/Text"
],
"7 0 R": {
"/Contents": "3 0 R",
"/MediaBox": [
0,
0,
612,
792
],
"/Parent": "2 0 R",
"/Resources": {
"/Font": {
"/F1": "5 0 R"
},
"/ProcSet": "6 0 R"
},
"/Type": "/Page"
},
"8 0 R": {
"/Contents": "3 0 R",
"/MediaBox": [
0,
0,
612,
792
],
"/Parent": "2 0 R",
"/Resources": {
"/Font": {
"/F1": "5 0 R"
},
"/ProcSet": "6 0 R"
},
"/Type": "/Page"
},
"trailer": {
"/ID": [
"\u0013#¥fi|WzfsU…©6ŸÎ<",
"7,¿DöÛ«`Ù&<\u000f\u000bÒj"
],
"/Root": "1 0 R",
"/Size": 7
}
},
"pages": [
{
"contents": [
"3 0 R"
],
"images": [],
"label": null,
"object": "7 0 R",
"outlines": [],
"pageposfrom1": 1
},
{
"contents": [
"3 0 R"
],
"images": [],
"label": null,
"object": "8 0 R",
"outlines": [],
"pageposfrom1": 2
}
],
"parameters": {
"decodelevel": "generalized"
},
"version": 1
}

View File

@ -0,0 +1,160 @@
{
"objectinfo": {
"1 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
},
"10 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
},
"2 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
},
"3 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
},
"4 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
},
"5 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
},
"6 0 R": {
"stream": {
"filter": null,
"is": true,
"length": 47
}
},
"7 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
},
"8 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
},
"9 0 R": {
"stream": {
"filter": null,
"is": true,
"length": 47
}
}
},
"objects": {
"1 0 R": {
"/Pages": "3 0 R",
"/Type": "/Catalog"
},
"10 0 R": 47,
"2 0 R": {
"/CreationDate": "D:20120621124041",
"/Producer": "Apex PDFWriter"
},
"3 0 R": {
"/Count": 3,
"/Kids": [
"4 0 R",
"4 0 R",
"5 0 R"
],
"/Type": "/Pages"
},
"4 0 R": {
"/Contents": "6 0 R",
"/MediaBox": [
0,
0,
612,
792
],
"/Parent": "3 0 R",
"/Resources": {
"/Font": {
"/F1": "8 0 R"
},
"/ProcSet": [
"/PDF",
"/Text"
]
},
"/Type": "/Page"
},
"5 0 R": {
"/Contents": "9 0 R",
"/MediaBox": [
0,
0,
612,
792
],
"/Parent": "3 0 R",
"/Resources": {
"/Font": {
"/F1": "8 0 R"
},
"/ProcSet": [
"/PDF",
"/Text"
]
},
"/Type": "/Page"
},
"6 0 R": {
"/Length": "7 0 R"
},
"7 0 R": 47,
"8 0 R": {
"/BaseFont": "/Times-Roman",
"/Encoding": "/WinAnsiEncoding",
"/Subtype": "/Type1",
"/Type": "/Font"
},
"9 0 R": {
"/Length": "10 0 R"
},
"trailer": {
"/ID": [
"û˘·ƒÿ{5\u0005ÚS*ºo",
"÷\u0017ž³QY¿ÔÀ\u000f\u0012¼ý˜\u0002"
],
"/Info": "2 0 R",
"/Root": "1 0 R",
"/Size": 11
}
},
"parameters": {
"decodelevel": "generalized"
},
"version": 1
}

View File

@ -1,37 +1,4 @@
{
"acroform": {
"fields": [],
"hasacroform": false,
"needappearances": false
},
"attachments": {},
"encrypt": {
"capabilities": {
"accessibility": true,
"extract": true,
"moddifyannotations": true,
"modify": true,
"modifyassembly": true,
"modifyforms": true,
"modifyother": true,
"printhigh": true,
"printlow": true
},
"encrypted": false,
"ownerpasswordmatched": false,
"parameters": {
"P": 0,
"R": 0,
"V": 0,
"bits": 0,
"filemethod": "none",
"key": null,
"method": "none",
"streammethod": "none",
"stringmethod": "none"
},
"userpasswordmatched": false
},
"objectinfo": {
"1 0 R": {
"stream": {
@ -47,6 +14,13 @@
"length": null
}
},
"11 0 R": {
"stream": {
"filter": null,
"is": false,
"length": null
}
},
"2 0 R": {
"stream": {
"filter": null,
@ -110,6 +84,26 @@
"/Type": "/Catalog"
},
"10 0 R": 47,
"11 0 R": {
"/Contents": "6 0 R",
"/MediaBox": [
0,
0,
612,
792
],
"/Parent": "3 0 R",
"/Resources": {
"/Font": {
"/F1": "8 0 R"
},
"/ProcSet": [
"/PDF",
"/Text"
]
},
"/Type": "/Page"
},
"2 0 R": {
"/CreationDate": "D:20120621124041",
"/Producer": "Apex PDFWriter"
@ -118,7 +112,7 @@
"/Count": 3,
"/Kids": [
"4 0 R",
"4 0 R",
"11 0 R",
"5 0 R"
],
"/Type": "/Pages"
@ -186,8 +180,6 @@
"/Size": 11
}
},
"outlines": [],
"pagelabels": [],
"pages": [
{
"contents": [