2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-05-30 00:40:52 +00:00

Simplify --json-output

Now --json-output just changes defaults. Allow output file with --json.
This commit is contained in:
Jay Berkenbilt 2022-07-31 10:34:05 -04:00
parent 80acfc3826
commit 5f4224f31a
10 changed files with 261 additions and 297 deletions

43
TODO
View File

@ -69,46 +69,11 @@ Soon: Break ground on "Document-level work"
JSON v2 fixes JSON v2 fixes
============= =============
* Unify code between QPDFJob::doJSONObjects and QPDF::writeJSON. Make * Rethink QPDF::writeJSON. Maybe provide a simpler overload?
sure that the "qpdf" key is always present when json-output is
specified.
* Change the name of the "qpdf-v2" key to "qpdf". Use that in place of * When reading back in, we'll have to call
"objects" and change its content to a two-element array whose first pushInheritedAttributesToPage or getAllPages based on the values
element is metadata required (or useful) for parsing and whose of the metadata.
second element contains the actual data. Use of an array is the only
way to ensure that the metadata is guaranteed to be parsed before we
start parsing the objects. Example:
{
"qpdf": [
{
"jsonversion": 2,
"pdfversion": "1.3",
"pushedinheritedpageresources": false,
"calledgetallpages": false,
"maxobjectid": 10
},
{
... objects ...
}
]
}
This implies a few things:
* Still need to test pushedinheritedpageresources and
calledgetallpages and check/use their values when reading
* Fix --json-help
* When reading back in, we'll have to call
pushInheritedAttributesToPage or getAllPages based on the values
of the metadata.
* Test --json with --json-stream-data and --json-output with
--json-stream-data=none. Recheck writeJSON's handling of the
pipeline argument.
* Support json v2 in the C API. At a minimum, write_json, * Support json v2 in the C API. At a minimum, write_json,
create_from_json, and update_from_json need to be there and should create_from_json, and update_from_json need to be there and should

View File

@ -133,7 +133,7 @@ class QPDF
QPDF_DLL QPDF_DLL
void updateFromJSON(std::shared_ptr<InputSource>); void updateFromJSON(std::shared_ptr<InputSource>);
// Write qpdf json format to the pipeline "p". The only supported // Write qpdf JSON format to the pipeline "p". The only supported
// version is 2. // version is 2.
// //
// If the value of "complete" is true, a complete JSON object // If the value of "complete" is true, a complete JSON object

View File

@ -554,7 +554,7 @@ class QPDFJob
void setEncryptionOptions(QPDF&, QPDFWriter&); void setEncryptionOptions(QPDF&, QPDFWriter&);
void maybeFixWritePassword(int R, std::string& password); void maybeFixWritePassword(int R, std::string& password);
void writeOutfile(QPDF& pdf); void writeOutfile(QPDF& pdf);
void writeJSON(Pipeline* p, QPDF& pdf, bool complete, bool& first_key); void writeJSON(QPDF& pdf);
// JSON // JSON
void doJSON(QPDF& pdf, Pipeline*); void doJSON(QPDF& pdf, Pipeline*);

View File

@ -8,10 +8,10 @@ include/qpdf/auto_job_c_pages.hh b3cc0f21029f6d89efa043dcdbfa183cb59325b6506001c
include/qpdf/auto_job_c_uo.hh ae21b69a1efa9333050f4833d465f6daff87e5b38e5106e49bbef5d4132e4ed1 include/qpdf/auto_job_c_uo.hh ae21b69a1efa9333050f4833d465f6daff87e5b38e5106e49bbef5d4132e4ed1
job.yml f9564f18b08a45d17328af43652645771d3498471820c858b8c9013a193e1412 job.yml f9564f18b08a45d17328af43652645771d3498471820c858b8c9013a193e1412
libqpdf/qpdf/auto_job_decl.hh 7844eba58edffb9494b19e8eca6fd59a24d6e152ca606c3b07da569f753df2da libqpdf/qpdf/auto_job_decl.hh 7844eba58edffb9494b19e8eca6fd59a24d6e152ca606c3b07da569f753df2da
libqpdf/qpdf/auto_job_help.hh db2e4350c700e064b204e3e20d4fee4eddfe312b28092afcf608b4b6863d30e5 libqpdf/qpdf/auto_job_help.hh 700d7600b34588169c80f3e325e39e592e2f5c1af1cdac16614150ff38424b40
libqpdf/qpdf/auto_job_init.hh fd1635a5ad6ba16b7ae008467145560a59a5ecfd10d29c5ef7cd0d8347747cd2 libqpdf/qpdf/auto_job_init.hh fd1635a5ad6ba16b7ae008467145560a59a5ecfd10d29c5ef7cd0d8347747cd2
libqpdf/qpdf/auto_job_json_decl.hh 06caa46eaf71db8a50c046f91866baa8087745a9474319fb7c86d92634cc8297 libqpdf/qpdf/auto_job_json_decl.hh 06caa46eaf71db8a50c046f91866baa8087745a9474319fb7c86d92634cc8297
libqpdf/qpdf/auto_job_json_init.hh 59545578a2e47c660ff98516ed53f06638be75eb4658e2a09d32cc08e0cb7268 libqpdf/qpdf/auto_job_json_init.hh 59545578a2e47c660ff98516ed53f06638be75eb4658e2a09d32cc08e0cb7268
libqpdf/qpdf/auto_job_schema.hh 9d543cd4a43eafffc2c4b8a6fee29e399c271c52cb6f7d417ae5497b3c1127dc libqpdf/qpdf/auto_job_schema.hh 5352ef1be1ad7cc6f4f36dab88f2937d278e6bd3a0e2d46259794dc226c8ba6b
manual/_ext/qpdf.py 6add6321666031d55ed4aedf7c00e5662bba856dfcd66ccb526563bffefbb580 manual/_ext/qpdf.py 6add6321666031d55ed4aedf7c00e5662bba856dfcd66ccb526563bffefbb580
manual/cli.rst 8e1f443c6fa000e023e516c318df4d04d58233d4d8648907c4a71f0ea5722bca manual/cli.rst bbce4cfb662a96c8df0c8563f8065844b77aca7b4ec6385955546b9a455d9953

View File

@ -680,8 +680,15 @@ QPDFJob::checkConfiguration()
" an output file is specified"); " an output file is specified");
} else if (m->split_pages) { } else if (m->split_pages) {
usage("--split-pages may not be used with --replace-input"); usage("--split-pages may not be used with --replace-input");
} else if (m->json_version) {
usage("--json may not be used with --replace-input");
} }
} }
if (m->json_version && (m->outfilename == nullptr)) {
// The output file is optional with --json for backward
// compatibility and defaults to standard output.
m->outfilename = QUtil::make_shared_cstr("-");
}
if (m->infilename == nullptr) { if (m->infilename == nullptr) {
usage("an input file name is required"); usage("an input file name is required");
} else if ( } else if (
@ -1116,25 +1123,47 @@ QPDFJob::doJSONObject(
void void
QPDFJob::doJSONObjects(Pipeline* p, bool& first, QPDF& pdf) QPDFJob::doJSONObjects(Pipeline* p, bool& first, QPDF& pdf)
{ {
JSON::writeDictionaryKey(p, first, "objects", 1); if (m->json_version == 1) {
bool first_object = true; JSON::writeDictionaryKey(p, first, "objects", 1);
JSON::writeDictionaryOpen(p, first_object, 1); bool first_object = true;
bool all_objects = m->json_objects.empty(); JSON::writeDictionaryOpen(p, first_object, 1);
std::set<QPDFObjGen> wanted_og = getWantedJSONObjects(); bool all_objects = m->json_objects.empty();
for (auto& obj: pdf.getAllObjects()) { std::set<QPDFObjGen> wanted_og = getWantedJSONObjects();
std::string key = obj.unparse(); for (auto& obj: pdf.getAllObjects()) {
if (this->m->json_version > 1) { std::string key = obj.unparse();
key = "obj:" + key; if (this->m->json_version > 1) {
key = "obj:" + key;
}
if (all_objects || wanted_og.count(obj.getObjGen())) {
doJSONObject(p, first_object, key, obj);
}
} }
if (all_objects || wanted_og.count(obj.getObjGen())) { if (all_objects || m->json_objects.count("trailer")) {
doJSONObject(p, first_object, key, obj); auto trailer = pdf.getTrailer();
doJSONObject(p, first_object, "trailer", trailer);
} }
JSON::writeDictionaryClose(p, first_object, 1);
} else {
std::set<std::string> json_objects;
if (this->m->json_objects.count("trailer")) {
json_objects.insert("trailer");
}
auto wanted = getWantedJSONObjects();
for (auto const& og: wanted) {
std::ostringstream s;
s << "obj:" << og.unparse(' ') << " R";
json_objects.insert(s.str());
}
pdf.writeJSON(
this->m->json_version,
p,
false,
first,
this->m->decode_level,
this->m->json_stream_data,
this->m->json_stream_prefix,
json_objects);
} }
if (all_objects || m->json_objects.count("trailer")) {
auto trailer = pdf.getTrailer();
doJSONObject(p, first_object, "trailer", trailer);
}
JSON::writeDictionaryClose(p, first_object, 1);
} }
void void
@ -1777,7 +1806,7 @@ void
QPDFJob::doJSON(QPDF& pdf, Pipeline* p) QPDFJob::doJSON(QPDF& pdf, Pipeline* p)
{ {
// qpdf guarantees that no new top-level keys whose names start // qpdf guarantees that no new top-level keys whose names start
// with "xdata" will be added. These are reserved for users. // with "x-" will be added. These are reserved for users.
std::string captured_json; std::string captured_json;
std::shared_ptr<Pl_String> pl_str; std::shared_ptr<Pl_String> pl_str;
@ -1788,32 +1817,38 @@ QPDFJob::doJSON(QPDF& pdf, Pipeline* p)
bool first = true; bool first = true;
JSON::writeDictionaryOpen(p, first, 0); JSON::writeDictionaryOpen(p, first, 0);
// This version is updated every time a non-backward-compatible
// change is made to the JSON format. Clients of the JSON are to
// ignore unrecognized keys, so we only update the version of a
// key disappears or if its value changes meaning.
JSON::writeDictionaryItem(
p, first, "version", JSON::makeInt(this->m->json_version), 1);
JSON j_params = JSON::makeDictionary();
std::string decode_level_str;
switch (m->decode_level) {
case qpdf_dl_none:
decode_level_str = "none";
break;
case qpdf_dl_generalized:
decode_level_str = "generalized";
break;
case qpdf_dl_specialized:
decode_level_str = "specialized";
break;
case qpdf_dl_all:
decode_level_str = "all";
break;
}
j_params.addDictionaryMember(
"decodelevel", JSON::makeString(decode_level_str));
JSON::writeDictionaryItem(p, first, "parameters", j_params, 1);
if (m->json_output) {
// Exclude version and parameters to keep the output file
// minimal. The JSON version is inside the "qpdf" key for
// version 2.
} else {
// This version is updated every time a non-backward-compatible
// change is made to the JSON format. Clients of the JSON are to
// ignore unrecognized keys, so we only update the version of a
// key disappears or if its value changes meaning.
JSON::writeDictionaryItem(
p, first, "version", JSON::makeInt(this->m->json_version), 1);
JSON j_params = JSON::makeDictionary();
std::string decode_level_str;
switch (m->decode_level) {
case qpdf_dl_none:
decode_level_str = "none";
break;
case qpdf_dl_generalized:
decode_level_str = "generalized";
break;
case qpdf_dl_specialized:
decode_level_str = "specialized";
break;
case qpdf_dl_all:
decode_level_str = "all";
break;
}
j_params.addDictionaryMember(
"decodelevel", JSON::makeString(decode_level_str));
JSON::writeDictionaryItem(p, first, "parameters", j_params, 1);
}
bool all_keys = m->json_keys.empty(); bool all_keys = m->json_keys.empty();
// The list of selectable top-level keys id duplicated in the // The list of selectable top-level keys id duplicated in the
// following places: job.yml, QPDFJob::json_schema, and // following places: job.yml, QPDFJob::json_schema, and
@ -1850,11 +1885,7 @@ QPDFJob::doJSON(QPDF& pdf, Pipeline* p)
// qpdf/objects/objectinfo without other keys. // qpdf/objects/objectinfo without other keys.
if (all_keys || m->json_keys.count("objects") || if (all_keys || m->json_keys.count("objects") ||
m->json_keys.count("qpdf")) { m->json_keys.count("qpdf")) {
if (this->m->json_version == 1) { doJSONObjects(p, first, pdf);
doJSONObjects(p, first, pdf);
} else {
writeJSON(p, pdf, false, first);
}
} }
if (this->m->json_version == 1) { if (this->m->json_version == 1) {
// "objectinfo" is not needed for version >1 since you can // "objectinfo" is not needed for version >1 since you can
@ -1889,9 +1920,6 @@ QPDFJob::doInspection(QPDF& pdf)
if (m->check) { if (m->check) {
doCheck(pdf); doCheck(pdf);
} }
if (m->json_version) {
doJSON(pdf, &cout);
}
if (m->show_npages) { if (m->show_npages) {
QTC::TC("qpdf", "QPDFJob npages"); QTC::TC("qpdf", "QPDFJob npages");
cout << pdf.getRoot().getKey("/Pages").getKey("/Count").getIntValue() cout << pdf.getRoot().getKey("/Pages").getKey("/Count").getIntValue()
@ -3337,9 +3365,8 @@ QPDFJob::writeOutfile(QPDF& pdf)
} else if (strcmp(m->outfilename.get(), "-") == 0) { } else if (strcmp(m->outfilename.get(), "-") == 0) {
m->outfilename = nullptr; m->outfilename = nullptr;
} }
if (this->m->json_output) { if (this->m->json_version) {
bool unused = true; writeJSON(pdf);
writeJSON(nullptr, pdf, true, unused);
} else { } else {
// QPDFWriter must have block scope so the output file will be // QPDFWriter must have block scope so the output file will be
// closed after write() finishes. // closed after write() finishes.
@ -3393,52 +3420,30 @@ QPDFJob::writeOutfile(QPDF& pdf)
} }
void void
QPDFJob::writeJSON(Pipeline* p, QPDF& pdf, bool complete, bool& first_key) QPDFJob::writeJSON(QPDF& pdf)
{ {
// File pipeline must have block scope so it will be closed // File pipeline must have block scope so it will be closed
// after write. // after write.
std::shared_ptr<QUtil::FileCloser> fc; std::shared_ptr<QUtil::FileCloser> fc;
std::shared_ptr<Pipeline> fp; std::shared_ptr<Pipeline> fp;
std::string file_prefix = this->m->json_stream_prefix;
if (m->outfilename.get()) { if (m->outfilename.get()) {
QTC::TC("qpdf", "QPDFJob write json to file"); QTC::TC("qpdf", "QPDFJob write json to file");
if (file_prefix.empty()) { if (this->m->json_stream_prefix.empty()) {
file_prefix = this->m->outfilename.get(); this->m->json_stream_prefix = this->m->outfilename.get();
} }
fc = std::make_shared<QUtil::FileCloser>( fc = std::make_shared<QUtil::FileCloser>(
QUtil::safe_fopen(this->m->outfilename.get(), "w")); QUtil::safe_fopen(this->m->outfilename.get(), "w"));
fp = std::make_shared<Pl_StdioFile>("json output", fc->f); fp = std::make_shared<Pl_StdioFile>("json output", fc->f);
} else if ( } else if (
(this->m->json_stream_data == qpdf_sj_file) && file_prefix.empty()) { (this->m->json_stream_data == qpdf_sj_file) &&
this->m->json_stream_prefix.empty()) {
QTC::TC("qpdf", "QPDFJob need json-stream-prefix for stdout"); QTC::TC("qpdf", "QPDFJob need json-stream-prefix for stdout");
usage("please specify --json-stream-prefix since the input file " usage("please specify --json-stream-prefix since the input file "
"name is unknown"); "name is unknown");
} else { } else {
QTC::TC("qpdf", "QPDFJob write json to stdout"); QTC::TC("qpdf", "QPDFJob write json to stdout");
if (p == nullptr) { this->m->log->saveToStandardOutput(true);
fp = this->m->log->getInfo(); fp = this->m->log->getSave();
}
} }
if (p == nullptr) { doJSON(pdf, fp.get());
p = fp.get();
}
std::set<std::string> json_objects;
if (this->m->json_objects.count("trailer")) {
json_objects.insert("trailer");
}
auto wanted = getWantedJSONObjects();
for (auto const& og: wanted) {
std::ostringstream s;
s << "obj:" << og.unparse(' ') << " R";
json_objects.insert(s.str());
}
pdf.writeJSON(
this->m->json_version,
p,
complete,
first_key,
this->m->decode_level,
this->m->json_stream_data,
file_prefix,
json_objects);
} }

View File

@ -244,7 +244,6 @@ QPDFJob::Config::json(std::string const& parameter)
if ((o.m->json_version < 1) || (o.m->json_version > JSON::LATEST)) { if ((o.m->json_version < 1) || (o.m->json_version > JSON::LATEST)) {
usage(std::string("unsupported json version ") + parameter); usage(std::string("unsupported json version ") + parameter);
} }
o.m->require_outfile = false;
return this; return this;
} }
@ -297,14 +296,7 @@ QPDFJob::Config*
QPDFJob::Config::jsonOutput(std::string const& parameter) QPDFJob::Config::jsonOutput(std::string const& parameter)
{ {
o.m->json_output = true; o.m->json_output = true;
if (parameter.empty() || (parameter == "latest")) { json(parameter);
o.m->json_version = JSON::LATEST;
} else {
o.m->json_version = QUtil::string_to_int(parameter.c_str());
}
if ((o.m->json_version < 2) || (o.m->json_version > JSON::LATEST)) {
usage(std::string("unsupported json output version ") + parameter);
}
if (!o.m->json_stream_data_set) { if (!o.m->json_stream_data_set) {
// No need to set json_stream_data_set -- that indicates // No need to set json_stream_data_set -- that indicates
// explicit use of --json-stream-data. // explicit use of --json-stream-data.
@ -313,9 +305,7 @@ QPDFJob::Config::jsonOutput(std::string const& parameter)
if (!o.m->decode_level_set) { if (!o.m->decode_level_set) {
o.m->decode_level = qpdf_dl_none; o.m->decode_level = qpdf_dl_none;
} }
if (o.m->json_keys.empty()) { o.m->json_keys.insert("qpdf");
o.m->json_keys.insert("qpdf");
}
return this; return this;
} }

View File

@ -803,7 +803,9 @@ depth in the JSON section of the manual. "version" may be a
specific version or "latest" (the default). Run qpdf --json-help specific version or "latest" (the default). Run qpdf --json-help
for a description of the generated JSON object. for a description of the generated JSON object.
)"); )");
ap.addOptionHelp("--json-help", "json", "show format of JSON output", R"(Describe the format of the JSON output by writing to standard ap.addOptionHelp("--json-help", "json", "show format of JSON output", R"(--json-help[=version]
Describe the format of the JSON output by writing to standard
output a JSON object with the same keys and with values output a JSON object with the same keys and with values
containing descriptive text. containing descriptive text.
)"); )");
@ -838,17 +840,17 @@ which is to use the output file name. Whatever is given here
will be appended with -nnn to create the name of the file that will be appended with -nnn to create the name of the file that
will contain the data for the stream stream in object nnn. will contain the data for the stream stream in object nnn.
)"); )");
ap.addOptionHelp("--json-output", "json", "serialize to JSON", R"(--json-output[=version] ap.addOptionHelp("--json-output", "json", "apply defaults for JSON serialization", R"(--json-output[=version]
The output file will be qpdf JSON format at the given version. Implies --json=version. Changes default values for certain
"version" may be a specific version or "latest" (the default). options so that the JSON output written is the most faithful
The only supported version is 2. See also --json-stream-data, representation of the original PDF and contains no additional
--json-stream-prefix, and --decode-level. JSON keys. See also --json-stream-data, --json-stream-prefix,
and --decode-level.
)"); )");
ap.addOptionHelp("--json-input", "json", "input file is qpdf JSON", R"(Treat the input file as a JSON file in qpdf JSON format as ap.addOptionHelp("--json-input", "json", "input file is qpdf JSON", R"(Treat the input file as a JSON file in qpdf JSON format. See the
written by qpdf --json-output. See the "qpdf JSON Format" "qpdf JSON Format" section of the manual for information about
section of the manual for information about how to use this how to use this option.
option.
)"); )");
ap.addOptionHelp("--update-from-json", "json", "update a PDF from qpdf JSON", R"(--update-from-json=qpdf-json-file ap.addOptionHelp("--update-from-json", "json", "update a PDF from qpdf JSON", R"(--update-from-json=qpdf-json-file

View File

@ -28,7 +28,7 @@ static constexpr char const* JOB_SCHEMA_DATA = R"({
"forceVersion": "set output PDF version", "forceVersion": "set output PDF version",
"progress": "show progress when writing", "progress": "show progress when writing",
"splitPages": "write pages to separate files", "splitPages": "write pages to separate files",
"jsonOutput": "serialize to JSON", "jsonOutput": "apply defaults for JSON serialization",
"encrypt": { "encrypt": {
"userPassword": "user password", "userPassword": "user password",
"ownerPassword": "owner password", "ownerPassword": "owner password",

View File

@ -3194,7 +3194,16 @@ Related Options
:qpdf:ref:`--json-help` option to get a description of the JSON :qpdf:ref:`--json-help` option to get a description of the JSON
object. object.
.. qpdf:option:: --json-help Starting with qpdf 11, when this option is specified, an output
file is optional (for backward compatibility) and defaults to
standard output. You may specify an output file to write the JSON
to a file rather than standard output.
Stream data is only included if :qpdf:ref:`--json-output` is
specified or if a value other than ``none`` is passed to
:qpdf:ref:`--json-stream-data`.
.. qpdf:option:: --json-help[=version]
.. help: show format of JSON output .. help: show format of JSON output
@ -3202,12 +3211,13 @@ Related Options
output a JSON object with the same keys and with values output a JSON object with the same keys and with values
containing descriptive text. containing descriptive text.
Describe the format of the JSON output by writing to standard Describe the format of the corresponding version of JSON output by
output a JSON object with the same structure as the JSON generated writing to standard output a JSON object with the same structure as
by qpdf. In the output written by ``--json-help``, each key's value the JSON generated by qpdf. In the output written by
is a description of the key. The specific contract guaranteed by ``--json-help``, each key's value is a description of the key. The
qpdf in its JSON representation is explained in more detail in the specific contract guaranteed by qpdf in its JSON representation is
:ref:`json`. explained in more detail in the :ref:`json`. The default version of
help is version ``2``, as with the :qpdf:ref:`--json` flag.
.. qpdf:option:: --json-key=key .. qpdf:option:: --json-key=key
@ -3233,11 +3243,9 @@ Related Options
objects will be shown. objects will be shown.
This option is repeatable. If given, only specified objects will be This option is repeatable. If given, only specified objects will be
shown in the ``"objects"`` key of the JSON output. Otherwise, all shown in the objects dictionary in the JSON output. Otherwise, all
objects will be shown. For qpdf JSON version 1, this also affects objects will be shown. See :ref:`json` for details about the qpdf
the ``"objectinfo"`` key, which is not present in version 2. This JSON format.
option may be used with :qpdf:ref:`--json` and also with
:qpdf:ref:`--json-output`.
.. qpdf:option:: --json-stream-data={none|inline|file} .. qpdf:option:: --json-stream-data={none|inline|file}
@ -3281,28 +3289,30 @@ Related Options
.. qpdf:option:: --json-output[=version] .. qpdf:option:: --json-output[=version]
.. help: serialize to JSON .. help: apply defaults for JSON serialization
The output file will be qpdf JSON format at the given version. Implies --json=version. Changes default values for certain
"version" may be a specific version or "latest" (the default). options so that the JSON output written is the most faithful
The only supported version is 2. See also --json-stream-data, representation of the original PDF and contains no additional
--json-stream-prefix, and --decode-level. JSON keys. See also --json-stream-data, --json-stream-prefix,
and --decode-level.
The output file, instead of being a PDF file, will be a JSON file Implies :qpdf:ref:`--json` at the specified version. This option
in qpdf JSON format at the given version. ``version`` may be a changes several default values, all of which can be overridden by
specific version or ``latest`` (the default). The only supported specifying the stated option:
version is 2. See also :qpdf:ref:`--json-stream-data` and
:qpdf:ref:`--json-stream-prefix`. This option also changes the
following defaults:
- The default value for :qpdf:ref:`--json-stream-data` changes from - The default value for :qpdf:ref:`--json-stream-data` changes from
``none`` to ``inline``. ``none`` to ``inline``.
- The default decode level for stream data becomes ``none``, but you can - The default value for :qpdf:ref:`--decode-level` changes from
override it with :qpdf:ref:`--decode-level`. ``generalized`` to ``none``.
- Only the ``"qpdf"`` key is included in the JSON output, but you - By default, only the ``"qpdf"`` key is included in the JSON
can add additional keys with :qpdf:ref:`--json-key`. output, but you can add additional keys with
:qpdf:ref:`--json-key`.
- Excludes the ``"version"`` and ``"parameters"`` keys from the
JSON output.
If you want to look at the contents of streams easily as you would If you want to look at the contents of streams easily as you would
in QDF mode (see :ref:`qdf`), you can use in QDF mode (see :ref:`qdf`), you can use
@ -3313,15 +3323,15 @@ Related Options
.. help: input file is qpdf JSON .. help: input file is qpdf JSON
Treat the input file as a JSON file in qpdf JSON format as Treat the input file as a JSON file in qpdf JSON format. See the
written by qpdf --json-output. See the "qpdf JSON Format" "qpdf JSON Format" section of the manual for information about
section of the manual for information about how to use this how to use this option.
option.
Treat the input file as a JSON file in qpdf JSON format as written Treat the input file as a JSON file in qpdf JSON format. The input
by ``qpdf --json-output``. The input file must be complete and file must be complete and include all stream data. The JSON version
include all stream data. For information about converting between must be at least 2. All top-level keys are ignored except for
PDF and JSON, please see :ref:`json`. ``"qpdf"``. For information about converting between PDF and JSON,
please see :ref:`json`.
.. qpdf:option:: --update-from-json=qpdf-json-file .. qpdf:option:: --update-from-json=qpdf-json-file

View File

@ -24,27 +24,28 @@ represents the contents of a PDF file. This is distinct from the
interacting with qpdf the way the command-line tool does. For interacting with qpdf the way the command-line tool does. For
information about that, see :ref:`qpdf-job`. information about that, see :ref:`qpdf-job`.
The qpdf JSON format is specific to qpdf. There are two ways to use The qpdf JSON format is specific to qpdf. With JSON version 2, the
qpdf JSON: :qpdf:ref:`--json` command-line flag causes creation of a JSON
representation of all the objects in a PDF file. This includes an
unambiguous representation of the PDF object structure and also
provides JSON-formatted summaries of other information about the file.
This functionality is built into ``QPDFJob`` and can be accessed from
the ``qpdf`` command-line tool or from the ``QPDFJob`` C or C++ API.
- The :qpdf:ref:`--json` command-line flag causes creation of a JSON By default, stream data is omitted, but it can be included by
representation of all the objects in a PDF file, excluding stream specifying the :qpdf:ref:`--json-stream-data` option. With stream data
data. This includes an unambiguous representation of the PDF object included, the generated JSON file completely represents a PDF file.
structure and also provides JSON-formatted summaries of other You can think of this as using JSON as an *alternative syntax* for
information about the file. This functionality is built into representing a PDF file. Using qpdf JSON, it is possible to convert a
``QPDFJob`` and can be accessed from the ``qpdf`` command-line tool PDF file to JSON, manipulate the structure or contents of the objects
or from the ``QPDFJob`` C or C++ API. at a low level, and convert the results back to a PDF file. This
functionality can be accessed from the command-line with the
- qpdf can create a JSON file that completely represents a PDF file. :qpdf:ref:`--json-input`, and :qpdf:ref:`--update-from-json` flags, or
You can think of this as using JSON as an *alternative syntax* for from the API using the ``QPDF::writeJSON``, ``QPDF::createFromJSON``,
representing a PDF file. Using qpdf JSON, it is possible to and ``QPDF::updateFromJSON`` methods. The :qpdf:ref:`--json-output`
convert a PDF file to JSON, manipulate the structure or contents of flag changes a handful of defaults so that the resulting JSON is as
the objects at a low level, and convert the results back to a PDF close as possible to the original input and is ready for being
file. This functionality can be accessed from the command-line with converted back to PDF.
the :qpdf:ref:`--json-output`, :qpdf:ref:`--json-input`, and
:qpdf:ref:`--update-from-json` flags, or from the API using the
``QPDF::writeJSON``, ``QPDF::createFromJSON``, and
``QPDF::updateFromJSON`` methods.
.. _json-terminology: .. _json-terminology:
@ -120,18 +121,53 @@ qpdf JSON Object Representation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This section describes the representation of PDF objects in qpdf JSON This section describes the representation of PDF objects in qpdf JSON
version 2. PDF objects are represented within the ``"objects"`` version 2. PDF objects are represented within the ``"qpdf"`` entry of
dictionary of a qpdf JSON file. This is true both for PDF serialized a qpdf JSON file. The ``"qpdf"`` entry is a two-element array. The
to JSON (:qpdf:ref:`--json-output`, ``QPDF::writeJSON``) or objects as first element is a dictionary containing header-like information about
they appear in the output of ``qpdf`` with the :qpdf:ref:`--json` the file such as the PDF version. The second element is a dictionary
option. containing all the objects in the PDF file. We refer to this as the
*objects dictionary*.
Each key in the ``"objects"`` dictionary is either ``"trailer"`` or a The first element contains the following keys:
string of the form ``"obj:O G R"`` where ``O`` and ``G`` are the
object and generation numbers and ``R`` is the literal string ``R``. - ``"jsonversion"`` -- a number indicating the JSON version used for
This is the PDF syntax for the indirect object reference prepended by writing. This will always be ``2``.
``obj:``. The value, representing the object itself, is a JSON object
whose structure is described below. - ``"pdfversion"`` -- a string containing PDF version as indicated in
the PDF header (e.g. ``"1.7"``, ``"2.0"``)
- ``pushedinheritedpageresources`` -- a boolean indicating whether
the library pushed inherited resources down to the page level.
Certain library calls cause this to happen, and qpdf needs to know
when reading a JSON file back in whether it should do this as it may
cause certain objects to be renumbered.
- ``calledgetallpages`` -- a boolean indicating whether
``getAllPages`` was called prior to writing the JSON output. This
method causes page tree repair to occur, which may renumber some
objects (in very rare cases of corrupted page trees), so qpdf needs
to know this information when reading a JSON file back in.
- ``"maxobjectid"`` -- a number indicating the object ID of the
highest numbered object in the file. This is provided to make it
easier for software that wants to add new objects to the file as you
can safely start with one above that number when creating new
objects. Note that the value of ``"maxobjectid"`` may be higher than
the actual maximum object that appears in the input PDF since it
takes into consideration any dangling indirect object references
from the original file. This prevents you from unwittingly creating
an object that doesn't exist but that is referenced, which may have
unintended side effects. (The PDF specification explicitly allows
dangling references and says to treat them as nulls. This can happen
if objects are removed from a PDF file.)
The second element is the objects dictionary. Each key in the objects
dictionary is either ``"trailer"`` or a string of the form ``"obj:O G
R"`` where ``O`` and ``G`` are the object and generation numbers and
``R`` is the literal string ``R``. This is the PDF syntax for the
indirect object reference prepended by ``obj:``. The value,
representing the object itself, is a JSON object whose structure is
described below.
Top-level Stream Objects Top-level Stream Objects
Stream objects are represented as a JSON object with the single key Stream objects are represented as a JSON object with the single key
@ -143,6 +179,7 @@ Top-level Stream Objects
- ``none``: stream data is not represented; no other keys are - ``none``: stream data is not represented; no other keys are
present present
specified.
- ``inline``: the stream data appears as a base64-encoded string as - ``inline``: the stream data appears as a base64-encoded string as
the value of the ``"data"`` key the value of the ``"data"`` key
@ -249,57 +286,6 @@ Object Values
the string representations of names and whose values are the string representations of names and whose values are
representations of PDF objects. representations of PDF objects.
.. _json.output:
qpdf JSON Output
~~~~~~~~~~~~~~~~
The format of the JSON written by qpdf's :qpdf:ref:`--json-output`
flag or the ``QPDF::writeJSON`` API call is a JSON object consisting
of a single key: ``"qpdf"``. This may be the only key, or it may be
embedded in the output of ``qpdf --json``. Unknown keys are ignored
for future compatibility. It is guaranteed that qpdf will never add
any keys whose names start with ``xdata``, so users are free to add
their own metadata using keys whose names start with ``xdata`` without
fear of clashing with a future version of qpdf.
The ``"qpdf"`` key points to a two-element JSON array. The first element is
a JSON object with the following keys:
- ``"jsonversion"`` -- a number indicating the JSON version used for
writing. This will always be ``2``.
- ``"pdfversion"`` -- a string containing PDF version as indicated in
the PDF header (e.g. ``"1.7"``, ``"2.0"``)
- ``pushedinheritedpageresources`` -- a boolean indicating whether
the library pushed inherited resources down to the page level.
Certain library calls cause this to happen, and qpdf needs to know
when reading a JSON file back in whether it should do this as it may
cause certain objects to be renumbered.
- ``calledgetallpages`` -- a boolean indicating whether
``getAllPages`` was called prior to writing the JSON output. This
method causes page tree repair to occur, which may renumber some
objects (in very rare cases of corrupted page trees), so qpdf needs
to know this information when reading a JSON file back in.
- ``"maxobjectid"`` -- a number indicating the object ID of the
highest numbered object in the file. This is provided to make it
easier for software that wants to add new objects to the file as you
can safely start with one above that number when creating new
objects. Note that the value of ``"maxobjectid"`` may be higher than
the actual maximum object that appears in the input PDF since it
takes into consideration any dangling indirect object references
from the original file. This prevents you from unwittingly creating
an object that doesn't exist but that is referenced, which may have
unintended side effects. (The PDF specification explicitly allows
dangling references and says to treat them as nulls. This can happen
if objects are removed from a PDF file.)
The second element is a JSON object containing the actual PDF objects
as described in :ref:`json.objects`.
Note that writing JSON output is done by ``QPDF``, not ``QPDFWriter``. Note that writing JSON output is done by ``QPDF``, not ``QPDFWriter``.
As such, none of the things ``QPDFWriter`` does apply. This includes As such, none of the things ``QPDFWriter`` does apply. This includes
recompression of streams, renumbering of objects, anything to do with recompression of streams, renumbering of objects, anything to do with
@ -325,7 +311,7 @@ qpdf JSON format.
"pdfversion": "1.3", "pdfversion": "1.3",
"pushedinheritedpageresources": false, "pushedinheritedpageresources": false,
"calledgetallpages": false, "calledgetallpages": false,
"maxobjectid": 5, "maxobjectid": 5
}, },
{ {
"obj:1 0 R": { "obj:1 0 R": {
@ -389,8 +375,7 @@ qpdf JSON format.
qpdf JSON Input qpdf JSON Input
~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~
Output in the JSON output format described in :ref:`json.output` can The qpdf JSON output can be used in two different ways:
be used in two different ways:
- By using the :qpdf:ref:`--json-input` flag or calling - By using the :qpdf:ref:`--json-input` flag or calling
``QPDF::createFromJSON`` in place of ``QPDF::processFile``, a qpdf ``QPDF::createFromJSON`` in place of ``QPDF::processFile``, a qpdf
@ -408,8 +393,11 @@ Here are some important things to know about qpdf JSON input.
- When a qpdf JSON file is used as the primary input file, it must be - When a qpdf JSON file is used as the primary input file, it must be
complete. This means complete. This means
- A JSON version number must be specified with the ``"jsonversion"``
key in the first array element
- A PDF version number must be specified with the ``"pdfversion"`` - A PDF version number must be specified with the ``"pdfversion"``
key key in the first array element
- Stream data must be present for all streams - Stream data must be present for all streams
@ -422,6 +410,9 @@ Here are some important things to know about qpdf JSON input.
- ``"maxobjectid"`` is ignored, so it is not necessary to update it - ``"maxobjectid"`` is ignored, so it is not necessary to update it
when adding new objects. when adding new objects.
- ``"calledgetallpages"`` and ``"pushedinheritedpageresources"`` are
treated as false if omitted.
- ``"/Length"`` is ignored in all stream dictionaries. qpdf doesn't - ``"/Length"`` is ignored in all stream dictionaries. qpdf doesn't
put it there when it creates JSON output, and it is not necessary put it there when it creates JSON output, and it is not necessary
to add it. to add it.
@ -432,14 +423,13 @@ Here are some important things to know about qpdf JSON input.
- Unknown keys at the to top level of the file, within ``objects``, - Unknown keys at the to top level of the file, within ``objects``,
at the top level of each individual object (inside the object that at the top level of each individual object (inside the object that
has the ``"value"`` or ``"stream"`` key) and directly within has the ``"value"`` or ``"stream"`` key) and directly within
``"stream"`` are ignored for future compatibility. You should ``"stream"`` are ignored for future compatibility. This includes
avoid putting your own values in those places if you wish to avoid other top-level keys generated by ``qpdf`` itself (such as
risking that your JSON files will not work in future versions of ``"pages"``). As such, those keys don't have to be consistent with
qpdf. The exception to this advice is at the top level of the the ``"qpdf"`` key if modifying a JSON file for conversion back to
overall file where it is explicitly supported for you to add your PDF. If you wish to store application-specific metadata, you can
own keys. For example, you could add your own metadata at the top do so by adding a key whose name starts with ``x-``. qpdf is
level, and qpdf will ignore it. Note that extra top-level keys are guaranteed not to add any of its own keys that starts with ``x-``.
not preserved when qpdf reads your JSON file.
- When qpdf reads a PDF file, the internal object numbers are always - When qpdf reads a PDF file, the internal object numbers are always
preserved. However, when qpdf writes a file using ``QPDFWriter``, preserved. However, when qpdf writes a file using ``QPDFWriter``,
@ -458,9 +448,9 @@ Here are some important things to know about qpdf JSON input.
# edit pdf.json # edit pdf.json
qpdf in.pdf out.pdf --update-from-json=pdf.json qpdf in.pdf out.pdf --update-from-json=pdf.json
The following will not produce predictable results because The following will produce unpredictable and probably incorrect
``out.pdf`` won't have the same object numbers as ``pdf.json`` and results because ``out.pdf`` won't have the same object numbers as
``in.pdf``. ``pdf.json`` and ``in.pdf``.
:: ::
@ -658,15 +648,16 @@ be aware of:
- If a PDF file has certain types of errors in its pages tree (such as - If a PDF file has certain types of errors in its pages tree (such as
page objects that are direct or multiple pages sharing the same page objects that are direct or multiple pages sharing the same
object ID), qpdf will automatically repair the pages tree. If you object ID), qpdf will automatically repair the pages tree. If you
specify ``"objects"`` (and, with qpdf JSON version 1, also specify ``"qpdf"`` (or, with qpdf JSON version 1, ``"objects"`` or
``"objectinfo"``) without any other keys, you will see the original ``"objectinfo"``) without any other keys, you will see the original
pages tree without any corrections. If you specify any of keys that pages tree without any corrections. If you specify any of keys that
require page tree traversal (for example, ``"pages"``, require page tree traversal (for example, ``"pages"``,
``"outlines"``, or ``"pagelabel"``), then ``"objects"`` (and ``"outlines"``, or ``"pagelabel"``), then ``"qpdf"`` (and
``"objectinfo"``) will show the repaired page tree so that object ``"objects"`` and ``"objectinfo"``) will show the repaired page
references will be consistent throughout the file. This is not an tree so that object references will be consistent throughout the
issue with :qpdf:ref:`--json-output`, which doesn't repair the pages file. You can tell if this has happened by looking at the
tree. ``"calledgetallpages"`` and ``"pushedinheritedpageresources"``
fields in the first element of the ``"qpdf"`` array.
- While qpdf guarantees that keys present in the help will be present - While qpdf guarantees that keys present in the help will be present
in the output, those fields may be null or empty if the information in the output, those fields may be null or empty if the information
@ -743,16 +734,17 @@ version 2.
dictionary containing either a ``"value"`` key or a ``"stream"`` dictionary containing either a ``"value"`` key or a ``"stream"``
key, making it possible to distinguish streams from other objects. key, making it possible to distinguish streams from other objects.
- The ``"objectinfo"`` key has been removed in favor of a - The ``"objectinfo"`` and ``"objects"`` keys have been removed in
representation in ``"objects"`` that differentiates between a stream favor of a representation in ``"qpdf"`` that includes header
and other kinds of objects. In v1, it was not possible to tell a information and differentiates between a stream and other kinds of
stream from a dictionary within ``"objects"``. objects. In v1, it was not possible to tell a stream from a
dictionary within ``"objects"``, and the PDF version was not
captured at all.
- Within the ``"objects"`` dictionary, keys are now ``"obj:O G R"`` - Within the objects dictionary, keys are now ``"obj:O G R"`` where
where ``O`` and ``G`` are the object and generation number. ``O`` and ``G`` are the object and generation number. ``"trailer"``
``"trailer"`` remains the key for the trailer dictionary. In v1, the remains the key for the trailer dictionary. In v1, the ``obj:``
``obj:`` prefix was not present. The rationale for this change is as prefix was not present. The rationale for this change is as follows:
follows:
- Having a unique prefix (``obj:``) makes it much easier to search - Having a unique prefix (``obj:``) makes it much easier to search
in the JSON file for the definition of an object in the JSON file for the definition of an object