diff --git a/TODO b/TODO index 97ac3d1c..9fdfff90 100644 --- a/TODO +++ b/TODO @@ -69,46 +69,11 @@ Soon: Break ground on "Document-level work" JSON v2 fixes ============= -* Unify code between QPDFJob::doJSONObjects and QPDF::writeJSON. Make - sure that the "qpdf" key is always present when json-output is - specified. +* Rethink QPDF::writeJSON. Maybe provide a simpler overload? -* Change the name of the "qpdf-v2" key to "qpdf". Use that in place of - "objects" and change its content to a two-element array whose first - element is metadata required (or useful) for parsing and whose - second element contains the actual data. Use of an array is the only - way to ensure that the metadata is guaranteed to be parsed before we - start parsing the objects. Example: - - { - "qpdf": [ - { - "jsonversion": 2, - "pdfversion": "1.3", - "pushedinheritedpageresources": false, - "calledgetallpages": false, - "maxobjectid": 10 - }, - { - ... objects ... - } - ] - } - - This implies a few things: - - * Still need to test pushedinheritedpageresources and - calledgetallpages and check/use their values when reading - - * Fix --json-help - - * When reading back in, we'll have to call - pushInheritedAttributesToPage or getAllPages based on the values - of the metadata. - - * Test --json with --json-stream-data and --json-output with - --json-stream-data=none. Recheck writeJSON's handling of the - pipeline argument. +* When reading back in, we'll have to call + pushInheritedAttributesToPage or getAllPages based on the values + of the metadata. * Support json v2 in the C API. At a minimum, write_json, create_from_json, and update_from_json need to be there and should diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index f218f87f..f0a5f31c 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -133,7 +133,7 @@ class QPDF QPDF_DLL void updateFromJSON(std::shared_ptr); - // Write qpdf json format to the pipeline "p". The only supported + // Write qpdf JSON format to the pipeline "p". The only supported // version is 2. // // If the value of "complete" is true, a complete JSON object diff --git a/include/qpdf/QPDFJob.hh b/include/qpdf/QPDFJob.hh index f2696327..2106a62c 100644 --- a/include/qpdf/QPDFJob.hh +++ b/include/qpdf/QPDFJob.hh @@ -554,7 +554,7 @@ class QPDFJob void setEncryptionOptions(QPDF&, QPDFWriter&); void maybeFixWritePassword(int R, std::string& password); void writeOutfile(QPDF& pdf); - void writeJSON(Pipeline* p, QPDF& pdf, bool complete, bool& first_key); + void writeJSON(QPDF& pdf); // JSON void doJSON(QPDF& pdf, Pipeline*); diff --git a/job.sums b/job.sums index fb7d62a9..d41bd4ce 100644 --- a/job.sums +++ b/job.sums @@ -8,10 +8,10 @@ include/qpdf/auto_job_c_pages.hh b3cc0f21029f6d89efa043dcdbfa183cb59325b6506001c include/qpdf/auto_job_c_uo.hh ae21b69a1efa9333050f4833d465f6daff87e5b38e5106e49bbef5d4132e4ed1 job.yml f9564f18b08a45d17328af43652645771d3498471820c858b8c9013a193e1412 libqpdf/qpdf/auto_job_decl.hh 7844eba58edffb9494b19e8eca6fd59a24d6e152ca606c3b07da569f753df2da -libqpdf/qpdf/auto_job_help.hh db2e4350c700e064b204e3e20d4fee4eddfe312b28092afcf608b4b6863d30e5 +libqpdf/qpdf/auto_job_help.hh 700d7600b34588169c80f3e325e39e592e2f5c1af1cdac16614150ff38424b40 libqpdf/qpdf/auto_job_init.hh fd1635a5ad6ba16b7ae008467145560a59a5ecfd10d29c5ef7cd0d8347747cd2 libqpdf/qpdf/auto_job_json_decl.hh 06caa46eaf71db8a50c046f91866baa8087745a9474319fb7c86d92634cc8297 libqpdf/qpdf/auto_job_json_init.hh 59545578a2e47c660ff98516ed53f06638be75eb4658e2a09d32cc08e0cb7268 -libqpdf/qpdf/auto_job_schema.hh 9d543cd4a43eafffc2c4b8a6fee29e399c271c52cb6f7d417ae5497b3c1127dc +libqpdf/qpdf/auto_job_schema.hh 5352ef1be1ad7cc6f4f36dab88f2937d278e6bd3a0e2d46259794dc226c8ba6b manual/_ext/qpdf.py 6add6321666031d55ed4aedf7c00e5662bba856dfcd66ccb526563bffefbb580 -manual/cli.rst 8e1f443c6fa000e023e516c318df4d04d58233d4d8648907c4a71f0ea5722bca +manual/cli.rst bbce4cfb662a96c8df0c8563f8065844b77aca7b4ec6385955546b9a455d9953 diff --git a/libqpdf/QPDFJob.cc b/libqpdf/QPDFJob.cc index bc8f64f3..f1b35f56 100644 --- a/libqpdf/QPDFJob.cc +++ b/libqpdf/QPDFJob.cc @@ -680,8 +680,15 @@ QPDFJob::checkConfiguration() " an output file is specified"); } else if (m->split_pages) { usage("--split-pages may not be used with --replace-input"); + } else if (m->json_version) { + usage("--json may not be used with --replace-input"); } } + if (m->json_version && (m->outfilename == nullptr)) { + // The output file is optional with --json for backward + // compatibility and defaults to standard output. + m->outfilename = QUtil::make_shared_cstr("-"); + } if (m->infilename == nullptr) { usage("an input file name is required"); } else if ( @@ -1116,25 +1123,47 @@ QPDFJob::doJSONObject( void QPDFJob::doJSONObjects(Pipeline* p, bool& first, QPDF& pdf) { - JSON::writeDictionaryKey(p, first, "objects", 1); - bool first_object = true; - JSON::writeDictionaryOpen(p, first_object, 1); - bool all_objects = m->json_objects.empty(); - std::set wanted_og = getWantedJSONObjects(); - for (auto& obj: pdf.getAllObjects()) { - std::string key = obj.unparse(); - if (this->m->json_version > 1) { - key = "obj:" + key; + if (m->json_version == 1) { + JSON::writeDictionaryKey(p, first, "objects", 1); + bool first_object = true; + JSON::writeDictionaryOpen(p, first_object, 1); + bool all_objects = m->json_objects.empty(); + std::set wanted_og = getWantedJSONObjects(); + for (auto& obj: pdf.getAllObjects()) { + std::string key = obj.unparse(); + if (this->m->json_version > 1) { + key = "obj:" + key; + } + if (all_objects || wanted_og.count(obj.getObjGen())) { + doJSONObject(p, first_object, key, obj); + } } - if (all_objects || wanted_og.count(obj.getObjGen())) { - doJSONObject(p, first_object, key, obj); + if (all_objects || m->json_objects.count("trailer")) { + auto trailer = pdf.getTrailer(); + doJSONObject(p, first_object, "trailer", trailer); } + JSON::writeDictionaryClose(p, first_object, 1); + } else { + std::set json_objects; + if (this->m->json_objects.count("trailer")) { + json_objects.insert("trailer"); + } + auto wanted = getWantedJSONObjects(); + for (auto const& og: wanted) { + std::ostringstream s; + s << "obj:" << og.unparse(' ') << " R"; + json_objects.insert(s.str()); + } + pdf.writeJSON( + this->m->json_version, + p, + false, + first, + this->m->decode_level, + this->m->json_stream_data, + this->m->json_stream_prefix, + json_objects); } - if (all_objects || m->json_objects.count("trailer")) { - auto trailer = pdf.getTrailer(); - doJSONObject(p, first_object, "trailer", trailer); - } - JSON::writeDictionaryClose(p, first_object, 1); } void @@ -1777,7 +1806,7 @@ void QPDFJob::doJSON(QPDF& pdf, Pipeline* p) { // qpdf guarantees that no new top-level keys whose names start - // with "xdata" will be added. These are reserved for users. + // with "x-" will be added. These are reserved for users. std::string captured_json; std::shared_ptr pl_str; @@ -1788,32 +1817,38 @@ QPDFJob::doJSON(QPDF& pdf, Pipeline* p) bool first = true; JSON::writeDictionaryOpen(p, first, 0); - // This version is updated every time a non-backward-compatible - // change is made to the JSON format. Clients of the JSON are to - // ignore unrecognized keys, so we only update the version of a - // key disappears or if its value changes meaning. - JSON::writeDictionaryItem( - p, first, "version", JSON::makeInt(this->m->json_version), 1); - JSON j_params = JSON::makeDictionary(); - std::string decode_level_str; - switch (m->decode_level) { - case qpdf_dl_none: - decode_level_str = "none"; - break; - case qpdf_dl_generalized: - decode_level_str = "generalized"; - break; - case qpdf_dl_specialized: - decode_level_str = "specialized"; - break; - case qpdf_dl_all: - decode_level_str = "all"; - break; - } - j_params.addDictionaryMember( - "decodelevel", JSON::makeString(decode_level_str)); - JSON::writeDictionaryItem(p, first, "parameters", j_params, 1); + if (m->json_output) { + // Exclude version and parameters to keep the output file + // minimal. The JSON version is inside the "qpdf" key for + // version 2. + } else { + // This version is updated every time a non-backward-compatible + // change is made to the JSON format. Clients of the JSON are to + // ignore unrecognized keys, so we only update the version of a + // key disappears or if its value changes meaning. + JSON::writeDictionaryItem( + p, first, "version", JSON::makeInt(this->m->json_version), 1); + JSON j_params = JSON::makeDictionary(); + std::string decode_level_str; + switch (m->decode_level) { + case qpdf_dl_none: + decode_level_str = "none"; + break; + case qpdf_dl_generalized: + decode_level_str = "generalized"; + break; + case qpdf_dl_specialized: + decode_level_str = "specialized"; + break; + case qpdf_dl_all: + decode_level_str = "all"; + break; + } + j_params.addDictionaryMember( + "decodelevel", JSON::makeString(decode_level_str)); + JSON::writeDictionaryItem(p, first, "parameters", j_params, 1); + } bool all_keys = m->json_keys.empty(); // The list of selectable top-level keys id duplicated in the // following places: job.yml, QPDFJob::json_schema, and @@ -1850,11 +1885,7 @@ QPDFJob::doJSON(QPDF& pdf, Pipeline* p) // qpdf/objects/objectinfo without other keys. if (all_keys || m->json_keys.count("objects") || m->json_keys.count("qpdf")) { - if (this->m->json_version == 1) { - doJSONObjects(p, first, pdf); - } else { - writeJSON(p, pdf, false, first); - } + doJSONObjects(p, first, pdf); } if (this->m->json_version == 1) { // "objectinfo" is not needed for version >1 since you can @@ -1889,9 +1920,6 @@ QPDFJob::doInspection(QPDF& pdf) if (m->check) { doCheck(pdf); } - if (m->json_version) { - doJSON(pdf, &cout); - } if (m->show_npages) { QTC::TC("qpdf", "QPDFJob npages"); cout << pdf.getRoot().getKey("/Pages").getKey("/Count").getIntValue() @@ -3337,9 +3365,8 @@ QPDFJob::writeOutfile(QPDF& pdf) } else if (strcmp(m->outfilename.get(), "-") == 0) { m->outfilename = nullptr; } - if (this->m->json_output) { - bool unused = true; - writeJSON(nullptr, pdf, true, unused); + if (this->m->json_version) { + writeJSON(pdf); } else { // QPDFWriter must have block scope so the output file will be // closed after write() finishes. @@ -3393,52 +3420,30 @@ QPDFJob::writeOutfile(QPDF& pdf) } void -QPDFJob::writeJSON(Pipeline* p, QPDF& pdf, bool complete, bool& first_key) +QPDFJob::writeJSON(QPDF& pdf) { // File pipeline must have block scope so it will be closed // after write. std::shared_ptr fc; std::shared_ptr fp; - std::string file_prefix = this->m->json_stream_prefix; if (m->outfilename.get()) { QTC::TC("qpdf", "QPDFJob write json to file"); - if (file_prefix.empty()) { - file_prefix = this->m->outfilename.get(); + if (this->m->json_stream_prefix.empty()) { + this->m->json_stream_prefix = this->m->outfilename.get(); } fc = std::make_shared( QUtil::safe_fopen(this->m->outfilename.get(), "w")); fp = std::make_shared("json output", fc->f); } else if ( - (this->m->json_stream_data == qpdf_sj_file) && file_prefix.empty()) { + (this->m->json_stream_data == qpdf_sj_file) && + this->m->json_stream_prefix.empty()) { QTC::TC("qpdf", "QPDFJob need json-stream-prefix for stdout"); usage("please specify --json-stream-prefix since the input file " "name is unknown"); } else { QTC::TC("qpdf", "QPDFJob write json to stdout"); - if (p == nullptr) { - fp = this->m->log->getInfo(); - } + this->m->log->saveToStandardOutput(true); + fp = this->m->log->getSave(); } - if (p == nullptr) { - p = fp.get(); - } - std::set json_objects; - if (this->m->json_objects.count("trailer")) { - json_objects.insert("trailer"); - } - auto wanted = getWantedJSONObjects(); - for (auto const& og: wanted) { - std::ostringstream s; - s << "obj:" << og.unparse(' ') << " R"; - json_objects.insert(s.str()); - } - pdf.writeJSON( - this->m->json_version, - p, - complete, - first_key, - this->m->decode_level, - this->m->json_stream_data, - file_prefix, - json_objects); + doJSON(pdf, fp.get()); } diff --git a/libqpdf/QPDFJob_config.cc b/libqpdf/QPDFJob_config.cc index 9a2b3a84..8a9c1470 100644 --- a/libqpdf/QPDFJob_config.cc +++ b/libqpdf/QPDFJob_config.cc @@ -244,7 +244,6 @@ QPDFJob::Config::json(std::string const& parameter) if ((o.m->json_version < 1) || (o.m->json_version > JSON::LATEST)) { usage(std::string("unsupported json version ") + parameter); } - o.m->require_outfile = false; return this; } @@ -297,14 +296,7 @@ QPDFJob::Config* QPDFJob::Config::jsonOutput(std::string const& parameter) { o.m->json_output = true; - if (parameter.empty() || (parameter == "latest")) { - o.m->json_version = JSON::LATEST; - } else { - o.m->json_version = QUtil::string_to_int(parameter.c_str()); - } - if ((o.m->json_version < 2) || (o.m->json_version > JSON::LATEST)) { - usage(std::string("unsupported json output version ") + parameter); - } + json(parameter); if (!o.m->json_stream_data_set) { // No need to set json_stream_data_set -- that indicates // explicit use of --json-stream-data. @@ -313,9 +305,7 @@ QPDFJob::Config::jsonOutput(std::string const& parameter) if (!o.m->decode_level_set) { o.m->decode_level = qpdf_dl_none; } - if (o.m->json_keys.empty()) { - o.m->json_keys.insert("qpdf"); - } + o.m->json_keys.insert("qpdf"); return this; } diff --git a/libqpdf/qpdf/auto_job_help.hh b/libqpdf/qpdf/auto_job_help.hh index 6ade99f5..3551cf3d 100644 --- a/libqpdf/qpdf/auto_job_help.hh +++ b/libqpdf/qpdf/auto_job_help.hh @@ -803,7 +803,9 @@ depth in the JSON section of the manual. "version" may be a specific version or "latest" (the default). Run qpdf --json-help for a description of the generated JSON object. )"); -ap.addOptionHelp("--json-help", "json", "show format of JSON output", R"(Describe the format of the JSON output by writing to standard +ap.addOptionHelp("--json-help", "json", "show format of JSON output", R"(--json-help[=version] + +Describe the format of the JSON output by writing to standard output a JSON object with the same keys and with values containing descriptive text. )"); @@ -838,17 +840,17 @@ which is to use the output file name. Whatever is given here will be appended with -nnn to create the name of the file that will contain the data for the stream stream in object nnn. )"); -ap.addOptionHelp("--json-output", "json", "serialize to JSON", R"(--json-output[=version] +ap.addOptionHelp("--json-output", "json", "apply defaults for JSON serialization", R"(--json-output[=version] -The output file will be qpdf JSON format at the given version. -"version" may be a specific version or "latest" (the default). -The only supported version is 2. See also --json-stream-data, ---json-stream-prefix, and --decode-level. +Implies --json=version. Changes default values for certain +options so that the JSON output written is the most faithful +representation of the original PDF and contains no additional +JSON keys. See also --json-stream-data, --json-stream-prefix, +and --decode-level. )"); -ap.addOptionHelp("--json-input", "json", "input file is qpdf JSON", R"(Treat the input file as a JSON file in qpdf JSON format as -written by qpdf --json-output. See the "qpdf JSON Format" -section of the manual for information about how to use this -option. +ap.addOptionHelp("--json-input", "json", "input file is qpdf JSON", R"(Treat the input file as a JSON file in qpdf JSON format. See the +"qpdf JSON Format" section of the manual for information about +how to use this option. )"); ap.addOptionHelp("--update-from-json", "json", "update a PDF from qpdf JSON", R"(--update-from-json=qpdf-json-file diff --git a/libqpdf/qpdf/auto_job_schema.hh b/libqpdf/qpdf/auto_job_schema.hh index 0fc187fe..aa69c192 100644 --- a/libqpdf/qpdf/auto_job_schema.hh +++ b/libqpdf/qpdf/auto_job_schema.hh @@ -28,7 +28,7 @@ static constexpr char const* JOB_SCHEMA_DATA = R"({ "forceVersion": "set output PDF version", "progress": "show progress when writing", "splitPages": "write pages to separate files", - "jsonOutput": "serialize to JSON", + "jsonOutput": "apply defaults for JSON serialization", "encrypt": { "userPassword": "user password", "ownerPassword": "owner password", diff --git a/manual/cli.rst b/manual/cli.rst index 809437b7..8383b87f 100644 --- a/manual/cli.rst +++ b/manual/cli.rst @@ -3194,7 +3194,16 @@ Related Options :qpdf:ref:`--json-help` option to get a description of the JSON object. -.. qpdf:option:: --json-help + Starting with qpdf 11, when this option is specified, an output + file is optional (for backward compatibility) and defaults to + standard output. You may specify an output file to write the JSON + to a file rather than standard output. + + Stream data is only included if :qpdf:ref:`--json-output` is + specified or if a value other than ``none`` is passed to + :qpdf:ref:`--json-stream-data`. + +.. qpdf:option:: --json-help[=version] .. help: show format of JSON output @@ -3202,12 +3211,13 @@ Related Options output a JSON object with the same keys and with values containing descriptive text. - Describe the format of the JSON output by writing to standard - output a JSON object with the same structure as the JSON generated - by qpdf. In the output written by ``--json-help``, each key's value - is a description of the key. The specific contract guaranteed by - qpdf in its JSON representation is explained in more detail in the - :ref:`json`. + Describe the format of the corresponding version of JSON output by + writing to standard output a JSON object with the same structure as + the JSON generated by qpdf. In the output written by + ``--json-help``, each key's value is a description of the key. The + specific contract guaranteed by qpdf in its JSON representation is + explained in more detail in the :ref:`json`. The default version of + help is version ``2``, as with the :qpdf:ref:`--json` flag. .. qpdf:option:: --json-key=key @@ -3233,11 +3243,9 @@ Related Options objects will be shown. This option is repeatable. If given, only specified objects will be - shown in the ``"objects"`` key of the JSON output. Otherwise, all - objects will be shown. For qpdf JSON version 1, this also affects - the ``"objectinfo"`` key, which is not present in version 2. This - option may be used with :qpdf:ref:`--json` and also with - :qpdf:ref:`--json-output`. + shown in the objects dictionary in the JSON output. Otherwise, all + objects will be shown. See :ref:`json` for details about the qpdf + JSON format. .. qpdf:option:: --json-stream-data={none|inline|file} @@ -3281,28 +3289,30 @@ Related Options .. qpdf:option:: --json-output[=version] - .. help: serialize to JSON + .. help: apply defaults for JSON serialization - The output file will be qpdf JSON format at the given version. - "version" may be a specific version or "latest" (the default). - The only supported version is 2. See also --json-stream-data, - --json-stream-prefix, and --decode-level. + Implies --json=version. Changes default values for certain + options so that the JSON output written is the most faithful + representation of the original PDF and contains no additional + JSON keys. See also --json-stream-data, --json-stream-prefix, + and --decode-level. - The output file, instead of being a PDF file, will be a JSON file - in qpdf JSON format at the given version. ``version`` may be a - specific version or ``latest`` (the default). The only supported - version is 2. See also :qpdf:ref:`--json-stream-data` and - :qpdf:ref:`--json-stream-prefix`. This option also changes the - following defaults: + Implies :qpdf:ref:`--json` at the specified version. This option + changes several default values, all of which can be overridden by + specifying the stated option: - The default value for :qpdf:ref:`--json-stream-data` changes from ``none`` to ``inline``. - - The default decode level for stream data becomes ``none``, but you can - override it with :qpdf:ref:`--decode-level`. + - The default value for :qpdf:ref:`--decode-level` changes from + ``generalized`` to ``none``. - - Only the ``"qpdf"`` key is included in the JSON output, but you - can add additional keys with :qpdf:ref:`--json-key`. + - By default, only the ``"qpdf"`` key is included in the JSON + output, but you can add additional keys with + :qpdf:ref:`--json-key`. + + - Excludes the ``"version"`` and ``"parameters"`` keys from the + JSON output. If you want to look at the contents of streams easily as you would in QDF mode (see :ref:`qdf`), you can use @@ -3313,15 +3323,15 @@ Related Options .. help: input file is qpdf JSON - Treat the input file as a JSON file in qpdf JSON format as - written by qpdf --json-output. See the "qpdf JSON Format" - section of the manual for information about how to use this - option. + Treat the input file as a JSON file in qpdf JSON format. See the + "qpdf JSON Format" section of the manual for information about + how to use this option. - Treat the input file as a JSON file in qpdf JSON format as written - by ``qpdf --json-output``. The input file must be complete and - include all stream data. For information about converting between - PDF and JSON, please see :ref:`json`. + Treat the input file as a JSON file in qpdf JSON format. The input + file must be complete and include all stream data. The JSON version + must be at least 2. All top-level keys are ignored except for + ``"qpdf"``. For information about converting between PDF and JSON, + please see :ref:`json`. .. qpdf:option:: --update-from-json=qpdf-json-file diff --git a/manual/json.rst b/manual/json.rst index 0becd405..9f1dc489 100644 --- a/manual/json.rst +++ b/manual/json.rst @@ -24,27 +24,28 @@ represents the contents of a PDF file. This is distinct from the interacting with qpdf the way the command-line tool does. For information about that, see :ref:`qpdf-job`. -The qpdf JSON format is specific to qpdf. There are two ways to use -qpdf JSON: +The qpdf JSON format is specific to qpdf. With JSON version 2, the +:qpdf:ref:`--json` command-line flag causes creation of a JSON +representation of all the objects in a PDF file. This includes an +unambiguous representation of the PDF object structure and also +provides JSON-formatted summaries of other information about the file. +This functionality is built into ``QPDFJob`` and can be accessed from +the ``qpdf`` command-line tool or from the ``QPDFJob`` C or C++ API. -- The :qpdf:ref:`--json` command-line flag causes creation of a JSON - representation of all the objects in a PDF file, excluding stream - data. This includes an unambiguous representation of the PDF object - structure and also provides JSON-formatted summaries of other - information about the file. This functionality is built into - ``QPDFJob`` and can be accessed from the ``qpdf`` command-line tool - or from the ``QPDFJob`` C or C++ API. - -- qpdf can create a JSON file that completely represents a PDF file. - You can think of this as using JSON as an *alternative syntax* for - representing a PDF file. Using qpdf JSON, it is possible to - convert a PDF file to JSON, manipulate the structure or contents of - the objects at a low level, and convert the results back to a PDF - file. This functionality can be accessed from the command-line with - the :qpdf:ref:`--json-output`, :qpdf:ref:`--json-input`, and - :qpdf:ref:`--update-from-json` flags, or from the API using the - ``QPDF::writeJSON``, ``QPDF::createFromJSON``, and - ``QPDF::updateFromJSON`` methods. +By default, stream data is omitted, but it can be included by +specifying the :qpdf:ref:`--json-stream-data` option. With stream data +included, the generated JSON file completely represents a PDF file. +You can think of this as using JSON as an *alternative syntax* for +representing a PDF file. Using qpdf JSON, it is possible to convert a +PDF file to JSON, manipulate the structure or contents of the objects +at a low level, and convert the results back to a PDF file. This +functionality can be accessed from the command-line with the +:qpdf:ref:`--json-input`, and :qpdf:ref:`--update-from-json` flags, or +from the API using the ``QPDF::writeJSON``, ``QPDF::createFromJSON``, +and ``QPDF::updateFromJSON`` methods. The :qpdf:ref:`--json-output` +flag changes a handful of defaults so that the resulting JSON is as +close as possible to the original input and is ready for being +converted back to PDF. .. _json-terminology: @@ -120,18 +121,53 @@ qpdf JSON Object Representation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This section describes the representation of PDF objects in qpdf JSON -version 2. PDF objects are represented within the ``"objects"`` -dictionary of a qpdf JSON file. This is true both for PDF serialized -to JSON (:qpdf:ref:`--json-output`, ``QPDF::writeJSON``) or objects as -they appear in the output of ``qpdf`` with the :qpdf:ref:`--json` -option. +version 2. PDF objects are represented within the ``"qpdf"`` entry of +a qpdf JSON file. The ``"qpdf"`` entry is a two-element array. The +first element is a dictionary containing header-like information about +the file such as the PDF version. The second element is a dictionary +containing all the objects in the PDF file. We refer to this as the +*objects dictionary*. -Each key in the ``"objects"`` dictionary is either ``"trailer"`` or a -string of the form ``"obj:O G R"`` where ``O`` and ``G`` are the -object and generation numbers and ``R`` is the literal string ``R``. -This is the PDF syntax for the indirect object reference prepended by -``obj:``. The value, representing the object itself, is a JSON object -whose structure is described below. +The first element contains the following keys: + +- ``"jsonversion"`` -- a number indicating the JSON version used for + writing. This will always be ``2``. + +- ``"pdfversion"`` -- a string containing PDF version as indicated in + the PDF header (e.g. ``"1.7"``, ``"2.0"``) + +- ``pushedinheritedpageresources`` -- a boolean indicating whether + the library pushed inherited resources down to the page level. + Certain library calls cause this to happen, and qpdf needs to know + when reading a JSON file back in whether it should do this as it may + cause certain objects to be renumbered. + +- ``calledgetallpages`` -- a boolean indicating whether + ``getAllPages`` was called prior to writing the JSON output. This + method causes page tree repair to occur, which may renumber some + objects (in very rare cases of corrupted page trees), so qpdf needs + to know this information when reading a JSON file back in. + +- ``"maxobjectid"`` -- a number indicating the object ID of the + highest numbered object in the file. This is provided to make it + easier for software that wants to add new objects to the file as you + can safely start with one above that number when creating new + objects. Note that the value of ``"maxobjectid"`` may be higher than + the actual maximum object that appears in the input PDF since it + takes into consideration any dangling indirect object references + from the original file. This prevents you from unwittingly creating + an object that doesn't exist but that is referenced, which may have + unintended side effects. (The PDF specification explicitly allows + dangling references and says to treat them as nulls. This can happen + if objects are removed from a PDF file.) + +The second element is the objects dictionary. Each key in the objects +dictionary is either ``"trailer"`` or a string of the form ``"obj:O G +R"`` where ``O`` and ``G`` are the object and generation numbers and +``R`` is the literal string ``R``. This is the PDF syntax for the +indirect object reference prepended by ``obj:``. The value, +representing the object itself, is a JSON object whose structure is +described below. Top-level Stream Objects Stream objects are represented as a JSON object with the single key @@ -143,6 +179,7 @@ Top-level Stream Objects - ``none``: stream data is not represented; no other keys are present + specified. - ``inline``: the stream data appears as a base64-encoded string as the value of the ``"data"`` key @@ -249,57 +286,6 @@ Object Values the string representations of names and whose values are representations of PDF objects. -.. _json.output: - -qpdf JSON Output -~~~~~~~~~~~~~~~~ - -The format of the JSON written by qpdf's :qpdf:ref:`--json-output` -flag or the ``QPDF::writeJSON`` API call is a JSON object consisting -of a single key: ``"qpdf"``. This may be the only key, or it may be -embedded in the output of ``qpdf --json``. Unknown keys are ignored -for future compatibility. It is guaranteed that qpdf will never add -any keys whose names start with ``xdata``, so users are free to add -their own metadata using keys whose names start with ``xdata`` without -fear of clashing with a future version of qpdf. - -The ``"qpdf"`` key points to a two-element JSON array. The first element is -a JSON object with the following keys: - -- ``"jsonversion"`` -- a number indicating the JSON version used for - writing. This will always be ``2``. - -- ``"pdfversion"`` -- a string containing PDF version as indicated in - the PDF header (e.g. ``"1.7"``, ``"2.0"``) - -- ``pushedinheritedpageresources`` -- a boolean indicating whether - the library pushed inherited resources down to the page level. - Certain library calls cause this to happen, and qpdf needs to know - when reading a JSON file back in whether it should do this as it may - cause certain objects to be renumbered. - -- ``calledgetallpages`` -- a boolean indicating whether - ``getAllPages`` was called prior to writing the JSON output. This - method causes page tree repair to occur, which may renumber some - objects (in very rare cases of corrupted page trees), so qpdf needs - to know this information when reading a JSON file back in. - -- ``"maxobjectid"`` -- a number indicating the object ID of the - highest numbered object in the file. This is provided to make it - easier for software that wants to add new objects to the file as you - can safely start with one above that number when creating new - objects. Note that the value of ``"maxobjectid"`` may be higher than - the actual maximum object that appears in the input PDF since it - takes into consideration any dangling indirect object references - from the original file. This prevents you from unwittingly creating - an object that doesn't exist but that is referenced, which may have - unintended side effects. (The PDF specification explicitly allows - dangling references and says to treat them as nulls. This can happen - if objects are removed from a PDF file.) - -The second element is a JSON object containing the actual PDF objects -as described in :ref:`json.objects`. - Note that writing JSON output is done by ``QPDF``, not ``QPDFWriter``. As such, none of the things ``QPDFWriter`` does apply. This includes recompression of streams, renumbering of objects, anything to do with @@ -325,7 +311,7 @@ qpdf JSON format. "pdfversion": "1.3", "pushedinheritedpageresources": false, "calledgetallpages": false, - "maxobjectid": 5, + "maxobjectid": 5 }, { "obj:1 0 R": { @@ -389,8 +375,7 @@ qpdf JSON format. qpdf JSON Input ~~~~~~~~~~~~~~~ -Output in the JSON output format described in :ref:`json.output` can -be used in two different ways: +The qpdf JSON output can be used in two different ways: - By using the :qpdf:ref:`--json-input` flag or calling ``QPDF::createFromJSON`` in place of ``QPDF::processFile``, a qpdf @@ -408,8 +393,11 @@ Here are some important things to know about qpdf JSON input. - When a qpdf JSON file is used as the primary input file, it must be complete. This means + - A JSON version number must be specified with the ``"jsonversion"`` + key in the first array element + - A PDF version number must be specified with the ``"pdfversion"`` - key + key in the first array element - Stream data must be present for all streams @@ -422,6 +410,9 @@ Here are some important things to know about qpdf JSON input. - ``"maxobjectid"`` is ignored, so it is not necessary to update it when adding new objects. + - ``"calledgetallpages"`` and ``"pushedinheritedpageresources"`` are + treated as false if omitted. + - ``"/Length"`` is ignored in all stream dictionaries. qpdf doesn't put it there when it creates JSON output, and it is not necessary to add it. @@ -432,14 +423,13 @@ Here are some important things to know about qpdf JSON input. - Unknown keys at the to top level of the file, within ``objects``, at the top level of each individual object (inside the object that has the ``"value"`` or ``"stream"`` key) and directly within - ``"stream"`` are ignored for future compatibility. You should - avoid putting your own values in those places if you wish to avoid - risking that your JSON files will not work in future versions of - qpdf. The exception to this advice is at the top level of the - overall file where it is explicitly supported for you to add your - own keys. For example, you could add your own metadata at the top - level, and qpdf will ignore it. Note that extra top-level keys are - not preserved when qpdf reads your JSON file. + ``"stream"`` are ignored for future compatibility. This includes + other top-level keys generated by ``qpdf`` itself (such as + ``"pages"``). As such, those keys don't have to be consistent with + the ``"qpdf"`` key if modifying a JSON file for conversion back to + PDF. If you wish to store application-specific metadata, you can + do so by adding a key whose name starts with ``x-``. qpdf is + guaranteed not to add any of its own keys that starts with ``x-``. - When qpdf reads a PDF file, the internal object numbers are always preserved. However, when qpdf writes a file using ``QPDFWriter``, @@ -458,9 +448,9 @@ Here are some important things to know about qpdf JSON input. # edit pdf.json qpdf in.pdf out.pdf --update-from-json=pdf.json - The following will not produce predictable results because - ``out.pdf`` won't have the same object numbers as ``pdf.json`` and - ``in.pdf``. + The following will produce unpredictable and probably incorrect + results because ``out.pdf`` won't have the same object numbers as + ``pdf.json`` and ``in.pdf``. :: @@ -658,15 +648,16 @@ be aware of: - If a PDF file has certain types of errors in its pages tree (such as page objects that are direct or multiple pages sharing the same object ID), qpdf will automatically repair the pages tree. If you - specify ``"objects"`` (and, with qpdf JSON version 1, also + specify ``"qpdf"`` (or, with qpdf JSON version 1, ``"objects"`` or ``"objectinfo"``) without any other keys, you will see the original pages tree without any corrections. If you specify any of keys that require page tree traversal (for example, ``"pages"``, - ``"outlines"``, or ``"pagelabel"``), then ``"objects"`` (and - ``"objectinfo"``) will show the repaired page tree so that object - references will be consistent throughout the file. This is not an - issue with :qpdf:ref:`--json-output`, which doesn't repair the pages - tree. + ``"outlines"``, or ``"pagelabel"``), then ``"qpdf"`` (and + ``"objects"`` and ``"objectinfo"``) will show the repaired page + tree so that object references will be consistent throughout the + file. You can tell if this has happened by looking at the + ``"calledgetallpages"`` and ``"pushedinheritedpageresources"`` + fields in the first element of the ``"qpdf"`` array. - While qpdf guarantees that keys present in the help will be present in the output, those fields may be null or empty if the information @@ -743,16 +734,17 @@ version 2. dictionary containing either a ``"value"`` key or a ``"stream"`` key, making it possible to distinguish streams from other objects. -- The ``"objectinfo"`` key has been removed in favor of a - representation in ``"objects"`` that differentiates between a stream - and other kinds of objects. In v1, it was not possible to tell a - stream from a dictionary within ``"objects"``. +- The ``"objectinfo"`` and ``"objects"`` keys have been removed in + favor of a representation in ``"qpdf"`` that includes header + information and differentiates between a stream and other kinds of + objects. In v1, it was not possible to tell a stream from a + dictionary within ``"objects"``, and the PDF version was not + captured at all. -- Within the ``"objects"`` dictionary, keys are now ``"obj:O G R"`` - where ``O`` and ``G`` are the object and generation number. - ``"trailer"`` remains the key for the trailer dictionary. In v1, the - ``obj:`` prefix was not present. The rationale for this change is as - follows: +- Within the objects dictionary, keys are now ``"obj:O G R"`` where + ``O`` and ``G`` are the object and generation number. ``"trailer"`` + remains the key for the trailer dictionary. In v1, the ``obj:`` + prefix was not present. The rationale for this change is as follows: - Having a unique prefix (``obj:``) makes it much easier to search in the JSON file for the definition of an object