From 4674c04cb80e0a50cde1b97464642e2778f9522f Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sun, 24 Jul 2022 16:44:51 -0400 Subject: [PATCH] JSON schema: support multi-element array validation --- ChangeLog | 6 +++- TODO | 5 ---- include/qpdf/JSON.hh | 43 ++++++++++++++------------ libqpdf/JSON.cc | 58 ++++++++++++++++++++++++++---------- libtests/json.cc | 29 ++++++++++++++---- libtests/libtests.testcov | 2 +- libtests/qtest/json/json.out | 11 +++---- manual/json.rst | 16 ++++++---- 8 files changed, 112 insertions(+), 58 deletions(-) diff --git a/ChangeLog b/ChangeLog index e0c20f9d..1b0e3fc4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -8,7 +8,11 @@ allow QPDFJob JSON to start accepting multiple items where a single item used to be expected without breaking backward compatibility. Without this change, the earlier fix to - removeAttachment would be a breaking change. + removeAttachment would be a breaking change. Also allow the schema + to contain a multi-element array, which means that the output has + to have an array of the same length in the corresponding location, + and each element is validated against the corresponding schema + element. * QPDFObjectHandle: for the methods insertItem, appendItem, eraseItem, replaceKey, and removeKey, add a corresponding diff --git a/TODO b/TODO index 80c9ef0d..d46a2821 100644 --- a/TODO +++ b/TODO @@ -124,11 +124,6 @@ JSON v2 fixes compatibility, but at least this gives people a namespace they can know will never conflict with future keys. - * Change schema validation so that if the schema contains an array - with more than one element, the output has to have an array with - the same number of elements whose individual elements are - validated according to the regular rules. - * When reading back in, we'll have to call pushInheritedAttributesToPage or getAllPages based on the values of the metadata. diff --git a/include/qpdf/JSON.hh b/include/qpdf/JSON.hh index ebeaaec8..63c858b1 100644 --- a/include/qpdf/JSON.hh +++ b/include/qpdf/JSON.hh @@ -183,25 +183,30 @@ class JSON // // * The schema is a nested structure containing dictionaries, // single-element arrays, and strings only. - // * Recursively walk the schema. - // * Whenever the schema has an array of length 1 and the object - // does not have an array in the corresponding location, - // validate the object against the array's single element. - // This effectively enables a single element to appear in - // place of an array and be treated as if it were an array of - // one element. This makes it possible to decide later that - // something that used to contain a single element now allows - // an array without invalidating any old data. - // * If the current value is a dictionary, this object must have - // a dictionary in the same place with the same keys. If flags - // contains f_optional, a key in the schema does not have to - // be present in the object. Otherwise, all keys have to be - // present. Any key in the object must be present in the - // schema. - // * If the current value is an array, this object must have an - // array in the same place. The schema's array must contain a - // single element, which is used as a schema to validate each - // element of this object's corresponding array. + // * Recursively walk the schema. In the items below, "schema + // object" refers to an object in the schema, and "checked + // object" refers to the correspondingi part of the object + // being checked. + // * If the schema object is a dictionary, the checked object + // must have a dictionary in the same place with the same + // keys. If flags contains f_optional, a key in the schema + // does not have to be present in the object. Otherwise, all + // keys have to be present. Any key in the object must be + // present in the schema. + // * If the schema object is an array of length 1, the checked + // object may either be a single item or an array of items. + // The single item or each element of the checked object's + // array is validated against the single element of the + // schema's array. The rationale behind this logic is that a + // single element may appear wherever the schema allows a + // variable-length array. This makes it possible to start + // allowing an array in the future where a single element was + // previously required without breaking backward + // compatibility. + // * If the schema object is an array of length > 1, the checked + // object must be an array of the same length. In this case, + // each element of the checked object array is validated + // against the corresponding element of the schema array. // * Otherwise, the value must be a string whose value is a // description of the object's corresponding value, which may // have any type. diff --git a/libqpdf/JSON.cc b/libqpdf/JSON.cc index 7bd2337f..9b9da656 100644 --- a/libqpdf/JSON.cc +++ b/libqpdf/JSON.cc @@ -538,27 +538,55 @@ JSON::checkSchemaInternal( } } } else if (sch_arr) { - if (sch_arr->elements.size() != 1) { - QTC::TC("libtests", "JSON schema array error"); - errors.push_back( - err_prefix + " schema array contains other than one item"); - return false; - } - if (this_arr) { - int i = 0; - for (auto const& element: this_arr->elements) { + auto n_elements = sch_arr->elements.size(); + if (n_elements == 1) { + // A single-element array in the schema allows a single + // element in the object or a variable-length array, each + // of whose items must conform to the single element of + // the schema array. This doesn't apply to arrays of + // arrays -- we fall back to the behavior of allowing a + // single item only when the object is not an array. + if (this_arr) { + int i = 0; + for (auto const& element: this_arr->elements) { + checkSchemaInternal( + element.get(), + sch_arr->elements.at(0).get(), + flags, + errors, + prefix + "." + QUtil::int_to_string(i)); + ++i; + } + } else { + QTC::TC("libtests", "JSON schema array for single item"); checkSchemaInternal( - element.get(), + this_v, sch_arr->elements.at(0).get(), flags, errors, - prefix + "." + QUtil::int_to_string(i)); + prefix); + } + } else if (!this_arr || (this_arr->elements.size() != n_elements)) { + QTC::TC("libtests", "JSON schema array length mismatch"); + errors.push_back( + err_prefix + " is supposed to be an array of length " + + QUtil::uint_to_string(n_elements)); + return false; + } else { + // A multi-element array in the schema must correspond to + // an element of the same length in the object. Each + // element in the object is validated against the + // corresponding element in the schema. + size_t i = 0; + for (auto const& element: this_arr->elements) { + checkSchemaInternal( + element.get(), + sch_arr->elements.at(i).get(), + flags, + errors, + prefix + "." + QUtil::uint_to_string(i)); ++i; } - } else { - QTC::TC("libtests", "JSON schema array for single item"); - checkSchemaInternal( - this_v, sch_arr->elements.at(0).get(), flags, errors, prefix); } } else if (!sch_str) { QTC::TC("libtests", "JSON schema other type"); diff --git a/libtests/json.cc b/libtests/json.cc index a509c2be..ef9188c5 100644 --- a/libtests/json.cc +++ b/libtests/json.cc @@ -179,7 +179,11 @@ test_schema() "z": "ebra", "o": "ptional" } - } + }, + "four": [ + { "first": "first element" }, + { "second": "second element" } + ] } )"); @@ -227,17 +231,28 @@ test_schema() "else": { "z": "okay" } - } + }, + "four": [ + {"first": "missing second"} + ] } )"); check_schema(b, schema, 0, false, "missing items"); - check_schema(a, a, 0, false, "top-level schema array error"); - check_schema(b, b, 0, false, "lower-level schema array error"); JSON bad_schema = JSON::parse(R"({"a": true, "b": "potato?"})"); check_schema(bad_schema, bad_schema, 0, false, "bad schema field type"); + JSON c = JSON::parse(R"( +{ + "four": [ + { "first": 1 }, + { "oops": [2] } + ] +} +)"); + check_schema(c, schema, JSON::f_optional, false, "array element mismatch"); + // "two" exercises the case of the JSON containing a single // element where the schema has an array. JSON good = JSON::parse(R"( @@ -262,7 +277,11 @@ test_schema() "": { "z": "ebra" } - } + }, + "four": [ + { "first": 1 }, + { "second": [2] } + ] } )"); check_schema(good, schema, 0, false, "not optional"); diff --git a/libtests/libtests.testcov b/libtests/libtests.testcov index 7f71d8e0..2ceef541 100644 --- a/libtests/libtests.testcov +++ b/libtests/libtests.testcov @@ -36,7 +36,6 @@ Pl_PNGFilter decodePaeth 0 Pl_TIFFPredictor processRow 1 JSON wanted dictionary 0 JSON key missing in object 0 -JSON schema array error 0 JSON key extra in object 0 QPDFArgParser read args from stdin 0 QPDFArgParser read args from file 0 @@ -93,3 +92,4 @@ JSON 16 low not after high 0 JSON 16 dangling high 0 JSON parse duplicate key 0 JSON schema array for single item 0 +JSON schema array length mismatch 0 diff --git a/libtests/qtest/json/json.out b/libtests/qtest/json/json.out index 811f8aed..f346daa6 100644 --- a/libtests/qtest/json/json.out +++ b/libtests/qtest/json/json.out @@ -2,6 +2,7 @@ top-level object is supposed to be a dictionary --- --- missing items +json key ".four" is supposed to be an array of length 2 json key ".one.a": key "q" is present in schema but missing in object json key ".one.a.r" is supposed to be a dictionary json key ".one.a.s": key "ss" is present in schema but missing in object @@ -15,16 +16,12 @@ json key ".two.1": key "flarp" is not present in schema but appears in object json key ".two.2" is supposed to be a dictionary json key ".two.3" is supposed to be a dictionary --- ---- top-level schema array error -top-level object schema array contains other than one item ---- ---- lower-level schema array error -json key ".one.a.r" schema array contains other than one item -json key ".two" schema array contains other than one item ---- --- bad schema field type json key ".a" schema value is not dictionary, array, or string --- +--- array element mismatch +json key ".four.1": key "oops" is not present in schema but appears in object +--- --- not optional json key ".three.": key "o" is present in schema but missing in object --- diff --git a/manual/json.rst b/manual/json.rst index 2e71bdab..92a89b6b 100644 --- a/manual/json.rst +++ b/manual/json.rst @@ -595,11 +595,17 @@ Documentation appears in the corresponding location of the actual output. The corresponding output can have any value including ``null``. - - An array in the help output always contains a single element. It - indicates that the corresponding location in the actual output is - an array of any length, and that each element of the array has - whatever format is implied by the single element of the help - output's array. + - A single-element array in the help output indicates that the + corresponding location in the actual output is either a single + item or is an array of any length. The single item or each + element of the array has whatever format is implied by the single + element of the help output's array. + + - A multi-element array in the help output indicates that the + corresponding location in the actual output is an array of the + same length. Each element of the output array has whatever format + is implied by the corresponding element of the help output's + array. For example, the help output indicates includes a ``"pagelabels"`` key whose value is an array of one element. That element is a