Add reactors to the JSON parser

This commit is contained in:
Jay Berkenbilt 2022-05-01 14:06:31 -04:00
parent f5dd63819d
commit 8d2a0eda5a
17 changed files with 401 additions and 14 deletions

View File

@ -1,3 +1,10 @@
2022-05-01 Jay Berkenbilt <ejb@ql.org>
* JSON: add reactors to the JSON parser, making it possible to
react to JSON parsing events as they occur and to block the
results from being stored. This makes it possible to incrementally
parse arbitrarily large JSON inputs.
2022-04-30 Jay Berkenbilt <ejb@ql.org>
* QPDFWriter: change encryption API calls

View File

@ -141,9 +141,86 @@ class JSON
QPDF_DLL
bool checkSchema(JSON schema, std::list<std::string>& errors);
// Create a JSON object from a string.
// An pointer to a Reactor class can be passed to parse, which
// will enable the caller to react to incremental events in the
// construction of the JSON object. This makes it possible to
// implement SAX-like handling of very large JSON objects.
class QPDF_DLL_CLASS Reactor
{
public:
QPDF_DLL
virtual ~Reactor() = default;
// The start/end methods are called when parsing of a
// dictionary or array is started or ended. The item methods
// are called when an item is added to a dictionary or array.
// See important notes in "Item methods" below.
// During parsing of a JSON string, the parser is operating on
// a single object at a time. When a dictionary or array is
// started, a new context begins, and when that dictionary or
// array is ended, the previous context is resumed. So, for
// example, if you have `{"a": [1]}`, you will receive the
// following method calls
//
// dictionaryStart -- current object is the top-level dictionary
// arrayStart -- current object is the array
// arrayItem -- called with the "1" object
// containerEnd -- now current object is the dictionary again
// dictionaryItem -- called with "a" and the just-completed array
// containerEnd -- current object is undefined
//
// If the top-level item in a JSON string is a scalar, the
// topLevelScalar() method will be called. No argument is
// passed since the object is the same as what is returned by
// parse().
QPDF_DLL
virtual void dictionaryStart() = 0;
QPDF_DLL
virtual void arrayStart() = 0;
QPDF_DLL
virtual void containerEnd(JSON const& value) = 0;
QPDF_DLL
virtual void topLevelScalar() = 0;
// Item methods:
//
// The return value of the item methods indicate whether the
// item has been "consumed". If the item method returns true,
// then the item will not be added to the containing JSON
// object. This is what allows arbitrarily large JSON objects
// to be parsed and not have to be kept in memory.
//
// NOTE: When a dictionary or an array is added to a
// container, the dictionaryItem or arrayItem method is called
// when the child item's start delimiter is encountered, so
// the JSON object passed in at that time will always be
// in its initial, empty state.
QPDF_DLL
virtual bool
dictionaryItem(std::string const& key, JSON const& value) = 0;
QPDF_DLL
virtual bool arrayItem(JSON const& value) = 0;
};
// Create a JSON object from a string. See above for information
// about how to use the Reactor.
QPDF_DLL
static JSON parse(std::string const&);
static JSON parse(std::string const&, Reactor* reactor = nullptr);
// parse calls setOffsets to set the inclusive start and
// non-inclusive end offsets of an object relative to its input
// string. Otherwise, both values are 0.
QPDF_DLL
void setStart(size_t);
QPDF_DLL
void setEnd(size_t);
QPDF_DLL
size_t getStart() const;
QPDF_DLL
size_t getEnd() const;
private:
static std::string encode_string(std::string const& utf8);
@ -217,6 +294,9 @@ class JSON
Members(Members const&) = delete;
std::shared_ptr<JSON_value> value;
// start and end are only populated for objects created by parse
size_t start;
size_t end;
};
std::shared_ptr<Members> m;

View File

@ -1,12 +1,15 @@
#include <qpdf/JSON.hh>
#include <qpdf/QIntC.hh>
#include <qpdf/QTC.hh>
#include <qpdf/QUtil.hh>
#include <cstring>
#include <stdexcept>
JSON::Members::Members(std::shared_ptr<JSON_value> value) :
value(value)
value(value),
start(0),
end(0)
{
}
@ -455,7 +458,8 @@ namespace
class JSONParser
{
public:
JSONParser() :
JSONParser(JSON::Reactor* reactor) :
reactor(reactor),
lex_state(ls_top),
number_before_point(0),
number_after_point(0),
@ -499,6 +503,7 @@ namespace
ls_backslash,
};
JSON::Reactor* reactor;
lex_state_e lex_state;
size_t number_before_point;
size_t number_after_point;
@ -828,10 +833,18 @@ JSONParser::handleToken()
switch (*tok_start) {
case '{':
item = std::make_shared<JSON>(JSON::makeDictionary());
item->setStart(QIntC::to_size(tok_start - cstr));
if (reactor) {
reactor->dictionaryStart();
}
break;
case '[':
item = std::make_shared<JSON>(JSON::makeArray());
item->setStart(QIntC::to_size(tok_start - cstr));
if (reactor) {
reactor->arrayStart();
}
break;
default:
@ -997,6 +1010,11 @@ JSONParser::handleToken()
} else if ((delimiter == '}') || (delimiter == ']')) {
next_state = ps_stack.back();
ps_stack.pop_back();
auto tos = stack.back();
tos->setEnd(QIntC::to_size(tok_end - cstr));
if (reactor) {
reactor->containerEnd(*tos);
}
if (next_state != ps_done) {
stack.pop_back();
}
@ -1004,6 +1022,11 @@ JSONParser::handleToken()
throw std::logic_error(
"JSONParser::handleToken: unexpected delimiter in transition");
} else if (item.get()) {
if (!(item->isArray() || item->isDictionary())) {
item->setStart(QIntC::to_size(tok_start - cstr));
item->setEnd(QIntC::to_size(tok_end - cstr));
}
std::shared_ptr<JSON> tos;
if (!stack.empty()) {
tos = stack.back();
@ -1017,14 +1040,18 @@ JSONParser::handleToken()
break;
case ps_dict_after_colon:
tos->addDictionaryMember(dict_key, *item);
if (!reactor || !reactor->dictionaryItem(dict_key, *item)) {
tos->addDictionaryMember(dict_key, *item);
}
next_state = ps_dict_after_item;
break;
case ps_array_begin:
case ps_array_after_comma:
if (!reactor || !reactor->arrayItem(*item)) {
tos->addArrayElement(*item);
}
next_state = ps_array_after_item;
tos->addArrayElement(*item);
break;
case ps_top:
@ -1083,12 +1110,40 @@ JSONParser::parse(std::string const& s)
QTC::TC("libtests", "JSON parse premature EOF");
throw std::runtime_error("JSON: premature end of input");
}
return stack.back();
auto const& tos = stack.back();
if (reactor && tos.get() && !(tos->isArray() || tos->isDictionary())) {
reactor->topLevelScalar();
}
return tos;
}
JSON
JSON::parse(std::string const& s)
JSON::parse(std::string const& s, Reactor* reactor)
{
JSONParser jp;
JSONParser jp(reactor);
return *jp.parse(s);
}
void
JSON::setStart(size_t start)
{
this->m->start = start;
}
void
JSON::setEnd(size_t end)
{
this->m->end = end;
}
size_t
JSON::getStart() const
{
return this->m->start;
}
size_t
JSON::getEnd() const
{
return this->m->end;
}

View File

@ -1,21 +1,113 @@
#include <qpdf/JSON.hh>
#include <qpdf/QUtil.hh>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <memory>
namespace
{
class Reactor: public JSON::Reactor
{
public:
virtual ~Reactor() = default;
virtual void dictionaryStart() override;
virtual void arrayStart() override;
virtual void containerEnd(JSON const& value) override;
virtual void topLevelScalar() override;
virtual bool
dictionaryItem(std::string const& key, JSON const& value) override;
virtual bool arrayItem(JSON const& value) override;
private:
void printItem(JSON const&);
};
} // namespace
void
Reactor::dictionaryStart()
{
std::cout << "dictionary start" << std::endl;
}
void
Reactor::arrayStart()
{
std::cout << "array start" << std::endl;
}
void
Reactor::containerEnd(JSON const& value)
{
std::cout << "container end: ";
printItem(value);
}
void
Reactor::topLevelScalar()
{
std::cout << "top-level scalar" << std::endl;
}
bool
Reactor::dictionaryItem(std::string const& key, JSON const& value)
{
std::cout << "dictionary item: " << key << " -> ";
printItem(value);
if (key == "keep") {
return false;
}
return true;
}
bool
Reactor::arrayItem(JSON const& value)
{
std::cout << "array item: ";
printItem(value);
std::string n;
if (value.getString(n) && n == "keep") {
return false;
}
return true;
}
void
Reactor::printItem(JSON const& j)
{
std::cout << "[" << j.getStart() << ", " << j.getEnd()
<< "): " << j.unparse() << std::endl;
}
static void
usage()
{
std::cerr << "Usage: json_parse file [--react]" << std::endl;
exit(2);
}
int
main(int argc, char* argv[])
{
if (argc != 2) {
std::cerr << "Usage: json_parse file" << std::endl;
if ((argc < 2) || (argc > 3)) {
usage();
return 2;
}
char const* filename = argv[1];
std::shared_ptr<Reactor> reactor;
if (argc == 3) {
if (strcmp(argv[2], "--react") == 0) {
reactor = std::make_shared<Reactor>();
} else {
usage();
}
}
try {
std::shared_ptr<char> buf;
size_t size;
QUtil::read_file_into_memory(filename, buf, size);
std::string s(buf.get(), size);
std::cout << JSON::parse(s).unparse() << std::endl;
std::cout << JSON::parse(s, reactor.get()).unparse() << std::endl;
} catch (std::exception& e) {
std::cerr << "exception: " << filename << ": " << e.what() << std::endl;
return 2;

View File

@ -32,7 +32,7 @@ if ($^O ne 'msys')
cleanup();
my $good = 9;
my $good = 10;
for (my $i = 1; $i <= $good; ++$i)
{
@ -73,6 +73,11 @@ for (my $i = 1; $i <= $good; ++$i)
{$td->FILE => "out.json"},
{$td->STRING => ""});
}
$td->runtest("good $n reactor",
{$td->COMMAND => "json_parse good-$n.json --react"},
{$td->FILE => "good-$n-react.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
}
my @bad = (
@ -127,7 +132,7 @@ foreach my $d (@bad)
cleanup();
$td->report((2 * $good) + scalar(@bad));
$td->report((3 * $good) + scalar(@bad));
sub cleanup
{

View File

@ -0,0 +1,21 @@
dictionary start
dictionary item: a -> [6, 11): "bcd"
array start
dictionary item: e -> [18, 0): []
array item: [19, 20): 1
array item: [41, 42): 2
array item: [44, 45): 3
array item: [46, 47): 4
array item: [48, 54): "five"
dictionary start
array item: [56, 0): {}
dictionary item: six -> [64, 65): 7
dictionary item: 8 -> [72, 73): 9
container end: [56, 74): {}
array item: [76, 80): null
array item: [82, 86): true
array item: [107, 112): false
array item: [114, 134): "a\b\f\n\r\t\\\"/z"
container end: [18, 135): []
container end: [0, 136): {}
{}

View File

@ -0,0 +1,3 @@
dictionary start
container end: [0, 2): {}
{}

View File

@ -0,0 +1,3 @@
array start
container end: [0, 2): []
[]

View File

@ -0,0 +1,18 @@
array start
array start
array item: [1, 0): []
array start
array item: [2, 0): []
dictionary start
array item: [3, 0): {}
container end: [3, 5): {}
container end: [2, 6): []
dictionary start
array item: [8, 0): {}
dictionary start
dictionary item: -> [13, 0): {}
container end: [13, 15): {}
container end: [8, 16): {}
container end: [1, 17): []
container end: [0, 18): []
[]

View File

@ -0,0 +1,2 @@
top-level scalar
"x"

View File

@ -0,0 +1,2 @@
top-level scalar
123

View File

@ -0,0 +1,2 @@
top-level scalar
-123

View File

@ -0,0 +1,11 @@
array start
array item: [1, 2): 1
array item: [4, 6): -2
array item: [8, 11): 3.4
array item: [13, 17): -5.6
array item: [19, 23): -9e1
array item: [25, 29): 10e2
array item: [31, 37): 12.3e5
array item: [39, 46): 12.6e-7
container end: [0, 47): []
[]

View File

@ -0,0 +1,8 @@
array start
array item: [1, 7): "aπb"
array item: [9, 23): "a\b\f\n\r\tc"
array item: [25, 42): "aπbπc"
array item: [44, 52): "π"
array item: [54, 71): "a\u0018bʬc"
container end: [0, 72): []
[]

View File

@ -0,0 +1,47 @@
dictionary start
array start
dictionary item: a -> [9, 0): []
array item: [10, 11): 1
array item: [13, 14): 2
dictionary start
array item: [16, 0): {}
dictionary item: x -> [22, 25): "y"
container end: [16, 26): {}
array item: [28, 29): 3
dictionary start
array item: [31, 0): {}
dictionary item: keep -> [40, 61): "not in final output"
container end: [31, 62): {
"keep": "not in final output"
}
container end: [9, 63): []
array start
dictionary item: keep -> [75, 0): []
array item: [76, 77): 1
array item: [79, 83): null
array item: [85, 86): 2
array item: [88, 93): false
array item: [95, 101): "keep"
array item: [103, 104): 3
array start
array item: [106, 0): []
array item: [107, 113): "this"
array item: [115, 121): "keep"
array item: [123, 128): "not"
array item: [130, 137): "final"
container end: [106, 138): [
"keep"
]
container end: [75, 139): [
"keep"
]
container end: [0, 141): {
"keep": [
"keep"
]
}
{
"keep": [
"keep"
]
}

View File

@ -0,0 +1,4 @@
{
"a": [1, 2, {"x": "y"}, 3, {"keep": "not in final output"}],
"keep": [1, null, 2, false, "keep", 3, ["this", "keep", "not", "final"]]
}

View File

@ -0,0 +1,27 @@
{
"a": [
1,
2,
{
"x": "y"
},
3,
{
"keep": "not in final output"
}
],
"keep": [
1,
null,
2,
false,
"keep",
3,
[
"this",
"keep",
"not",
"final"
]
]
}