From 05fda4afa289ef248804865d7648c9ac3ae75fbd Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Wed, 4 May 2022 09:12:57 -0400 Subject: [PATCH] Change JSON parser to parse from an InputSource --- TODO | 3 - include/qpdf/JSON.hh | 10 +- libqpdf/JSON.cc | 255 ++++++++++++++++++++++------------------- libtests/json_parse.cc | 8 +- 4 files changed, 150 insertions(+), 126 deletions(-) diff --git a/TODO b/TODO index d1bf0117..b53d175f 100644 --- a/TODO +++ b/TODO @@ -51,9 +51,6 @@ library, when context is available, to have a pipeline rather than a FILE* or std::ostream. This makes it possible for people to capture output more flexibly. -JSON::parse should work from an InputSource. BufferInputSource can -already start with a std::string. - Have a json blob defined by a function that takes a pipeline and writes data to the pipeline. It's writer should create a Pl_Base64 -> Pl_Concatenate in front of the pipeline passed to write and call the diff --git a/include/qpdf/JSON.hh b/include/qpdf/JSON.hh index 24d09cc2..435c2362 100644 --- a/include/qpdf/JSON.hh +++ b/include/qpdf/JSON.hh @@ -46,6 +46,7 @@ #include class Pipeline; +class InputSource; class JSON { @@ -249,10 +250,13 @@ class JSON virtual bool arrayItem(JSON const& value) = 0; }; - // Create a JSON object from a string. See above for information - // about how to use the Reactor. + // Create a JSON object from a string. QPDF_DLL - static JSON parse(std::string const&, Reactor* reactor = nullptr); + static JSON parse(std::string const&); + // Create a JSON object from an input source. See above for + // information about how to use the Reactor. + QPDF_DLL + static JSON parse(InputSource&, Reactor* reactor = nullptr); // parse calls setOffsets to set the inclusive start and // non-inclusive end offsets of an object relative to its input diff --git a/libqpdf/JSON.cc b/libqpdf/JSON.cc index 3c5ddafd..71ea33d7 100644 --- a/libqpdf/JSON.cc +++ b/libqpdf/JSON.cc @@ -1,8 +1,7 @@ #include -#include +#include #include -#include #include #include #include @@ -521,7 +520,8 @@ namespace class JSONParser { public: - JSONParser(JSON::Reactor* reactor) : + JSONParser(InputSource& is, JSON::Reactor* reactor) : + is(is), reactor(reactor), lex_state(ls_top), number_before_point(0), @@ -529,16 +529,16 @@ namespace number_after_e(0), number_saw_point(false), number_saw_e(false), - cstr(nullptr), - end(nullptr), - tok_start(nullptr), - tok_end(nullptr), - p(nullptr), + bytes(0), + p(buf), + u_count(0), + offset(0), + done(false), parser_state(ps_top) { } - std::shared_ptr parse(std::string const& s); + std::shared_ptr parse(); private: void getToken(); @@ -564,8 +564,10 @@ namespace ls_alpha, ls_string, ls_backslash, + ls_u4, }; + InputSource& is; JSON::Reactor* reactor; lex_state_e lex_state; size_t number_before_point; @@ -573,11 +575,13 @@ namespace size_t number_after_e; bool number_saw_point; bool number_saw_e; - char const* cstr; - char const* end; - char const* tok_start; - char const* tok_end; + char buf[16384]; + size_t bytes; char const* p; + size_t u_count; + size_t offset; + bool done; + std::string token; parser_state_e parser_state; std::vector> stack; std::vector ps_stack; @@ -661,28 +665,35 @@ JSONParser::decode_string(std::string const& str) void JSONParser::getToken() { - while (p < end) { + enum { append, ignore, reread } action = append; + bool ready = false; + token.clear(); + while (!done) { + if (p == (buf + bytes)) { + p = buf; + bytes = is.read(buf, sizeof(buf)); + if (bytes == 0) { + done = true; + break; + } + } + if (*p == 0) { QTC::TC("libtests", "JSON parse null character"); throw std::runtime_error( "JSON: null character at offset " + - QUtil::int_to_string(p - cstr)); + QUtil::uint_to_string(offset)); } + action = append; switch (lex_state) { case ls_top: if (*p == '"') { - tok_start = p; - tok_end = nullptr; lex_state = ls_string; } else if (QUtil::is_space(*p)) { - // ignore + action = ignore; } else if ((*p >= 'a') && (*p <= 'z')) { - tok_start = p; - tok_end = nullptr; lex_state = ls_alpha; } else if (*p == '-') { - tok_start = p; - tok_end = nullptr; lex_state = ls_number; number_before_point = 0; number_after_point = 0; @@ -690,8 +701,6 @@ JSONParser::getToken() number_saw_point = false; number_saw_e = false; } else if ((*p >= '0') && (*p <= '9')) { - tok_start = p; - tok_end = nullptr; lex_state = ls_number; number_before_point = 1; number_after_point = 0; @@ -699,8 +708,6 @@ JSONParser::getToken() number_saw_point = false; number_saw_e = false; } else if (*p == '.') { - tok_start = p; - tok_end = nullptr; lex_state = ls_number; number_before_point = 0; number_after_point = 0; @@ -708,12 +715,11 @@ JSONParser::getToken() number_saw_point = true; number_saw_e = false; } else if (strchr("{}[]:,", *p)) { - tok_start = p; - tok_end = p + 1; + ready = true; } else { QTC::TC("libtests", "JSON parse bad character"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": unexpected character " + std::string(p, 1)); } break; @@ -731,12 +737,12 @@ JSONParser::getToken() if (number_saw_e) { QTC::TC("libtests", "JSON parse point after e"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": numeric literal: decimal point after e"); } else if (number_saw_point) { QTC::TC("libtests", "JSON parse duplicate point"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": numeric literal: decimal point already seen"); } else { number_saw_point = true; @@ -745,7 +751,7 @@ JSONParser::getToken() if (number_saw_e) { QTC::TC("libtests", "JSON parse duplicate e"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": numeric literal: e already seen"); } else { number_saw_e = true; @@ -756,18 +762,19 @@ JSONParser::getToken() } else { QTC::TC("libtests", "JSON parse unexpected sign"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": numeric literal: unexpected sign"); } } else if (QUtil::is_space(*p)) { - tok_end = p; + action = ignore; + ready = true; } else if (strchr("{}[]:,", *p)) { - tok_end = p; - --p; + action = reread; + ready = true; } else { QTC::TC("libtests", "JSON parse numeric bad character"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": numeric literal: unexpected character " + std::string(p, 1)); } @@ -777,21 +784,22 @@ JSONParser::getToken() if ((*p >= 'a') && (*p <= 'z')) { // okay } else if (QUtil::is_space(*p)) { - tok_end = p; + action = ignore; + ready = true; } else if (strchr("{}[]:,", *p)) { - tok_end = p; - --p; + action = reread; + ready = true; } else { QTC::TC("libtests", "JSON parse keyword bad character"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": keyword: unexpected character " + std::string(p, 1)); } break; case ls_string: if (*p == '"') { - tok_end = p + 1; + ready = true; } else if (*p == '\\') { lex_state = ls_backslash; } @@ -802,56 +810,70 @@ JSONParser::getToken() if (strchr("\\\"/bfnrt", *p)) { lex_state = ls_string; } else if (*p == 'u') { - if (p + 4 >= end) { - QTC::TC("libtests", "JSON parse premature end of u"); - throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + - ": \\u must be followed by four characters"); - } - for (size_t i = 1; i <= 4; ++i) { - if (!QUtil::is_hex_digit(p[i])) { - QTC::TC("libtests", "JSON parse bad hex after u"); - throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + - ": \\u must be followed by four hex digits"); - } - } - p += 4; - lex_state = ls_string; + lex_state = ls_u4; + u_count = 0; } else { QTC::TC("libtests", "JSON parse backslash bad character"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": invalid character after backslash: " + std::string(p, 1)); } break; + + case ls_u4: + if (!QUtil::is_hex_digit(*p)) { + QTC::TC("libtests", "JSON parse bad hex after u"); + throw std::runtime_error( + "JSON: offset " + + QUtil::uint_to_string(offset - u_count - 1) + + ": \\u must be followed by four hex digits"); + } + if (++u_count == 4) { + lex_state = ls_string; + } + break; } - ++p; - if (tok_start && tok_end) { + switch (action) { + case reread: + break; + case append: + token.append(1, *p); + // fall through + case ignore: + ++p; + ++offset; + break; + } + if (ready) { break; } } - if (p == end) { - if (tok_start && (!tok_end)) { + if (done) { + if ((!token.empty()) && (!ready)) { switch (lex_state) { case ls_top: // Can't happen - throw std::logic_error( - "tok_start set in ls_top while parsing " + - std::string(cstr)); + throw std::logic_error("tok_start set in ls_top while parsing"); break; case ls_number: case ls_alpha: - tok_end = p; + // okay break; + case ls_u4: + QTC::TC("libtests", "JSON parse premature end of u"); + throw std::runtime_error( + "JSON: offset " + + QUtil::uint_to_string(offset - u_count - 1) + + ": \\u must be followed by four characters"); + case ls_string: case ls_backslash: QTC::TC("libtests", "JSON parse unterminated string"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": unterminated string"); break; } @@ -862,28 +884,25 @@ JSONParser::getToken() void JSONParser::handleToken() { - if (!(tok_start && tok_end)) { + if (token.empty()) { return; } - // Get token value. - std::string value(tok_start, tok_end); - if (parser_state == ps_done) { QTC::TC("libtests", "JSON parse junk after object"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + - ": material follows end of object: " + value); + "JSON: offset " + QUtil::uint_to_string(offset) + + ": material follows end of object: " + token); } // Git string value std::string s_value; if (lex_state == ls_string) { // Token includes the quotation marks - if (tok_end - tok_start < 2) { + if (token.length() < 2) { throw std::logic_error("JSON string length < 2"); } - s_value = decode_string(value); + s_value = decode_string(token); } // Based on the lexical state and value, figure out whether we are // looking at an item or a delimiter. It will always be exactly @@ -891,12 +910,14 @@ JSONParser::handleToken() std::shared_ptr item; char delimiter = '\0'; + // Already verified that token is not empty + char first_char = token.at(0); switch (lex_state) { case ls_top: - switch (*tok_start) { + switch (first_char) { case '{': item = std::make_shared(JSON::makeDictionary()); - item->setStart(QIntC::to_size(tok_start - cstr)); + item->setStart(offset - token.length()); if (reactor) { reactor->dictionaryStart(); } @@ -904,14 +925,14 @@ JSONParser::handleToken() case '[': item = std::make_shared(JSON::makeArray()); - item->setStart(QIntC::to_size(tok_start - cstr)); + item->setStart(offset - token.length()); if (reactor) { reactor->arrayStart(); } break; default: - delimiter = *tok_start; + delimiter = first_char; break; } break; @@ -920,38 +941,38 @@ JSONParser::handleToken() if (number_saw_point && (number_after_point == 0)) { QTC::TC("libtests", "JSON parse decimal with no digits"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": decimal point with no digits"); } if ((number_before_point > 1) && - ((tok_start[0] == '0') || - ((tok_start[0] == '-') && (tok_start[1] == '0')))) { + ((first_char == '0') || + ((first_char == '-') && (token.at(1) == '0')))) { QTC::TC("libtests", "JSON parse leading zero"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": number with leading zero"); } if ((number_before_point == 0) && (number_after_point == 0)) { QTC::TC("libtests", "JSON parse number no digits"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": number with no digits"); } - item = std::make_shared(JSON::makeNumber(value)); + item = std::make_shared(JSON::makeNumber(token)); break; case ls_alpha: - if (value == "true") { + if (token == "true") { item = std::make_shared(JSON::makeBool(true)); - } else if (value == "false") { + } else if (token == "false") { item = std::make_shared(JSON::makeBool(false)); - } else if (value == "null") { + } else if (token == "null") { item = std::make_shared(JSON::makeNull()); } else { QTC::TC("libtests", "JSON parse invalid keyword"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + - ": invalid keyword " + value); + "JSON: offset " + QUtil::uint_to_string(offset) + + ": invalid keyword " + token); } break; @@ -960,7 +981,9 @@ JSONParser::handleToken() break; case ls_backslash: - throw std::logic_error("tok_end is set while state = ls_backslash"); + case ls_u4: + throw std::logic_error( + "tok_end is set while state = ls_backslash or ls_u4"); break; } @@ -981,21 +1004,21 @@ JSONParser::handleToken() case ps_dict_after_key: QTC::TC("libtests", "JSON parse expected colon"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": expected ':'"); break; case ps_dict_after_item: QTC::TC("libtests", "JSON parse expected , or }"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": expected ',' or '}'"); break; case ps_array_after_item: QTC::TC("libtests", "JSON parse expected, or ]"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": expected ',' or ']'"); break; @@ -1004,7 +1027,7 @@ JSONParser::handleToken() if (lex_state != ls_string) { QTC::TC("libtests", "JSON parse string as dict key"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": expect string as dictionary key"); } break; @@ -1023,7 +1046,7 @@ JSONParser::handleToken() { QTC::TC("libtests", "JSON parse unexpected }"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": unexpected dictionary end delimiter"); } } else if (delimiter == ']') { @@ -1033,14 +1056,14 @@ JSONParser::handleToken() { QTC::TC("libtests", "JSON parse unexpected ]"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": unexpected array end delimiter"); } } else if (delimiter == ':') { if (parser_state != ps_dict_after_key) { QTC::TC("libtests", "JSON parse unexpected :"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": unexpected colon"); } } else if (delimiter == ',') { @@ -1048,7 +1071,7 @@ JSONParser::handleToken() (parser_state == ps_array_after_item))) { QTC::TC("libtests", "JSON parse unexpected ,"); throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": unexpected comma"); } } else if (delimiter != '\0') { @@ -1074,7 +1097,7 @@ JSONParser::handleToken() next_state = ps_stack.back(); ps_stack.pop_back(); auto tos = stack.back(); - tos->setEnd(QIntC::to_size(tok_end - cstr)); + tos->setEnd(offset); if (reactor) { reactor->containerEnd(*tos); } @@ -1086,8 +1109,8 @@ JSONParser::handleToken() "JSONParser::handleToken: unexpected delimiter in transition"); } else if (item.get()) { if (!(item->isArray() || item->isDictionary())) { - item->setStart(QIntC::to_size(tok_start - cstr)); - item->setEnd(QIntC::to_size(tok_end - cstr)); + item->setStart(offset - token.length()); + item->setEnd(offset); } std::shared_ptr tos; @@ -1149,23 +1172,17 @@ JSONParser::handleToken() } if (ps_stack.size() > 500) { throw std::runtime_error( - "JSON: offset " + QUtil::int_to_string(p - cstr) + + "JSON: offset " + QUtil::uint_to_string(offset) + ": maximum object depth exceeded"); } parser_state = next_state; - tok_start = nullptr; - tok_end = nullptr; lex_state = ls_top; } std::shared_ptr -JSONParser::parse(std::string const& s) +JSONParser::parse() { - cstr = s.c_str(); - end = cstr + s.length(); - p = cstr; - - while (p < end) { + while (!done) { getToken(); handleToken(); } @@ -1181,10 +1198,18 @@ JSONParser::parse(std::string const& s) } JSON -JSON::parse(std::string const& s, Reactor* reactor) +JSON::parse(InputSource& is, Reactor* reactor) { - JSONParser jp(reactor); - return *jp.parse(s); + JSONParser jp(is, reactor); + return *jp.parse(); +} + +JSON +JSON::parse(std::string const& s) +{ + BufferInputSource bis("json input", s); + JSONParser jp(bis, nullptr); + return *jp.parse(); } void diff --git a/libtests/json_parse.cc b/libtests/json_parse.cc index 7f894c8d..721b17de 100644 --- a/libtests/json_parse.cc +++ b/libtests/json_parse.cc @@ -1,3 +1,4 @@ +#include #include #include #include @@ -103,11 +104,8 @@ main(int argc, char* argv[]) } } try { - std::shared_ptr buf; - size_t size; - QUtil::read_file_into_memory(filename, buf, size); - std::string s(buf.get(), size); - std::cout << JSON::parse(s, reactor.get()).unparse() << std::endl; + FileInputSource is(filename); + std::cout << JSON::parse(is, reactor.get()).unparse() << std::endl; } catch (std::exception& e) { std::cerr << "exception: " << filename << ": " << e.what() << std::endl; return 2;