From d3152869b666a725d303e0667a69f973fc5a96ed Mon Sep 17 00:00:00 2001 From: m-holger Date: Mon, 30 Jan 2023 13:17:09 +0000 Subject: [PATCH] In JSONParser::getToken handle structural and space chars early --- libqpdf/JSON.cc | 164 +++++++++++++++------------ libtests/qtest/json_parse/bad-09.out | 2 +- libtests/qtest/json_parse/bad-31.out | 2 +- 3 files changed, 95 insertions(+), 73 deletions(-) diff --git a/libqpdf/JSON.cc b/libqpdf/JSON.cc index e9637e86..59843c05 100644 --- a/libqpdf/JSON.cc +++ b/libqpdf/JSON.cc @@ -791,7 +791,7 @@ JSONParser::tokenError() void JSONParser::getToken() { - enum { append, ignore, reread } action = append; + enum { append, ignore } action = append; bool ready = false; token.clear(); @@ -820,13 +820,103 @@ JSONParser::getToken() } else { break; } - } else { QTC::TC("libtests", "JSON parse null character"); throw std::runtime_error( "JSON: control or null character at offset " + std::to_string(offset)); } + } else if (*p == ',') { + if (lex_state == ls_top) { + ++p; + ++offset; + lex_state = ls_comma; + return; + } else if (lex_state == ls_string) { + token += *p; + ++p; + ++offset; + } else { + break; + } + } else if (*p == ':') { + if (lex_state == ls_top) { + ++p; + ++offset; + lex_state = ls_colon; + return; + } else if (lex_state == ls_string) { + token += *p; + ++p; + ++offset; + } else { + break; + } + } else if (*p == ' ') { + if (lex_state == ls_top) { + ++p; + ++offset; + } else if (lex_state == ls_string) { + token += *p; + ++p; + ++offset; + } else { + break; + } + } else if (*p == '{') { + if (lex_state == ls_top) { + token_start = offset; + ++p; + ++offset; + lex_state = ls_begin_dict; + return; + } else if (lex_state == ls_string) { + token += *p; + ++p; + ++offset; + } else { + break; + } + } else if (*p == '}') { + if (lex_state == ls_top) { + ++p; + ++offset; + lex_state = ls_end_dict; + return; + } else if (lex_state == ls_string) { + token += *p; + ++p; + ++offset; + } else { + break; + } + } else if (*p == '[') { + if (lex_state == ls_top) { + token_start = offset; + ++p; + ++offset; + lex_state = ls_begin_array; + return; + } else if (lex_state == ls_string) { + token += *p; + ++p; + ++offset; + } else { + break; + } + } else if (*p == ']') { + if (lex_state == ls_top) { + ++p; + ++offset; + lex_state = ls_end_array; + return; + } else if (lex_state == ls_string) { + token += *p; + ++p; + ++offset; + } else { + break; + } } else { action = append; switch (lex_state) { @@ -835,36 +925,6 @@ JSONParser::getToken() if (*p == '"') { lex_state = ls_string; action = ignore; - } else if (*p == ' ') { - action = ignore; - } else if (*p == ',') { - lex_state = ls_comma; - action = ignore; - ready = true; - } else if (*p == ',') { - lex_state = ls_comma; - action = ignore; - ready = true; - } else if (*p == ':') { - lex_state = ls_colon; - action = ignore; - ready = true; - } else if (*p == '{') { - lex_state = ls_begin_dict; - action = ignore; - ready = true; - } else if (*p == '}') { - lex_state = ls_end_dict; - action = ignore; - ready = true; - } else if (*p == '[') { - lex_state = ls_begin_array; - action = ignore; - ready = true; - } else if (*p == ']') { - lex_state = ls_end_array; - action = ignore; - ready = true; } else if ((*p >= 'a') && (*p <= 'z')) { lex_state = ls_alpha; } else if (*p == '-') { @@ -897,14 +957,6 @@ JSONParser::getToken() case ls_number_leading_zero: if (*p == '.') { lex_state = ls_number_point; - } else if (*p == ' ') { - lex_state = ls_number; - action = ignore; - ready = true; - } else if (strchr("{}[]:,", *p)) { - lex_state = ls_number; - action = reread; - ready = true; } else if (*p == 'e' || *p == 'E') { lex_state = ls_number_e; } else { @@ -920,14 +972,6 @@ JSONParser::getToken() // continue } else if (*p == '.') { lex_state = ls_number_point; - } else if (*p == ' ') { - lex_state = ls_number; - action = ignore; - ready = true; - } else if (strchr("{}[]:,", *p)) { - lex_state = ls_number; - action = reread; - ready = true; } else if (*p == 'e' || *p == 'E') { lex_state = ls_number_e; } else { @@ -946,14 +990,6 @@ JSONParser::getToken() case ls_number_after_point: if ((*p >= '0') && (*p <= '9')) { // continue - } else if (*p == ' ') { - lex_state = ls_number; - action = ignore; - ready = true; - } else if (strchr("{}[]:,", *p)) { - lex_state = ls_number; - action = reread; - ready = true; } else if (*p == 'e' || *p == 'E') { lex_state = ls_number_e; } else { @@ -983,12 +1019,6 @@ JSONParser::getToken() // We only get here after we have seen an exponent. if ((*p >= '0') && (*p <= '9')) { // continue - } else if (*p == ' ') { - action = ignore; - ready = true; - } else if (strchr("{}[]:,", *p)) { - action = reread; - ready = true; } else { tokenError(); } @@ -997,12 +1027,6 @@ JSONParser::getToken() case ls_alpha: if ((*p >= 'a') && (*p <= 'z')) { // okay - } else if (*p == ' ') { - action = ignore; - ready = true; - } else if (strchr("{}[]:,", *p)) { - action = reread; - ready = true; } else { tokenError(); } @@ -1090,8 +1114,6 @@ JSONParser::getToken() "JSONParser::getToken : trying to handle delimiter state"); } switch (action) { - case reread: - break; case append: token.append(1, *p); // fall through @@ -1107,7 +1129,7 @@ JSONParser::getToken() } // We only get here if on end of input or if the last character was a - // control character. + // control character or other delimiter. if (!token.empty()) { switch (lex_state) { diff --git a/libtests/qtest/json_parse/bad-09.out b/libtests/qtest/json_parse/bad-09.out index 21d2f1c1..979d53d0 100644 --- a/libtests/qtest/json_parse/bad-09.out +++ b/libtests/qtest/json_parse/bad-09.out @@ -1 +1 @@ -exception: bad-09.json: JSON: offset 3: expect string as dictionary key +exception: bad-09.json: JSON: offset 2: expect string as dictionary key diff --git a/libtests/qtest/json_parse/bad-31.out b/libtests/qtest/json_parse/bad-31.out index 2228d08d..af177726 100644 --- a/libtests/qtest/json_parse/bad-31.out +++ b/libtests/qtest/json_parse/bad-31.out @@ -1 +1 @@ -exception: bad-31.json: JSON: offset 1: numeric literal: no digit after minus sign +exception: bad-31.json: JSON: offset 1: numeric literal: incomplete number