From 708ea4ef43c2f7d6a88f215f1b932c5118aceafb Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Mon, 1 Jan 2024 09:51:05 -0500 Subject: [PATCH] Completely rewrite QUtil::parse_numrange --- libqpdf/QUtil.cc | 196 ++++++++++++++--------------------- libtests/qtest/numrange.test | 30 +++--- 2 files changed, 95 insertions(+), 131 deletions(-) diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index 7b4b119b..5e88ff88 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -9,15 +9,12 @@ #include #include -#include #include -#include #include #include #include #include #include -#include #include #include #include @@ -1303,93 +1300,52 @@ QUtil::str_compare_nocase(char const* s1, char const* s2) #endif } -static int -maybe_from_end(int num, bool from_end, int max) -{ - if (from_end) { - if (num > max) { - num = 0; - } else { - num = max + 1 - num; - } - } - return num; -} - std::vector QUtil::parse_numrange(char const* range, int max) { - std::vector result; - char const* p = range; + static std::regex group_re(R"((x)?(z|r?\d+)(?:-(z|r?\d+))?)"); + auto parse_num = [&max](std::string const& s) -> int { + if (s == "z") { + return max; + } + int num; + if (s.at(0) == 'r') { + num = max + 1 - string_to_int(s.substr(1).c_str()); + } else { + num = string_to_int(s.c_str()); + } + // max == 0 means we don't know the max and are just testing for valid syntax. + if ((max > 0) && ((num < 1) || (num > max))) { + throw std::runtime_error("number " + std::to_string(num) + " out of range"); + } + return num; + }; + + auto populate = [](std::vector& group, int first_num, bool is_span, int last_num) { + group.clear(); + group.emplace_back(first_num); + if (is_span) { + if (first_num > last_num) { + for (auto i = first_num - 1; i >= last_num; --i) { + group.push_back(i); + } + } else { + for (auto i = first_num + 1; i <= last_num; ++i) { + group.push_back(i); + } + } + } + }; + + char const* p; try { - std::vector work; - static int const comma = -1; - static int const dash = -2; + char const* range_end = range + strlen(range); + std::vector result; + std::vector last_group; + // See if range ends with :even or :odd. size_t start_idx = 0; size_t skip = 1; - - enum { st_top, st_in_number, st_after_number } state = st_top; - bool last_separator_was_dash = false; - int cur_number = 0; - bool from_end = false; - while (*p) { - char ch = *p; - if (isdigit(ch)) { - if (!((state == st_top) || (state == st_in_number))) { - throw std::runtime_error("digit not expected"); - } - state = st_in_number; - cur_number *= 10; - cur_number += (ch - '0'); - } else if (ch == 'z') { - // z represents max - if (!(state == st_top)) { - throw std::runtime_error("z not expected"); - } - state = st_after_number; - cur_number = max; - } else if (ch == 'r') { - if (!(state == st_top)) { - throw std::runtime_error("r not expected"); - } - state = st_in_number; - from_end = true; - } else if ((ch == ',') || (ch == '-')) { - if (!((state == st_in_number) || (state == st_after_number))) { - throw std::runtime_error("unexpected separator"); - } - cur_number = maybe_from_end(cur_number, from_end, max); - work.push_back(cur_number); - cur_number = 0; - from_end = false; - if (ch == ',') { - state = st_top; - last_separator_was_dash = false; - work.push_back(comma); - } else if (ch == '-') { - if (last_separator_was_dash) { - throw std::runtime_error("unexpected dash"); - } - state = st_top; - last_separator_was_dash = true; - work.push_back(dash); - } - } else if (ch == ':') { - if (!((state == st_in_number) || (state == st_after_number))) { - throw std::runtime_error("unexpected colon"); - } - break; - } else { - throw std::runtime_error("unexpected character"); - } - ++p; - } - if ((state == st_in_number) || (state == st_after_number)) { - cur_number = maybe_from_end(cur_number, from_end, max); - work.push_back(cur_number); - } else { - throw std::runtime_error("number expected"); - } + p = std::find(range, range_end, ':'); if (*p == ':') { if (strcmp(p, ":odd") == 0) { skip = 2; @@ -1397,46 +1353,55 @@ QUtil::parse_numrange(char const* range, int max) skip = 2; start_idx = 1; } else { - throw std::runtime_error("unexpected even/odd modifier"); + throw std::runtime_error("expected :even or :odd"); } + range_end = p; } - p = nullptr; - for (size_t i = 0; i < work.size(); i += 2) { - int num = work.at(i); - // max == 0 means we don't know the max and are just testing for valid syntax. - if ((max > 0) && ((num < 1) || (num > max))) { - throw std::runtime_error("number " + QUtil::int_to_string(num) + " out of range"); + // Divide the range into groups + p = range; + char const* group_end; + bool first = true; + while (p != range_end) { + group_end = std::find(p, range_end, ','); + std::cmatch m; + if (!std::regex_match(p, group_end, m, group_re)) { + throw std::runtime_error("invalid range syntax"); } - if (i == 0) { - result.push_back(work.at(i)); + auto is_exclude = m[1].matched; + if (first && is_exclude) { + throw std::runtime_error("first range group may not be an exclusion"); + } + first = false; + auto first_num = parse_num(m[2].str()); + auto is_span = m[3].matched; + int last_num; + if (is_span) { + last_num = parse_num(m[3].str()); + } + if (is_exclude) { + // XXX } else { - int separator = work.at(i - 1); - if (separator == comma) { - result.push_back(num); - } else if (separator == dash) { - int lastnum = result.back(); - if (num > lastnum) { - for (int j = lastnum + 1; j <= num; ++j) { - result.push_back(j); - } - } else { - for (int j = lastnum - 1; j >= num; --j) { - result.push_back(j); - } - } - } else { - throw std::logic_error("INTERNAL ERROR parsing numeric range"); + result.insert(result.end(), last_group.begin(), last_group.end()); + populate(last_group, first_num, is_span, last_num); + } + p = group_end; + if (*p == ',') { + ++p; + if (p == range_end) { + throw std::runtime_error("trailing comma"); } } } - if ((start_idx > 0) || (skip != 1)) { - auto t = result; - result.clear(); - for (size_t i = start_idx; i < t.size(); i += skip) { - result.push_back(t.at(i)); - } + result.insert(result.end(), last_group.begin(), last_group.end()); + if (skip == 1) { + return result; } + std::vector filtered; + for (auto i = start_idx; i < result.size(); i += skip) { + filtered.emplace_back(result.at(i)); + } + return filtered; } catch (std::runtime_error const& e) { std::string message; if (p) { @@ -1447,7 +1412,6 @@ QUtil::parse_numrange(char const* range, int max) } throw std::runtime_error(message); } - return result; } enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc }; diff --git a/libtests/qtest/numrange.test b/libtests/qtest/numrange.test index 896c44d2..b1a04c98 100644 --- a/libtests/qtest/numrange.test +++ b/libtests/qtest/numrange.test @@ -9,37 +9,37 @@ my $td = new TestDriver('numrange'); my @nrange_tests = ( [",5", - "error at * in numeric range *,5: unexpected separator", + "error at * in numeric range *,5: invalid range syntax", 2], ["4,,5", - "error at * in numeric range 4,*,5: unexpected separator", + "error at * in numeric range 4,*,5: invalid range syntax", 2], ["4,5,", - "error at * in numeric range 4,5,*: number expected", + "error at * in numeric range 4,5,*: trailing comma", 2], ["z1,", - "error at * in numeric range z*1,: digit not expected", + "error at * in numeric range *z1,: invalid range syntax", 2], ["1z,", - "error at * in numeric range 1*z,: z not expected", + "error at * in numeric range *1z,: invalid range syntax", 2], ["1-5?", - "error at * in numeric range 1-5*?: unexpected character", + "error at * in numeric range *1-5?: invalid range syntax", 2], ["1-30", - "error in numeric range 1-30: number 30 out of range", + "error at * in numeric range *1-30: number 30 out of range", 2], ["1-10,0,5", - "error in numeric range 1-10,0,5: number 0 out of range", + "error at * in numeric range 1-10,*0,5: number 0 out of range", 2], ["1-10,1234,5", - "error in numeric range 1-10,1234,5: number 1234 out of range", + "error at * in numeric range 1-10,*1234,5: number 1234 out of range", 2], ["1,r,3", - "error in numeric range 1,r,3: number 16 out of range", + "error at * in numeric range 1,*r,3: invalid range syntax", 2], ["1,r16,3", - "error in numeric range 1,r16,3: number 0 out of range", + "error at * in numeric range 1,*r16,3: number 0 out of range", 2], ["1,3,5-10,z-13,13,9,z,2,r2-r4", "numeric range 1,3,5-10,z-13,13,9,z,2,r2-r4" . @@ -50,16 +50,16 @@ my @nrange_tests = ( " -> 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1", 0], ["1-10:quack", - "error at * in numeric range 1-10*:quack: unexpected even/odd modifier", + "error at * in numeric range 1-10*:quack: expected :even or :odd", 2], ["1-10:", - "error at * in numeric range 1-10*:: unexpected even/odd modifier", + "error at * in numeric range 1-10*:: expected :even or :odd", 2], ["1-10,r:", - "error at * in numeric range 1-10,r*:: unexpected even/odd modifier", + "error at * in numeric range 1-10,r*:: expected :even or :odd", 2], ["1-10,:", - "error at * in numeric range 1-10,*:: unexpected colon", + "error at * in numeric range 1-10,*:: expected :even or :odd", 2], ["1-6,8-12:odd", "numeric range 1-6,8-12:odd -> 1 3 5 8 10 12",