From 137dc7acb9f46dfe40b73dd0079bf130eb6981e0 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sat, 11 Aug 2012 09:22:59 -0400 Subject: [PATCH] Refactor: move resolution of literal to its own method --- include/qpdf/QPDFTokenizer.hh | 1 + libqpdf/QPDFTokenizer.cc | 158 ++++++++++++++++++---------------- 2 files changed, 83 insertions(+), 76 deletions(-) diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh index e888c113..6b385b4d 100644 --- a/include/qpdf/QPDFTokenizer.hh +++ b/include/qpdf/QPDFTokenizer.hh @@ -133,6 +133,7 @@ class QPDFTokenizer private: void reset(); + void resolveLiteral(); // Lexer state enum { st_top, st_in_comment, st_in_string, st_lt, st_gt, diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index 979a79bf..78ab1551 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -50,10 +50,90 @@ QPDFTokenizer::reset() } void -QPDFTokenizer::presentCharacter(char ch) +QPDFTokenizer::resolveLiteral() { PCRE num_re("^[\\+\\-]?(?:\\.\\d+|\\d+(?:\\.\\d+)?)$"); + if ((val.length() > 0) && (val[0] == '/')) + { + type = tt_name; + // Deal with # in name token. Note: '/' by itself is a + // valid name, so don't strip leading /. That way we + // don't have to deal with the empty string as a name. + std::string nval = "/"; + char const* valstr = val.c_str() + 1; + for (char const* p = valstr; *p; ++p) + { + if ((*p == '#') && this->pound_special_in_name) + { + if (p[1] && p[2] && + is_hex_digit(p[1]) && is_hex_digit(p[2])) + { + char num[3]; + num[0] = p[1]; + num[1] = p[2]; + num[2] = '\0'; + char ch = (char)(strtol(num, 0, 16)); + if (ch == '\0') + { + type = tt_bad; + QTC::TC("qpdf", "QPDF_Tokenizer null in name"); + error_message = + "null character not allowed in name token"; + nval += "#00"; + } + else + { + nval += ch; + } + p += 2; + } + else + { + QTC::TC("qpdf", "QPDF_Tokenizer bad name"); + type = tt_bad; + error_message = "invalid name token"; + nval += *p; + } + } + else + { + nval += *p; + } + } + val = nval; + } + else if (num_re.match(val.c_str())) + { + if (val.find('.') != std::string::npos) + { + type = tt_real; + } + else + { + type = tt_integer; + } + } + else if ((val == "true") || (val == "false")) + { + type = tt_bool; + } + else if (val == "null") + { + type = tt_null; + } + else + { + // I don't really know what it is, so leave it as tt_word. + // Lots of cases ($, #, etc.) other than actual words fall + // into this category, but that's okay at least for now. + type = tt_word; + } +} + +void +QPDFTokenizer::presentCharacter(char ch) +{ if (state == st_token_ready) { throw std::logic_error( @@ -342,81 +422,7 @@ QPDFTokenizer::presentCharacter(char ch) if ((state == st_token_ready) && (type == tt_word)) { - if ((val.length() > 0) && (val[0] == '/')) - { - type = tt_name; - // Deal with # in name token. Note: '/' by itself is a - // valid name, so don't strip leading /. That way we - // don't have to deal with the empty string as a name. - std::string nval = "/"; - char const* valstr = val.c_str() + 1; - for (char const* p = valstr; *p; ++p) - { - if ((*p == '#') && this->pound_special_in_name) - { - if (p[1] && p[2] && - is_hex_digit(p[1]) && is_hex_digit(p[2])) - { - char num[3]; - num[0] = p[1]; - num[1] = p[2]; - num[2] = '\0'; - char ch = (char)(strtol(num, 0, 16)); - if (ch == '\0') - { - type = tt_bad; - QTC::TC("qpdf", "QPDF_Tokenizer null in name"); - error_message = - "null character not allowed in name token"; - nval += "#00"; - } - else - { - nval += ch; - } - p += 2; - } - else - { - QTC::TC("qpdf", "QPDF_Tokenizer bad name"); - type = tt_bad; - error_message = "invalid name token"; - nval += *p; - } - } - else - { - nval += *p; - } - } - val = nval; - } - else if (num_re.match(val.c_str())) - { - if (val.find('.') != std::string::npos) - { - type = tt_real; - } - else - { - type = tt_integer; - } - } - else if ((val == "true") || (val == "false")) - { - type = tt_bool; - } - else if (val == "null") - { - type = tt_null; - } - else - { - // I don't really know what it is, so leave it as tt_word. - // Lots of cases ($, #, etc.) other than actual words fall - // into this category, but that's okay at least for now. - type = tt_word; - } + resolveLiteral(); } if (! (betweenTokens() || ((state == st_token_ready) && unread_char)))