diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh index cd727613..7d7f6132 100644 --- a/include/qpdf/QPDFTokenizer.hh +++ b/include/qpdf/QPDFTokenizer.hh @@ -165,31 +165,53 @@ class QPDFTokenizer size_t max_len = 0); private: - void reset(); + // Do not implement copy or assignment + QPDFTokenizer(QPDFTokenizer const&); + QPDFTokenizer& operator=(QPDFTokenizer const&); + void resolveLiteral(); bool isSpace(char); - // Lexer state - enum { st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt, - st_literal, st_in_hexstring, st_token_ready } state; + enum state_e { + st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt, + st_literal, st_in_hexstring, st_token_ready + }; - bool pound_special_in_name; - bool allow_eof; - bool include_ignorable; + class Members + { + friend class QPDFTokenizer; - // Current token accumulation - token_type_e type; - std::string val; - std::string raw_val; - std::string error_message; - bool unread_char; - char char_to_unread; + public: + QPDF_DLL + ~Members(); - // State for strings - int string_depth; - bool string_ignoring_newline; - char bs_num_register[4]; - bool last_char_was_bs; + private: + Members(); + Members(Members const&); + void reset(); + + // Lexer state + state_e state; + + bool pound_special_in_name; + bool allow_eof; + bool include_ignorable; + + // Current token accumulation + token_type_e type; + std::string val; + std::string raw_val; + std::string error_message; + bool unread_char; + char char_to_unread; + + // State for strings + int string_depth; + bool string_ignoring_newline; + char bs_num_register[4]; + bool last_char_was_bs; + }; + PointerHolder m; }; #endif // __QPDFTOKENIZER_HH__ diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index 2a45a0b5..776019c8 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -12,7 +12,7 @@ #include #include -QPDFTokenizer::QPDFTokenizer() : +QPDFTokenizer::Members::Members() : pound_special_in_name(true), allow_eof(false), include_ignorable(false) @@ -21,32 +21,7 @@ QPDFTokenizer::QPDFTokenizer() : } void -QPDFTokenizer::allowPoundAnywhereInName() -{ - QTC::TC("qpdf", "QPDFTokenizer allow pound anywhere in name"); - this->pound_special_in_name = false; -} - -void -QPDFTokenizer::allowEOF() -{ - this->allow_eof = true; -} - -void -QPDFTokenizer::includeIgnorable() -{ - this->include_ignorable = true; -} - -bool -QPDFTokenizer::isSpace(char ch) -{ - return ((ch == '\0') || QUtil::is_space(ch)); -} - -void -QPDFTokenizer::reset() +QPDFTokenizer::Members::reset() { state = st_top; type = tt_bad; @@ -60,20 +35,54 @@ QPDFTokenizer::reset() last_char_was_bs = false; } +QPDFTokenizer::Members::~Members() +{ +} + +QPDFTokenizer::QPDFTokenizer() : + m(new Members()) +{ +} + +void +QPDFTokenizer::allowPoundAnywhereInName() +{ + QTC::TC("qpdf", "QPDFTokenizer allow pound anywhere in name"); + this->m->pound_special_in_name = false; +} + +void +QPDFTokenizer::allowEOF() +{ + this->m->allow_eof = true; +} + +void +QPDFTokenizer::includeIgnorable() +{ + this->m->include_ignorable = true; +} + +bool +QPDFTokenizer::isSpace(char ch) +{ + return ((ch == '\0') || QUtil::is_space(ch)); +} + void QPDFTokenizer::resolveLiteral() { - if ((val.length() > 0) && (val.at(0) == '/')) + if ((this->m->val.length() > 0) && (this->m->val.at(0) == '/')) { - type = tt_name; + this->m->type = tt_name; // Deal with # in name token. Note: '/' by itself is a // valid name, so don't strip leading /. That way we // don't have to deal with the empty string as a name. std::string nval = "/"; - char const* valstr = val.c_str() + 1; + char const* valstr = this->m->val.c_str() + 1; for (char const* p = valstr; *p; ++p) { - if ((*p == '#') && this->pound_special_in_name) + if ((*p == '#') && this->m->pound_special_in_name) { if (p[1] && p[2] && QUtil::is_hex_digit(p[1]) && QUtil::is_hex_digit(p[2])) @@ -85,9 +94,9 @@ QPDFTokenizer::resolveLiteral() char ch = static_cast(strtol(num, 0, 16)); if (ch == '\0') { - type = tt_bad; + this->m->type = tt_bad; QTC::TC("qpdf", "QPDF_Tokenizer null in name"); - error_message = + this->m->error_message = "null character not allowed in name token"; nval += "#00"; } @@ -100,8 +109,8 @@ QPDFTokenizer::resolveLiteral() else { QTC::TC("qpdf", "QPDF_Tokenizer bad name"); - type = tt_bad; - error_message = "invalid name token"; + this->m->type = tt_bad; + this->m->error_message = "invalid name token"; nval += *p; } } @@ -110,40 +119,40 @@ QPDFTokenizer::resolveLiteral() nval += *p; } } - val = nval; + this->m->val = nval; } - else if (QUtil::is_number(val.c_str())) + else if (QUtil::is_number(this->m->val.c_str())) { - if (val.find('.') != std::string::npos) + if (this->m->val.find('.') != std::string::npos) { - type = tt_real; + this->m->type = tt_real; } else { - type = tt_integer; + this->m->type = tt_integer; } } - else if ((val == "true") || (val == "false")) + else if ((this->m->val == "true") || (this->m->val == "false")) { - type = tt_bool; + this->m->type = tt_bool; } - else if (val == "null") + else if (this->m->val == "null") { - type = tt_null; + this->m->type = tt_null; } else { // I don't really know what it is, so leave it as tt_word. // Lots of cases ($, #, etc.) other than actual words fall // into this category, but that's okay at least for now. - type = tt_word; + this->m->type = tt_word; } } void QPDFTokenizer::presentCharacter(char ch) { - if (state == st_token_ready) + if (this->m->state == st_token_ready) { throw std::logic_error( "INTERNAL ERROR: QPDF tokenizer presented character " @@ -157,205 +166,210 @@ QPDFTokenizer::presentCharacter(char ch) // the character that caused a state change in the new state. bool handled = true; - if (state == st_top) + if (this->m->state == st_top) { // Note: we specifically do not use ctype here. It is // locale-dependent. if (isSpace(ch)) { - if (this->include_ignorable) + if (this->m->include_ignorable) { - state = st_in_space; - val += ch; + this->m->state = st_in_space; + this->m->val += ch; } } else if (ch == '%') { - state = st_in_comment; - if (this->include_ignorable) + this->m->state = st_in_comment; + if (this->m->include_ignorable) { - val += ch; + this->m->val += ch; } } else if (ch == '(') { - string_depth = 1; - string_ignoring_newline = false; - memset(bs_num_register, '\0', sizeof(bs_num_register)); - last_char_was_bs = false; - state = st_in_string; + this->m->string_depth = 1; + this->m->string_ignoring_newline = false; + memset(this->m->bs_num_register, '\0', + sizeof(this->m->bs_num_register)); + this->m->last_char_was_bs = false; + this->m->state = st_in_string; } else if (ch == '<') { - state = st_lt; + this->m->state = st_lt; } else if (ch == '>') { - state = st_gt; + this->m->state = st_gt; } else { - val += ch; + this->m->val += ch; if (ch == ')') { - type = tt_bad; + this->m->type = tt_bad; QTC::TC("qpdf", "QPDF_Tokenizer bad )"); - error_message = "unexpected )"; - state = st_token_ready; + this->m->error_message = "unexpected )"; + this->m->state = st_token_ready; } else if (ch == '[') { - type = tt_array_open; - state = st_token_ready; + this->m->type = tt_array_open; + this->m->state = st_token_ready; } else if (ch == ']') { - type = tt_array_close; - state = st_token_ready; + this->m->type = tt_array_close; + this->m->state = st_token_ready; } else if (ch == '{') { - type = tt_brace_open; - state = st_token_ready; + this->m->type = tt_brace_open; + this->m->state = st_token_ready; } else if (ch == '}') { - type = tt_brace_close; - state = st_token_ready; + this->m->type = tt_brace_close; + this->m->state = st_token_ready; } else { - state = st_literal; + this->m->state = st_literal; } } } - else if (state == st_in_space) + else if (this->m->state == st_in_space) { // We only enter this state if include_ignorable is true. if (! isSpace(ch)) { - type = tt_space; - unread_char = true; - char_to_unread = ch; - state = st_token_ready; + this->m->type = tt_space; + this->m->unread_char = true; + this->m->char_to_unread = ch; + this->m->state = st_token_ready; } else { - val += ch; + this->m->val += ch; } } - else if (state == st_in_comment) + else if (this->m->state == st_in_comment) { if ((ch == '\r') || (ch == '\n')) { - if (this->include_ignorable) + if (this->m->include_ignorable) { - type = tt_comment; - unread_char = true; - char_to_unread = ch; - state = st_token_ready; + this->m->type = tt_comment; + this->m->unread_char = true; + this->m->char_to_unread = ch; + this->m->state = st_token_ready; } else { - state = st_top; + this->m->state = st_top; } } - else if (this->include_ignorable) + else if (this->m->include_ignorable) { - val += ch; + this->m->val += ch; } } - else if (state == st_lt) + else if (this->m->state == st_lt) { if (ch == '<') { - val = "<<"; - type = tt_dict_open; - state = st_token_ready; + this->m->val = "<<"; + this->m->type = tt_dict_open; + this->m->state = st_token_ready; } else { handled = false; - state = st_in_hexstring; + this->m->state = st_in_hexstring; } } - else if (state == st_gt) + else if (this->m->state == st_gt) { if (ch == '>') { - val = ">>"; - type = tt_dict_close; - state = st_token_ready; + this->m->val = ">>"; + this->m->type = tt_dict_close; + this->m->state = st_token_ready; } else { - val = ">"; - type = tt_bad; + this->m->val = ">"; + this->m->type = tt_bad; QTC::TC("qpdf", "QPDF_Tokenizer bad >"); - error_message = "unexpected >"; - unread_char = true; - char_to_unread = ch; - state = st_token_ready; + this->m->error_message = "unexpected >"; + this->m->unread_char = true; + this->m->char_to_unread = ch; + this->m->state = st_token_ready; } } - else if (state == st_in_string) + else if (this->m->state == st_in_string) { - if (string_ignoring_newline && (! ((ch == '\r') || (ch == '\n')))) + if (this->m->string_ignoring_newline && + (! ((ch == '\r') || (ch == '\n')))) { - string_ignoring_newline = false; + this->m->string_ignoring_newline = false; } - size_t bs_num_count = strlen(bs_num_register); + size_t bs_num_count = strlen(this->m->bs_num_register); bool ch_is_octal = ((ch >= '0') && (ch <= '7')); if ((bs_num_count == 3) || ((bs_num_count > 0) && (! ch_is_octal))) { // We've accumulated \ddd. PDF Spec says to ignore // high-order overflow. - val += static_cast(strtol(bs_num_register, 0, 8)); - memset(bs_num_register, '\0', sizeof(bs_num_register)); + this->m->val += static_cast( + strtol(this->m->bs_num_register, 0, 8)); + memset(this->m->bs_num_register, '\0', + sizeof(this->m->bs_num_register)); bs_num_count = 0; } - if (string_ignoring_newline && ((ch == '\r') || (ch == '\n'))) + if (this->m->string_ignoring_newline && ((ch == '\r') || (ch == '\n'))) { // ignore } - else if (ch_is_octal && (last_char_was_bs || (bs_num_count > 0))) + else if (ch_is_octal && + (this->m->last_char_was_bs || (bs_num_count > 0))) { - bs_num_register[bs_num_count++] = ch; + this->m->bs_num_register[bs_num_count++] = ch; } - else if (last_char_was_bs) + else if (this->m->last_char_was_bs) { switch (ch) { case 'n': - val += '\n'; + this->m->val += '\n'; break; case 'r': - val += '\r'; + this->m->val += '\r'; break; case 't': - val += '\t'; + this->m->val += '\t'; break; case 'b': - val += '\b'; + this->m->val += '\b'; break; case 'f': - val += '\f'; + this->m->val += '\f'; break; case '\r': case '\n': - string_ignoring_newline = true; + this->m->string_ignoring_newline = true; break; default: // PDF spec says backslash is ignored before anything else - val += ch; + this->m->val += ch; break; } } @@ -371,22 +385,23 @@ QPDFTokenizer::presentCharacter(char ch) } else if (ch == '(') { - val += ch; - ++string_depth; + this->m->val += ch; + ++this->m->string_depth; } - else if ((ch == ')') && (--string_depth == 0)) + else if ((ch == ')') && (--this->m->string_depth == 0)) { - type = tt_string; - state = st_token_ready; + this->m->type = tt_string; + this->m->state = st_token_ready; } else { - val += ch; + this->m->val += ch; } - last_char_was_bs = ((! last_char_was_bs) && (ch == '\\')); + this->m->last_char_was_bs = + ((! this->m->last_char_was_bs) && (ch == '\\')); } - else if (state == st_literal) + else if (this->m->state == st_literal) { if (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0) { @@ -398,14 +413,14 @@ QPDFTokenizer::presentCharacter(char ch) // though not on any files in the test suite as of this // writing. - type = tt_word; - unread_char = true; - char_to_unread = ch; - state = st_token_ready; + this->m->type = tt_word; + this->m->unread_char = true; + this->m->char_to_unread = ch; + this->m->state = st_token_ready; } else { - val += ch; + this->m->val += ch; } } else @@ -418,33 +433,33 @@ QPDFTokenizer::presentCharacter(char ch) { // okay } - else if (state == st_in_hexstring) + else if (this->m->state == st_in_hexstring) { if (ch == '>') { - type = tt_string; - state = st_token_ready; - if (val.length() % 2) + this->m->type = tt_string; + this->m->state = st_token_ready; + if (this->m->val.length() % 2) { // PDF spec says odd hexstrings have implicit // trailing 0. - val += '0'; + this->m->val += '0'; } char num[3]; num[2] = '\0'; std::string nval; - for (unsigned int i = 0; i < val.length(); i += 2) + for (unsigned int i = 0; i < this->m->val.length(); i += 2) { - num[0] = val.at(i); - num[1] = val.at(i+1); + num[0] = this->m->val.at(i); + num[1] = this->m->val.at(i+1); char nch = static_cast(strtol(num, 0, 16)); nval += nch; } - val = nval; + this->m->val = nval; } else if (QUtil::is_hex_digit(ch)) { - val += ch; + this->m->val += ch; } else if (isSpace(ch)) { @@ -452,11 +467,11 @@ QPDFTokenizer::presentCharacter(char ch) } else { - type = tt_bad; + this->m->type = tt_bad; QTC::TC("qpdf", "QPDF_Tokenizer bad hexstring character"); - error_message = std::string("invalid character (") + + this->m->error_message = std::string("invalid character (") + ch + ") in hexstring"; - state = st_token_ready; + this->m->state = st_token_ready; } } else @@ -465,61 +480,63 @@ QPDFTokenizer::presentCharacter(char ch) "INTERNAL ERROR: invalid state while reading token"); } - if ((state == st_token_ready) && (type == tt_word)) + if ((this->m->state == st_token_ready) && (this->m->type == tt_word)) { resolveLiteral(); } - if (! (betweenTokens() || ((state == st_token_ready) && unread_char))) + if (! (betweenTokens() || + ((this->m->state == st_token_ready) && this->m->unread_char))) { - this->raw_val += orig_ch; + this->m->raw_val += orig_ch; } } void QPDFTokenizer::presentEOF() { - if (state == st_literal) + if (this->m->state == st_literal) { QTC::TC("qpdf", "QPDF_Tokenizer EOF reading appendable token"); resolveLiteral(); } - else if ((this->include_ignorable) && (state == st_in_space)) + else if ((this->m->include_ignorable) && (this->m->state == st_in_space)) { - type = tt_space; + this->m->type = tt_space; } - else if ((this->include_ignorable) && (state == st_in_comment)) + else if ((this->m->include_ignorable) && (this->m->state == st_in_comment)) { - type = tt_comment; + this->m->type = tt_comment; } else if (betweenTokens()) { - type = tt_eof; + this->m->type = tt_eof; } - else if (state != st_token_ready) + else if (this->m->state != st_token_ready) { QTC::TC("qpdf", "QPDF_Tokenizer EOF reading token"); - type = tt_bad; - error_message = "EOF while reading token"; + this->m->type = tt_bad; + this->m->error_message = "EOF while reading token"; } - state = st_token_ready; + this->m->state = st_token_ready; } bool QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) { - bool ready = (this->state == st_token_ready); - unread_char = this->unread_char; - ch = this->char_to_unread; + bool ready = (this->m->state == st_token_ready); + unread_char = this->m->unread_char; + ch = this->m->char_to_unread; if (ready) { - if (type == tt_bad) + if (this->m->type == tt_bad) { - val = raw_val; + this->m->val = this->m->raw_val; } - token = Token(type, val, raw_val, error_message); - reset(); + token = Token(this->m->type, this->m->val, + this->m->raw_val, this->m->error_message); + this->m->reset(); } return ready; } @@ -527,10 +544,10 @@ QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) bool QPDFTokenizer::betweenTokens() { - return ((state == st_top) || - ((! this->include_ignorable) && - ((state == st_in_comment) || - (state == st_in_space)))); + return ((this->m->state == st_top) || + ((! this->m->include_ignorable) && + ((this->m->state == st_in_comment) || + (this->m->state == st_in_space)))); } QPDFTokenizer::Token @@ -553,11 +570,11 @@ QPDFTokenizer::readToken(PointerHolder input, { presentEOF(); presented_eof = true; - if ((type == tt_eof) && (! this->allow_eof)) + if ((this->m->type == tt_eof) && (! this->m->allow_eof)) { QTC::TC("qpdf", "QPDF_Tokenizer EOF when not allowed"); - type = tt_bad; - error_message = "unexpected EOF"; + this->m->type = tt_bad; + this->m->error_message = "unexpected EOF"; offset = input->getLastOffset(); } } @@ -574,14 +591,14 @@ QPDFTokenizer::readToken(PointerHolder input, { ++offset; } - if (max_len && (raw_val.length() >= max_len) && - (this->state != st_token_ready)) + if (max_len && (this->m->raw_val.length() >= max_len) && + (this->m->state != st_token_ready)) { // terminate this token now QTC::TC("qpdf", "QPDFTokenizer block long token"); - this->type = tt_bad; - this->state = st_token_ready; - error_message = + this->m->type = tt_bad; + this->m->state = st_token_ready; + this->m->error_message = "exceeded allowable length while reading token"; } }