#include <qpdf/QPDFTokenizer.hh> // DO NOT USE ctype -- it is locale dependent for some things, and // it's not worth the risk of including it in case it may accidentally // be used. #include <qpdf/QTC.hh> #include <qpdf/QPDFExc.hh> #include <qpdf/QUtil.hh> #include <qpdf/QPDFObjectHandle.hh> #include <stdexcept> #include <string.h> #include <cstdlib> QPDFTokenizer::Members::Members() : pound_special_in_name(true), allow_eof(false), include_ignorable(false) { reset(); } void QPDFTokenizer::Members::reset() { state = st_top; type = tt_bad; val = ""; raw_val = ""; error_message = ""; unread_char = false; char_to_unread = '\0'; string_depth = 0; string_ignoring_newline = false; last_char_was_bs = false; last_char_was_cr = false; } QPDFTokenizer::Members::~Members() { } QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : type(type), value(value), raw_value(value) { if (type == tt_string) { raw_value = QPDFObjectHandle::newString(value).unparse(); } else if (type == tt_name) { raw_value = QPDFObjectHandle::newName(value).unparse(); } } QPDFTokenizer::QPDFTokenizer() : m(new Members()) { } void QPDFTokenizer::allowPoundAnywhereInName() { QTC::TC("qpdf", "QPDFTokenizer allow pound anywhere in name"); this->m->pound_special_in_name = false; } void QPDFTokenizer::allowEOF() { this->m->allow_eof = true; } void QPDFTokenizer::includeIgnorable() { this->m->include_ignorable = true; } bool QPDFTokenizer::isSpace(char ch) { return ((ch == '\0') || QUtil::is_space(ch)); } bool QPDFTokenizer::isDelimiter(char ch) { return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0); } void QPDFTokenizer::resolveLiteral() { if ((this->m->val.length() > 0) && (this->m->val.at(0) == '/')) { this->m->type = tt_name; // Deal with # in name token. Note: '/' by itself is a // valid name, so don't strip leading /. That way we // don't have to deal with the empty string as a name. std::string nval = "/"; char const* valstr = this->m->val.c_str() + 1; for (char const* p = valstr; *p; ++p) { if ((*p == '#') && this->m->pound_special_in_name) { if (p[1] && p[2] && QUtil::is_hex_digit(p[1]) && QUtil::is_hex_digit(p[2])) { char num[3]; num[0] = p[1]; num[1] = p[2]; num[2] = '\0'; char ch = static_cast<char>(strtol(num, 0, 16)); if (ch == '\0') { this->m->type = tt_bad; QTC::TC("qpdf", "QPDFTokenizer null in name"); this->m->error_message = "null character not allowed in name token"; nval += "#00"; } else { nval += ch; } p += 2; } else { QTC::TC("qpdf", "QPDFTokenizer bad name"); this->m->type = tt_bad; this->m->error_message = "invalid name token"; nval += *p; } } else { nval += *p; } } this->m->val = nval; } else if (QUtil::is_number(this->m->val.c_str())) { if (this->m->val.find('.') != std::string::npos) { this->m->type = tt_real; } else { this->m->type = tt_integer; } } else if ((this->m->val == "true") || (this->m->val == "false")) { this->m->type = tt_bool; } else if (this->m->val == "null") { this->m->type = tt_null; } else { // I don't really know what it is, so leave it as tt_word. // Lots of cases ($, #, etc.) other than actual words fall // into this category, but that's okay at least for now. this->m->type = tt_word; } } void QPDFTokenizer::presentCharacter(char ch) { if (this->m->state == st_token_ready) { throw std::logic_error( "INTERNAL ERROR: QPDF tokenizer presented character " "while token is waiting"); } char orig_ch = ch; // State machine is implemented such that some characters may be // handled more than once. This happens whenever you have to use // the character that caused a state change in the new state. bool handled = true; if (this->m->state == st_top) { // Note: we specifically do not use ctype here. It is // locale-dependent. if (isSpace(ch)) { if (this->m->include_ignorable) { this->m->state = st_in_space; this->m->val += ch; } } else if (ch == '%') { this->m->state = st_in_comment; if (this->m->include_ignorable) { this->m->val += ch; } } else if (ch == '(') { this->m->string_depth = 1; this->m->string_ignoring_newline = false; memset(this->m->bs_num_register, '\0', sizeof(this->m->bs_num_register)); this->m->last_char_was_bs = false; this->m->last_char_was_cr = false; this->m->state = st_in_string; } else if (ch == '<') { this->m->state = st_lt; } else if (ch == '>') { this->m->state = st_gt; } else { this->m->val += ch; if (ch == ')') { this->m->type = tt_bad; QTC::TC("qpdf", "QPDFTokenizer bad )"); this->m->error_message = "unexpected )"; this->m->state = st_token_ready; } else if (ch == '[') { this->m->type = tt_array_open; this->m->state = st_token_ready; } else if (ch == ']') { this->m->type = tt_array_close; this->m->state = st_token_ready; } else if (ch == '{') { this->m->type = tt_brace_open; this->m->state = st_token_ready; } else if (ch == '}') { this->m->type = tt_brace_close; this->m->state = st_token_ready; } else { this->m->state = st_literal; } } } else if (this->m->state == st_in_space) { // We only enter this state if include_ignorable is true. if (! isSpace(ch)) { this->m->type = tt_space; this->m->unread_char = true; this->m->char_to_unread = ch; this->m->state = st_token_ready; } else { this->m->val += ch; } } else if (this->m->state == st_in_comment) { if ((ch == '\r') || (ch == '\n')) { if (this->m->include_ignorable) { this->m->type = tt_comment; this->m->unread_char = true; this->m->char_to_unread = ch; this->m->state = st_token_ready; } else { this->m->state = st_top; } } else if (this->m->include_ignorable) { this->m->val += ch; } } else if (this->m->state == st_lt) { if (ch == '<') { this->m->val = "<<"; this->m->type = tt_dict_open; this->m->state = st_token_ready; } else { handled = false; this->m->state = st_in_hexstring; } } else if (this->m->state == st_gt) { if (ch == '>') { this->m->val = ">>"; this->m->type = tt_dict_close; this->m->state = st_token_ready; } else { this->m->val = ">"; this->m->type = tt_bad; QTC::TC("qpdf", "QPDFTokenizer bad >"); this->m->error_message = "unexpected >"; this->m->unread_char = true; this->m->char_to_unread = ch; this->m->state = st_token_ready; } } else if (this->m->state == st_in_string) { if (this->m->string_ignoring_newline && (ch != '\n')) { this->m->string_ignoring_newline = false; } size_t bs_num_count = strlen(this->m->bs_num_register); bool ch_is_octal = ((ch >= '0') && (ch <= '7')); if ((bs_num_count == 3) || ((bs_num_count > 0) && (! ch_is_octal))) { // We've accumulated \ddd. PDF Spec says to ignore // high-order overflow. this->m->val += static_cast<char>( strtol(this->m->bs_num_register, 0, 8)); memset(this->m->bs_num_register, '\0', sizeof(this->m->bs_num_register)); bs_num_count = 0; } if (this->m->string_ignoring_newline && (ch == '\n')) { // ignore this->m->string_ignoring_newline = false; } else if (ch_is_octal && (this->m->last_char_was_bs || (bs_num_count > 0))) { this->m->bs_num_register[bs_num_count++] = ch; } else if (this->m->last_char_was_bs) { switch (ch) { case 'n': this->m->val += '\n'; break; case 'r': this->m->val += '\r'; break; case 't': this->m->val += '\t'; break; case 'b': this->m->val += '\b'; break; case 'f': this->m->val += '\f'; break; case '\n': break; case '\r': this->m->string_ignoring_newline = true; break; default: // PDF spec says backslash is ignored before anything else this->m->val += ch; break; } } else if (ch == '\\') { // last_char_was_bs is set/cleared below as appropriate if (bs_num_count) { throw std::logic_error( "INTERNAL ERROR: QPDFTokenizer: bs_num_count != 0 " "when ch == '\\'"); } } else if (ch == '(') { this->m->val += ch; ++this->m->string_depth; } else if ((ch == ')') && (--this->m->string_depth == 0)) { this->m->type = tt_string; this->m->state = st_token_ready; } else if (ch == '\r') { // CR by itself is converted to LF this->m->val += '\n'; } else if (ch == '\n') { // CR LF is converted to LF if (! this->m->last_char_was_cr) { this->m->val += ch; } } else { this->m->val += ch; } this->m->last_char_was_cr = ((! this->m->string_ignoring_newline) && (ch == '\r')); this->m->last_char_was_bs = ((! this->m->last_char_was_bs) && (ch == '\\')); } else if (this->m->state == st_literal) { if (isDelimiter(ch)) { // A C-locale whitespace character or delimiter terminates // token. It is important to unread the whitespace // character even though it is ignored since it may be the // newline after a stream keyword. Removing it here could // make the stream-reading code break on some files, // though not on any files in the test suite as of this // writing. this->m->type = tt_word; this->m->unread_char = true; this->m->char_to_unread = ch; this->m->state = st_token_ready; } else { this->m->val += ch; } } else if (this->m->state == st_inline_image) { size_t len = this->m->val.length(); if ((len >= 4) && isDelimiter(this->m->val.at(len-4)) && (this->m->val.at(len-3) == 'E') && (this->m->val.at(len-2) == 'I') && isDelimiter(this->m->val.at(len-1))) { this->m->type = tt_inline_image; this->m->unread_char = true; this->m->char_to_unread = ch; this->m->state = st_token_ready; } else { this->m->val += ch; } } else { handled = false; } if (handled) { // okay } else if (this->m->state == st_in_hexstring) { if (ch == '>') { this->m->type = tt_string; this->m->state = st_token_ready; if (this->m->val.length() % 2) { // PDF spec says odd hexstrings have implicit // trailing 0. this->m->val += '0'; } char num[3]; num[2] = '\0'; std::string nval; for (unsigned int i = 0; i < this->m->val.length(); i += 2) { num[0] = this->m->val.at(i); num[1] = this->m->val.at(i+1); char nch = static_cast<char>(strtol(num, 0, 16)); nval += nch; } this->m->val = nval; } else if (QUtil::is_hex_digit(ch)) { this->m->val += ch; } else if (isSpace(ch)) { // ignore } else { this->m->type = tt_bad; QTC::TC("qpdf", "QPDFTokenizer bad hexstring character"); this->m->error_message = std::string("invalid character (") + ch + ") in hexstring"; this->m->state = st_token_ready; } } else { throw std::logic_error( "INTERNAL ERROR: invalid state while reading token"); } if ((this->m->state == st_token_ready) && (this->m->type == tt_word)) { resolveLiteral(); } if (! (betweenTokens() || ((this->m->state == st_token_ready) && this->m->unread_char))) { this->m->raw_val += orig_ch; } } void QPDFTokenizer::presentEOF() { if (this->m->state == st_inline_image) { size_t len = this->m->val.length(); if ((len >= 3) && isDelimiter(this->m->val.at(len-3)) && (this->m->val.at(len-2) == 'E') && (this->m->val.at(len-1) == 'I')) { QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); this->m->type = tt_inline_image; this->m->state = st_token_ready; } } if (this->m->state == st_literal) { QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token"); resolveLiteral(); } else if ((this->m->include_ignorable) && (this->m->state == st_in_space)) { this->m->type = tt_space; } else if ((this->m->include_ignorable) && (this->m->state == st_in_comment)) { this->m->type = tt_comment; } else if (betweenTokens()) { this->m->type = tt_eof; } else if (this->m->state != st_token_ready) { QTC::TC("qpdf", "QPDFTokenizer EOF reading token"); this->m->type = tt_bad; this->m->error_message = "EOF while reading token"; } this->m->state = st_token_ready; } void QPDFTokenizer::expectInlineImage() { if (this->m->state != st_top) { throw std::logic_error("QPDFTokenizer::expectInlineImage called" " when tokenizer is in improper state"); } this->m->state = st_inline_image; } bool QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) { bool ready = (this->m->state == st_token_ready); unread_char = this->m->unread_char; ch = this->m->char_to_unread; if (ready) { if (this->m->type == tt_bad) { this->m->val = this->m->raw_val; } token = Token(this->m->type, this->m->val, this->m->raw_val, this->m->error_message); this->m->reset(); } return ready; } bool QPDFTokenizer::betweenTokens() { return ((this->m->state == st_top) || ((! this->m->include_ignorable) && ((this->m->state == st_in_comment) || (this->m->state == st_in_space)))); } QPDFTokenizer::Token QPDFTokenizer::readToken(PointerHolder<InputSource> input, std::string const& context, bool allow_bad, size_t max_len) { qpdf_offset_t offset = input->tell(); Token token; bool unread_char; char char_to_unread; bool presented_eof = false; while (! getToken(token, unread_char, char_to_unread)) { char ch; if (input->read(&ch, 1) == 0) { if (! presented_eof) { presentEOF(); presented_eof = true; if ((this->m->type == tt_eof) && (! this->m->allow_eof)) { // Nothing in the qpdf library calls readToken // without allowEOF anymore, so this case is not // exercised. this->m->type = tt_bad; this->m->error_message = "unexpected EOF"; offset = input->getLastOffset(); } } else { throw std::logic_error( "getToken returned false after presenting EOF"); } } else { presentCharacter(ch); if (betweenTokens() && (input->getLastOffset() == offset)) { ++offset; } if (max_len && (this->m->raw_val.length() >= max_len) && (this->m->state != st_token_ready)) { // terminate this token now QTC::TC("qpdf", "QPDFTokenizer block long token"); this->m->type = tt_bad; this->m->state = st_token_ready; this->m->error_message = "exceeded allowable length while reading token"; } } } if (unread_char) { input->unreadCh(char_to_unread); } if (token.getType() != tt_eof) { input->setLastOffset(offset); } if (token.getType() == tt_bad) { if (allow_bad) { QTC::TC("qpdf", "QPDFTokenizer allowing bad token"); } else { throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), context, offset, token.getErrorMessage()); } } return token; }