// Copyright (c) 2005-2017 Jay Berkenbilt // // This file is part of qpdf. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Versions of qpdf prior to version 7 were released under the terms // of version 2.0 of the Artistic License. At your option, you may // continue to consider qpdf to be licensed under those terms. Please // see the manual for additional information. #ifndef __QPDFTOKENIZER_HH__ #define __QPDFTOKENIZER_HH__ #include #include #include #include #include class QPDFTokenizer { public: // Token type tt_eof is only returned of allowEOF() is called on // the tokenizer. tt_eof was introduced in QPDF version 4.1. enum token_type_e { tt_bad, tt_array_close, tt_array_open, tt_brace_close, tt_brace_open, tt_dict_close, tt_dict_open, tt_integer, tt_name, tt_real, tt_string, tt_null, tt_bool, tt_word, tt_eof, }; class Token { public: Token() : type(tt_bad) {} Token(token_type_e type, std::string const& value) : type(type), value(value) { } Token(token_type_e type, std::string const& value, std::string raw_value, std::string error_message) : type(type), value(value), raw_value(raw_value), error_message(error_message) { } token_type_e getType() const { return this->type; } std::string const& getValue() const { return this->value; } std::string const& getRawValue() const { return this->raw_value; } std::string const& getErrorMessage() const { return this->error_message; } bool operator==(Token const& rhs) { // Ignore fields other than type and value return ((this->type != tt_bad) && (this->type == rhs.type) && (this->value == rhs.value)); } private: token_type_e type; std::string value; std::string raw_value; std::string error_message; }; QPDF_DLL QPDFTokenizer(); // PDF files with version < 1.2 allowed the pound character // anywhere in a name. Starting with version 1.2, the pound // character was allowed only when followed by two hexadecimal // digits. This method should be called when parsing a PDF file // whose version is older than 1.2. QPDF_DLL void allowPoundAnywhereInName(); // If called, treat EOF as a separate token type instead of an // error. This was introduced in QPDF 4.1 to facilitate // tokenizing content streams. QPDF_DLL void allowEOF(); // Mode of operation: // Keep presenting characters and calling getToken() until // getToken() returns true. When it does, be sure to check // unread_ch and to unread ch if it is true. // It these are called when a token is available, an exception // will be thrown. QPDF_DLL void presentCharacter(char ch); QPDF_DLL void presentEOF(); // If a token is available, return true and initialize token with // the token, unread_char with whether or not we have to unread // the last character, and if unread_char, ch with the character // to unread. QPDF_DLL bool getToken(Token& token, bool& unread_char, char& ch); // This function returns true of the current character is between // tokens (i.e., white space that is not part of a string) or is // part of a comment. A tokenizing filter can call this to // determine whether to output the character. QPDF_DLL bool betweenTokens(); // Read a token from an input source. Context describes the // context in which the token is being read and is used in the // exception thrown if there is an error. QPDF_DLL Token readToken(PointerHolder input, std::string const& context, bool allow_bad = false, size_t max_len = 0); private: void reset(); void resolveLiteral(); // Lexer state enum { st_top, st_in_comment, st_in_string, st_lt, st_gt, st_literal, st_in_hexstring, st_token_ready } state; bool pound_special_in_name; bool allow_eof; // Current token accumulation token_type_e type; std::string val; std::string raw_val; std::string error_message; bool unread_char; char char_to_unread; // State for strings int string_depth; bool string_ignoring_newline; char bs_num_register[4]; bool last_char_was_bs; }; #endif // __QPDFTOKENIZER_HH__