// Copyright (c) 2005-2024 Jay Berkenbilt // // This file is part of qpdf. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software distributed under the License // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under // the License. // // Versions of qpdf prior to version 7 were released under the terms of version 2.0 of the Artistic // License. At your option, you may continue to consider qpdf to be licensed under those terms. // Please see the manual for additional information. #ifndef QPDFTOKENIZER_HH #define QPDFTOKENIZER_HH #include #include #include // unused -- remove in qpdf 12 (see #785) #include #include #include class QPDFTokenizer { public: // Token type tt_eof is only returned of allowEOF() is called on the tokenizer. tt_eof was // introduced in QPDF version 4.1. tt_space, tt_comment, and tt_inline_image were added in QPDF // version 8. enum token_type_e { tt_bad, tt_array_close, tt_array_open, tt_brace_close, tt_brace_open, tt_dict_close, tt_dict_open, tt_integer, tt_name, tt_real, tt_string, tt_null, tt_bool, tt_word, tt_eof, tt_space, tt_comment, tt_inline_image, }; class Token { public: Token() : type(tt_bad) { } QPDF_DLL Token(token_type_e type, std::string const& value); Token( token_type_e type, std::string const& value, std::string raw_value, std::string error_message) : type(type), value(value), raw_value(raw_value), error_message(error_message) { } token_type_e getType() const { return this->type; } std::string const& getValue() const { return this->value; } std::string const& getRawValue() const { return this->raw_value; } std::string const& getErrorMessage() const { return this->error_message; } bool operator==(Token const& rhs) const { // Ignore fields other than type and value return ( (this->type != tt_bad) && (this->type == rhs.type) && (this->value == rhs.value)); } bool isInteger() const { return this->type == tt_integer; } bool isWord() const { return this->type == tt_word; } bool isWord(std::string const& value) const { return this->type == tt_word && this->value == value; } private: token_type_e type; std::string value; std::string raw_value; std::string error_message; }; QPDF_DLL QPDFTokenizer(); // If called, treat EOF as a separate token type instead of an error. This was introduced in // QPDF 4.1 to facilitate tokenizing content streams. QPDF_DLL void allowEOF(); // If called, readToken will return "ignorable" tokens for space and comments. This was added in // QPDF 8. QPDF_DLL void includeIgnorable(); // There are two modes of operation: push and pull. The pull method is easier but requires an // input source. The push method is more complicated but can be used to tokenize a stream of // incoming characters in a pipeline. // Push mode: // Keep presenting characters with presentCharacter() and presentEOF() and calling getToken() // until getToken() returns true. When it does, be sure to check unread_ch and to unread ch if // it is true. // It these are called when a token is available, an exception will be thrown. QPDF_DLL void presentCharacter(char ch); QPDF_DLL void presentEOF(); // If a token is available, return true and initialize token with the token, unread_char with // whether or not we have to unread the last character, and if unread_char, ch with the // character to unread. QPDF_DLL bool getToken(Token& token, bool& unread_char, char& ch); // This function returns true of the current character is between tokens (i.e., white space that // is not part of a string) or is part of a comment. A tokenizing filter can call this to // determine whether to output the character. QPDF_DLL bool betweenTokens(); // Pull mode: // Read a token from an input source. Context describes the context in which the token is being // read and is used in the exception thrown if there is an error. After a token is read, the // position of the input source returned by input->tell() points to just after the token, and // the input source's "last offset" as returned by input->getLastOffset() points to the // beginning of the token. QPDF_DLL Token readToken( InputSource& input, std::string const& context, bool allow_bad = false, size_t max_len = 0); QPDF_DLL Token readToken( std::shared_ptr input, std::string const& context, bool allow_bad = false, size_t max_len = 0); // Calling this method puts the tokenizer in a state for reading inline images. You should call // this method after reading the character following the ID operator. In that state, it will // return all data up to BUT NOT INCLUDING the next EI token. After you call this method, the // next call to readToken (or the token created next time getToken returns true) will either be // tt_inline_image or tt_bad. This is the only way readToken // returns a tt_inline_image token. QPDF_DLL void expectInlineImage(std::shared_ptr input); private: friend class QPDFParser; // Read a token from an input source. Context describes the context in which the token is being // read and is used in the exception thrown if there is an error. After a token is read, the // position of the input source returned by input->tell() points to just after the token, and // the input source's "last offset" as returned by input->getLastOffset() points to the // beginning of the token. Returns false if the token is bad or if scanning produced an error // message for any reason. bool nextToken(InputSource& input, std::string const& context, size_t max_len = 0); // The following methods are only valid after nextToken has been called and until another // QPDFTokenizer method is called. They allow the results of calling nextToken to be accessed // without creating a Token, thus avoiding copying information that may not be needed. inline token_type_e getType() const noexcept; inline std::string const& getValue() const noexcept; inline std::string const& getRawValue() const noexcept; inline std::string const& getErrorMessage() const noexcept; QPDFTokenizer(QPDFTokenizer const&) = delete; QPDFTokenizer& operator=(QPDFTokenizer const&) = delete; bool isSpace(char); bool isDelimiter(char); void findEI(std::shared_ptr input); enum state_e { st_top, st_in_hexstring, st_in_string, st_in_hexstring_2nd, st_name, st_literal, st_in_space, st_in_comment, st_string_escape, st_char_code, st_string_after_cr, st_lt, st_gt, st_inline_image, st_sign, st_number, st_real, st_decimal, st_name_hex1, st_name_hex2, st_before_token, st_token_ready }; void handleCharacter(char); void inBeforeToken(char); void inTop(char); void inSpace(char); void inComment(char); void inString(char); void inName(char); void inLt(char); void inGt(char); void inStringAfterCR(char); void inStringEscape(char); void inLiteral(char); void inCharCode(char); void inHexstring(char); void inHexstring2nd(char); void inInlineImage(char); void inTokenReady(char); void inNameHex1(char); void inNameHex2(char); void inSign(char); void inDecimal(char); void inNumber(char); void inReal(char); void reset(); // Lexer state state_e state; bool allow_eof; bool include_ignorable; // Current token accumulation token_type_e type; std::string val; std::string raw_val; std::string error_message; bool before_token; bool in_token; char char_to_unread; size_t inline_image_bytes; bool bad; // State for strings int string_depth; int char_code; char hex_char; int digit_count; }; inline QPDFTokenizer::token_type_e QPDFTokenizer::getType() const noexcept { return this->type; } inline std::string const& QPDFTokenizer::getValue() const noexcept { return (this->type == tt_name || this->type == tt_string) ? this->val : this->raw_val; } inline std::string const& QPDFTokenizer::getRawValue() const noexcept { return this->raw_val; } inline std::string const& QPDFTokenizer::getErrorMessage() const noexcept { return this->error_message; } #endif // QPDFTOKENIZER_HH