mirror of
https://github.com/qpdf/qpdf.git
synced 2025-01-05 08:02:11 +00:00
318 lines
9.6 KiB
C++
318 lines
9.6 KiB
C++
// Copyright (c) 2005-2023 Jay Berkenbilt
|
|
//
|
|
// This file is part of qpdf.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
|
|
// in compliance with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
// or implied. See the License for the specific language governing permissions and limitations under
|
|
// the License.
|
|
//
|
|
// Versions of qpdf prior to version 7 were released under the terms of version 2.0 of the Artistic
|
|
// License. At your option, you may continue to consider qpdf to be licensed under those terms.
|
|
// Please see the manual for additional information.
|
|
|
|
#ifndef QPDFTOKENIZER_HH
|
|
#define QPDFTOKENIZER_HH
|
|
|
|
#include <qpdf/DLL.h>
|
|
|
|
#include <qpdf/InputSource.hh>
|
|
#include <qpdf/PointerHolder.hh> // unused -- remove in qpdf 12 (see #785)
|
|
|
|
#include <cstdio>
|
|
#include <memory>
|
|
#include <string>
|
|
|
|
class QPDFTokenizer
|
|
{
|
|
public:
|
|
// Token type tt_eof is only returned of allowEOF() is called on the tokenizer. tt_eof was
|
|
// introduced in QPDF version 4.1. tt_space, tt_comment, and tt_inline_image were added in QPDF
|
|
// version 8.
|
|
enum token_type_e {
|
|
tt_bad,
|
|
tt_array_close,
|
|
tt_array_open,
|
|
tt_brace_close,
|
|
tt_brace_open,
|
|
tt_dict_close,
|
|
tt_dict_open,
|
|
tt_integer,
|
|
tt_name,
|
|
tt_real,
|
|
tt_string,
|
|
tt_null,
|
|
tt_bool,
|
|
tt_word,
|
|
tt_eof,
|
|
tt_space,
|
|
tt_comment,
|
|
tt_inline_image,
|
|
};
|
|
|
|
class Token
|
|
{
|
|
public:
|
|
Token() :
|
|
type(tt_bad)
|
|
{
|
|
}
|
|
QPDF_DLL
|
|
Token(token_type_e type, std::string const& value);
|
|
Token(
|
|
token_type_e type,
|
|
std::string const& value,
|
|
std::string raw_value,
|
|
std::string error_message) :
|
|
type(type),
|
|
value(value),
|
|
raw_value(raw_value),
|
|
error_message(error_message)
|
|
{
|
|
}
|
|
token_type_e
|
|
getType() const
|
|
{
|
|
return this->type;
|
|
}
|
|
std::string const&
|
|
getValue() const
|
|
{
|
|
return this->value;
|
|
}
|
|
std::string const&
|
|
getRawValue() const
|
|
{
|
|
return this->raw_value;
|
|
}
|
|
std::string const&
|
|
getErrorMessage() const
|
|
{
|
|
return this->error_message;
|
|
}
|
|
bool
|
|
operator==(Token const& rhs) const
|
|
{
|
|
// Ignore fields other than type and value
|
|
return (
|
|
(this->type != tt_bad) && (this->type == rhs.type) && (this->value == rhs.value));
|
|
}
|
|
bool
|
|
isInteger() const
|
|
{
|
|
return this->type == tt_integer;
|
|
}
|
|
bool
|
|
isWord() const
|
|
{
|
|
return this->type == tt_word;
|
|
}
|
|
bool
|
|
isWord(std::string const& value) const
|
|
{
|
|
return this->type == tt_word && this->value == value;
|
|
}
|
|
|
|
private:
|
|
token_type_e type;
|
|
std::string value;
|
|
std::string raw_value;
|
|
std::string error_message;
|
|
};
|
|
|
|
QPDF_DLL
|
|
QPDFTokenizer();
|
|
|
|
// If called, treat EOF as a separate token type instead of an error. This was introduced in
|
|
// QPDF 4.1 to facilitate tokenizing content streams.
|
|
QPDF_DLL
|
|
void allowEOF();
|
|
|
|
// If called, readToken will return "ignorable" tokens for space and comments. This was added in
|
|
// QPDF 8.
|
|
QPDF_DLL
|
|
void includeIgnorable();
|
|
|
|
// There are two modes of operation: push and pull. The pull method is easier but requires an
|
|
// input source. The push method is more complicated but can be used to tokenize a stream of
|
|
// incoming characters in a pipeline.
|
|
|
|
// Push mode:
|
|
|
|
// Keep presenting characters with presentCharacter() and presentEOF() and calling getToken()
|
|
// until getToken() returns true. When it does, be sure to check unread_ch and to unread ch if
|
|
// it is true.
|
|
|
|
// It these are called when a token is available, an exception will be thrown.
|
|
QPDF_DLL
|
|
void presentCharacter(char ch);
|
|
QPDF_DLL
|
|
void presentEOF();
|
|
|
|
// If a token is available, return true and initialize token with the token, unread_char with
|
|
// whether or not we have to unread the last character, and if unread_char, ch with the
|
|
// character to unread.
|
|
QPDF_DLL
|
|
bool getToken(Token& token, bool& unread_char, char& ch);
|
|
|
|
// This function returns true of the current character is between tokens (i.e., white space that
|
|
// is not part of a string) or is part of a comment. A tokenizing filter can call this to
|
|
// determine whether to output the character.
|
|
QPDF_DLL
|
|
bool betweenTokens();
|
|
|
|
// Pull mode:
|
|
|
|
// Read a token from an input source. Context describes the context in which the token is being
|
|
// read and is used in the exception thrown if there is an error. After a token is read, the
|
|
// position of the input source returned by input->tell() points to just after the token, and
|
|
// the input source's "last offset" as returned by input->getLastOffset() points to the
|
|
// beginning of the token.
|
|
QPDF_DLL
|
|
Token readToken(
|
|
InputSource& input, std::string const& context, bool allow_bad = false, size_t max_len = 0);
|
|
QPDF_DLL
|
|
Token readToken(
|
|
std::shared_ptr<InputSource> input,
|
|
std::string const& context,
|
|
bool allow_bad = false,
|
|
size_t max_len = 0);
|
|
|
|
// Calling this method puts the tokenizer in a state for reading inline images. You should call
|
|
// this method after reading the character following the ID operator. In that state, it will
|
|
// return all data up to BUT NOT INCLUDING the next EI token. After you call this method, the
|
|
// next call to readToken (or the token created next time getToken returns true) will either be
|
|
// tt_inline_image or tt_bad. This is the only way readToken
|
|
// returns a tt_inline_image token.
|
|
QPDF_DLL
|
|
void expectInlineImage(std::shared_ptr<InputSource> input);
|
|
|
|
private:
|
|
friend class QPDFParser;
|
|
|
|
// Read a token from an input source. Context describes the context in which the token is being
|
|
// read and is used in the exception thrown if there is an error. After a token is read, the
|
|
// position of the input source returned by input->tell() points to just after the token, and
|
|
// the input source's "last offset" as returned by input->getLastOffset() points to the
|
|
// beginning of the token. Returns false if the token is bad or if scanning produced an error
|
|
// message for any reason.
|
|
|
|
bool nextToken(InputSource& input, std::string const& context, size_t max_len = 0);
|
|
|
|
// The following methods are only valid after nextToken has been called and until another
|
|
// QPDFTokenizer method is called. They allow the results of calling nextToken to be accessed
|
|
// without creating a Token, thus avoiding copying information that may not be needed.
|
|
inline token_type_e getType() const noexcept;
|
|
inline std::string const& getValue() const noexcept;
|
|
inline std::string const& getRawValue() const noexcept;
|
|
inline std::string const& getErrorMessage() const noexcept;
|
|
|
|
QPDFTokenizer(QPDFTokenizer const&) = delete;
|
|
QPDFTokenizer& operator=(QPDFTokenizer const&) = delete;
|
|
|
|
bool isSpace(char);
|
|
bool isDelimiter(char);
|
|
void findEI(std::shared_ptr<InputSource> input);
|
|
|
|
enum state_e {
|
|
st_top,
|
|
st_in_hexstring,
|
|
st_in_string,
|
|
st_in_hexstring_2nd,
|
|
st_name,
|
|
st_literal,
|
|
st_in_space,
|
|
st_in_comment,
|
|
st_string_escape,
|
|
st_char_code,
|
|
st_string_after_cr,
|
|
st_lt,
|
|
st_gt,
|
|
st_inline_image,
|
|
st_sign,
|
|
st_number,
|
|
st_real,
|
|
st_decimal,
|
|
st_name_hex1,
|
|
st_name_hex2,
|
|
st_before_token,
|
|
st_token_ready
|
|
};
|
|
|
|
void handleCharacter(char);
|
|
void inBeforeToken(char);
|
|
void inTop(char);
|
|
void inSpace(char);
|
|
void inComment(char);
|
|
void inString(char);
|
|
void inName(char);
|
|
void inLt(char);
|
|
void inGt(char);
|
|
void inStringAfterCR(char);
|
|
void inStringEscape(char);
|
|
void inLiteral(char);
|
|
void inCharCode(char);
|
|
void inHexstring(char);
|
|
void inHexstring2nd(char);
|
|
void inInlineImage(char);
|
|
void inTokenReady(char);
|
|
void inNameHex1(char);
|
|
void inNameHex2(char);
|
|
void inSign(char);
|
|
void inDecimal(char);
|
|
void inNumber(char);
|
|
void inReal(char);
|
|
void reset();
|
|
|
|
// Lexer state
|
|
state_e state;
|
|
|
|
bool allow_eof;
|
|
bool include_ignorable;
|
|
|
|
// Current token accumulation
|
|
token_type_e type;
|
|
std::string val;
|
|
std::string raw_val;
|
|
std::string error_message;
|
|
bool before_token;
|
|
bool in_token;
|
|
char char_to_unread;
|
|
size_t inline_image_bytes;
|
|
bool bad;
|
|
|
|
// State for strings
|
|
int string_depth;
|
|
int char_code;
|
|
char hex_char;
|
|
int digit_count;
|
|
};
|
|
|
|
inline QPDFTokenizer::token_type_e
|
|
QPDFTokenizer::getType() const noexcept
|
|
{
|
|
return this->type;
|
|
}
|
|
inline std::string const&
|
|
QPDFTokenizer::getValue() const noexcept
|
|
{
|
|
return (this->type == tt_name || this->type == tt_string) ? this->val : this->raw_val;
|
|
}
|
|
inline std::string const&
|
|
QPDFTokenizer::getRawValue() const noexcept
|
|
{
|
|
return this->raw_val;
|
|
}
|
|
inline std::string const&
|
|
QPDFTokenizer::getErrorMessage() const noexcept
|
|
{
|
|
return this->error_message;
|
|
}
|
|
|
|
#endif // QPDFTOKENIZER_HH
|