2008-04-29 12:55:25 +00:00
|
|
|
#include <qpdf/QPDFTokenizer.hh>
|
|
|
|
|
|
|
|
// DO NOT USE ctype -- it is locale dependent for some things, and
|
|
|
|
// it's not worth the risk of including it in case it may accidentally
|
|
|
|
// be used.
|
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
#include <qpdf/QIntC.hh>
|
2012-07-21 09:48:13 +00:00
|
|
|
#include <qpdf/QPDFExc.hh>
|
2018-02-02 23:21:34 +00:00
|
|
|
#include <qpdf/QPDFObjectHandle.hh>
|
2022-04-02 21:14:10 +00:00
|
|
|
#include <qpdf/QTC.hh>
|
|
|
|
#include <qpdf/QUtil.hh>
|
2008-04-29 12:55:25 +00:00
|
|
|
|
2009-09-26 18:36:04 +00:00
|
|
|
#include <stdexcept>
|
2019-03-12 14:05:29 +00:00
|
|
|
#include <stdlib.h>
|
2008-05-04 16:02:53 +00:00
|
|
|
#include <string.h>
|
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
static bool
|
|
|
|
is_delimiter(char ch)
|
2019-01-30 19:20:56 +00:00
|
|
|
{
|
2022-07-26 11:37:50 +00:00
|
|
|
return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != nullptr);
|
2019-01-30 19:20:56 +00:00
|
|
|
}
|
|
|
|
|
2022-04-16 17:21:57 +00:00
|
|
|
namespace
|
2019-01-30 19:20:56 +00:00
|
|
|
{
|
2022-04-16 17:21:57 +00:00
|
|
|
class QPDFWordTokenFinder: public InputSource::Finder
|
2019-01-30 19:20:56 +00:00
|
|
|
{
|
2022-04-16 17:21:57 +00:00
|
|
|
public:
|
|
|
|
QPDFWordTokenFinder(
|
|
|
|
std::shared_ptr<InputSource> is, std::string const& str) :
|
|
|
|
is(is),
|
|
|
|
str(str)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
virtual ~QPDFWordTokenFinder() = default;
|
|
|
|
virtual bool check();
|
2019-01-30 19:20:56 +00:00
|
|
|
|
2022-04-16 17:21:57 +00:00
|
|
|
private:
|
|
|
|
std::shared_ptr<InputSource> is;
|
|
|
|
std::string str;
|
|
|
|
};
|
|
|
|
} // namespace
|
2019-01-30 19:20:56 +00:00
|
|
|
|
|
|
|
bool
|
|
|
|
QPDFWordTokenFinder::check()
|
|
|
|
{
|
|
|
|
// Find a word token matching the given string, preceded by a
|
|
|
|
// delimiter, and followed by a delimiter or EOF.
|
|
|
|
QPDFTokenizer tokenizer;
|
|
|
|
QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
|
|
|
|
qpdf_offset_t pos = is->tell();
|
2022-04-02 21:14:10 +00:00
|
|
|
if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) {
|
2019-01-30 19:26:08 +00:00
|
|
|
QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
|
2019-01-30 19:20:56 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
qpdf_offset_t token_start = is->getLastOffset();
|
|
|
|
char next;
|
|
|
|
bool next_okay = false;
|
2022-04-02 21:14:10 +00:00
|
|
|
if (is->read(&next, 1) == 0) {
|
2019-01-30 19:20:56 +00:00
|
|
|
QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
|
|
|
|
next_okay = true;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2019-01-30 19:20:56 +00:00
|
|
|
next_okay = is_delimiter(next);
|
|
|
|
}
|
|
|
|
is->seek(pos, SEEK_SET);
|
2022-04-02 21:14:10 +00:00
|
|
|
if (!next_okay) {
|
2019-01-30 19:20:56 +00:00
|
|
|
return false;
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
if (token_start == 0) {
|
2019-01-30 19:20:56 +00:00
|
|
|
// Can't actually happen...we never start the search at the
|
|
|
|
// beginning of the input.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-01-30 01:00:06 +00:00
|
|
|
QPDFTokenizer::Members::Members() :
|
2018-01-28 23:28:45 +00:00
|
|
|
allow_eof(false),
|
|
|
|
include_ignorable(false)
|
2008-04-29 12:55:25 +00:00
|
|
|
{
|
|
|
|
reset();
|
|
|
|
}
|
|
|
|
|
2018-01-30 01:00:06 +00:00
|
|
|
void
|
|
|
|
QPDFTokenizer::Members::reset()
|
|
|
|
{
|
|
|
|
state = st_top;
|
|
|
|
type = tt_bad;
|
|
|
|
val = "";
|
|
|
|
raw_val = "";
|
|
|
|
error_message = "";
|
|
|
|
unread_char = false;
|
|
|
|
char_to_unread = '\0';
|
2019-01-30 19:20:56 +00:00
|
|
|
inline_image_bytes = 0;
|
2018-01-30 01:00:06 +00:00
|
|
|
string_depth = 0;
|
|
|
|
string_ignoring_newline = false;
|
|
|
|
last_char_was_bs = false;
|
2018-08-05 22:59:41 +00:00
|
|
|
last_char_was_cr = false;
|
2018-01-30 01:00:06 +00:00
|
|
|
}
|
|
|
|
|
2018-02-02 23:21:34 +00:00
|
|
|
QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) :
|
|
|
|
type(type),
|
|
|
|
value(value),
|
|
|
|
raw_value(value)
|
|
|
|
{
|
2022-04-02 21:14:10 +00:00
|
|
|
if (type == tt_string) {
|
2018-02-02 23:21:34 +00:00
|
|
|
raw_value = QPDFObjectHandle::newString(value).unparse();
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (type == tt_name) {
|
2018-02-02 23:21:34 +00:00
|
|
|
raw_value = QPDFObjectHandle::newName(value).unparse();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-01-30 01:00:06 +00:00
|
|
|
QPDFTokenizer::QPDFTokenizer() :
|
|
|
|
m(new Members())
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2013-01-20 20:26:45 +00:00
|
|
|
void
|
|
|
|
QPDFTokenizer::allowEOF()
|
|
|
|
{
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->allow_eof = true;
|
2013-01-20 20:26:45 +00:00
|
|
|
}
|
|
|
|
|
2018-01-28 23:28:45 +00:00
|
|
|
void
|
|
|
|
QPDFTokenizer::includeIgnorable()
|
|
|
|
{
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->include_ignorable = true;
|
2018-01-28 23:28:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
QPDFTokenizer::isSpace(char ch)
|
|
|
|
{
|
|
|
|
return ((ch == '\0') || QUtil::is_space(ch));
|
|
|
|
}
|
|
|
|
|
2018-01-30 01:57:04 +00:00
|
|
|
bool
|
|
|
|
QPDFTokenizer::isDelimiter(char ch)
|
|
|
|
{
|
2019-01-30 19:20:56 +00:00
|
|
|
return is_delimiter(ch);
|
2018-01-30 01:57:04 +00:00
|
|
|
}
|
|
|
|
|
2008-04-29 12:55:25 +00:00
|
|
|
void
|
2012-08-11 13:22:59 +00:00
|
|
|
QPDFTokenizer::resolveLiteral()
|
2008-04-29 12:55:25 +00:00
|
|
|
{
|
2022-04-02 21:14:10 +00:00
|
|
|
if ((this->m->val.length() > 0) && (this->m->val.at(0) == '/')) {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->type = tt_name;
|
2012-08-11 13:22:59 +00:00
|
|
|
// Deal with # in name token. Note: '/' by itself is a
|
|
|
|
// valid name, so don't strip leading /. That way we
|
|
|
|
// don't have to deal with the empty string as a name.
|
|
|
|
std::string nval = "/";
|
2019-08-19 01:26:19 +00:00
|
|
|
size_t len = this->m->val.length();
|
2022-04-02 21:14:10 +00:00
|
|
|
for (size_t i = 1; i < len; ++i) {
|
2019-08-19 01:26:19 +00:00
|
|
|
char ch = this->m->val.at(i);
|
2022-04-02 21:14:10 +00:00
|
|
|
if (ch == '#') {
|
2019-08-19 01:26:19 +00:00
|
|
|
if ((i + 2 < len) &&
|
2022-04-02 21:14:10 +00:00
|
|
|
QUtil::is_hex_digit(this->m->val.at(i + 1)) &&
|
|
|
|
QUtil::is_hex_digit(this->m->val.at(i + 2))) {
|
2012-08-11 13:22:59 +00:00
|
|
|
char num[3];
|
2022-04-02 21:14:10 +00:00
|
|
|
num[0] = this->m->val.at(i + 1);
|
|
|
|
num[1] = this->m->val.at(i + 2);
|
2012-08-11 13:22:59 +00:00
|
|
|
num[2] = '\0';
|
2022-07-26 11:37:50 +00:00
|
|
|
char ch2 = static_cast<char>(strtol(num, nullptr, 16));
|
2022-04-02 21:14:10 +00:00
|
|
|
if (ch2 == '\0') {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->type = tt_bad;
|
2018-01-30 01:57:04 +00:00
|
|
|
QTC::TC("qpdf", "QPDFTokenizer null in name");
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->error_message =
|
2012-08-11 13:22:59 +00:00
|
|
|
"null character not allowed in name token";
|
|
|
|
nval += "#00";
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2020-04-16 15:43:37 +00:00
|
|
|
nval.append(1, ch2);
|
2012-08-11 13:22:59 +00:00
|
|
|
}
|
2019-08-19 01:26:19 +00:00
|
|
|
i += 2;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2018-01-30 01:57:04 +00:00
|
|
|
QTC::TC("qpdf", "QPDFTokenizer bad name");
|
2019-08-19 22:34:14 +00:00
|
|
|
this->m->error_message =
|
|
|
|
"name with stray # will not work with PDF >= 1.2";
|
2019-08-19 01:26:19 +00:00
|
|
|
// Use null to encode a bad # -- this is reversed
|
|
|
|
// in QPDF_Name::normalizeName.
|
|
|
|
nval += '\0';
|
2012-08-11 13:22:59 +00:00
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2019-08-19 01:26:19 +00:00
|
|
|
nval.append(1, ch);
|
2012-08-11 13:22:59 +00:00
|
|
|
}
|
|
|
|
}
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->val = nval;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (QUtil::is_number(this->m->val.c_str())) {
|
|
|
|
if (this->m->val.find('.') != std::string::npos) {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->type = tt_real;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->type = tt_integer;
|
2012-08-11 13:22:59 +00:00
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if ((this->m->val == "true") || (this->m->val == "false")) {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->type = tt_bool;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (this->m->val == "null") {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->type = tt_null;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2012-08-11 13:22:59 +00:00
|
|
|
// I don't really know what it is, so leave it as tt_word.
|
|
|
|
// Lots of cases ($, #, etc.) other than actual words fall
|
|
|
|
// into this category, but that's okay at least for now.
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->type = tt_word;
|
2012-08-11 13:22:59 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
QPDFTokenizer::presentCharacter(char ch)
|
|
|
|
{
|
2022-04-02 21:14:10 +00:00
|
|
|
if (this->m->state == st_token_ready) {
|
2022-02-08 14:18:08 +00:00
|
|
|
throw std::logic_error(
|
|
|
|
"INTERNAL ERROR: QPDF tokenizer presented character "
|
|
|
|
"while token is waiting");
|
2008-04-29 12:55:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
char orig_ch = ch;
|
|
|
|
|
|
|
|
// State machine is implemented such that some characters may be
|
|
|
|
// handled more than once. This happens whenever you have to use
|
|
|
|
// the character that caused a state change in the new state.
|
|
|
|
|
|
|
|
bool handled = true;
|
2022-04-02 21:14:10 +00:00
|
|
|
if (this->m->state == st_top) {
|
2022-02-08 14:18:08 +00:00
|
|
|
// Note: we specifically do not use ctype here. It is
|
|
|
|
// locale-dependent.
|
2022-04-02 21:14:10 +00:00
|
|
|
if (isSpace(ch)) {
|
|
|
|
if (this->m->include_ignorable) {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->state = st_in_space;
|
|
|
|
this->m->val += ch;
|
2018-01-28 23:28:45 +00:00
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (ch == '%') {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->state = st_in_comment;
|
2022-04-02 21:14:10 +00:00
|
|
|
if (this->m->include_ignorable) {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->val += ch;
|
2018-01-28 23:28:45 +00:00
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (ch == '(') {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->string_depth = 1;
|
|
|
|
this->m->string_ignoring_newline = false;
|
2022-04-02 21:14:10 +00:00
|
|
|
memset(
|
|
|
|
this->m->bs_num_register,
|
|
|
|
'\0',
|
|
|
|
sizeof(this->m->bs_num_register));
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->last_char_was_bs = false;
|
|
|
|
this->m->last_char_was_cr = false;
|
|
|
|
this->m->state = st_in_string;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (ch == '<') {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->state = st_lt;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (ch == '>') {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->state = st_gt;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->val += ch;
|
2022-04-02 21:14:10 +00:00
|
|
|
if (ch == ')') {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->type = tt_bad;
|
|
|
|
QTC::TC("qpdf", "QPDFTokenizer bad )");
|
|
|
|
this->m->error_message = "unexpected )";
|
|
|
|
this->m->state = st_token_ready;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (ch == '[') {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->type = tt_array_open;
|
|
|
|
this->m->state = st_token_ready;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (ch == ']') {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->type = tt_array_close;
|
|
|
|
this->m->state = st_token_ready;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (ch == '{') {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->type = tt_brace_open;
|
|
|
|
this->m->state = st_token_ready;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (ch == '}') {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->type = tt_brace_close;
|
|
|
|
this->m->state = st_token_ready;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->state = st_literal;
|
|
|
|
}
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (this->m->state == st_in_space) {
|
2018-01-28 23:28:45 +00:00
|
|
|
// We only enter this state if include_ignorable is true.
|
2022-04-02 21:14:10 +00:00
|
|
|
if (!isSpace(ch)) {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->type = tt_space;
|
|
|
|
this->m->unread_char = true;
|
|
|
|
this->m->char_to_unread = ch;
|
|
|
|
this->m->state = st_token_ready;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->val += ch;
|
2018-01-28 23:28:45 +00:00
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (this->m->state == st_in_comment) {
|
|
|
|
if ((ch == '\r') || (ch == '\n')) {
|
|
|
|
if (this->m->include_ignorable) {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->type = tt_comment;
|
|
|
|
this->m->unread_char = true;
|
|
|
|
this->m->char_to_unread = ch;
|
|
|
|
this->m->state = st_token_ready;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->state = st_top;
|
2018-01-28 23:28:45 +00:00
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (this->m->include_ignorable) {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->val += ch;
|
2018-01-28 23:28:45 +00:00
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (this->m->state == st_lt) {
|
|
|
|
if (ch == '<') {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->val = "<<";
|
|
|
|
this->m->type = tt_dict_open;
|
|
|
|
this->m->state = st_token_ready;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2022-02-08 14:18:08 +00:00
|
|
|
handled = false;
|
|
|
|
this->m->state = st_in_hexstring;
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (this->m->state == st_gt) {
|
|
|
|
if (ch == '>') {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->val = ">>";
|
|
|
|
this->m->type = tt_dict_close;
|
|
|
|
this->m->state = st_token_ready;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->val = ">";
|
|
|
|
this->m->type = tt_bad;
|
|
|
|
QTC::TC("qpdf", "QPDFTokenizer bad >");
|
|
|
|
this->m->error_message = "unexpected >";
|
|
|
|
this->m->unread_char = true;
|
|
|
|
this->m->char_to_unread = ch;
|
|
|
|
this->m->state = st_token_ready;
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (this->m->state == st_in_string) {
|
|
|
|
if (this->m->string_ignoring_newline && (ch != '\n')) {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->string_ignoring_newline = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t bs_num_count = strlen(this->m->bs_num_register);
|
|
|
|
bool ch_is_octal = ((ch >= '0') && (ch <= '7'));
|
2022-04-02 21:14:10 +00:00
|
|
|
if ((bs_num_count == 3) || ((bs_num_count > 0) && (!ch_is_octal))) {
|
2022-02-08 14:18:08 +00:00
|
|
|
// We've accumulated \ddd. PDF Spec says to ignore
|
|
|
|
// high-order overflow.
|
2022-04-02 21:14:10 +00:00
|
|
|
this->m->val +=
|
2022-07-26 11:37:50 +00:00
|
|
|
static_cast<char>(strtol(this->m->bs_num_register, nullptr, 8));
|
2022-04-02 21:14:10 +00:00
|
|
|
memset(
|
|
|
|
this->m->bs_num_register,
|
|
|
|
'\0',
|
|
|
|
sizeof(this->m->bs_num_register));
|
2022-02-08 14:18:08 +00:00
|
|
|
bs_num_count = 0;
|
|
|
|
}
|
2008-04-29 12:55:25 +00:00
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
if (this->m->string_ignoring_newline && (ch == '\n')) {
|
2022-02-08 14:18:08 +00:00
|
|
|
// ignore
|
2018-08-05 22:59:41 +00:00
|
|
|
this->m->string_ignoring_newline = false;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (
|
|
|
|
ch_is_octal && (this->m->last_char_was_bs || (bs_num_count > 0))) {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->bs_num_register[bs_num_count++] = ch;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (this->m->last_char_was_bs) {
|
|
|
|
switch (ch) {
|
|
|
|
case 'n':
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->val += '\n';
|
2018-08-05 22:59:41 +00:00
|
|
|
break;
|
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
case 'r':
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->val += '\r';
|
|
|
|
break;
|
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
case 't':
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->val += '\t';
|
|
|
|
break;
|
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
case 'b':
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->val += '\b';
|
|
|
|
break;
|
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
case 'f':
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->val += '\f';
|
|
|
|
break;
|
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
case '\n':
|
2022-02-08 14:18:08 +00:00
|
|
|
break;
|
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
case '\r':
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->string_ignoring_newline = true;
|
|
|
|
break;
|
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
default:
|
2022-02-08 14:18:08 +00:00
|
|
|
// PDF spec says backslash is ignored before anything else
|
|
|
|
this->m->val += ch;
|
|
|
|
break;
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (ch == '\\') {
|
2022-02-08 14:18:08 +00:00
|
|
|
// last_char_was_bs is set/cleared below as appropriate
|
2022-04-02 21:14:10 +00:00
|
|
|
if (bs_num_count) {
|
2022-02-08 14:18:08 +00:00
|
|
|
throw std::logic_error(
|
|
|
|
"INTERNAL ERROR: QPDFTokenizer: bs_num_count != 0 "
|
|
|
|
"when ch == '\\'");
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (ch == '(') {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->val += ch;
|
|
|
|
++this->m->string_depth;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if ((ch == ')') && (--this->m->string_depth == 0)) {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->type = tt_string;
|
|
|
|
this->m->state = st_token_ready;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (ch == '\r') {
|
2018-08-05 22:59:41 +00:00
|
|
|
// CR by itself is converted to LF
|
|
|
|
this->m->val += '\n';
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (ch == '\n') {
|
2018-08-05 22:59:41 +00:00
|
|
|
// CR LF is converted to LF
|
2022-04-02 21:14:10 +00:00
|
|
|
if (!this->m->last_char_was_cr) {
|
2018-08-05 22:59:41 +00:00
|
|
|
this->m->val += ch;
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->val += ch;
|
|
|
|
}
|
2008-04-29 12:55:25 +00:00
|
|
|
|
2018-08-05 22:59:41 +00:00
|
|
|
this->m->last_char_was_cr =
|
2022-04-02 21:14:10 +00:00
|
|
|
((!this->m->string_ignoring_newline) && (ch == '\r'));
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->last_char_was_bs =
|
2022-04-02 21:14:10 +00:00
|
|
|
((!this->m->last_char_was_bs) && (ch == '\\'));
|
|
|
|
} else if (this->m->state == st_literal) {
|
|
|
|
if (isDelimiter(ch)) {
|
2022-02-08 14:18:08 +00:00
|
|
|
// A C-locale whitespace character or delimiter terminates
|
|
|
|
// token. It is important to unread the whitespace
|
|
|
|
// character even though it is ignored since it may be the
|
|
|
|
// newline after a stream keyword. Removing it here could
|
|
|
|
// make the stream-reading code break on some files,
|
|
|
|
// though not on any files in the test suite as of this
|
|
|
|
// writing.
|
|
|
|
|
|
|
|
this->m->type = tt_word;
|
|
|
|
this->m->unread_char = true;
|
|
|
|
this->m->char_to_unread = ch;
|
|
|
|
this->m->state = st_token_ready;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->val += ch;
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (this->m->state == st_inline_image) {
|
2019-01-30 19:14:46 +00:00
|
|
|
this->m->val += ch;
|
2018-01-30 01:57:04 +00:00
|
|
|
size_t len = this->m->val.length();
|
2022-04-02 21:14:10 +00:00
|
|
|
if (len == this->m->inline_image_bytes) {
|
2019-01-30 19:20:56 +00:00
|
|
|
QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
|
|
|
|
this->m->type = tt_inline_image;
|
|
|
|
this->m->inline_image_bytes = 0;
|
|
|
|
this->m->state = st_token_ready;
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2022-02-08 14:18:08 +00:00
|
|
|
handled = false;
|
2008-04-29 12:55:25 +00:00
|
|
|
}
|
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
if (handled) {
|
2022-02-08 14:18:08 +00:00
|
|
|
// okay
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (this->m->state == st_in_hexstring) {
|
|
|
|
if (ch == '>') {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->type = tt_string;
|
|
|
|
this->m->state = st_token_ready;
|
2022-04-02 21:14:10 +00:00
|
|
|
if (this->m->val.length() % 2) {
|
2022-02-08 14:18:08 +00:00
|
|
|
// PDF spec says odd hexstrings have implicit
|
|
|
|
// trailing 0.
|
|
|
|
this->m->val += '0';
|
|
|
|
}
|
|
|
|
char num[3];
|
|
|
|
num[2] = '\0';
|
|
|
|
std::string nval;
|
2022-04-02 21:14:10 +00:00
|
|
|
for (unsigned int i = 0; i < this->m->val.length(); i += 2) {
|
2022-02-08 14:18:08 +00:00
|
|
|
num[0] = this->m->val.at(i);
|
2022-04-02 21:14:10 +00:00
|
|
|
num[1] = this->m->val.at(i + 1);
|
2022-07-26 11:37:50 +00:00
|
|
|
char nch = static_cast<char>(strtol(num, nullptr, 16));
|
2022-02-08 14:18:08 +00:00
|
|
|
nval += nch;
|
|
|
|
}
|
|
|
|
this->m->val = nval;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (QUtil::is_hex_digit(ch)) {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->val += ch;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (isSpace(ch)) {
|
2022-02-08 14:18:08 +00:00
|
|
|
// ignore
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->type = tt_bad;
|
|
|
|
QTC::TC("qpdf", "QPDFTokenizer bad hexstring character");
|
2022-04-02 21:14:10 +00:00
|
|
|
this->m->error_message =
|
|
|
|
std::string("invalid character (") + ch + ") in hexstring";
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->state = st_token_ready;
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2022-02-08 14:18:08 +00:00
|
|
|
throw std::logic_error(
|
|
|
|
"INTERNAL ERROR: invalid state while reading token");
|
2008-04-29 12:55:25 +00:00
|
|
|
}
|
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
if ((this->m->state == st_token_ready) && (this->m->type == tt_word)) {
|
2012-08-11 13:22:59 +00:00
|
|
|
resolveLiteral();
|
2008-04-29 12:55:25 +00:00
|
|
|
}
|
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
if (!(betweenTokens() ||
|
|
|
|
((this->m->state == st_token_ready) && this->m->unread_char))) {
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->raw_val += orig_ch;
|
2008-04-29 12:55:25 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
QPDFTokenizer::presentEOF()
|
|
|
|
{
|
2022-04-02 21:14:10 +00:00
|
|
|
if (this->m->state == st_literal) {
|
2018-01-30 01:57:04 +00:00
|
|
|
QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token");
|
2012-08-11 13:27:30 +00:00
|
|
|
resolveLiteral();
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (
|
|
|
|
(this->m->include_ignorable) && (this->m->state == st_in_space)) {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->type = tt_space;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (
|
|
|
|
(this->m->include_ignorable) && (this->m->state == st_in_comment)) {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->type = tt_comment;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (betweenTokens()) {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->type = tt_eof;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (this->m->state != st_token_ready) {
|
2018-01-30 01:57:04 +00:00
|
|
|
QTC::TC("qpdf", "QPDFTokenizer EOF reading token");
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->type = tt_bad;
|
|
|
|
this->m->error_message = "EOF while reading token";
|
2008-04-29 12:55:25 +00:00
|
|
|
}
|
2012-08-11 13:27:30 +00:00
|
|
|
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->state = st_token_ready;
|
2008-04-29 12:55:25 +00:00
|
|
|
}
|
|
|
|
|
2019-01-30 19:20:56 +00:00
|
|
|
void
|
2022-04-09 18:35:56 +00:00
|
|
|
QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
|
2019-01-30 19:20:56 +00:00
|
|
|
{
|
2022-04-02 21:14:10 +00:00
|
|
|
if (this->m->state != st_top) {
|
2019-01-30 19:26:08 +00:00
|
|
|
throw std::logic_error("QPDFTokenizer::expectInlineImage called"
|
|
|
|
" when tokenizer is in improper state");
|
|
|
|
}
|
|
|
|
findEI(input);
|
|
|
|
this->m->state = st_inline_image;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2022-04-09 18:35:56 +00:00
|
|
|
QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
|
2019-01-30 19:26:08 +00:00
|
|
|
{
|
2022-04-02 21:14:10 +00:00
|
|
|
if (!input.get()) {
|
2019-01-30 19:26:08 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
qpdf_offset_t last_offset = input->getLastOffset();
|
|
|
|
qpdf_offset_t pos = input->tell();
|
2019-01-30 19:20:56 +00:00
|
|
|
|
2019-01-30 19:26:08 +00:00
|
|
|
// Use QPDFWordTokenFinder to find EI surrounded by delimiters.
|
|
|
|
// Then read the next several tokens or up to EOF. If we find any
|
|
|
|
// suspicious-looking or tokens, this is probably still part of
|
|
|
|
// the image data, so keep looking for EI. Stop at the first EI
|
|
|
|
// that passes. If we get to the end without finding one, return
|
|
|
|
// the last EI we found. Store the number of bytes expected in the
|
|
|
|
// inline image including the EI and use that to break out of
|
|
|
|
// inline image, falling back to the old method if needed.
|
|
|
|
|
|
|
|
bool okay = false;
|
|
|
|
bool first_try = true;
|
2022-04-02 21:14:10 +00:00
|
|
|
while (!okay) {
|
2019-01-30 19:20:56 +00:00
|
|
|
QPDFWordTokenFinder f(input, "EI");
|
2022-04-02 21:14:10 +00:00
|
|
|
if (!input->findFirst("EI", input->tell(), 0, f)) {
|
2019-01-30 19:26:08 +00:00
|
|
|
break;
|
|
|
|
}
|
2019-06-21 03:35:23 +00:00
|
|
|
this->m->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2);
|
2019-01-30 19:26:08 +00:00
|
|
|
|
|
|
|
QPDFTokenizer check;
|
|
|
|
bool found_bad = false;
|
|
|
|
// Look at the next 10 tokens or up to EOF. The next inline
|
|
|
|
// image's image data would look like bad tokens, but there
|
|
|
|
// will always be at least 10 tokens between one inline
|
|
|
|
// image's EI and the next valid one's ID since width, height,
|
|
|
|
// bits per pixel, and color space are all required as well as
|
|
|
|
// a BI and ID. If we get 10 good tokens in a row or hit EOF,
|
|
|
|
// we can be pretty sure we've found the actual EI.
|
2022-04-02 21:14:10 +00:00
|
|
|
for (int i = 0; i < 10; ++i) {
|
|
|
|
QPDFTokenizer::Token t = check.readToken(input, "checker", true);
|
2019-01-30 19:26:08 +00:00
|
|
|
token_type_e type = t.getType();
|
2022-04-02 21:14:10 +00:00
|
|
|
if (type == tt_eof) {
|
2019-01-30 19:26:08 +00:00
|
|
|
okay = true;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (type == tt_bad) {
|
2019-01-30 19:26:08 +00:00
|
|
|
found_bad = true;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (type == tt_word) {
|
2019-01-30 19:26:08 +00:00
|
|
|
// The qpdf tokenizer lumps alphabetic and otherwise
|
|
|
|
// uncategorized characters into "words". We recognize
|
|
|
|
// strings of alphabetic characters as potential valid
|
|
|
|
// operators for purposes of telling whether we're in
|
|
|
|
// valid content or not. It's not perfect, but it
|
|
|
|
// should work more reliably than what we used to do,
|
|
|
|
// which was already good enough for the vast majority
|
|
|
|
// of files.
|
|
|
|
bool found_alpha = false;
|
|
|
|
bool found_non_printable = false;
|
|
|
|
bool found_other = false;
|
2022-05-21 14:18:15 +00:00
|
|
|
for (char ch: t.getValue()) {
|
2019-01-30 19:26:08 +00:00
|
|
|
if (((ch >= 'a') && (ch <= 'z')) ||
|
2022-04-02 21:14:10 +00:00
|
|
|
((ch >= 'A') && (ch <= 'Z')) || (ch == '*')) {
|
2019-01-30 19:26:08 +00:00
|
|
|
// Treat '*' as alpha since there are valid
|
|
|
|
// PDF operators that contain * along with
|
|
|
|
// alphabetic characters.
|
|
|
|
found_alpha = true;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else if (
|
|
|
|
(static_cast<signed char>(ch) < 32) && (!isSpace(ch))) {
|
2022-01-10 16:52:07 +00:00
|
|
|
// Compare ch as a signed char so characters
|
|
|
|
// outside of 7-bit will be < 0.
|
2019-01-30 19:26:08 +00:00
|
|
|
found_non_printable = true;
|
|
|
|
break;
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2019-01-30 19:26:08 +00:00
|
|
|
found_other = true;
|
|
|
|
}
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
if (found_non_printable || (found_alpha && found_other)) {
|
2019-01-30 19:26:08 +00:00
|
|
|
found_bad = true;
|
|
|
|
}
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
if (okay || found_bad) {
|
2019-01-30 19:26:08 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
if (!found_bad) {
|
2019-01-30 19:26:08 +00:00
|
|
|
okay = true;
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
if (!okay) {
|
2019-01-30 19:26:08 +00:00
|
|
|
first_try = false;
|
2019-01-30 19:20:56 +00:00
|
|
|
}
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
if (okay && (!first_try)) {
|
2019-01-30 19:26:08 +00:00
|
|
|
QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
|
2018-01-30 01:57:04 +00:00
|
|
|
}
|
2019-01-30 19:26:08 +00:00
|
|
|
|
|
|
|
input->seek(pos, SEEK_SET);
|
|
|
|
input->setLastOffset(last_offset);
|
2018-01-30 01:57:04 +00:00
|
|
|
}
|
|
|
|
|
2008-04-29 12:55:25 +00:00
|
|
|
bool
|
|
|
|
QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
|
|
|
|
{
|
2018-01-30 01:00:06 +00:00
|
|
|
bool ready = (this->m->state == st_token_ready);
|
|
|
|
unread_char = this->m->unread_char;
|
|
|
|
ch = this->m->char_to_unread;
|
2022-04-02 21:14:10 +00:00
|
|
|
if (ready) {
|
|
|
|
if (this->m->type == tt_bad) {
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->val = this->m->raw_val;
|
2018-01-28 23:28:45 +00:00
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
token = Token(
|
|
|
|
this->m->type,
|
|
|
|
this->m->val,
|
|
|
|
this->m->raw_val,
|
|
|
|
this->m->error_message);
|
2022-02-08 14:18:08 +00:00
|
|
|
this->m->reset();
|
2008-04-29 12:55:25 +00:00
|
|
|
}
|
|
|
|
return ready;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
QPDFTokenizer::betweenTokens()
|
|
|
|
{
|
2022-04-02 21:14:10 +00:00
|
|
|
return (
|
|
|
|
(this->m->state == st_top) ||
|
|
|
|
((!this->m->include_ignorable) &&
|
|
|
|
((this->m->state == st_in_comment) ||
|
|
|
|
(this->m->state == st_in_space))));
|
2008-04-29 12:55:25 +00:00
|
|
|
}
|
2012-07-21 09:48:13 +00:00
|
|
|
|
|
|
|
QPDFTokenizer::Token
|
2022-04-02 21:14:10 +00:00
|
|
|
QPDFTokenizer::readToken(
|
2022-04-09 18:35:56 +00:00
|
|
|
std::shared_ptr<InputSource> input,
|
2022-04-02 21:14:10 +00:00
|
|
|
std::string const& context,
|
|
|
|
bool allow_bad,
|
|
|
|
size_t max_len)
|
2012-07-21 09:48:13 +00:00
|
|
|
{
|
|
|
|
qpdf_offset_t offset = input->tell();
|
|
|
|
Token token;
|
|
|
|
bool unread_char;
|
|
|
|
char char_to_unread;
|
2012-08-11 13:27:30 +00:00
|
|
|
bool presented_eof = false;
|
2022-04-02 21:14:10 +00:00
|
|
|
while (!getToken(token, unread_char, char_to_unread)) {
|
2022-02-08 14:18:08 +00:00
|
|
|
char ch;
|
2022-04-02 21:14:10 +00:00
|
|
|
if (input->read(&ch, 1) == 0) {
|
|
|
|
if (!presented_eof) {
|
2012-08-11 13:27:30 +00:00
|
|
|
presentEOF();
|
|
|
|
presented_eof = true;
|
2022-04-02 21:14:10 +00:00
|
|
|
if ((this->m->type == tt_eof) && (!this->m->allow_eof)) {
|
2018-02-16 22:25:27 +00:00
|
|
|
// Nothing in the qpdf library calls readToken
|
|
|
|
// without allowEOF anymore, so this case is not
|
|
|
|
// exercised.
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->type = tt_bad;
|
|
|
|
this->m->error_message = "unexpected EOF";
|
2018-01-28 23:28:45 +00:00
|
|
|
offset = input->getLastOffset();
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2012-08-11 13:27:30 +00:00
|
|
|
throw std::logic_error(
|
|
|
|
"getToken returned false after presenting EOF");
|
|
|
|
}
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
2022-02-08 14:18:08 +00:00
|
|
|
presentCharacter(ch);
|
2022-04-02 21:14:10 +00:00
|
|
|
if (betweenTokens() && (input->getLastOffset() == offset)) {
|
2022-02-08 14:18:08 +00:00
|
|
|
++offset;
|
|
|
|
}
|
2018-01-30 01:00:06 +00:00
|
|
|
if (max_len && (this->m->raw_val.length() >= max_len) &&
|
2022-04-02 21:14:10 +00:00
|
|
|
(this->m->state != st_token_ready)) {
|
2017-08-22 14:24:19 +00:00
|
|
|
// terminate this token now
|
|
|
|
QTC::TC("qpdf", "QPDFTokenizer block long token");
|
2018-01-30 01:00:06 +00:00
|
|
|
this->m->type = tt_bad;
|
|
|
|
this->m->state = st_token_ready;
|
|
|
|
this->m->error_message =
|
2018-01-28 23:28:45 +00:00
|
|
|
"exceeded allowable length while reading token";
|
2017-08-22 14:24:19 +00:00
|
|
|
}
|
2022-02-08 14:18:08 +00:00
|
|
|
}
|
2012-07-21 09:48:13 +00:00
|
|
|
}
|
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
if (unread_char) {
|
2022-02-08 14:18:08 +00:00
|
|
|
input->unreadCh(char_to_unread);
|
2012-07-21 09:48:13 +00:00
|
|
|
}
|
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
if (token.getType() != tt_eof) {
|
2018-02-16 22:25:27 +00:00
|
|
|
input->setLastOffset(offset);
|
|
|
|
}
|
2017-08-09 01:44:37 +00:00
|
|
|
|
2022-04-02 21:14:10 +00:00
|
|
|
if (token.getType() == tt_bad) {
|
|
|
|
if (allow_bad) {
|
2017-08-05 18:54:07 +00:00
|
|
|
QTC::TC("qpdf", "QPDFTokenizer allowing bad token");
|
2022-04-02 21:14:10 +00:00
|
|
|
} else {
|
|
|
|
throw QPDFExc(
|
|
|
|
qpdf_e_damaged_pdf,
|
|
|
|
input->getName(),
|
|
|
|
context,
|
|
|
|
offset,
|
|
|
|
token.getErrorMessage());
|
2017-08-09 01:44:37 +00:00
|
|
|
}
|
2012-07-21 09:48:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return token;
|
|
|
|
}
|