qpdf/libqpdf/QPDFTokenizer.cc

459 lines
8.7 KiB
C++

#include <qpdf/QPDFTokenizer.hh>
// DO NOT USE ctype -- it is locale dependent for some things, and
// it's not worth the risk of including it in case it may accidentally
// be used.
#include <qpdf/PCRE.hh>
#include <qpdf/QEXC.hh>
#include <qpdf/QTC.hh>
// See note above about ctype.
static bool is_hex_digit(char ch)
{
return (strchr("0123456789abcdefABCDEF", ch) != 0);
}
QPDFTokenizer::QPDFTokenizer() :
pound_special_in_name(true)
{
reset();
}
void
QPDFTokenizer::allowPoundAnywhereInName()
{
QTC::TC("qpdf", "QPDFTokenizer allow pound anywhere in name");
this->pound_special_in_name = false;
}
void
QPDFTokenizer::reset()
{
state = st_top;
type = tt_bad;
val = "";
raw_val = "";
error_message = "";
unread_char = false;
char_to_unread = '\0';
string_depth = 0;
string_ignoring_newline = false;
last_char_was_bs = false;
}
void
QPDFTokenizer::presentCharacter(char ch)
{
static PCRE num_re("^[\\+\\-]?(?:\\.\\d+|\\d+(?:\\.\\d+)?)$");
if (state == st_token_ready)
{
throw QEXC::Internal("QPDF tokenizer presented character "
"while token is waiting");
}
char orig_ch = ch;
// State machine is implemented such that some characters may be
// handled more than once. This happens whenever you have to use
// the character that caused a state change in the new state.
bool handled = true;
if (state == st_top)
{
// Note: we specifically do not use ctype here. It is
// locale-dependent.
if (strchr(" \t\n\v\f\r", ch))
{
// ignore
}
else if (ch == '%')
{
// Discard comments
state = st_in_comment;
}
else if (ch == '(')
{
string_depth = 1;
string_ignoring_newline = false;
memset(bs_num_register, '\0', sizeof(bs_num_register));
last_char_was_bs = false;
state = st_in_string;
}
else if (ch == '<')
{
state = st_lt;
}
else if (ch == '>')
{
state = st_gt;
}
else
{
val += ch;
if (ch == ')')
{
type = tt_bad;
QTC::TC("qpdf", "QPDF_Tokenizer bad )");
error_message = "unexpected )";
state = st_token_ready;
}
else if (ch == '[')
{
type = tt_array_open;
state = st_token_ready;
}
else if (ch == ']')
{
type = tt_array_close;
state = st_token_ready;
}
else if (ch == '{')
{
type = tt_brace_open;
state = st_token_ready;
}
else if (ch == '}')
{
type = tt_brace_close;
state = st_token_ready;
}
else
{
state = st_literal;
}
}
}
else if (state == st_in_comment)
{
if ((ch == '\r') || (ch == '\n'))
{
state = st_top;
}
}
else if (state == st_lt)
{
if (ch == '<')
{
val = "<<";
type = tt_dict_open;
state = st_token_ready;
}
else
{
handled = false;
state = st_in_hexstring;
}
}
else if (state == st_gt)
{
if (ch == '>')
{
val = ">>";
type = tt_dict_close;
state = st_token_ready;
}
else
{
val = ">";
type = tt_bad;
QTC::TC("qpdf", "QPDF_Tokenizer bad >");
error_message = "unexpected >";
unread_char = true;
char_to_unread = ch;
state = st_token_ready;
}
}
else if (state == st_in_string)
{
if (string_ignoring_newline && (! ((ch == '\r') || (ch == '\n'))))
{
string_ignoring_newline = false;
}
unsigned int bs_num_count = strlen(bs_num_register);
bool ch_is_octal = ((ch >= '0') && (ch <= '7'));
if ((bs_num_count == 3) || ((bs_num_count > 0) && (! ch_is_octal)))
{
// We've accumulated \ddd. PDF Spec says to ignore
// high-order overflow.
val += (char) strtol(bs_num_register, 0, 8);
memset(bs_num_register, '\0', sizeof(bs_num_register));
bs_num_count = 0;
}
if (string_ignoring_newline && ((ch == '\r') || (ch == '\n')))
{
// ignore
}
else if (ch_is_octal && (last_char_was_bs || (bs_num_count > 0)))
{
bs_num_register[bs_num_count++] = ch;
}
else if (last_char_was_bs)
{
switch (ch)
{
case 'n':
val += '\n';
break;
case 'r':
val += '\r';
break;
case 't':
val += '\t';
break;
case 'b':
val += '\b';
break;
case 'f':
val += '\f';
break;
case '\r':
case '\n':
string_ignoring_newline = true;
break;
default:
// PDF spec says backslash is ignored before anything else
val += ch;
break;
}
}
else if (ch == '\\')
{
// last_char_was_bs is set/cleared below as appropriate
if (bs_num_count)
{
throw QEXC::Internal("QPDFTokenizer: bs_num_count != 0 "
"when ch == '\\'");
}
}
else if (ch == '(')
{
val += ch;
++string_depth;
}
else if ((ch == ')') && (--string_depth == 0))
{
type = tt_string;
state = st_token_ready;
}
else
{
val += ch;
}
last_char_was_bs = ((! last_char_was_bs) && (ch == '\\'));
}
else if (state == st_literal)
{
if (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0)
{
// A C-loacle whitespace character or delimiter terminates
// token. It is important to unread the whitespace
// character even though it is ignored since it may be the
// newline after a stream keyword. Removing it here could
// make the stream-reading code break on some files,
// though not on any files in the test suite as of this
// writing.
type = tt_word;
unread_char = true;
char_to_unread = ch;
state = st_token_ready;
}
else
{
val += ch;
}
}
else
{
handled = false;
}
if (handled)
{
// okay
}
else if (state == st_in_hexstring)
{
if (ch == '>')
{
type = tt_string;
state = st_token_ready;
if (val.length() % 2)
{
// PDF spec says odd hexstrings have implicit
// trailing 0.
val += '0';
}
char num[3];
num[2] = '\0';
std::string nval;
for (unsigned int i = 0; i < val.length(); i += 2)
{
num[0] = val[i];
num[1] = val[i+1];
char nch = (char)(strtol(num, 0, 16));
nval += nch;
}
val = nval;
}
else if (is_hex_digit(ch))
{
val += ch;
}
else if (strchr(" \t\n\v\f\r", ch))
{
// ignore
}
else
{
type = tt_bad;
QTC::TC("qpdf", "QPDF_Tokenizer bad (");
error_message = std::string("invalid character (") +
ch + ") in hexstring";
state = st_token_ready;
}
}
else
{
throw QEXC::Internal("invalid state while reading token");
}
if ((state == st_token_ready) && (type == tt_word))
{
if ((val.length() > 0) && (val[0] == '/'))
{
type = tt_name;
// Deal with # in name token. Note: '/' by itself is a
// valid name, so don't strip leading /. That way we
// don't have to deal with the empty string as a name.
std::string nval = "/";
char const* valstr = val.c_str() + 1;
for (char const* p = valstr; *p; ++p)
{
if ((*p == '#') && this->pound_special_in_name)
{
if (p[1] && p[2] &&
is_hex_digit(p[1]) && is_hex_digit(p[2]))
{
char num[3];
num[0] = p[1];
num[1] = p[2];
num[2] = '\0';
char ch = (char)(strtol(num, 0, 16));
if (ch == '\0')
{
type = tt_bad;
QTC::TC("qpdf", "QPDF_Tokenizer null in name");
error_message =
"null character not allowed in name token";
nval += "#00";
}
else
{
nval += ch;
}
p += 2;
}
else
{
QTC::TC("qpdf", "QPDF_Tokenizer bad name");
type = tt_bad;
error_message = "invalid name token";
nval += *p;
}
}
else
{
nval += *p;
}
}
val = nval;
}
else if (num_re.match(val.c_str()))
{
if (val.find('.') != std::string::npos)
{
type = tt_real;
}
else
{
type = tt_integer;
}
}
else if ((val == "true") || (val == "false"))
{
type = tt_bool;
}
else if (val == "null")
{
type = tt_null;
}
else
{
// I don't really know what it is, so leave it as tt_word.
// Lots of cases ($, #, etc.) other than actual words fall
// into this category, but that's okay at least for now.
type = tt_word;
}
}
if (! (betweenTokens() || ((state == st_token_ready) && unread_char)))
{
this->raw_val += orig_ch;
}
}
void
QPDFTokenizer::presentEOF()
{
switch (state)
{
case st_token_ready:
case st_top:
// okay
break;
case st_in_comment:
state = st_top;
break;
default:
type = tt_bad;
error_message = "EOF while reading token";
state = st_token_ready;
}
}
bool
QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
{
bool ready = (this->state == st_token_ready);
unread_char = this->unread_char;
ch = this->char_to_unread;
if (ready)
{
token = Token(type, val, raw_val, error_message);
reset();
}
return ready;
}
bool
QPDFTokenizer::betweenTokens()
{
return ((state == st_top) || (state == st_in_comment));
}