2008-04-29 12:55:25 +00:00
|
|
|
#include <qpdf/Pl_QPDFTokenizer.hh>
|
|
|
|
#include <qpdf/QPDF_String.hh>
|
|
|
|
#include <qpdf/QPDF_Name.hh>
|
2011-04-30 18:20:35 +00:00
|
|
|
#include <qpdf/QTC.hh>
|
2018-01-23 00:23:42 +00:00
|
|
|
#include <qpdf/QUtil.hh>
|
2009-09-26 18:36:04 +00:00
|
|
|
#include <stdexcept>
|
2008-05-04 16:02:53 +00:00
|
|
|
#include <string.h>
|
2008-04-29 12:55:25 +00:00
|
|
|
|
|
|
|
Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) :
|
|
|
|
Pipeline(identifier, next),
|
|
|
|
newline_after_next_token(false),
|
|
|
|
just_wrote_nl(false),
|
|
|
|
last_char_was_cr(false),
|
|
|
|
unread_char(false),
|
|
|
|
char_to_unread('\0'),
|
2011-04-30 18:20:35 +00:00
|
|
|
in_inline_image(false)
|
2008-04-29 12:55:25 +00:00
|
|
|
{
|
2011-04-30 18:20:35 +00:00
|
|
|
memset(this->image_buf, 0, IMAGE_BUF_SIZE);
|
2008-04-29 12:55:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2012-06-20 15:20:57 +00:00
|
|
|
Pl_QPDFTokenizer::writeNext(char const* buf, size_t len)
|
2008-04-29 12:55:25 +00:00
|
|
|
{
|
|
|
|
if (len)
|
|
|
|
{
|
2018-01-23 00:23:42 +00:00
|
|
|
getNext()->write(QUtil::unsigned_char_pointer(buf), len);
|
2008-04-29 12:55:25 +00:00
|
|
|
this->just_wrote_nl = (buf[len-1] == '\n');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token)
|
|
|
|
{
|
|
|
|
std::string value = token.getRawValue();
|
|
|
|
|
|
|
|
switch (token.getType())
|
|
|
|
{
|
|
|
|
case QPDFTokenizer::tt_string:
|
|
|
|
value = QPDF_String(token.getValue()).unparse();
|
|
|
|
break;
|
|
|
|
|
|
|
|
case QPDFTokenizer::tt_name:
|
|
|
|
value = QPDF_Name(token.getValue()).unparse();
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
writeNext(value.c_str(), value.length());
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Pl_QPDFTokenizer::processChar(char ch)
|
|
|
|
{
|
2011-04-30 18:20:35 +00:00
|
|
|
if (this->in_inline_image)
|
2008-04-29 12:55:25 +00:00
|
|
|
{
|
2011-04-30 18:20:35 +00:00
|
|
|
// Scan through the input looking for EI surrounded by
|
|
|
|
// whitespace. If that pattern appears in the inline image's
|
|
|
|
// representation, we're hosed, but this situation seems
|
|
|
|
// excessively unlikely, and this code path is only followed
|
|
|
|
// during content stream normalization, which is pretty much
|
|
|
|
// used for debugging and human inspection of PDF files.
|
|
|
|
memmove(this->image_buf,
|
|
|
|
this->image_buf + 1,
|
|
|
|
IMAGE_BUF_SIZE - 1);
|
|
|
|
this->image_buf[IMAGE_BUF_SIZE - 1] = ch;
|
|
|
|
if (strchr(" \t\n\v\f\r", this->image_buf[0]) &&
|
|
|
|
(this->image_buf[1] == 'E') &&
|
|
|
|
(this->image_buf[2] == 'I') &&
|
|
|
|
strchr(" \t\n\v\f\r", this->image_buf[3]))
|
|
|
|
{
|
|
|
|
// We've found an EI operator. We've already written the
|
|
|
|
// EI operator to output; terminate with a newline
|
|
|
|
// character and resume normal processing.
|
|
|
|
writeNext("\n", 1);
|
|
|
|
this->in_inline_image = false;
|
|
|
|
QTC::TC("qpdf", "Pl_QPDFTokenizer found EI");
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
writeNext(&ch, 1);
|
|
|
|
}
|
2008-04-29 12:55:25 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
tokenizer.presentCharacter(ch);
|
|
|
|
QPDFTokenizer::Token token;
|
|
|
|
if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))
|
|
|
|
{
|
|
|
|
writeToken(token);
|
|
|
|
if (this->newline_after_next_token)
|
|
|
|
{
|
|
|
|
writeNext("\n", 1);
|
|
|
|
this->newline_after_next_token = false;
|
|
|
|
}
|
|
|
|
if ((token.getType() == QPDFTokenizer::tt_word) &&
|
2011-04-30 18:20:35 +00:00
|
|
|
(token.getValue() == "ID"))
|
2008-04-29 12:55:25 +00:00
|
|
|
{
|
2011-04-30 18:20:35 +00:00
|
|
|
// Suspend normal scanning until we find an EI token.
|
|
|
|
this->in_inline_image = true;
|
2008-04-29 12:55:25 +00:00
|
|
|
if (this->unread_char)
|
|
|
|
{
|
|
|
|
writeNext(&this->char_to_unread, 1);
|
|
|
|
this->unread_char = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
bool suppress = false;
|
|
|
|
if ((ch == '\n') && (this->last_char_was_cr))
|
|
|
|
{
|
|
|
|
// Always ignore \n following \r
|
|
|
|
suppress = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((this->last_char_was_cr = (ch == '\r')))
|
|
|
|
{
|
|
|
|
ch = '\n';
|
|
|
|
}
|
|
|
|
|
|
|
|
if (this->tokenizer.betweenTokens())
|
|
|
|
{
|
|
|
|
if (! suppress)
|
|
|
|
{
|
|
|
|
writeNext(&ch, 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (ch == '\n')
|
|
|
|
{
|
|
|
|
this->newline_after_next_token = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
Pl_QPDFTokenizer::checkUnread()
|
|
|
|
{
|
|
|
|
if (this->unread_char)
|
|
|
|
{
|
|
|
|
processChar(this->char_to_unread);
|
|
|
|
if (this->unread_char)
|
|
|
|
{
|
2009-09-26 18:36:04 +00:00
|
|
|
throw std::logic_error(
|
|
|
|
"INTERNAL ERROR: unread_char still true after processing "
|
|
|
|
"unread character");
|
2008-04-29 12:55:25 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2012-06-20 15:20:57 +00:00
|
|
|
Pl_QPDFTokenizer::write(unsigned char* buf, size_t len)
|
2008-04-29 12:55:25 +00:00
|
|
|
{
|
|
|
|
checkUnread();
|
2012-06-20 15:20:57 +00:00
|
|
|
for (size_t i = 0; i < len; ++i)
|
2008-04-29 12:55:25 +00:00
|
|
|
{
|
|
|
|
processChar(buf[i]);
|
|
|
|
checkUnread();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Pl_QPDFTokenizer::finish()
|
|
|
|
{
|
|
|
|
this->tokenizer.presentEOF();
|
2011-04-30 18:20:35 +00:00
|
|
|
if (! this->in_inline_image)
|
2008-04-29 12:55:25 +00:00
|
|
|
{
|
|
|
|
QPDFTokenizer::Token token;
|
|
|
|
if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))
|
|
|
|
{
|
|
|
|
writeToken(token);
|
|
|
|
if (unread_char)
|
|
|
|
{
|
|
|
|
if (this->char_to_unread == '\r')
|
|
|
|
{
|
|
|
|
this->char_to_unread = '\n';
|
|
|
|
}
|
|
|
|
writeNext(&this->char_to_unread, 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (! this->just_wrote_nl)
|
|
|
|
{
|
|
|
|
writeNext("\n", 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
getNext()->finish();
|
|
|
|
}
|