mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 10:58:58 +00:00
Refactor QPDFTokenizer's inline image handling
Add a version of expectInlineImage that takes an input source and searches for EI. This is in preparation for improving the way EI is found. This commit just refactors the code without changing the functionality and adds tests to make sure the old and new code behave identically.
This commit is contained in:
parent
31372edce0
commit
ec9e310c9e
@ -27,6 +27,7 @@
|
||||
#include <qpdf/QPDFTokenizer.hh>
|
||||
#include <qpdf/PointerHolder.hh>
|
||||
#include <qpdf/QPDFObjectHandle.hh>
|
||||
#include <qpdf/Pl_Buffer.hh>
|
||||
|
||||
// Tokenize the incoming text using QPDFTokenizer and pass the tokens
|
||||
// in turn to a QPDFObjectHandle::TokenFilter object. All bytes of
|
||||
@ -56,9 +57,6 @@ class Pl_QPDFTokenizer: public Pipeline
|
||||
virtual void finish();
|
||||
|
||||
private:
|
||||
void processChar(char ch);
|
||||
void checkUnread();
|
||||
|
||||
class Members
|
||||
{
|
||||
friend class Pl_QPDFTokenizer;
|
||||
@ -73,9 +71,7 @@ class Pl_QPDFTokenizer: public Pipeline
|
||||
|
||||
QPDFObjectHandle::TokenFilter* filter;
|
||||
QPDFTokenizer tokenizer;
|
||||
bool last_char_was_cr;
|
||||
bool unread_char;
|
||||
char char_to_unread;
|
||||
Pl_Buffer buf;
|
||||
};
|
||||
PointerHolder<Members> m;
|
||||
};
|
||||
|
@ -178,7 +178,15 @@ class QPDFTokenizer
|
||||
// including the next EI token. After you call this method, the
|
||||
// next call to readToken (or the token created next time getToken
|
||||
// returns true) will either be tt_inline_image or tt_bad. This is
|
||||
// the only way readToken returns a tt_inline_image token.
|
||||
// the only way readToken returns a tt_inline_image token. The
|
||||
// version of this method that takes a PointerHolder<InputSource>
|
||||
// does a better job of locating the end of the inline image and
|
||||
// should be used whenever the input source is available. It
|
||||
// preserves both tell() and getLastOffset(). The version without
|
||||
// the input source will always end the inline image the first
|
||||
// time it sees something that looks like an EI operator.
|
||||
QPDF_DLL
|
||||
void expectInlineImage(PointerHolder<InputSource> input);
|
||||
QPDF_DLL
|
||||
void expectInlineImage();
|
||||
|
||||
@ -223,6 +231,7 @@ class QPDFTokenizer
|
||||
std::string error_message;
|
||||
bool unread_char;
|
||||
char char_to_unread;
|
||||
size_t inline_image_bytes;
|
||||
|
||||
// State for strings
|
||||
int string_depth;
|
||||
|
@ -1,13 +1,13 @@
|
||||
#include <qpdf/Pl_QPDFTokenizer.hh>
|
||||
#include <qpdf/QTC.hh>
|
||||
#include <qpdf/QUtil.hh>
|
||||
#include <qpdf/BufferInputSource.hh>
|
||||
#include <stdexcept>
|
||||
#include <string.h>
|
||||
|
||||
Pl_QPDFTokenizer::Members::Members() :
|
||||
filter(0),
|
||||
last_char_was_cr(false),
|
||||
unread_char(false),
|
||||
char_to_unread('\0')
|
||||
buf("tokenizer buffer")
|
||||
{
|
||||
}
|
||||
|
||||
@ -33,61 +33,36 @@ Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
|
||||
}
|
||||
|
||||
void
|
||||
Pl_QPDFTokenizer::processChar(char ch)
|
||||
Pl_QPDFTokenizer::write(unsigned char* data, size_t len)
|
||||
{
|
||||
this->m->tokenizer.presentCharacter(ch);
|
||||
QPDFTokenizer::Token token;
|
||||
if (this->m->tokenizer.getToken(
|
||||
token, this->m->unread_char, this->m->char_to_unread))
|
||||
{
|
||||
this->m->filter->handleToken(token);
|
||||
if ((token.getType() == QPDFTokenizer::tt_word) &&
|
||||
(token.getValue() == "ID"))
|
||||
{
|
||||
QTC::TC("qpdf", "Pl_QPDFTokenizer found ID");
|
||||
this->m->tokenizer.expectInlineImage();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Pl_QPDFTokenizer::checkUnread()
|
||||
{
|
||||
if (this->m->unread_char)
|
||||
{
|
||||
processChar(this->m->char_to_unread);
|
||||
if (this->m->unread_char)
|
||||
{
|
||||
throw std::logic_error(
|
||||
"INTERNAL ERROR: unread_char still true after processing "
|
||||
"unread character");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Pl_QPDFTokenizer::write(unsigned char* buf, size_t len)
|
||||
{
|
||||
checkUnread();
|
||||
for (size_t i = 0; i < len; ++i)
|
||||
{
|
||||
processChar(buf[i]);
|
||||
checkUnread();
|
||||
}
|
||||
this->m->buf.write(data, len);
|
||||
}
|
||||
|
||||
void
|
||||
Pl_QPDFTokenizer::finish()
|
||||
{
|
||||
this->m->tokenizer.presentEOF();
|
||||
QPDFTokenizer::Token token;
|
||||
if (this->m->tokenizer.getToken(
|
||||
token, this->m->unread_char, this->m->char_to_unread))
|
||||
{
|
||||
this->m->filter->handleToken(token);
|
||||
}
|
||||
this->m->buf.finish();
|
||||
PointerHolder<InputSource> input =
|
||||
new BufferInputSource("tokenizer data",
|
||||
this->m->buf.getBuffer(), true);
|
||||
|
||||
while (true)
|
||||
{
|
||||
QPDFTokenizer::Token token = this->m->tokenizer.readToken(
|
||||
input, "offset " + QUtil::int_to_string(input->tell()),
|
||||
true);
|
||||
this->m->filter->handleToken(token);
|
||||
if (token.getType() == QPDFTokenizer::tt_eof)
|
||||
{
|
||||
break;
|
||||
}
|
||||
else if ((token.getType() == QPDFTokenizer::tt_word) &&
|
||||
(token.getValue() == "ID"))
|
||||
{
|
||||
QTC::TC("qpdf", "Pl_QPDFTokenizer found ID");
|
||||
this->m->tokenizer.expectInlineImage(input);
|
||||
}
|
||||
}
|
||||
this->m->filter->handleEOF();
|
||||
QPDFObjectHandle::TokenFilter::PipelineAccessor::setPipeline(
|
||||
m->filter, 0);
|
||||
|
@ -1558,7 +1558,7 @@ QPDFObjectHandle::parseContentStream_data(
|
||||
// terminated the token. Read until end of inline image.
|
||||
char ch;
|
||||
input->read(&ch, 1);
|
||||
tokenizer.expectInlineImage();
|
||||
tokenizer.expectInlineImage(input);
|
||||
QPDFTokenizer::Token t =
|
||||
tokenizer.readToken(input, description, true);
|
||||
if (t.getType() == QPDFTokenizer::tt_bad)
|
||||
|
@ -13,6 +13,79 @@
|
||||
#include <string.h>
|
||||
#include <cstdlib>
|
||||
|
||||
static bool is_delimiter(char ch)
|
||||
{
|
||||
return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0);
|
||||
}
|
||||
|
||||
class QPDFWordTokenFinder: public InputSource::Finder
|
||||
{
|
||||
public:
|
||||
QPDFWordTokenFinder(PointerHolder<InputSource> is,
|
||||
std::string const& str) :
|
||||
is(is),
|
||||
str(str)
|
||||
{
|
||||
}
|
||||
virtual ~QPDFWordTokenFinder()
|
||||
{
|
||||
}
|
||||
virtual bool check();
|
||||
|
||||
private:
|
||||
PointerHolder<InputSource> is;
|
||||
std::string str;
|
||||
};
|
||||
|
||||
bool
|
||||
QPDFWordTokenFinder::check()
|
||||
{
|
||||
// Find a word token matching the given string, preceded by a
|
||||
// delimiter, and followed by a delimiter or EOF.
|
||||
QPDFTokenizer tokenizer;
|
||||
QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
|
||||
qpdf_offset_t pos = is->tell();
|
||||
if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)))
|
||||
{
|
||||
/// QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
|
||||
return false;
|
||||
}
|
||||
qpdf_offset_t token_start = is->getLastOffset();
|
||||
char next;
|
||||
bool next_okay = false;
|
||||
if (is->read(&next, 1) == 0)
|
||||
{
|
||||
QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
|
||||
next_okay = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
next_okay = is_delimiter(next);
|
||||
}
|
||||
is->seek(pos, SEEK_SET);
|
||||
if (! next_okay)
|
||||
{
|
||||
/// QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter");
|
||||
return false;
|
||||
}
|
||||
if (token_start == 0)
|
||||
{
|
||||
// Can't actually happen...we never start the search at the
|
||||
// beginning of the input.
|
||||
return false;
|
||||
}
|
||||
is->seek(token_start - 1, SEEK_SET);
|
||||
char prev;
|
||||
bool prev_okay = ((is->read(&prev, 1) == 1) && is_delimiter(prev));
|
||||
is->seek(pos, SEEK_SET);
|
||||
if (! prev_okay)
|
||||
{
|
||||
/// QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
QPDFTokenizer::Members::Members() :
|
||||
pound_special_in_name(true),
|
||||
allow_eof(false),
|
||||
@ -31,6 +104,7 @@ QPDFTokenizer::Members::reset()
|
||||
error_message = "";
|
||||
unread_char = false;
|
||||
char_to_unread = '\0';
|
||||
inline_image_bytes = 0;
|
||||
string_depth = 0;
|
||||
string_ignoring_newline = false;
|
||||
last_char_was_bs = false;
|
||||
@ -91,7 +165,7 @@ QPDFTokenizer::isSpace(char ch)
|
||||
bool
|
||||
QPDFTokenizer::isDelimiter(char ch)
|
||||
{
|
||||
return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0);
|
||||
return is_delimiter(ch);
|
||||
}
|
||||
|
||||
void
|
||||
@ -470,12 +544,21 @@ QPDFTokenizer::presentCharacter(char ch)
|
||||
{
|
||||
this->m->val += ch;
|
||||
size_t len = this->m->val.length();
|
||||
if ((len >= 4) &&
|
||||
isDelimiter(this->m->val.at(len-4)) &&
|
||||
(this->m->val.at(len-3) == 'E') &&
|
||||
(this->m->val.at(len-2) == 'I') &&
|
||||
isDelimiter(this->m->val.at(len-1)))
|
||||
if (len == this->m->inline_image_bytes)
|
||||
{
|
||||
QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
|
||||
this->m->type = tt_inline_image;
|
||||
this->m->inline_image_bytes = 0;
|
||||
this->m->state = st_token_ready;
|
||||
}
|
||||
else if ((this->m->inline_image_bytes == 0) &&
|
||||
(len >= 4) &&
|
||||
isDelimiter(this->m->val.at(len-4)) &&
|
||||
(this->m->val.at(len-3) == 'E') &&
|
||||
(this->m->val.at(len-2) == 'I') &&
|
||||
isDelimiter(this->m->val.at(len-1)))
|
||||
{
|
||||
QTC::TC("qpdf", "QPDFTokenizer found EI the old way");
|
||||
this->m->val.erase(len - 1);
|
||||
this->m->type = tt_inline_image;
|
||||
this->m->unread_char = true;
|
||||
@ -562,7 +645,7 @@ QPDFTokenizer::presentEOF()
|
||||
(this->m->val.at(len-2) == 'E') &&
|
||||
(this->m->val.at(len-1) == 'I'))
|
||||
{
|
||||
QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
|
||||
QTC::TC("qpdf", "QPDFTokenizer inline image at EOF the old way");
|
||||
this->m->type = tt_inline_image;
|
||||
this->m->state = st_token_ready;
|
||||
}
|
||||
@ -598,6 +681,26 @@ QPDFTokenizer::presentEOF()
|
||||
void
|
||||
QPDFTokenizer::expectInlineImage()
|
||||
{
|
||||
expectInlineImage(PointerHolder<InputSource>());
|
||||
}
|
||||
|
||||
void
|
||||
QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input)
|
||||
{
|
||||
if (input.getPointer())
|
||||
{
|
||||
qpdf_offset_t last_offset = input->getLastOffset();
|
||||
qpdf_offset_t pos = input->tell();
|
||||
|
||||
QPDFWordTokenFinder f(input, "EI");
|
||||
if (input->findFirst("EI", pos, 0, f))
|
||||
{
|
||||
this->m->inline_image_bytes = input->tell() - pos;
|
||||
}
|
||||
|
||||
input->seek(pos, SEEK_SET);
|
||||
input->setLastOffset(last_offset);
|
||||
}
|
||||
if (this->m->state != st_top)
|
||||
{
|
||||
throw std::logic_error("QPDFTokenizer::expectInlineImage called"
|
||||
|
@ -430,3 +430,6 @@ QPDFPageObjectHelper copy shared attribute 0
|
||||
qpdf from_nr from repeat_nr 0
|
||||
QPDF resolve duplicated page object 0
|
||||
QPDF handle direct page object 0
|
||||
QPDFTokenizer found EI the old way 0
|
||||
QPDFTokenizer found EI by byte count 0
|
||||
QPDFTokenizer inline image at EOF the old way 0
|
||||
|
@ -694,7 +694,7 @@ $td->runtest("check pass1 file",
|
||||
show_ntests();
|
||||
# ----------
|
||||
$td->notify("--- Tokenizer ---");
|
||||
$n_tests += 4;
|
||||
$n_tests += 5;
|
||||
|
||||
$td->runtest("tokenizer with no ignorable",
|
||||
{$td->COMMAND => "test_tokenizer -no-ignorable tokens.pdf"},
|
||||
@ -706,6 +706,11 @@ $td->runtest("tokenizer",
|
||||
{$td->FILE => "tokens.out", $td->EXIT_STATUS => 0},
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
|
||||
$td->runtest("tokenizer with old inline image code",
|
||||
{$td->COMMAND => "test_tokenizer -old-ei tokens.pdf"},
|
||||
{$td->FILE => "tokens.out", $td->EXIT_STATUS => 0},
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
|
||||
$td->runtest("tokenizer with max_len",
|
||||
{$td->COMMAND => "test_tokenizer -maxlen 50 tokens.pdf"},
|
||||
{$td->FILE => "tokens-maxlen.out", $td->EXIT_STATUS => 0},
|
||||
|
@ -16,7 +16,7 @@ static char const* whoami = 0;
|
||||
void usage()
|
||||
{
|
||||
std::cerr << "Usage: " << whoami
|
||||
<< " [-maxlen len | -no-ignorable] filename"
|
||||
<< " [-maxlen len | -no-ignorable | -old-ei] filename"
|
||||
<< std::endl;
|
||||
exit(2);
|
||||
}
|
||||
@ -132,7 +132,7 @@ try_skipping(QPDFTokenizer& tokenizer, PointerHolder<InputSource> is,
|
||||
static void
|
||||
dump_tokens(PointerHolder<InputSource> is, std::string const& label,
|
||||
size_t max_len, bool include_ignorable,
|
||||
bool skip_streams, bool skip_inline_images)
|
||||
bool skip_streams, bool skip_inline_images, bool old_ei)
|
||||
{
|
||||
Finder f1(is, "endstream");
|
||||
std::cout << "--- BEGIN " << label << " ---" << std::endl;
|
||||
@ -183,7 +183,14 @@ dump_tokens(PointerHolder<InputSource> is, std::string const& label,
|
||||
else if (skip_inline_images &&
|
||||
(token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID")))
|
||||
{
|
||||
tokenizer.expectInlineImage();
|
||||
if (old_ei)
|
||||
{
|
||||
tokenizer.expectInlineImage();
|
||||
}
|
||||
else
|
||||
{
|
||||
tokenizer.expectInlineImage(is);
|
||||
}
|
||||
inline_image_offset = is->tell();
|
||||
}
|
||||
else if (token.getType() == QPDFTokenizer::tt_eof)
|
||||
@ -195,7 +202,7 @@ dump_tokens(PointerHolder<InputSource> is, std::string const& label,
|
||||
}
|
||||
|
||||
static void process(char const* filename, bool include_ignorable,
|
||||
size_t max_len)
|
||||
size_t max_len, bool old_ei)
|
||||
{
|
||||
PointerHolder<InputSource> is;
|
||||
|
||||
@ -203,7 +210,7 @@ static void process(char const* filename, bool include_ignorable,
|
||||
FileInputSource* fis = new FileInputSource();
|
||||
fis->setFilename(filename);
|
||||
is = fis;
|
||||
dump_tokens(is, "FILE", max_len, include_ignorable, true, false);
|
||||
dump_tokens(is, "FILE", max_len, include_ignorable, true, false, false);
|
||||
|
||||
// Tokenize content streams, skipping inline images
|
||||
QPDF qpdf;
|
||||
@ -222,7 +229,7 @@ static void process(char const* filename, bool include_ignorable,
|
||||
"content data", content_data.getPointer());
|
||||
is = bis;
|
||||
dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno),
|
||||
max_len, include_ignorable, false, true);
|
||||
max_len, include_ignorable, false, true, old_ei);
|
||||
}
|
||||
|
||||
// Tokenize object streams
|
||||
@ -241,7 +248,7 @@ static void process(char const* filename, bool include_ignorable,
|
||||
is = bis;
|
||||
dump_tokens(is, "OBJECT STREAM " +
|
||||
QUtil::int_to_string((*iter).getObjectID()),
|
||||
max_len, include_ignorable, false, false);
|
||||
max_len, include_ignorable, false, false, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -266,6 +273,7 @@ int main(int argc, char* argv[])
|
||||
char const* filename = 0;
|
||||
size_t max_len = 0;
|
||||
bool include_ignorable = true;
|
||||
bool old_ei = false;
|
||||
for (int i = 1; i < argc; ++i)
|
||||
{
|
||||
if (argv[i][0] == '-')
|
||||
@ -282,6 +290,10 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
include_ignorable = false;
|
||||
}
|
||||
else if (strcmp(argv[i], "-old-ei") == 0)
|
||||
{
|
||||
old_ei = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
usage();
|
||||
@ -303,7 +315,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
try
|
||||
{
|
||||
process(filename, include_ignorable, max_len);
|
||||
process(filename, include_ignorable, max_len, old_ei);
|
||||
}
|
||||
catch (std::exception& e)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user