diff --git a/include/qpdf/Pl_QPDFTokenizer.hh b/include/qpdf/Pl_QPDFTokenizer.hh index 52630d2a..a571b079 100644 --- a/include/qpdf/Pl_QPDFTokenizer.hh +++ b/include/qpdf/Pl_QPDFTokenizer.hh @@ -27,6 +27,7 @@ #include #include #include +#include // Tokenize the incoming text using QPDFTokenizer and pass the tokens // in turn to a QPDFObjectHandle::TokenFilter object. All bytes of @@ -56,9 +57,6 @@ class Pl_QPDFTokenizer: public Pipeline virtual void finish(); private: - void processChar(char ch); - void checkUnread(); - class Members { friend class Pl_QPDFTokenizer; @@ -73,9 +71,7 @@ class Pl_QPDFTokenizer: public Pipeline QPDFObjectHandle::TokenFilter* filter; QPDFTokenizer tokenizer; - bool last_char_was_cr; - bool unread_char; - char char_to_unread; + Pl_Buffer buf; }; PointerHolder m; }; diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh index 370296b2..31f2f398 100644 --- a/include/qpdf/QPDFTokenizer.hh +++ b/include/qpdf/QPDFTokenizer.hh @@ -178,7 +178,15 @@ class QPDFTokenizer // including the next EI token. After you call this method, the // next call to readToken (or the token created next time getToken // returns true) will either be tt_inline_image or tt_bad. This is - // the only way readToken returns a tt_inline_image token. + // the only way readToken returns a tt_inline_image token. The + // version of this method that takes a PointerHolder + // does a better job of locating the end of the inline image and + // should be used whenever the input source is available. It + // preserves both tell() and getLastOffset(). The version without + // the input source will always end the inline image the first + // time it sees something that looks like an EI operator. + QPDF_DLL + void expectInlineImage(PointerHolder input); QPDF_DLL void expectInlineImage(); @@ -223,6 +231,7 @@ class QPDFTokenizer std::string error_message; bool unread_char; char char_to_unread; + size_t inline_image_bytes; // State for strings int string_depth; diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc index 577c5cc7..bd5d88ab 100644 --- a/libqpdf/Pl_QPDFTokenizer.cc +++ b/libqpdf/Pl_QPDFTokenizer.cc @@ -1,13 +1,13 @@ #include #include +#include +#include #include #include Pl_QPDFTokenizer::Members::Members() : filter(0), - last_char_was_cr(false), - unread_char(false), - char_to_unread('\0') + buf("tokenizer buffer") { } @@ -33,61 +33,36 @@ Pl_QPDFTokenizer::~Pl_QPDFTokenizer() } void -Pl_QPDFTokenizer::processChar(char ch) +Pl_QPDFTokenizer::write(unsigned char* data, size_t len) { - this->m->tokenizer.presentCharacter(ch); - QPDFTokenizer::Token token; - if (this->m->tokenizer.getToken( - token, this->m->unread_char, this->m->char_to_unread)) - { - this->m->filter->handleToken(token); - if ((token.getType() == QPDFTokenizer::tt_word) && - (token.getValue() == "ID")) - { - QTC::TC("qpdf", "Pl_QPDFTokenizer found ID"); - this->m->tokenizer.expectInlineImage(); - } - } -} - - -void -Pl_QPDFTokenizer::checkUnread() -{ - if (this->m->unread_char) - { - processChar(this->m->char_to_unread); - if (this->m->unread_char) - { - throw std::logic_error( - "INTERNAL ERROR: unread_char still true after processing " - "unread character"); - } - } -} - -void -Pl_QPDFTokenizer::write(unsigned char* buf, size_t len) -{ - checkUnread(); - for (size_t i = 0; i < len; ++i) - { - processChar(buf[i]); - checkUnread(); - } + this->m->buf.write(data, len); } void Pl_QPDFTokenizer::finish() { - this->m->tokenizer.presentEOF(); - QPDFTokenizer::Token token; - if (this->m->tokenizer.getToken( - token, this->m->unread_char, this->m->char_to_unread)) - { - this->m->filter->handleToken(token); - } + this->m->buf.finish(); + PointerHolder input = + new BufferInputSource("tokenizer data", + this->m->buf.getBuffer(), true); + while (true) + { + QPDFTokenizer::Token token = this->m->tokenizer.readToken( + input, "offset " + QUtil::int_to_string(input->tell()), + true); + this->m->filter->handleToken(token); + if (token.getType() == QPDFTokenizer::tt_eof) + { + break; + } + else if ((token.getType() == QPDFTokenizer::tt_word) && + (token.getValue() == "ID")) + { + QTC::TC("qpdf", "Pl_QPDFTokenizer found ID"); + this->m->tokenizer.expectInlineImage(input); + } + } this->m->filter->handleEOF(); QPDFObjectHandle::TokenFilter::PipelineAccessor::setPipeline( m->filter, 0); diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index ecaa49bd..de5d56b3 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -1558,7 +1558,7 @@ QPDFObjectHandle::parseContentStream_data( // terminated the token. Read until end of inline image. char ch; input->read(&ch, 1); - tokenizer.expectInlineImage(); + tokenizer.expectInlineImage(input); QPDFTokenizer::Token t = tokenizer.readToken(input, description, true); if (t.getType() == QPDFTokenizer::tt_bad) diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index c11c8218..e03f927b 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -13,6 +13,79 @@ #include #include +static bool is_delimiter(char ch) +{ + return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0); +} + +class QPDFWordTokenFinder: public InputSource::Finder +{ + public: + QPDFWordTokenFinder(PointerHolder is, + std::string const& str) : + is(is), + str(str) + { + } + virtual ~QPDFWordTokenFinder() + { + } + virtual bool check(); + + private: + PointerHolder is; + std::string str; +}; + +bool +QPDFWordTokenFinder::check() +{ + // Find a word token matching the given string, preceded by a + // delimiter, and followed by a delimiter or EOF. + QPDFTokenizer tokenizer; + QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true); + qpdf_offset_t pos = is->tell(); + if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) + { +/// QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); + return false; + } + qpdf_offset_t token_start = is->getLastOffset(); + char next; + bool next_okay = false; + if (is->read(&next, 1) == 0) + { + QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); + next_okay = true; + } + else + { + next_okay = is_delimiter(next); + } + is->seek(pos, SEEK_SET); + if (! next_okay) + { +/// QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter"); + return false; + } + if (token_start == 0) + { + // Can't actually happen...we never start the search at the + // beginning of the input. + return false; + } + is->seek(token_start - 1, SEEK_SET); + char prev; + bool prev_okay = ((is->read(&prev, 1) == 1) && is_delimiter(prev)); + is->seek(pos, SEEK_SET); + if (! prev_okay) + { +/// QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter"); + return false; + } + return true; +} + QPDFTokenizer::Members::Members() : pound_special_in_name(true), allow_eof(false), @@ -31,6 +104,7 @@ QPDFTokenizer::Members::reset() error_message = ""; unread_char = false; char_to_unread = '\0'; + inline_image_bytes = 0; string_depth = 0; string_ignoring_newline = false; last_char_was_bs = false; @@ -91,7 +165,7 @@ QPDFTokenizer::isSpace(char ch) bool QPDFTokenizer::isDelimiter(char ch) { - return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0); + return is_delimiter(ch); } void @@ -470,12 +544,21 @@ QPDFTokenizer::presentCharacter(char ch) { this->m->val += ch; size_t len = this->m->val.length(); - if ((len >= 4) && - isDelimiter(this->m->val.at(len-4)) && - (this->m->val.at(len-3) == 'E') && - (this->m->val.at(len-2) == 'I') && - isDelimiter(this->m->val.at(len-1))) + if (len == this->m->inline_image_bytes) { + QTC::TC("qpdf", "QPDFTokenizer found EI by byte count"); + this->m->type = tt_inline_image; + this->m->inline_image_bytes = 0; + this->m->state = st_token_ready; + } + else if ((this->m->inline_image_bytes == 0) && + (len >= 4) && + isDelimiter(this->m->val.at(len-4)) && + (this->m->val.at(len-3) == 'E') && + (this->m->val.at(len-2) == 'I') && + isDelimiter(this->m->val.at(len-1))) + { + QTC::TC("qpdf", "QPDFTokenizer found EI the old way"); this->m->val.erase(len - 1); this->m->type = tt_inline_image; this->m->unread_char = true; @@ -562,7 +645,7 @@ QPDFTokenizer::presentEOF() (this->m->val.at(len-2) == 'E') && (this->m->val.at(len-1) == 'I')) { - QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); + QTC::TC("qpdf", "QPDFTokenizer inline image at EOF the old way"); this->m->type = tt_inline_image; this->m->state = st_token_ready; } @@ -598,6 +681,26 @@ QPDFTokenizer::presentEOF() void QPDFTokenizer::expectInlineImage() { + expectInlineImage(PointerHolder()); +} + +void +QPDFTokenizer::expectInlineImage(PointerHolder input) +{ + if (input.getPointer()) + { + qpdf_offset_t last_offset = input->getLastOffset(); + qpdf_offset_t pos = input->tell(); + + QPDFWordTokenFinder f(input, "EI"); + if (input->findFirst("EI", pos, 0, f)) + { + this->m->inline_image_bytes = input->tell() - pos; + } + + input->seek(pos, SEEK_SET); + input->setLastOffset(last_offset); + } if (this->m->state != st_top) { throw std::logic_error("QPDFTokenizer::expectInlineImage called" diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 5150e567..6dcebd6e 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -430,3 +430,6 @@ QPDFPageObjectHelper copy shared attribute 0 qpdf from_nr from repeat_nr 0 QPDF resolve duplicated page object 0 QPDF handle direct page object 0 +QPDFTokenizer found EI the old way 0 +QPDFTokenizer found EI by byte count 0 +QPDFTokenizer inline image at EOF the old way 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index ca7ea12b..6abc7edb 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -694,7 +694,7 @@ $td->runtest("check pass1 file", show_ntests(); # ---------- $td->notify("--- Tokenizer ---"); -$n_tests += 4; +$n_tests += 5; $td->runtest("tokenizer with no ignorable", {$td->COMMAND => "test_tokenizer -no-ignorable tokens.pdf"}, @@ -706,6 +706,11 @@ $td->runtest("tokenizer", {$td->FILE => "tokens.out", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); +$td->runtest("tokenizer with old inline image code", + {$td->COMMAND => "test_tokenizer -old-ei tokens.pdf"}, + {$td->FILE => "tokens.out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); + $td->runtest("tokenizer with max_len", {$td->COMMAND => "test_tokenizer -maxlen 50 tokens.pdf"}, {$td->FILE => "tokens-maxlen.out", $td->EXIT_STATUS => 0}, diff --git a/qpdf/test_tokenizer.cc b/qpdf/test_tokenizer.cc index 9f65281b..ecbb3552 100644 --- a/qpdf/test_tokenizer.cc +++ b/qpdf/test_tokenizer.cc @@ -16,7 +16,7 @@ static char const* whoami = 0; void usage() { std::cerr << "Usage: " << whoami - << " [-maxlen len | -no-ignorable] filename" + << " [-maxlen len | -no-ignorable | -old-ei] filename" << std::endl; exit(2); } @@ -132,7 +132,7 @@ try_skipping(QPDFTokenizer& tokenizer, PointerHolder is, static void dump_tokens(PointerHolder is, std::string const& label, size_t max_len, bool include_ignorable, - bool skip_streams, bool skip_inline_images) + bool skip_streams, bool skip_inline_images, bool old_ei) { Finder f1(is, "endstream"); std::cout << "--- BEGIN " << label << " ---" << std::endl; @@ -183,7 +183,14 @@ dump_tokens(PointerHolder is, std::string const& label, else if (skip_inline_images && (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID"))) { - tokenizer.expectInlineImage(); + if (old_ei) + { + tokenizer.expectInlineImage(); + } + else + { + tokenizer.expectInlineImage(is); + } inline_image_offset = is->tell(); } else if (token.getType() == QPDFTokenizer::tt_eof) @@ -195,7 +202,7 @@ dump_tokens(PointerHolder is, std::string const& label, } static void process(char const* filename, bool include_ignorable, - size_t max_len) + size_t max_len, bool old_ei) { PointerHolder is; @@ -203,7 +210,7 @@ static void process(char const* filename, bool include_ignorable, FileInputSource* fis = new FileInputSource(); fis->setFilename(filename); is = fis; - dump_tokens(is, "FILE", max_len, include_ignorable, true, false); + dump_tokens(is, "FILE", max_len, include_ignorable, true, false, false); // Tokenize content streams, skipping inline images QPDF qpdf; @@ -222,7 +229,7 @@ static void process(char const* filename, bool include_ignorable, "content data", content_data.getPointer()); is = bis; dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno), - max_len, include_ignorable, false, true); + max_len, include_ignorable, false, true, old_ei); } // Tokenize object streams @@ -241,7 +248,7 @@ static void process(char const* filename, bool include_ignorable, is = bis; dump_tokens(is, "OBJECT STREAM " + QUtil::int_to_string((*iter).getObjectID()), - max_len, include_ignorable, false, false); + max_len, include_ignorable, false, false, false); } } } @@ -266,6 +273,7 @@ int main(int argc, char* argv[]) char const* filename = 0; size_t max_len = 0; bool include_ignorable = true; + bool old_ei = false; for (int i = 1; i < argc; ++i) { if (argv[i][0] == '-') @@ -282,6 +290,10 @@ int main(int argc, char* argv[]) { include_ignorable = false; } + else if (strcmp(argv[i], "-old-ei") == 0) + { + old_ei = true; + } else { usage(); @@ -303,7 +315,7 @@ int main(int argc, char* argv[]) try { - process(filename, include_ignorable, max_len); + process(filename, include_ignorable, max_len, old_ei); } catch (std::exception& e) {