diff --git a/ChangeLog b/ChangeLog index 44396f55..cbe9357e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +2019-01-30 Jay Berkenbilt + + * Improve locating of an inline image's EI operator to correctly + handle the case of EI appearing inside the image data. + + * Very low-level QPDFTokenizer API now includes an + expectInlineImage method that takes an input stream, enabling it + to locate an inline image's EI operator better. This is called + automatically everywhere within the qpdf library. Most user code + will never have to use the low-level tokenizer API. If you use + Pl_QPDFTokenizer, this will be done automatically for you. + 2019-01-29 Jay Berkenbilt * Bug fix: when returning an inline image token, the tokenizer no diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh index 31f2f398..424ac099 100644 --- a/include/qpdf/QPDFTokenizer.hh +++ b/include/qpdf/QPDFTokenizer.hh @@ -198,6 +198,7 @@ class QPDFTokenizer void resolveLiteral(); bool isSpace(char); bool isDelimiter(char); + void findEI(PointerHolder input); enum state_e { st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt, diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index e03f927b..2671fcbb 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -47,7 +47,7 @@ QPDFWordTokenFinder::check() qpdf_offset_t pos = is->tell(); if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { -/// QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); + QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); return false; } qpdf_offset_t token_start = is->getLastOffset(); @@ -65,7 +65,6 @@ QPDFWordTokenFinder::check() is->seek(pos, SEEK_SET); if (! next_okay) { -/// QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter"); return false; } if (token_start == 0) @@ -80,7 +79,7 @@ QPDFWordTokenFinder::check() is->seek(pos, SEEK_SET); if (! prev_okay) { -/// QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter"); + QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter"); return false; } return true; @@ -687,28 +686,133 @@ QPDFTokenizer::expectInlineImage() void QPDFTokenizer::expectInlineImage(PointerHolder input) { - if (input.getPointer()) - { - qpdf_offset_t last_offset = input->getLastOffset(); - qpdf_offset_t pos = input->tell(); - - QPDFWordTokenFinder f(input, "EI"); - if (input->findFirst("EI", pos, 0, f)) - { - this->m->inline_image_bytes = input->tell() - pos; - } - - input->seek(pos, SEEK_SET); - input->setLastOffset(last_offset); - } if (this->m->state != st_top) { throw std::logic_error("QPDFTokenizer::expectInlineImage called" " when tokenizer is in improper state"); } + findEI(input); this->m->state = st_inline_image; } +void +QPDFTokenizer::findEI(PointerHolder input) +{ + if (! input.getPointer()) + { + return; + } + + qpdf_offset_t last_offset = input->getLastOffset(); + qpdf_offset_t pos = input->tell(); + + // Use QPDFWordTokenFinder to find EI surrounded by delimiters. + // Then read the next several tokens or up to EOF. If we find any + // suspicious-looking or tokens, this is probably still part of + // the image data, so keep looking for EI. Stop at the first EI + // that passes. If we get to the end without finding one, return + // the last EI we found. Store the number of bytes expected in the + // inline image including the EI and use that to break out of + // inline image, falling back to the old method if needed. + + bool okay = false; + bool first_try = true; + while (! okay) + { + QPDFWordTokenFinder f(input, "EI"); + if (! input->findFirst("EI", input->tell(), 0, f)) + { + break; + } + this->m->inline_image_bytes = input->tell() - pos; + + QPDFTokenizer check; + bool found_bad = false; + // Look at the next 10 tokens or up to EOF. The next inline + // image's image data would look like bad tokens, but there + // will always be at least 10 tokens between one inline + // image's EI and the next valid one's ID since width, height, + // bits per pixel, and color space are all required as well as + // a BI and ID. If we get 10 good tokens in a row or hit EOF, + // we can be pretty sure we've found the actual EI. + for (int i = 0; i < 10; ++i) + { + QPDFTokenizer::Token t = + check.readToken(input, "checker", true); + token_type_e type = t.getType(); + if (type == tt_eof) + { + okay = true; + } + else if (type == tt_bad) + { + found_bad = true; + } + else if (type == tt_word) + { + // The qpdf tokenizer lumps alphabetic and otherwise + // uncategorized characters into "words". We recognize + // strings of alphabetic characters as potential valid + // operators for purposes of telling whether we're in + // valid content or not. It's not perfect, but it + // should work more reliably than what we used to do, + // which was already good enough for the vast majority + // of files. + bool found_alpha = false; + bool found_non_printable = false; + bool found_other = false; + std::string value = t.getValue(); + for (std::string::iterator iter = value.begin(); + iter != value.end(); ++iter) + { + char ch = *iter; + if (((ch >= 'a') && (ch <= 'z')) || + ((ch >= 'A') && (ch <= 'Z')) || + (ch == '*')) + { + // Treat '*' as alpha since there are valid + // PDF operators that contain * along with + // alphabetic characters. + found_alpha = true; + } + else if (((ch < 32) && (! isSpace(ch))) || (ch > 127)) + { + found_non_printable = true; + break; + } + else + { + found_other = true; + } + } + if (found_non_printable || (found_alpha && found_other)) + { + found_bad = true; + } + } + if (okay || found_bad) + { + break; + } + } + if (! found_bad) + { + okay = true; + } + if (! okay) + { + first_try = false; + } + } + if (okay && (! first_try)) + { + QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); + } + + input->seek(pos, SEEK_SET); + input->setLastOffset(last_offset); +} + bool QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) { diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 6dcebd6e..21b43dc8 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -430,6 +430,9 @@ QPDFPageObjectHelper copy shared attribute 0 qpdf from_nr from repeat_nr 0 QPDF resolve duplicated page object 0 QPDF handle direct page object 0 +QPDFTokenizer finder found wrong word 0 +QPDFTokenizer finder word not preceded by delimiter 0 QPDFTokenizer found EI the old way 0 QPDFTokenizer found EI by byte count 0 QPDFTokenizer inline image at EOF the old way 0 +QPDFTokenizer found EI after more than one try 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index 6abc7edb..da40d389 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -691,6 +691,26 @@ $td->runtest("check pass1 file", {$td->FILE => "b.pdf"}, {$td->FILE => "minimal-linearize-pass1.pdf"}); +show_ntests(); +# ---------- +$td->notify("--- Inline Images ---"); +$n_tests += 2; + +# The file large-inline-image.pdf is a hand-crafted file with several +# inline images of various sizes including one that is two megabytes, +# encoded in base85, and has a base85-encoding that contains EI +# surrounded by delimiters several times. This exercises the EI +# detection code added in qpdf 8.4. + +$td->runtest("complex inline image parsing", + {$td->COMMAND => + "qpdf --qdf --static-id large-inline-image.pdf a.pdf"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); +$td->runtest("check output", + {$td->FILE => "a.pdf"}, + {$td->FILE => "large-inline-image.qdf"}); + show_ntests(); # ---------- $td->notify("--- Tokenizer ---"); diff --git a/qpdf/qtest/qpdf/large-inline-image.pdf b/qpdf/qtest/qpdf/large-inline-image.pdf new file mode 100644 index 00000000..0a47c192 Binary files /dev/null and b/qpdf/qtest/qpdf/large-inline-image.pdf differ diff --git a/qpdf/qtest/qpdf/large-inline-image.qdf b/qpdf/qtest/qpdf/large-inline-image.qdf new file mode 100644 index 00000000..a82ea105 Binary files /dev/null and b/qpdf/qtest/qpdf/large-inline-image.qdf differ