diff --git a/ChangeLog b/ChangeLog index bcf8e9f8..46afbfed 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2011-04-30 Jay Berkenbilt + + * libqpdf/Pl_QPDFTokenizer.cc (processChar): When an inline image + is detected, suspend normalization only up to the end of the + inline image rather than for the remainder of the content stream. + (Fixes qpdf-Bugs 3152169.) + 2011-01-31 Jay Berkenbilt * libqpdf/QPDF.cc (readObjectAtOffset): use -1 rather than 0 when diff --git a/TODO b/TODO index 41aee45d..37d8aa75 100644 --- a/TODO +++ b/TODO @@ -1,3 +1,11 @@ +Next +==== + + * Look for %PDF header somewhere within the first 1024 bytes of the + file. Also accept headers of the form "%!PS−Adobe−N.n PDF−M.m". + See Implementation notes 13 and 14 in appendix H of the PDF 1.7 + specification. This is bug 3267974. + General ======= @@ -174,6 +182,10 @@ Index: QPDFWriter.cc providing some mechanism to recover earlier versions of a file embedded prior to appended sections. + * From a suggestion in bug 3152169, consisder having an option to + re-encode inline images with an ASCII encoding. + + Splitting by Pages ================== diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc index 28ea4faa..3bd3fb6c 100644 --- a/libqpdf/Pl_QPDFTokenizer.cc +++ b/libqpdf/Pl_QPDFTokenizer.cc @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -11,8 +12,9 @@ Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) : last_char_was_cr(false), unread_char(false), char_to_unread('\0'), - pass_through(false) + in_inline_image(false) { + memset(this->image_buf, 0, IMAGE_BUF_SIZE); } Pl_QPDFTokenizer::~Pl_QPDFTokenizer() @@ -56,11 +58,34 @@ Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token) void Pl_QPDFTokenizer::processChar(char ch) { - if (this->pass_through) + if (this->in_inline_image) { - // We're not normalizing anymore -- just write this without - // looking at it. - writeNext(&ch, 1); + // Scan through the input looking for EI surrounded by + // whitespace. If that pattern appears in the inline image's + // representation, we're hosed, but this situation seems + // excessively unlikely, and this code path is only followed + // during content stream normalization, which is pretty much + // used for debugging and human inspection of PDF files. + memmove(this->image_buf, + this->image_buf + 1, + IMAGE_BUF_SIZE - 1); + this->image_buf[IMAGE_BUF_SIZE - 1] = ch; + if (strchr(" \t\n\v\f\r", this->image_buf[0]) && + (this->image_buf[1] == 'E') && + (this->image_buf[2] == 'I') && + strchr(" \t\n\v\f\r", this->image_buf[3])) + { + // We've found an EI operator. We've already written the + // EI operator to output; terminate with a newline + // character and resume normal processing. + writeNext("\n", 1); + this->in_inline_image = false; + QTC::TC("qpdf", "Pl_QPDFTokenizer found EI"); + } + else + { + writeNext(&ch, 1); + } return; } @@ -75,18 +100,10 @@ Pl_QPDFTokenizer::processChar(char ch) this->newline_after_next_token = false; } if ((token.getType() == QPDFTokenizer::tt_word) && - (token.getValue() == "BI")) + (token.getValue() == "ID")) { - // Uh oh.... we're not sophisticated enough to handle - // inline images safely. We'd have to to set up all the - // filters and pipe the image data through it until the - // filtered output was the right size for an image of the - // specified dimensions. Then we'd either have to write - // out raw image data or continue to write filtered data, - // resuming normalization when we get to the end. - // Instead, for now, we'll just turn off normalization for - // the remainder of this stream. - this->pass_through = true; + // Suspend normal scanning until we find an EI token. + this->in_inline_image = true; if (this->unread_char) { writeNext(&this->char_to_unread, 1); @@ -156,7 +173,7 @@ void Pl_QPDFTokenizer::finish() { this->tokenizer.presentEOF(); - if (! this->pass_through) + if (! this->in_inline_image) { QPDFTokenizer::Token token; if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) diff --git a/libqpdf/qpdf/Pl_QPDFTokenizer.hh b/libqpdf/qpdf/Pl_QPDFTokenizer.hh index d300d7cd..3f816f5d 100644 --- a/libqpdf/qpdf/Pl_QPDFTokenizer.hh +++ b/libqpdf/qpdf/Pl_QPDFTokenizer.hh @@ -33,7 +33,9 @@ class Pl_QPDFTokenizer: public Pipeline bool last_char_was_cr; bool unread_char; char char_to_unread; - bool pass_through; + bool in_inline_image; + static int const IMAGE_BUF_SIZE = 4; // must be >= 4 + char image_buf[IMAGE_BUF_SIZE]; }; #endif // __PL_QPDFTOKENIZER_HH__ diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index f0b3e06a..51a6f0e8 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -187,3 +187,4 @@ QPDF_Stream getRawStreamData 0 QPDF_Stream getStreamData 0 QPDF_Stream expand filter abbreviation 0 qpdf-c called qpdf_read_memory 0 +Pl_QPDFTokenizer found EI 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index 2562f7c3..62eed000 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -1257,8 +1257,8 @@ my @flags = (["-qdf", # 1 "no arguments"], ); -$n_tests += (@files * @flags * 2 * 3); -$n_compare_pdfs += (@files * @flags * 2); +$n_tests += 1 + (@files * @flags * 2 * 3); +$n_compare_pdfs += 1 + (@files * @flags * 2); $n_acroread += (@files * @flags * 2); foreach my $file (@files) @@ -1311,6 +1311,14 @@ foreach my $file (@files) } } +# inline-images-cr.pdf is xbkm938-dies.pdf from PDF collection +$td->runtest("convert inline-images-cr to qdf", + {$td->COMMAND => "qpdf --static-id --no-original-object-ids" . + " --qdf inline-images-cr.pdf a.pdf"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}); + +compare_pdfs("inline-images-cr.pdf", "a.pdf"); + show_ntests(); # ---------- $td->notify("--- fix-qdf Tests ---"); diff --git a/qpdf/qtest/qpdf/inline-images-cr.pdf b/qpdf/qtest/qpdf/inline-images-cr.pdf new file mode 100644 index 00000000..48861a73 Binary files /dev/null and b/qpdf/qtest/qpdf/inline-images-cr.pdf differ