be less conservative when skipping over inline images in content normalization

git-svn-id: svn+q:///qpdf/trunk@1050 71b93d88-0707-0410-a8cf-f5a4172ac649
2024-12-22 02:49:00 +00:00 · 2011-04-30 18:20:35 +00:00 · 2011-04-30 18:20:35 +00:00 · 6405d3928f
commit 6405d3928f
parent a8f2248729
7 changed files with 67 additions and 20 deletions
--- a/7
+++ b/7
@ -1,3 +1,10 @@
+2011-04-30  Jay Berkenbilt  <ejb@ql.org>
+
+	* libqpdf/Pl_QPDFTokenizer.cc (processChar): When an inline image
+	is detected, suspend normalization only up to the end of the
+	inline image rather than for the remainder of the content stream.
+	(Fixes qpdf-Bugs 3152169.)
+
 2011-01-31  Jay Berkenbilt  <ejb@ql.org>

 	* libqpdf/QPDF.cc (readObjectAtOffset): use -1 rather than 0 when
--- a/12
+++ b/12
@ -1,3 +1,11 @@
+Next
+====
+
+ * Look for %PDF header somewhere within the first 1024 bytes of the
+   file.  Also accept headers of the form "%!PS−Adobe−N.n PDF−M.m".
+   See Implementation notes 13 and 14 in appendix H of the PDF 1.7
+   specification.  This is bug 3267974.
+
 General
 =======

@ -174,6 +182,10 @@ Index: QPDFWriter.cc
   providing some mechanism to recover earlier versions of a file
   embedded prior to appended sections.

+ * From a suggestion in bug 3152169, consisder having an option to
+   re-encode inline images with an ASCII encoding.
+
+
 Splitting by Pages
 ==================

--- a/libqpdf/Pl_QPDFTokenizer.cc
+++ b/libqpdf/Pl_QPDFTokenizer.cc
@ -1,6 +1,7 @@
 #include <qpdf/Pl_QPDFTokenizer.hh>
 #include <qpdf/QPDF_String.hh>
 #include <qpdf/QPDF_Name.hh>
+#include <qpdf/QTC.hh>
 #include <stdexcept>
 #include <string.h>

@ -11,8 +12,9 @@ Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) :
    last_char_was_cr(false),
    unread_char(false),
    char_to_unread('\0'),
-    pass_through(false)
+    in_inline_image(false)
 {
+    memset(this->image_buf, 0, IMAGE_BUF_SIZE);
 }

 Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
@ -56,11 +58,34 @@ Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token)
 void
 Pl_QPDFTokenizer::processChar(char ch)
 {
-    if (this->pass_through)
+    if (this->in_inline_image)
    {
-	// We're not normalizing anymore -- just write this without
-	// looking at it.
-	writeNext(&ch, 1);
+	// Scan through the input looking for EI surrounded by
+	// whitespace.  If that pattern appears in the inline image's
+	// representation, we're hosed, but this situation seems
+	// excessively unlikely, and this code path is only followed
+	// during content stream normalization, which is pretty much
+	// used for debugging and human inspection of PDF files.
+	memmove(this->image_buf,
+		this->image_buf + 1,
+		IMAGE_BUF_SIZE - 1);
+	this->image_buf[IMAGE_BUF_SIZE - 1] = ch;
+	if (strchr(" \t\n\v\f\r", this->image_buf[0]) &&
+	    (this->image_buf[1] == 'E') &&
+	    (this->image_buf[2] == 'I') &&
+	    strchr(" \t\n\v\f\r", this->image_buf[3]))
+	{
+	    // We've found an EI operator.  We've already written the
+	    // EI operator to output; terminate with a newline
+	    // character and resume normal processing.
+	    writeNext("\n", 1);
+	    this->in_inline_image = false;
+	    QTC::TC("qpdf", "Pl_QPDFTokenizer found EI");
+	}
+	else
+	{
+	    writeNext(&ch, 1);
+	}
 	return;
    }

@ -75,18 +100,10 @@ Pl_QPDFTokenizer::processChar(char ch)
 	    this->newline_after_next_token = false;
 	}
 	if ((token.getType() == QPDFTokenizer::tt_word) &&
-	    (token.getValue() == "BI"))
+	    (token.getValue() == "ID"))
 	{
-	    // Uh oh.... we're not sophisticated enough to handle
-	    // inline images safely.  We'd have to to set up all the
-	    // filters and pipe the image data through it until the
-	    // filtered output was the right size for an image of the
-	    // specified dimensions.  Then we'd either have to write
-	    // out raw image data or continue to write filtered data,
-	    // resuming normalization when we get to the end.
-	    // Instead, for now, we'll just turn off normalization for
-	    // the remainder of this stream.
-	    this->pass_through = true;
+	    // Suspend normal scanning until we find an EI token.
+	    this->in_inline_image = true;
 	    if (this->unread_char)
 	    {
 		writeNext(&this->char_to_unread, 1);
@ -156,7 +173,7 @@ void
 Pl_QPDFTokenizer::finish()
 {
    this->tokenizer.presentEOF();
-    if (! this->pass_through)
+    if (! this->in_inline_image)
    {
 	QPDFTokenizer::Token token;
 	if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))
--- a/libqpdf/qpdf/Pl_QPDFTokenizer.hh
+++ b/libqpdf/qpdf/Pl_QPDFTokenizer.hh
@ -33,7 +33,9 @@ class Pl_QPDFTokenizer: public Pipeline
    bool last_char_was_cr;
    bool unread_char;
    char char_to_unread;
-    bool pass_through;
+    bool in_inline_image;
+    static int const IMAGE_BUF_SIZE = 4; // must be >= 4
+    char image_buf[IMAGE_BUF_SIZE];
 };

 #endif // __PL_QPDFTOKENIZER_HH__
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@ -187,3 +187,4 @@ QPDF_Stream getRawStreamData 0
 QPDF_Stream getStreamData 0
 QPDF_Stream expand filter abbreviation 0
 qpdf-c called qpdf_read_memory 0
+Pl_QPDFTokenizer found EI 0
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@ -1257,8 +1257,8 @@ my @flags = (["-qdf",						# 1
 	      "no arguments"],
 	     );

-$n_tests += (@files * @flags  * 2 * 3);
-$n_compare_pdfs += (@files * @flags * 2);
+$n_tests += 1 + (@files * @flags  * 2 * 3);
+$n_compare_pdfs += 1 + (@files * @flags * 2);
 $n_acroread += (@files * @flags * 2);

 foreach my $file (@files)
@ -1311,6 +1311,14 @@ foreach my $file (@files)
    }
 }

+# inline-images-cr.pdf is xbkm938-dies.pdf from PDF collection
+$td->runtest("convert inline-images-cr to qdf",
+	     {$td->COMMAND => "qpdf --static-id --no-original-object-ids" .
+		  " --qdf inline-images-cr.pdf a.pdf"},
+	     {$td->STRING => "", $td->EXIT_STATUS => 0});
+
+compare_pdfs("inline-images-cr.pdf", "a.pdf");
+
 show_ntests();
 # ----------
 $td->notify("--- fix-qdf Tests ---");
--- a/qpdf/qtest/qpdf/inline-images-cr.pdf
+++ b/qpdf/qtest/qpdf/inline-images-cr.pdf