Improve locating inline image's EI

We've actually seen a PDF file in the wild that contained EI surrounded by delimiters inside the image data, which confused qpdf's naive code. This significantly improves EI detection.
2024-12-22 10:58:58 +00:00 · 2019-01-30 14:26:08 -05:00 · 2019-01-30 14:26:08 -05:00 · 2b6c79bcae
commit 2b6c79bcae
parent ec9e310c9e
7 changed files with 157 additions and 17 deletions
--- a/12
+++ b/12
@ -1,3 +1,15 @@
+2019-01-30  Jay Berkenbilt  <ejb@ql.org>
+
+	* Improve locating of an inline image's EI operator to correctly
+	handle the case of EI appearing inside the image data.
+
+	* Very low-level QPDFTokenizer API now includes an
+	expectInlineImage method that takes an input stream, enabling it
+	to locate an inline image's EI operator better. This is called
+	automatically everywhere within the qpdf library. Most user code
+	will never have to use the low-level tokenizer API. If you use
+	Pl_QPDFTokenizer, this will be done automatically for you.
+
 2019-01-29  Jay Berkenbilt  <ejb@ql.org>

 	* Bug fix: when returning an inline image token, the tokenizer no
--- a/include/qpdf/QPDFTokenizer.hh
+++ b/include/qpdf/QPDFTokenizer.hh
@ -198,6 +198,7 @@ class QPDFTokenizer
    void resolveLiteral();
    bool isSpace(char);
    bool isDelimiter(char);
+    void findEI(PointerHolder<InputSource> input);

    enum state_e {
        st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt,
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@ -47,7 +47,7 @@ QPDFWordTokenFinder::check()
    qpdf_offset_t pos = is->tell();
    if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)))
    {
-///        QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
+        QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
        return false;
    }
    qpdf_offset_t token_start = is->getLastOffset();
@ -65,7 +65,6 @@ QPDFWordTokenFinder::check()
    is->seek(pos, SEEK_SET);
    if (! next_okay)
    {
-///        QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter");
        return false;
    }
    if (token_start == 0)
@ -80,7 +79,7 @@ QPDFWordTokenFinder::check()
    is->seek(pos, SEEK_SET);
    if (! prev_okay)
    {
-///        QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter");
+        QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter");
        return false;
    }
    return true;
@ -687,28 +686,133 @@ QPDFTokenizer::expectInlineImage()
 void
 QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input)
 {
-    if (input.getPointer())
-    {
-        qpdf_offset_t last_offset = input->getLastOffset();
-        qpdf_offset_t pos = input->tell();
-
-        QPDFWordTokenFinder f(input, "EI");
-        if (input->findFirst("EI", pos, 0, f))
-        {
-            this->m->inline_image_bytes = input->tell() - pos;
-        }
-
-        input->seek(pos, SEEK_SET);
-        input->setLastOffset(last_offset);
-    }
    if (this->m->state != st_top)
    {
        throw std::logic_error("QPDFTokenizer::expectInlineImage called"
                               " when tokenizer is in improper state");
    }
+    findEI(input);
    this->m->state = st_inline_image;
 }

+void
+QPDFTokenizer::findEI(PointerHolder<InputSource> input)
+{
+    if (! input.getPointer())
+    {
+        return;
+    }
+
+    qpdf_offset_t last_offset = input->getLastOffset();
+    qpdf_offset_t pos = input->tell();
+
+    // Use QPDFWordTokenFinder to find EI surrounded by delimiters.
+    // Then read the next several tokens or up to EOF. If we find any
+    // suspicious-looking or tokens, this is probably still part of
+    // the image data, so keep looking for EI. Stop at the first EI
+    // that passes. If we get to the end without finding one, return
+    // the last EI we found. Store the number of bytes expected in the
+    // inline image including the EI and use that to break out of
+    // inline image, falling back to the old method if needed.
+
+    bool okay = false;
+    bool first_try = true;
+    while (! okay)
+    {
+        QPDFWordTokenFinder f(input, "EI");
+        if (! input->findFirst("EI", input->tell(), 0, f))
+        {
+            break;
+        }
+        this->m->inline_image_bytes = input->tell() - pos;
+
+        QPDFTokenizer check;
+        bool found_bad = false;
+        // Look at the next 10 tokens or up to EOF. The next inline
+        // image's image data would look like bad tokens, but there
+        // will always be at least 10 tokens between one inline
+        // image's EI and the next valid one's ID since width, height,
+        // bits per pixel, and color space are all required as well as
+        // a BI and ID. If we get 10 good tokens in a row or hit EOF,
+        // we can be pretty sure we've found the actual EI.
+        for (int i = 0; i < 10; ++i)
+        {
+            QPDFTokenizer::Token t =
+                check.readToken(input, "checker", true);
+            token_type_e type = t.getType();
+            if (type == tt_eof)
+            {
+                okay = true;
+            }
+            else if (type == tt_bad)
+            {
+                found_bad = true;
+            }
+            else if (type == tt_word)
+            {
+                // The qpdf tokenizer lumps alphabetic and otherwise
+                // uncategorized characters into "words". We recognize
+                // strings of alphabetic characters as potential valid
+                // operators for purposes of telling whether we're in
+                // valid content or not. It's not perfect, but it
+                // should work more reliably than what we used to do,
+                // which was already good enough for the vast majority
+                // of files.
+                bool found_alpha = false;
+                bool found_non_printable = false;
+                bool found_other = false;
+                std::string value = t.getValue();
+                for (std::string::iterator iter = value.begin();
+                     iter != value.end(); ++iter)
+                {
+                    char ch = *iter;
+                    if (((ch >= 'a') && (ch <= 'z')) ||
+                        ((ch >= 'A') && (ch <= 'Z')) ||
+                        (ch == '*'))
+                    {
+                        // Treat '*' as alpha since there are valid
+                        // PDF operators that contain * along with
+                        // alphabetic characters.
+                        found_alpha = true;
+                    }
+                    else if (((ch < 32) && (! isSpace(ch))) || (ch > 127))
+                    {
+                        found_non_printable = true;
+                        break;
+                    }
+                    else
+                    {
+                        found_other = true;
+                    }
+                }
+                if (found_non_printable || (found_alpha && found_other))
+                {
+                    found_bad = true;
+                }
+            }
+            if (okay || found_bad)
+            {
+                break;
+            }
+        }
+        if (! found_bad)
+        {
+            okay = true;
+        }
+        if (! okay)
+        {
+            first_try = false;
+        }
+    }
+    if (okay && (! first_try))
+    {
+        QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
+    }
+
+    input->seek(pos, SEEK_SET);
+    input->setLastOffset(last_offset);
+}
+
 bool
 QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
 {
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@ -430,6 +430,9 @@ QPDFPageObjectHelper copy shared attribute 0
 qpdf from_nr from repeat_nr 0
 QPDF resolve duplicated page object 0
 QPDF handle direct page object 0
+QPDFTokenizer finder found wrong word 0
+QPDFTokenizer finder word not preceded by delimiter 0
 QPDFTokenizer found EI the old way 0
 QPDFTokenizer found EI by byte count 0
 QPDFTokenizer inline image at EOF the old way 0
+QPDFTokenizer found EI after more than one try 0
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@ -691,6 +691,26 @@ $td->runtest("check pass1 file",
 	     {$td->FILE => "b.pdf"},
 	     {$td->FILE => "minimal-linearize-pass1.pdf"});

+show_ntests();
+# ----------
+$td->notify("--- Inline Images ---");
+$n_tests += 2;
+
+# The file large-inline-image.pdf is a hand-crafted file with several
+# inline images of various sizes including one that is two megabytes,
+# encoded in base85, and has a base85-encoding that contains EI
+# surrounded by delimiters several times. This exercises the EI
+# detection code added in qpdf 8.4.
+
+$td->runtest("complex inline image parsing",
+             {$td->COMMAND =>
+                  "qpdf --qdf --static-id large-inline-image.pdf a.pdf"},
+             {$td->STRING => "", $td->EXIT_STATUS => 0},
+             $td->NORMALIZE_NEWLINES);
+$td->runtest("check output",
+	     {$td->FILE => "a.pdf"},
+	     {$td->FILE => "large-inline-image.qdf"});
+
 show_ntests();
 # ----------
 $td->notify("--- Tokenizer ---");
--- a/qpdf/qtest/qpdf/large-inline-image.pdf
+++ b/qpdf/qtest/qpdf/large-inline-image.pdf
--- a/qpdf/qtest/qpdf/large-inline-image.qdf
+++ b/qpdf/qtest/qpdf/large-inline-image.qdf