Better handle split content streams (fixes #73)

When parsing content streams, allow content to be split arbitrarily across stream boundaries.
2024-12-22 02:49:00 +00:00 · 2017-07-29 10:40:31 -04:00 · 2017-07-29 10:40:31 -04:00 · b389268f16
commit b389268f16
parent a136824243
11 changed files with 8793 additions and 20 deletions
--- a/5
+++ b/5
@ -1,3 +1,8 @@
+2017-07-29  Jay Berkenbilt  <ejb@ql.org>
+
+	* Fix content stream parsing to handle cases of structures within
+	the stream split across stream boundaries. Fixes #73.
+
 2017-07-28  Jay Berkenbilt  <ejb@ql.org>

 	* Add --preserve-unreferenced command-line option and
--- a/include/qpdf/QPDFObjectHandle.hh
+++ b/include/qpdf/QPDFObjectHandle.hh
@ -623,7 +623,9 @@ class QPDFObjectHandle
        bool in_array, bool in_dictionary,
        bool content_stream);
    static void parseContentStream_internal(
-        QPDFObjectHandle stream, ParserCallbacks* callbacks);
+        PointerHolder<Buffer> stream_data,
+        std::string const& description,
+        ParserCallbacks* callbacks);

    // Other methods
    static void warn(QPDF*, QPDFExc const&);
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@ -13,6 +13,7 @@
 #include <qpdf/QPDF_Dictionary.hh>
 #include <qpdf/QPDF_Stream.hh>
 #include <qpdf/QPDF_Reserved.hh>
+#include <qpdf/Pl_Buffer.hh>
 #include <qpdf/BufferInputSource.hh>
 #include <qpdf/QPDFExc.hh>

@ -739,37 +740,63 @@ QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
    {
        streams.push_back(stream_or_array);
    }
+    Pl_Buffer buf("concatenated stream data buffer");
+    std::string all_description = "content stream objects";
+    bool first = true;
    for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
         iter != streams.end(); ++iter)
    {
        QPDFObjectHandle stream = *iter;
        if (! stream.isStream())
        {
-            throw std::logic_error(
-                "QPDFObjectHandle: parseContentStream called on non-stream");
+            QTC::TC("qpdf", "QPDFObjectHandle non-stream in parsecontent");
+            warn(stream.getOwningQPDF(),
+                 QPDFExc(qpdf_e_damaged_pdf, "content stream",
+                         "", 0,
+                         "ignoring non-stream while parsing content streams"));
        }
-        try
+        else
        {
-            parseContentStream_internal(stream, callbacks);
-        }
-        catch (TerminateParsing&)
-        {
-            return;
+            std::string og = QUtil::int_to_string(stream.getObjectID()) + " " +
+                QUtil::int_to_string(stream.getGeneration());
+            std::string description = "content stream object " + og;
+            if (first)
+            {
+                first = false;
+            }
+            else
+            {
+                all_description += ",";
+            }
+            all_description += " " + og;
+            if (! stream.pipeStreamData(&buf, true, false, false, false))
+            {
+                QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
+                warn(stream.getOwningQPDF(),
+                     QPDFExc(qpdf_e_damaged_pdf, "content stream",
+                             description, 0,
+                             "errors while decoding content stream"));
+            }
        }
    }
+    PointerHolder<Buffer> stream_data = buf.getBuffer();
+    try
+    {
+        parseContentStream_internal(stream_data, all_description, callbacks);
+    }
+    catch (TerminateParsing&)
+    {
+        return;
+    }
    callbacks->handleEOF();
 }

 void
-QPDFObjectHandle::parseContentStream_internal(QPDFObjectHandle stream,
+QPDFObjectHandle::parseContentStream_internal(PointerHolder<Buffer> stream_data,
+                                              std::string const& description,
                                              ParserCallbacks* callbacks)
 {
-    stream.assertStream();
-    PointerHolder<Buffer> stream_data = stream.getStreamData();
    size_t length = stream_data->getSize();
-    std::string description = "content stream object " +
-        QUtil::int_to_string(stream.getObjectID()) + " " +
-        QUtil::int_to_string(stream.getGeneration());
    PointerHolder<InputSource> input =
        new BufferInputSource(description, stream_data.getPointer());
    QPDFTokenizer tokenizer;
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@ -281,3 +281,5 @@ QPDFObjectHandle no val for last key 0
 QPDF resolve failure to null 0
 QPDFWriter precheck stream 0
 QPDFWriter preserve unreferenced standard 0
+QPDFObjectHandle non-stream in parsecontent 0
+QPDFObjectHandle errors in parsecontent 0
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@ -206,7 +206,7 @@ $td->runtest("remove page we don't have",
 show_ntests();
 # ----------
 $td->notify("--- Miscellaneous Tests ---");
-$n_tests += 86;
+$n_tests += 88;

 $td->runtest("qpdf version",
 	     {$td->COMMAND => "qpdf --version"},
@ -604,6 +604,20 @@ $td->runtest("no trailing space in xref table",
             {$td->FILE => "no-space-in-xref.out", $td->EXIT_STATUS => 0},
             $td->NORMALIZE_NEWLINES);

+# An array is split across multiple content streams starting object
+# 42. This was reported in github issue 73. The file is modified from
+# that example.
+$td->runtest("parse split content stream",
+             {$td->COMMAND => "qpdf --check split-content-stream.pdf"},
+             {$td->FILE => "split-content-stream.out", $td->EXIT_STATUS => 0},
+             $td->NORMALIZE_NEWLINES);
+$td->runtest("split content stream errors",
+             {$td->COMMAND => "qpdf --check split-content-stream-errors.pdf"},
+             {$td->FILE => "split-content-stream-errors.out",
+              $td->EXIT_STATUS => 3},
+             $td->NORMALIZE_NEWLINES);
+
+
 show_ntests();
 # ----------
 $td->notify("--- Numeric range parsing tests ---");
--- a/qpdf/qtest/qpdf/content-stream-errors.out
+++ b/qpdf/qtest/qpdf/content-stream-errors.out
@ -2,6 +2,6 @@ checking content-stream-errors.pdf
 PDF Version: 1.3
 File is not encrypted
 File is not linearized
-page 1: content stream object 7 0 (content, file position 52): parse error while reading object
-page 3: content stream object 15 0 (stream data, file position 117): EOF found while reading inline image
-page 4: content stream object 19 0 (content, file position 53): parse error while reading object
+page 1: content stream objects 7 0 (content, file position 52): parse error while reading object
+page 3: content stream objects 15 0 (stream data, file position 117): EOF found while reading inline image
+page 4: content stream objects 19 0 (content, file position 53): parse error while reading object
--- a/qpdf/qtest/qpdf/eof-in-inline-image.out
+++ b/qpdf/qtest/qpdf/eof-in-inline-image.out
@ -22,4 +22,4 @@ name: /Fl
 name: /DP
 dictionary: << /Columns 1 /Predictor 15 >>
 operator: ID
-content stream object 4 0 (stream data, file position 139): EOF found while reading inline image
+content stream objects 4 0 (stream data, file position 139): EOF found while reading inline image
--- a/qpdf/qtest/qpdf/split-content-stream-errors.out
+++ b/qpdf/qtest/qpdf/split-content-stream-errors.out
@ -0,0 +1,11 @@
+WARNING: split-content-stream-errors.pdf: file is damaged
+WARNING: split-content-stream-errors.pdf (file position 802): xref not found
+WARNING: split-content-stream-errors.pdf: Attempting to reconstruct cross-reference table
+checking split-content-stream-errors.pdf
+PDF Version: 1.3
+File is not encrypted
+File is not linearized
+WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
+WARNING: content stream: ignoring non-stream while parsing content streams
+WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
+WARNING: content stream (content stream object 6 0): errors while decoding content stream
--- a/qpdf/qtest/qpdf/split-content-stream-errors.pdf
+++ b/qpdf/qtest/qpdf/split-content-stream-errors.pdf
@ -0,0 +1,113 @@
+%PDF-1.3
+%¿÷¢þ
+%QDF-1.0
+
+1 0 obj
+<<
+  /Pages 2 0 R
+  /Type /Catalog
+>>
+endobj
+
+2 0 obj
+<<
+  /Count 1
+  /Kids [
+    3 0 R
+  ]
+  /Type /Pages
+>>
+endobj
+
+%% Page 1
+3 0 obj
+<<
+  /Contents [
+    4 0 R
+    6 0 R
+  ]
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 8 0 R
+    >>
+    /ProcSet 9 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Contents for page 1
+4 0 obj
+<<
+  /Length 5 0 R
+  /Oops (Not a stream)
+>>
+endobj
+
+5 0 obj
+44
+endobj
+
+%% Contents for page 1
+6 0 obj
+<<
+  /Length 7 0 R
+  /Filter /LZWDecode
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Encoding errors) Tj
+ET
+endstream
+endobj
+
+7 0 obj
+53
+endobj
+
+8 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+9 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+xref
+0 10
+0000000000 65535 f 
+0000000025 00000 n 
+0000000079 00000 n 
+0000000161 00000 n 
+0000000396 00000 n 
+0000000457 00000 n 
+0000000499 00000 n 
+0000000630 00000 n 
+0000000649 00000 n 
+0000000767 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 10
+  /ID [<cbdd966f9b7b2bb31ad606c532d7cce5><e5f7cff7a542641606230aadd53106a4>]
+>>
+startxref
+802
+%%EOF
--- a/qpdf/qtest/qpdf/split-content-stream.out
+++ b/qpdf/qtest/qpdf/split-content-stream.out
@ -0,0 +1,6 @@
+checking split-content-stream.pdf
+PDF Version: 1.4
+File is not encrypted
+File is not linearized
+No syntax or stream encoding errors found; the file may still contain
+errors that qpdf cannot detect
--- a/qpdf/qtest/qpdf/split-content-stream.pdf
+++ b/qpdf/qtest/qpdf/split-content-stream.pdf