Let optimize filter stream parameters instead of making them direct

Also removes preclusion of stream references in stream parameters of filterable streams and reduces write times by about 8% by eliminating an extra traversal of the objects.
2024-06-03 19:00:51 +00:00 · 2020-12-25 09:51:20 -05:00 · 2020-12-25 09:51:20 -05:00 · 858c7b89bc
commit 858c7b89bc
parent 1a62cce940
5 changed files with 53 additions and 114 deletions
--- a/8
+++ b/8
@ -1,3 +1,11 @@
 2020-12-25  Jay Berkenbilt  <ejb@ql.org>
 	* Refactor write code to eliminate an extra full traversal of
 	objects in the file and to remove assumptions that preclude stream
 	references from appearing in /DecodeParms of filterable streams.
 	This results in an approximately 8% performance reduction in write
 	times.
 2020-12-23  Jay Berkenbilt  <ejb@ql.org>
 	* Allow library users to provide their own decoders for stream
--- a/libqpdf/QPDFWriter.cc
+++ b/libqpdf/QPDFWriter.cc
@ -2452,115 +2452,36 @@ QPDFWriter::getTrimmedTrailer()
 void
 QPDFWriter::prepareFileForWrite()
 {
-    // Do a traversal of the entire PDF file structure replacing all
+    // Make document extension level information direct as required by
-    // indirect objects that QPDFWriter wants to be direct.  This
+    // the spec.
    // includes stream lengths, stream filtering parameters, and
    // document extension level information.
    this->m->pdf.fixDanglingReferences(true);
-    std::list<QPDFObjectHandle> queue;
+    QPDFObjectHandle root = this->m->pdf.getRoot();
-    queue.push_back(getTrimmedTrailer());
+    for (auto const& key: root.getKeys())
    std::set<int> visited;
    while (! queue.empty())
    {
-	QPDFObjectHandle node = queue.front();
+        QPDFObjectHandle oh = root.getKey(key);
-	queue.pop_front();
+        if ((key == "/Extensions") && (oh.isDictionary()))
-	if (node.isIndirect())
+        {
-	{
+            bool extensions_indirect = false;
-	    if (visited.count(node.getObjectID()) > 0)
+            if (oh.isIndirect())
 	    {
 		continue;
 	    }
            indicateProgress(false, false);
 	    visited.insert(node.getObjectID());
 	}
 	if (node.isArray())
 	{
 	    int nitems = node.getArrayNItems();
 	    for (int i = 0; i < nitems; ++i)
 	    {
 		QPDFObjectHandle oh = node.getArrayItem(i);
                if (! oh.isScalar())
 		{
 		    queue.push_back(oh);
 		}
 	    }
 	}
 	else if (node.isDictionary() || node.isStream())
 	{
            bool is_stream = false;
            bool is_root = false;
            bool filterable = false;
 	    QPDFObjectHandle dict = node;
 	    if (node.isStream())
 	    {
                is_stream = true;
 		dict = node.getDict();
                // See whether we are able to filter this stream.
                filterable = node.pipeStreamData(
                    0, 0, this->m->stream_decode_level, true);
 	    }
            else if (this->m->pdf.getRoot().getObjectID() == node.getObjectID())
            {
-                is_root = true;
+                QTC::TC("qpdf", "QPDFWriter make Extensions direct");
                extensions_indirect = true;
                oh = oh.shallowCopy();
                root.replaceKey(key, oh);
            }
-
+            if (oh.hasKey("/ADBE"))
-	    std::set<std::string> keys = dict.getKeys();
+            {
-	    for (std::set<std::string>::iterator iter = keys.begin();
+                QPDFObjectHandle adbe = oh.getKey("/ADBE");
-		 iter != keys.end(); ++iter)
+                if (adbe.isIndirect())
 	    {
 		std::string const& key = *iter;
 		QPDFObjectHandle oh = dict.getKey(key);
                bool add_to_queue = true;
                if (is_stream)
                {
-                    if (oh.isIndirect() &&
+                    QTC::TC("qpdf", "QPDFWriter make ADBE direct",
-                        ((key == "/Length") ||
+                            extensions_indirect ? 0 : 1);
-                         (filterable &&
+                    adbe.makeDirect();
-                          ((key == "/Filter") ||
+                    oh.replaceKey("/ADBE", adbe);
                           (key == "/DecodeParms")))))
                    {
                        QTC::TC("qpdf", "QPDFWriter make stream key direct");
                        add_to_queue = false;
                        oh.makeDirect();
                        dict.replaceKey(key, oh);
                    }
                }
-                else if (is_root)
+            }
-                {
+        }
                    if ((key == "/Extensions") && (oh.isDictionary()))
                    {
                        bool extensions_indirect = false;
                        if (oh.isIndirect())
                        {
                            QTC::TC("qpdf", "QPDFWriter make Extensions direct");
                            extensions_indirect = true;
                            add_to_queue = false;
                            oh = oh.shallowCopy();
                            dict.replaceKey(key, oh);
                        }
                        if (oh.hasKey("/ADBE"))
                        {
                            QPDFObjectHandle adbe = oh.getKey("/ADBE");
                            if (adbe.isIndirect())
                            {
                                QTC::TC("qpdf", "QPDFWriter make ADBE direct",
                                        extensions_indirect ? 0 : 1);
                                adbe.makeDirect();
                                oh.replaceKey("/ADBE", adbe);
                            }
                        }
                    }
                }
                if (add_to_queue)
                {
                    queue.push_back(oh);
 		}
 	    }
 	}
    }
 }
@ -2737,14 +2658,11 @@ QPDFWriter::write()
 {
    doWriteSetup();
-    // Set up progress reporting. We spent about equal amounts of time
+    // Set up progress reporting. For linearized files, we write two
-    // preparing and writing one pass. To get a rough estimate of
+    // passes. events_expected is an approximation, but it's good
-    // progress, we track handling of indirect objects. For linearized
+    // enough for progress reporting, which is mostly a guess anyway.
    // files, we write two passes. events_expected is an
    // approximation, but it's good enough for progress reporting,
    // which is mostly a guess anyway.
    this->m->events_expected = QIntC::to_int(
-        this->m->pdf.getObjectCount() * (this->m->linearized ? 3 : 2));
+        this->m->pdf.getObjectCount() * (this->m->linearized ? 2 : 1));
    prepareFileForWrite();
@ -3138,8 +3056,21 @@ QPDFWriter::writeLinearized()
    discardGeneration(this->m->object_to_object_stream,
                      this->m->object_to_object_stream_no_gen);
-    bool need_xref_stream = (! this->m->object_to_object_stream.empty());
+    auto skip_stream_parameters = [this](QPDFObjectHandle& stream) {
-    this->m->pdf.optimize(this->m->object_to_object_stream_no_gen);
+        bool compress_stream;
        bool is_metadata;
        if (willFilterStream(stream, compress_stream, is_metadata, nullptr))
        {
            return 2;
        }
        else
        {
            return 1;
        }
    };
    this->m->pdf.optimize(this->m->object_to_object_stream_no_gen,
                          true, skip_stream_parameters);
    std::vector<QPDFObjectHandle> part4;
    std::vector<QPDFObjectHandle> part6;
@ -3173,6 +3104,7 @@ QPDFWriter::writeLinearized()
    int after_second_half = 1 + second_half_uncompressed;
    this->m->next_objid = after_second_half;
    int second_half_xref = 0;
    bool need_xref_stream = (! this->m->object_to_object_stream.empty());
    if (need_xref_stream)
    {
 	second_half_xref = this->m->next_objid++;
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@ -234,7 +234,6 @@ QPDFWriter extra header text no newline 0
 QPDFWriter extra header text add newline 0
 QPDF bogus 0 offset 0
 QPDF global offset 0
 QPDFWriter make stream key direct 0
 QPDFWriter copy V5 0
 QPDFWriter increasing extension level 0
 QPDFWriter make Extensions direct 0
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@ -716,7 +716,7 @@ my @bug_tests = (
    ["99b", "object 0", 2],
    ["100", "xref reconstruction loop", 2],
    ["101", "resolve for exception text", 2],
-    ["117", "other infinite loop", 2],
+    ["117", "other infinite loop", 3],
    ["118", "other infinite loop", 2],
    ["119", "other infinite loop", 3],
    ["120", "other infinite loop", 3],
--- a/qpdf/qtest/qpdf/issue-117.out
+++ b/qpdf/qtest/qpdf/issue-117.out
@ -13,4 +13,4 @@ WARNING: issue-117.pdf (object 7 0, offset 1791): unknown token while reading ob
 WARNING: issue-117.pdf (object 7 0, offset 1267): /Length key in stream dictionary is not an integer
 WARNING: issue-117.pdf (object 7 0, offset 1418): attempting to recover stream length
 WARNING: issue-117.pdf (object 7 0, offset 1418): recovered stream length: 347
-attempt to make a stream into a direct object
+qpdf: operation succeeded with warnings; resulting file may have some problems