diff --git a/ChangeLog b/ChangeLog index 7c1b43d1..fc376cef 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2020-12-25 Jay Berkenbilt + + * Refactor write code to eliminate an extra full traversal of + objects in the file and to remove assumptions that preclude stream + references from appearing in /DecodeParms of filterable streams. + This results in an approximately 8% performance reduction in write + times. + 2020-12-23 Jay Berkenbilt * Allow library users to provide their own decoders for stream diff --git a/libqpdf/QPDFWriter.cc b/libqpdf/QPDFWriter.cc index 2bee43e3..7057930b 100644 --- a/libqpdf/QPDFWriter.cc +++ b/libqpdf/QPDFWriter.cc @@ -2452,115 +2452,36 @@ QPDFWriter::getTrimmedTrailer() void QPDFWriter::prepareFileForWrite() { - // Do a traversal of the entire PDF file structure replacing all - // indirect objects that QPDFWriter wants to be direct. This - // includes stream lengths, stream filtering parameters, and - // document extension level information. + // Make document extension level information direct as required by + // the spec. this->m->pdf.fixDanglingReferences(true); - std::list queue; - queue.push_back(getTrimmedTrailer()); - std::set visited; - - while (! queue.empty()) + QPDFObjectHandle root = this->m->pdf.getRoot(); + for (auto const& key: root.getKeys()) { - QPDFObjectHandle node = queue.front(); - queue.pop_front(); - if (node.isIndirect()) - { - if (visited.count(node.getObjectID()) > 0) - { - continue; - } - indicateProgress(false, false); - visited.insert(node.getObjectID()); - } - - if (node.isArray()) - { - int nitems = node.getArrayNItems(); - for (int i = 0; i < nitems; ++i) - { - QPDFObjectHandle oh = node.getArrayItem(i); - if (! oh.isScalar()) - { - queue.push_back(oh); - } - } - } - else if (node.isDictionary() || node.isStream()) - { - bool is_stream = false; - bool is_root = false; - bool filterable = false; - QPDFObjectHandle dict = node; - if (node.isStream()) - { - is_stream = true; - dict = node.getDict(); - // See whether we are able to filter this stream. - filterable = node.pipeStreamData( - 0, 0, this->m->stream_decode_level, true); - } - else if (this->m->pdf.getRoot().getObjectID() == node.getObjectID()) + QPDFObjectHandle oh = root.getKey(key); + if ((key == "/Extensions") && (oh.isDictionary())) + { + bool extensions_indirect = false; + if (oh.isIndirect()) { - is_root = true; + QTC::TC("qpdf", "QPDFWriter make Extensions direct"); + extensions_indirect = true; + oh = oh.shallowCopy(); + root.replaceKey(key, oh); } - - std::set keys = dict.getKeys(); - for (std::set::iterator iter = keys.begin(); - iter != keys.end(); ++iter) - { - std::string const& key = *iter; - QPDFObjectHandle oh = dict.getKey(key); - bool add_to_queue = true; - if (is_stream) + if (oh.hasKey("/ADBE")) + { + QPDFObjectHandle adbe = oh.getKey("/ADBE"); + if (adbe.isIndirect()) { - if (oh.isIndirect() && - ((key == "/Length") || - (filterable && - ((key == "/Filter") || - (key == "/DecodeParms"))))) - { - QTC::TC("qpdf", "QPDFWriter make stream key direct"); - add_to_queue = false; - oh.makeDirect(); - dict.replaceKey(key, oh); - } + QTC::TC("qpdf", "QPDFWriter make ADBE direct", + extensions_indirect ? 0 : 1); + adbe.makeDirect(); + oh.replaceKey("/ADBE", adbe); } - else if (is_root) - { - if ((key == "/Extensions") && (oh.isDictionary())) - { - bool extensions_indirect = false; - if (oh.isIndirect()) - { - QTC::TC("qpdf", "QPDFWriter make Extensions direct"); - extensions_indirect = true; - add_to_queue = false; - oh = oh.shallowCopy(); - dict.replaceKey(key, oh); - } - if (oh.hasKey("/ADBE")) - { - QPDFObjectHandle adbe = oh.getKey("/ADBE"); - if (adbe.isIndirect()) - { - QTC::TC("qpdf", "QPDFWriter make ADBE direct", - extensions_indirect ? 0 : 1); - adbe.makeDirect(); - oh.replaceKey("/ADBE", adbe); - } - } - } - } - - if (add_to_queue) - { - queue.push_back(oh); - } - } - } + } + } } } @@ -2737,14 +2658,11 @@ QPDFWriter::write() { doWriteSetup(); - // Set up progress reporting. We spent about equal amounts of time - // preparing and writing one pass. To get a rough estimate of - // progress, we track handling of indirect objects. For linearized - // files, we write two passes. events_expected is an - // approximation, but it's good enough for progress reporting, - // which is mostly a guess anyway. + // Set up progress reporting. For linearized files, we write two + // passes. events_expected is an approximation, but it's good + // enough for progress reporting, which is mostly a guess anyway. this->m->events_expected = QIntC::to_int( - this->m->pdf.getObjectCount() * (this->m->linearized ? 3 : 2)); + this->m->pdf.getObjectCount() * (this->m->linearized ? 2 : 1)); prepareFileForWrite(); @@ -3138,8 +3056,21 @@ QPDFWriter::writeLinearized() discardGeneration(this->m->object_to_object_stream, this->m->object_to_object_stream_no_gen); - bool need_xref_stream = (! this->m->object_to_object_stream.empty()); - this->m->pdf.optimize(this->m->object_to_object_stream_no_gen); + auto skip_stream_parameters = [this](QPDFObjectHandle& stream) { + bool compress_stream; + bool is_metadata; + if (willFilterStream(stream, compress_stream, is_metadata, nullptr)) + { + return 2; + } + else + { + return 1; + } + }; + + this->m->pdf.optimize(this->m->object_to_object_stream_no_gen, + true, skip_stream_parameters); std::vector part4; std::vector part6; @@ -3173,6 +3104,7 @@ QPDFWriter::writeLinearized() int after_second_half = 1 + second_half_uncompressed; this->m->next_objid = after_second_half; int second_half_xref = 0; + bool need_xref_stream = (! this->m->object_to_object_stream.empty()); if (need_xref_stream) { second_half_xref = this->m->next_objid++; diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index f0f96242..908f06e5 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -234,7 +234,6 @@ QPDFWriter extra header text no newline 0 QPDFWriter extra header text add newline 0 QPDF bogus 0 offset 0 QPDF global offset 0 -QPDFWriter make stream key direct 0 QPDFWriter copy V5 0 QPDFWriter increasing extension level 0 QPDFWriter make Extensions direct 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index 7dc7e261..6ae330a4 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -716,7 +716,7 @@ my @bug_tests = ( ["99b", "object 0", 2], ["100", "xref reconstruction loop", 2], ["101", "resolve for exception text", 2], - ["117", "other infinite loop", 2], + ["117", "other infinite loop", 3], ["118", "other infinite loop", 2], ["119", "other infinite loop", 3], ["120", "other infinite loop", 3], diff --git a/qpdf/qtest/qpdf/issue-117.out b/qpdf/qtest/qpdf/issue-117.out index 18157749..540b9c3c 100644 --- a/qpdf/qtest/qpdf/issue-117.out +++ b/qpdf/qtest/qpdf/issue-117.out @@ -13,4 +13,4 @@ WARNING: issue-117.pdf (object 7 0, offset 1791): unknown token while reading ob WARNING: issue-117.pdf (object 7 0, offset 1267): /Length key in stream dictionary is not an integer WARNING: issue-117.pdf (object 7 0, offset 1418): attempting to recover stream length WARNING: issue-117.pdf (object 7 0, offset 1418): recovered stream length: 347 -attempt to make a stream into a direct object +qpdf: operation succeeded with warnings; resulting file may have some problems