From 891751f618fb95b82af289edfd2e1219e3522e6f Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Mon, 4 Jan 2021 14:56:44 -0500 Subject: [PATCH] Remove unreferenced resources only from relevant pages --- ChangeLog | 5 +++++ TODO | 7 ------- manual/qpdf-manual.xml | 9 +++++++++ qpdf/qpdf.cc | 12 +++++++++--- qpdf/qtest/qpdf.test | 7 +++++-- qpdf/qtest/qpdf/shared-images-errors-1.out | 3 --- 6 files changed, 28 insertions(+), 15 deletions(-) delete mode 100644 qpdf/qtest/qpdf/shared-images-errors-1.out diff --git a/ChangeLog b/ChangeLog index 1e31efb4..40eb1014 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2021-01-04 Jay Berkenbilt + * When qpdf CLI extracts pages, it now only attempts to remove + unreferenced resourecs from the pages that it is keeping. This + change dramatically reduces the time it takes to extract a small + number of pages from a large, complex file. + * Move getNext()->write() calls in some pipelines to ensure that state gates properly reset even if the next pipeline's write throws an exception (fuzz issue 28262). diff --git a/TODO b/TODO index 7b620a96..9d687d56 100644 --- a/TODO +++ b/TODO @@ -1,10 +1,3 @@ -Candidates for upcoming release -=============================== - -* Remember to check work `qpdf` project for private issues - * file with very slow page extraction - * big page even with --remove-unreferenced-resources=yes, even with --empty - Fuzz Errors =========== diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml index f93f4a17..98fca51e 100644 --- a/manual/qpdf-manual.xml +++ b/manual/qpdf-manual.xml @@ -5001,6 +5001,15 @@ print "\n"; /DecodeParms. + + + When extracting pages, the qpdf CLI only + removes unreferenced resources from the pages that are being + kept, resulting in a significant performance improvement + when extracting small numbers of pages from large, complex + documents. + + diff --git a/qpdf/qpdf.cc b/qpdf/qpdf.cc index 0d1ab988..2e35c96f 100644 --- a/qpdf/qpdf.cc +++ b/qpdf/qpdf.cc @@ -5120,6 +5120,7 @@ static void handle_page_specs(QPDF& pdf, Options& o) page_spec.range)); } + std::map remove_unreferenced; if (o.remove_unreferenced_page_resources != re_no) { for (std::map::iterator iter = @@ -5134,10 +5135,11 @@ static void handle_page_specs(QPDF& pdf, Options& o) cis->stayOpen(true); } QPDF& other(*((*iter).second)); - if (should_remove_unreferenced_resources(other, o)) + auto other_uuid = other.getUniqueId(); + if (remove_unreferenced.count(other_uuid) == 0) { - QPDFPageDocumentHelper dh(other); - dh.removeUnreferencedResources(); + remove_unreferenced[other_uuid] = + should_remove_unreferenced_resources(other, o); } if (cis) { @@ -5246,6 +5248,10 @@ static void handle_page_specs(QPDF& pdf, Options& o) else { copied_pages[from_uuid].insert(to_copy_og); + if (remove_unreferenced[from_uuid]) + { + to_copy.removeUnreferencedResources(); + } } dh.addPage(to_copy, false); if (page_data.qpdf == &pdf) diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index 6919bfcf..83cbacd3 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -2247,12 +2247,15 @@ $td->runtest("check output", {$td->FILE => "a.pdf"}, {$td->FILE => "shared-images-errors-2-out.pdf"}); +# This test used to generate warnings about images on pages we didn't +# care about, but qpdf was modified not to process those pages, so the +# "irrelevant" errors went away. $td->runtest("shared resources irrelevant errors", {$td->COMMAND => "qpdf --qdf --static-id" . " shared-images-errors.pdf --pages . 1 -- a.pdf"}, - {$td->FILE => "shared-images-errors-1.out", - $td->EXIT_STATUS => 3}, + {$td->STRING => "", + $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); $td->runtest("check output", {$td->FILE => "a.pdf"}, diff --git a/qpdf/qtest/qpdf/shared-images-errors-1.out b/qpdf/qtest/qpdf/shared-images-errors-1.out deleted file mode 100644 index 5b98f88f..00000000 --- a/qpdf/qtest/qpdf/shared-images-errors-1.out +++ /dev/null @@ -1,3 +0,0 @@ -WARNING: shared-images-errors.pdf (offset 4933): error decoding stream data for object 19 0: stream inflate: inflate: data: incorrect header check -WARNING: shared-images-errors.pdf, object 4 0 at offset 676: Unable to parse content stream: content stream (content stream object 19 0): errors while decoding content stream; not attempting to remove unreferenced objects from this page -qpdf: operation succeeded with warnings; resulting file may have some problems