Remove unreferenced resources only from relevant pages

This commit is contained in:
Jay Berkenbilt 2021-01-04 14:56:44 -05:00
parent dc92574c10
commit 891751f618
6 changed files with 28 additions and 15 deletions

View File

@ -1,5 +1,10 @@
2021-01-04 Jay Berkenbilt <ejb@ql.org>
* When qpdf CLI extracts pages, it now only attempts to remove
unreferenced resourecs from the pages that it is keeping. This
change dramatically reduces the time it takes to extract a small
number of pages from a large, complex file.
* Move getNext()->write() calls in some pipelines to ensure that
state gates properly reset even if the next pipeline's write
throws an exception (fuzz issue 28262).

7
TODO
View File

@ -1,10 +1,3 @@
Candidates for upcoming release
===============================
* Remember to check work `qpdf` project for private issues
* file with very slow page extraction
* big page even with --remove-unreferenced-resources=yes, even with --empty
Fuzz Errors
===========

View File

@ -5001,6 +5001,15 @@ print "\n";
<literal>/DecodeParms</literal>.
</para>
</listitem>
<listitem>
<para>
When extracting pages, the <command>qpdf</command> CLI only
removes unreferenced resources from the pages that are being
kept, resulting in a significant performance improvement
when extracting small numbers of pages from large, complex
documents.
</para>
</listitem>
</itemizedlist>
</listitem>
<listitem>

View File

@ -5120,6 +5120,7 @@ static void handle_page_specs(QPDF& pdf, Options& o)
page_spec.range));
}
std::map<unsigned long long, bool> remove_unreferenced;
if (o.remove_unreferenced_page_resources != re_no)
{
for (std::map<std::string, QPDF*>::iterator iter =
@ -5134,10 +5135,11 @@ static void handle_page_specs(QPDF& pdf, Options& o)
cis->stayOpen(true);
}
QPDF& other(*((*iter).second));
if (should_remove_unreferenced_resources(other, o))
auto other_uuid = other.getUniqueId();
if (remove_unreferenced.count(other_uuid) == 0)
{
QPDFPageDocumentHelper dh(other);
dh.removeUnreferencedResources();
remove_unreferenced[other_uuid] =
should_remove_unreferenced_resources(other, o);
}
if (cis)
{
@ -5246,6 +5248,10 @@ static void handle_page_specs(QPDF& pdf, Options& o)
else
{
copied_pages[from_uuid].insert(to_copy_og);
if (remove_unreferenced[from_uuid])
{
to_copy.removeUnreferencedResources();
}
}
dh.addPage(to_copy, false);
if (page_data.qpdf == &pdf)

View File

@ -2247,12 +2247,15 @@ $td->runtest("check output",
{$td->FILE => "a.pdf"},
{$td->FILE => "shared-images-errors-2-out.pdf"});
# This test used to generate warnings about images on pages we didn't
# care about, but qpdf was modified not to process those pages, so the
# "irrelevant" errors went away.
$td->runtest("shared resources irrelevant errors",
{$td->COMMAND =>
"qpdf --qdf --static-id" .
" shared-images-errors.pdf --pages . 1 -- a.pdf"},
{$td->FILE => "shared-images-errors-1.out",
$td->EXIT_STATUS => 3},
{$td->STRING => "",
$td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->runtest("check output",
{$td->FILE => "a.pdf"},

View File

@ -1,3 +0,0 @@
WARNING: shared-images-errors.pdf (offset 4933): error decoding stream data for object 19 0: stream inflate: inflate: data: incorrect header check
WARNING: shared-images-errors.pdf, object 4 0 at offset 676: Unable to parse content stream: content stream (content stream object 19 0): errors while decoding content stream; not attempting to remove unreferenced objects from this page
qpdf: operation succeeded with warnings; resulting file may have some problems