Add flags to control zlib compression level (fixes #113)

This commit is contained in:
Jay Berkenbilt 2019-08-23 19:59:38 -04:00
parent dac0598b94
commit 2794bfb1a6
9 changed files with 178 additions and 22 deletions

View File

@ -1,8 +1,22 @@
2019-08-23 Jay Berkenbilt <ejb@ql.org>
* Add --recompress-streams option to qpdf and
QPDFWriter::setRecompressFlate to cause QPDFWriter to recompress
streams that are already compressed with /FlateDecode.
* Add option Pl_Flate::setCompressionLevel to globally set the
zlib compression level used by all Pl_Flate pipelines.
* Add --compression-level flag to qpdf to set the zlib compression
level. When combined with --recompress-flate, this will cause most
of qpdf's streams to use the maximum compression level. This
results in only a very small amount of savings in size that comes
at a fairly significant performance cost, but it could be useful
for archival files or other cases where every byte counts and
creation time doesn't matter so much. Note that using
--object-streams=generate in combination with these options gives
you the biggest advantage. Fixes #113.
2019-08-22 Jay Berkenbilt <ejb@ql.org>
* In QPDFObjectHandle::ParserCallbacks, in addition to

View File

@ -189,10 +189,11 @@ class QPDFWriter
// filters on the input. When combined with
// setCompressStreams(true), which the default, the effect of this
// is that streams filtered with these older and less efficient
// filters will be recompressed with the Flate filter. As a
// special case, if a stream is already compressed with
// filters will be recompressed with the Flate filter. By default,
// as a special case, if a stream is already compressed with
// FlateDecode and setCompressStreams is enabled, the original
// compressed data will be preserved.
// compressed data will be preserved. This behavior can be
// overridden by calling setRecompressFlate(true).
//
// qpdf_dl_specialized: In addition to uncompressing the
// generalized compression formats, supported non-lossy
@ -209,6 +210,15 @@ class QPDFWriter
QPDF_DLL
void setDecodeLevel(qpdf_stream_decode_level_e);
// By default, when both the input and output contents of a stream
// are compressed with Flate, qpdf does not uncompress and
// recompress the stream. Passing true here causes it to do so.
// This can be useful if recompressing all streams with a higher
// compression level, which can be set by calling the static
// method Pl_Flate::setCompressionLevel.
QPDF_DLL
void setRecompressFlate(bool);
// Set value of content stream normalization. The default is
// "false". If true, we attempt to normalize newlines inside of
// content streams. Some constructs such as inline images may
@ -597,6 +607,7 @@ class QPDFWriter
bool compress_streams_set;
qpdf_stream_decode_level_e stream_decode_level;
bool stream_decode_level_set;
bool recompress_flate;
bool qdf_mode;
bool preserve_unreferenced_objects;
bool newline_before_endstream;

View File

@ -37,6 +37,7 @@ QPDFWriter::Members::Members(QPDF& pdf) :
compress_streams_set(false),
stream_decode_level(qpdf_dl_none),
stream_decode_level_set(false),
recompress_flate(false),
qdf_mode(false),
preserve_unreferenced_objects(false),
newline_before_endstream(false),
@ -206,6 +207,12 @@ QPDFWriter::setDecodeLevel(qpdf_stream_decode_level_e val)
this->m->stream_decode_level_set = true;
}
void
QPDFWriter::setRecompressFlate(bool val)
{
this->m->recompress_flate = val;
}
void
QPDFWriter::setContentNormalization(bool val)
{
@ -1716,13 +1723,14 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level,
if (this->m->compress_streams)
{
// Don't filter if the stream is already compressed with
// FlateDecode. We don't want to make it worse by getting
// rid of a predictor or otherwise messing with it. We
// should also avoid messing with anything that's
// compressed with a lossy compression scheme, but we
// don't support any of those right now.
// FlateDecode. This way we don't make it worse if the
// original file used a better Flate algorithm, and we
// don't spend time and CPU cycles uncompressing and
// recompressing stuff. This can be overridden with
// setRecompressFlate(true).
QPDFObjectHandle filter_obj = stream_dict.getKey("/Filter");
if ((! object.isDataModified()) &&
if ((! this->m->recompress_flate) &&
(! object.isDataModified()) &&
filter_obj.isName() &&
((filter_obj.getName() == "/FlateDecode") ||
(filter_obj.getName() == "/Fl")))

View File

@ -26,7 +26,8 @@ endif
$(OUTDOC).pdf: $(OUTDOC).fo qpdf/build/qpdf
$(FOP) $< -pdf $@.tmp
qpdf/build/qpdf --linearize $@.tmp $@
qpdf/build/qpdf --linearize --object-streams=generate \
--recompress-flate --compression-level=9 $@.tmp $@
$(OUTDOC).html: $(INDOC).xml manual/html.xsl $(VALIDATE)
$(XSLTPROC) --output $@ manual/html.xsl $<

View File

@ -1433,27 +1433,32 @@ outfile.pdf</option>
<listitem>
<para>
<option>generalized</option>: decode streams filtered with
supported generalized filters: <option>/LZWDecode</option>,
<option>/FlateDecode</option>,
<option>/ASCII85Decode</option>, and
<option>/ASCIIHexDecode</option>. We define generalized
supported generalized filters:
<literal>/LZWDecode</literal>,
<literal>/FlateDecode</literal>,
<literal>/ASCII85Decode</literal>, and
<literal>/ASCIIHexDecode</literal>. We define generalized
filters as those to be used for general-purpose compression
or encoding, as opposed to filters specifically designed
for image data.
for image data. Note that, by default, streams already
compressed with <literal>/FlateDecode</literal> are not
uncompressed and recompressed unless you also specify
<option>--recompress-flate</option>.
</para>
</listitem>
<listitem>
<para>
<option>specialized</option>: in addition to generalized,
decode streams with supported non-lossy specialized
filters; currently this is just <option>/RunLengthDecode</option>
filters; currently this is just
<literal>/RunLengthDecode</literal>
</para>
</listitem>
<listitem>
<para>
<option>all</option>: in addition to generalized and
specialized, decode streams with supported lossy filters;
currently this is just <option>/DCTDecode</option> (JPEG)
currently this is just <literal>/DCTDecode</literal> (JPEG)
</para>
</listitem>
</itemizedlist>
@ -1476,7 +1481,10 @@ outfile.pdf</option>
<option>compress</option>: recompress stream data when
possible (default); equivalent to
<option>--compress-streams=y</option>
<option>--decode-level=generalized</option>
<option>--decode-level=generalized</option>. Does not
recompress streams already compressed with
<literal>/FlateDecode</literal> unless
<option>--recompress-flate</option> is also specified.
</para>
</listitem>
<listitem>
@ -1498,6 +1506,37 @@ outfile.pdf</option>
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--recompress-flate</option></term>
<listitem>
<para>
By default, streams already compressed with
<literal>/FlateDecode</literal> are left alone rather than
being uncompressed and recompressed. This option causes qpdf
to uncompress and recompress the streams. There is a
significant performance cost to using this option, but you
probably want to use it if you specify
<option>--compression-level</option>.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--compression-level=<replaceable>level</replaceable></option></term>
<listitem>
<para>
When writing new streams that are compressed with
<literal>/FlateDecode</literal>, use the specified compression
level. The value of <option>level</option> should be a number
from 1 to 9 and is passed directly to zlib, which implements
deflate compression. Note that qpdf doesn't uncompress and
recompress streams by default. To have this option apply to
already compressed streams, you should also specify
<option>--recompress-flate</option>. If your goal is to shrink
the size of PDF files, you should also use
<option>--object-streams=generate</option>.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--normalize-content=[yn]</option></term>
<listitem>
@ -4449,7 +4488,7 @@ print "\n";
</listitem>
<listitem>
<para>
Library Enhancements
Library and CLI Enhancements
</para>
<itemizedlist>
<listitem>
@ -4508,6 +4547,41 @@ print "\n";
bytes of the combined contents.
</para>
</listitem>
<listitem>
<para>
Static method
<function>Pl_Flate::setCompressionLevel</function> can be
called to set the zlib compression level globally used by
all instances of Pl_Flate in deflate mode.
</para>
</listitem>
<listitem>
<para>
The method
<function>QPDFWriter::setRecompressFlate</function> can be
called to tell <classname>QPDFWriter</classname> to
uncompress and recompress streams already compressed with
<literal>/FlateDecode</literal>.
</para>
</listitem>
<listitem>
<para>
CLI enhancement: the <option>--recompress-flate</option>
instructs <command>qpdf</command> to recompress streams that
are already compressed with <literal>/FlateDecode</literal>.
Useful with <option>--compression-level</option>.
</para>
</listitem>
<listitem>
<para>
CLI enhancement: the
<option>--compression-level=<replaceable>level</replaceable></option>
sets the zlib compression level used for any streams
compressed by <literal>/FlateDecode</literal>. Most
effective when combined with
<option>--recompress-flate</option>.
</para>
</listitem>
<listitem>
<para>
The underlying implementation of QPDF arrays has been
@ -5699,9 +5773,9 @@ print "\n";
<listitem>
<para>
Disregard data check errors when uncompressing
<option>/FlateDecode</option> streams. This is consistent with
most other PDF readers and allows qpdf to recover data from
another class of malformed PDF files.
<literal>/FlateDecode</literal> streams. This is consistent
with most other PDF readers and allows qpdf to recover data
from another class of malformed PDF files.
</para>
</listitem>
<listitem>

View File

@ -13,6 +13,7 @@
#include <qpdf/Pl_Discard.hh>
#include <qpdf/Pl_DCT.hh>
#include <qpdf/Pl_Count.hh>
#include <qpdf/Pl_Flate.hh>
#include <qpdf/PointerHolder.hh>
#include <qpdf/QPDF.hh>
@ -124,6 +125,9 @@ struct Options
stream_data_mode(qpdf_s_compress),
compress_streams(true),
compress_streams_set(false),
recompress_flate(false),
recompress_flate_set(false),
compression_level(-1),
decode_level(qpdf_dl_generalized),
decode_level_set(false),
normalize_set(false),
@ -217,6 +221,9 @@ struct Options
qpdf_stream_data_e stream_data_mode;
bool compress_streams;
bool compress_streams_set;
bool recompress_flate;
bool recompress_flate_set;
int compression_level;
qpdf_stream_decode_level_e decode_level;
bool decode_level_set;
bool normalize_set;
@ -632,6 +639,8 @@ class ArgParser
void argCollate();
void argStreamData(char* parameter);
void argCompressStreams(char* parameter);
void argRecompressFlate();
void argCompressionLevel(char* parameter);
void argDecodeLevel(char* parameter);
void argNormalizeContent(char* parameter);
void argSuppressRecovery();
@ -847,6 +856,9 @@ ArgParser::initOptionTable()
&ArgParser::argStreamData, stream_data_choices);
(*t)["compress-streams"] = oe_requiredChoices(
&ArgParser::argCompressStreams, yn);
(*t)["recompress-flate"] = oe_bare(&ArgParser::argRecompressFlate);
(*t)["compression-level"] = oe_requiredParameter(
&ArgParser::argCompressionLevel, "level");
char const* decode_level_choices[] =
{"none", "generalized", "specialized", "all", 0};
(*t)["decode-level"] = oe_requiredChoices(
@ -1328,6 +1340,9 @@ ArgParser::argHelp()
<< "--stream-data=option controls transformation of stream data (below)\n"
<< "--compress-streams=[yn] controls whether to compress streams on output\n"
<< "--decode-level=option controls how to filter streams from the input\n"
<< "--recompress-flate recompress streams already compressed with Flate\n"
<< "--compression-level=n set zlib compression level; most effective with\n"
<< " --recompress-flate --object-streams=generate\n"
<< "--normalize-content=[yn] enables or disables normalization of content streams\n"
<< "--object-streams=mode controls handing of object streams\n"
<< "--preserve-unreferenced preserve unreferenced objects\n"
@ -1724,6 +1739,19 @@ ArgParser::argCompressStreams(char* parameter)
o.compress_streams = (strcmp(parameter, "y") == 0);
}
void
ArgParser::argRecompressFlate()
{
o.recompress_flate_set = true;
o.recompress_flate = true;
}
void
ArgParser::argCompressionLevel(char* parameter)
{
o.compression_level = QUtil::string_to_int(parameter);
}
void
ArgParser::argDecodeLevel(char* parameter)
{
@ -4889,6 +4917,10 @@ static void set_encryption_options(QPDF& pdf, Options& o, QPDFWriter& w)
static void set_writer_options(QPDF& pdf, Options& o, QPDFWriter& w)
{
if (o.compression_level >= 0)
{
Pl_Flate::setCompressionLevel(o.compression_level);
}
if (o.qdf_mode)
{
w.setQDFMode(true);
@ -4913,6 +4945,10 @@ static void set_writer_options(QPDF& pdf, Options& o, QPDFWriter& w)
{
w.setCompressStreams(o.compress_streams);
}
if (o.recompress_flate_set)
{
w.setRecompressFlate(o.recompress_flate);
}
if (o.decode_level_set)
{
w.setDecodeLevel(o.decode_level);

View File

@ -3876,8 +3876,20 @@ $td->runtest("convert inline-images to qdf",
compare_pdfs("inline-images.pdf", "a.pdf");
show_ntests();
# ----------
$td->notify("--- Compression Level ---");
$n_tests += 4;
check_pdf("recompress with level",
"qpdf --static-id --recompress-flate --compression-level=9" .
" --object-streams=generate minimal.pdf",
"minimal-9.pdf", 0);
check_pdf("recompress with level",
"qpdf --static-id --recompress-flate --compression-level=1" .
" --object-streams=generate minimal.pdf",
"minimal-1.pdf", 0);
show_ntests();
# ----------
$td->notify("--- Specialized filtering Tests ---");
$n_tests += 3;

Binary file not shown.

Binary file not shown.