diff --git a/ChangeLog b/ChangeLog index cdfa3fce..915d73f8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,22 @@ 2019-08-23 Jay Berkenbilt + * Add --recompress-streams option to qpdf and + QPDFWriter::setRecompressFlate to cause QPDFWriter to recompress + streams that are already compressed with /FlateDecode. + * Add option Pl_Flate::setCompressionLevel to globally set the zlib compression level used by all Pl_Flate pipelines. + * Add --compression-level flag to qpdf to set the zlib compression + level. When combined with --recompress-flate, this will cause most + of qpdf's streams to use the maximum compression level. This + results in only a very small amount of savings in size that comes + at a fairly significant performance cost, but it could be useful + for archival files or other cases where every byte counts and + creation time doesn't matter so much. Note that using + --object-streams=generate in combination with these options gives + you the biggest advantage. Fixes #113. + 2019-08-22 Jay Berkenbilt * In QPDFObjectHandle::ParserCallbacks, in addition to diff --git a/include/qpdf/QPDFWriter.hh b/include/qpdf/QPDFWriter.hh index 860b0630..0fd114db 100644 --- a/include/qpdf/QPDFWriter.hh +++ b/include/qpdf/QPDFWriter.hh @@ -189,10 +189,11 @@ class QPDFWriter // filters on the input. When combined with // setCompressStreams(true), which the default, the effect of this // is that streams filtered with these older and less efficient - // filters will be recompressed with the Flate filter. As a - // special case, if a stream is already compressed with + // filters will be recompressed with the Flate filter. By default, + // as a special case, if a stream is already compressed with // FlateDecode and setCompressStreams is enabled, the original - // compressed data will be preserved. + // compressed data will be preserved. This behavior can be + // overridden by calling setRecompressFlate(true). // // qpdf_dl_specialized: In addition to uncompressing the // generalized compression formats, supported non-lossy @@ -209,6 +210,15 @@ class QPDFWriter QPDF_DLL void setDecodeLevel(qpdf_stream_decode_level_e); + // By default, when both the input and output contents of a stream + // are compressed with Flate, qpdf does not uncompress and + // recompress the stream. Passing true here causes it to do so. + // This can be useful if recompressing all streams with a higher + // compression level, which can be set by calling the static + // method Pl_Flate::setCompressionLevel. + QPDF_DLL + void setRecompressFlate(bool); + // Set value of content stream normalization. The default is // "false". If true, we attempt to normalize newlines inside of // content streams. Some constructs such as inline images may @@ -597,6 +607,7 @@ class QPDFWriter bool compress_streams_set; qpdf_stream_decode_level_e stream_decode_level; bool stream_decode_level_set; + bool recompress_flate; bool qdf_mode; bool preserve_unreferenced_objects; bool newline_before_endstream; diff --git a/libqpdf/QPDFWriter.cc b/libqpdf/QPDFWriter.cc index 6c92338d..30bc1fcb 100644 --- a/libqpdf/QPDFWriter.cc +++ b/libqpdf/QPDFWriter.cc @@ -37,6 +37,7 @@ QPDFWriter::Members::Members(QPDF& pdf) : compress_streams_set(false), stream_decode_level(qpdf_dl_none), stream_decode_level_set(false), + recompress_flate(false), qdf_mode(false), preserve_unreferenced_objects(false), newline_before_endstream(false), @@ -206,6 +207,12 @@ QPDFWriter::setDecodeLevel(qpdf_stream_decode_level_e val) this->m->stream_decode_level_set = true; } +void +QPDFWriter::setRecompressFlate(bool val) +{ + this->m->recompress_flate = val; +} + void QPDFWriter::setContentNormalization(bool val) { @@ -1716,13 +1723,14 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level, if (this->m->compress_streams) { // Don't filter if the stream is already compressed with - // FlateDecode. We don't want to make it worse by getting - // rid of a predictor or otherwise messing with it. We - // should also avoid messing with anything that's - // compressed with a lossy compression scheme, but we - // don't support any of those right now. + // FlateDecode. This way we don't make it worse if the + // original file used a better Flate algorithm, and we + // don't spend time and CPU cycles uncompressing and + // recompressing stuff. This can be overridden with + // setRecompressFlate(true). QPDFObjectHandle filter_obj = stream_dict.getKey("/Filter"); - if ((! object.isDataModified()) && + if ((! this->m->recompress_flate) && + (! object.isDataModified()) && filter_obj.isName() && ((filter_obj.getName() == "/FlateDecode") || (filter_obj.getName() == "/Fl"))) diff --git a/manual/build.mk b/manual/build.mk index 03e8fe56..3911b8e2 100644 --- a/manual/build.mk +++ b/manual/build.mk @@ -26,7 +26,8 @@ endif $(OUTDOC).pdf: $(OUTDOC).fo qpdf/build/qpdf $(FOP) $< -pdf $@.tmp - qpdf/build/qpdf --linearize $@.tmp $@ + qpdf/build/qpdf --linearize --object-streams=generate \ + --recompress-flate --compression-level=9 $@.tmp $@ $(OUTDOC).html: $(INDOC).xml manual/html.xsl $(VALIDATE) $(XSLTPROC) --output $@ manual/html.xsl $< diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml index db2a54fa..6e72456e 100644 --- a/manual/qpdf-manual.xml +++ b/manual/qpdf-manual.xml @@ -1433,27 +1433,32 @@ outfile.pdf : decode streams filtered with - supported generalized filters: , - , - , and - . We define generalized + supported generalized filters: + /LZWDecode, + /FlateDecode, + /ASCII85Decode, and + /ASCIIHexDecode. We define generalized filters as those to be used for general-purpose compression or encoding, as opposed to filters specifically designed - for image data. + for image data. Note that, by default, streams already + compressed with /FlateDecode are not + uncompressed and recompressed unless you also specify + . : in addition to generalized, decode streams with supported non-lossy specialized - filters; currently this is just + filters; currently this is just + /RunLengthDecode : in addition to generalized and specialized, decode streams with supported lossy filters; - currently this is just (JPEG) + currently this is just /DCTDecode (JPEG) @@ -1476,7 +1481,10 @@ outfile.pdf : recompress stream data when possible (default); equivalent to - + . Does not + recompress streams already compressed with + /FlateDecode unless + is also specified. @@ -1498,6 +1506,37 @@ outfile.pdf + + + + + By default, streams already compressed with + /FlateDecode are left alone rather than + being uncompressed and recompressed. This option causes qpdf + to uncompress and recompress the streams. There is a + significant performance cost to using this option, but you + probably want to use it if you specify + . + + + + + + + + When writing new streams that are compressed with + /FlateDecode, use the specified compression + level. The value of should be a number + from 1 to 9 and is passed directly to zlib, which implements + deflate compression. Note that qpdf doesn't uncompress and + recompress streams by default. To have this option apply to + already compressed streams, you should also specify + . If your goal is to shrink + the size of PDF files, you should also use + . + + + @@ -4449,7 +4488,7 @@ print "\n"; - Library Enhancements + Library and CLI Enhancements @@ -4508,6 +4547,41 @@ print "\n"; bytes of the combined contents. + + + Static method + Pl_Flate::setCompressionLevel can be + called to set the zlib compression level globally used by + all instances of Pl_Flate in deflate mode. + + + + + The method + QPDFWriter::setRecompressFlate can be + called to tell QPDFWriter to + uncompress and recompress streams already compressed with + /FlateDecode. + + + + + CLI enhancement: the + instructs qpdf to recompress streams that + are already compressed with /FlateDecode. + Useful with . + + + + + CLI enhancement: the + + sets the zlib compression level used for any streams + compressed by /FlateDecode. Most + effective when combined with + . + + The underlying implementation of QPDF arrays has been @@ -5699,9 +5773,9 @@ print "\n"; Disregard data check errors when uncompressing - streams. This is consistent with - most other PDF readers and allows qpdf to recover data from - another class of malformed PDF files. + /FlateDecode streams. This is consistent + with most other PDF readers and allows qpdf to recover data + from another class of malformed PDF files. diff --git a/qpdf/qpdf.cc b/qpdf/qpdf.cc index a0f7f7ea..a5eef425 100644 --- a/qpdf/qpdf.cc +++ b/qpdf/qpdf.cc @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -124,6 +125,9 @@ struct Options stream_data_mode(qpdf_s_compress), compress_streams(true), compress_streams_set(false), + recompress_flate(false), + recompress_flate_set(false), + compression_level(-1), decode_level(qpdf_dl_generalized), decode_level_set(false), normalize_set(false), @@ -217,6 +221,9 @@ struct Options qpdf_stream_data_e stream_data_mode; bool compress_streams; bool compress_streams_set; + bool recompress_flate; + bool recompress_flate_set; + int compression_level; qpdf_stream_decode_level_e decode_level; bool decode_level_set; bool normalize_set; @@ -632,6 +639,8 @@ class ArgParser void argCollate(); void argStreamData(char* parameter); void argCompressStreams(char* parameter); + void argRecompressFlate(); + void argCompressionLevel(char* parameter); void argDecodeLevel(char* parameter); void argNormalizeContent(char* parameter); void argSuppressRecovery(); @@ -847,6 +856,9 @@ ArgParser::initOptionTable() &ArgParser::argStreamData, stream_data_choices); (*t)["compress-streams"] = oe_requiredChoices( &ArgParser::argCompressStreams, yn); + (*t)["recompress-flate"] = oe_bare(&ArgParser::argRecompressFlate); + (*t)["compression-level"] = oe_requiredParameter( + &ArgParser::argCompressionLevel, "level"); char const* decode_level_choices[] = {"none", "generalized", "specialized", "all", 0}; (*t)["decode-level"] = oe_requiredChoices( @@ -1328,6 +1340,9 @@ ArgParser::argHelp() << "--stream-data=option controls transformation of stream data (below)\n" << "--compress-streams=[yn] controls whether to compress streams on output\n" << "--decode-level=option controls how to filter streams from the input\n" + << "--recompress-flate recompress streams already compressed with Flate\n" + << "--compression-level=n set zlib compression level; most effective with\n" + << " --recompress-flate --object-streams=generate\n" << "--normalize-content=[yn] enables or disables normalization of content streams\n" << "--object-streams=mode controls handing of object streams\n" << "--preserve-unreferenced preserve unreferenced objects\n" @@ -1724,6 +1739,19 @@ ArgParser::argCompressStreams(char* parameter) o.compress_streams = (strcmp(parameter, "y") == 0); } +void +ArgParser::argRecompressFlate() +{ + o.recompress_flate_set = true; + o.recompress_flate = true; +} + +void +ArgParser::argCompressionLevel(char* parameter) +{ + o.compression_level = QUtil::string_to_int(parameter); +} + void ArgParser::argDecodeLevel(char* parameter) { @@ -4889,6 +4917,10 @@ static void set_encryption_options(QPDF& pdf, Options& o, QPDFWriter& w) static void set_writer_options(QPDF& pdf, Options& o, QPDFWriter& w) { + if (o.compression_level >= 0) + { + Pl_Flate::setCompressionLevel(o.compression_level); + } if (o.qdf_mode) { w.setQDFMode(true); @@ -4913,6 +4945,10 @@ static void set_writer_options(QPDF& pdf, Options& o, QPDFWriter& w) { w.setCompressStreams(o.compress_streams); } + if (o.recompress_flate_set) + { + w.setRecompressFlate(o.recompress_flate); + } if (o.decode_level_set) { w.setDecodeLevel(o.decode_level); diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index d7046e8b..9474d723 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -3876,8 +3876,20 @@ $td->runtest("convert inline-images to qdf", compare_pdfs("inline-images.pdf", "a.pdf"); show_ntests(); +# ---------- +$td->notify("--- Compression Level ---"); +$n_tests += 4; +check_pdf("recompress with level", + "qpdf --static-id --recompress-flate --compression-level=9" . + " --object-streams=generate minimal.pdf", + "minimal-9.pdf", 0); +check_pdf("recompress with level", + "qpdf --static-id --recompress-flate --compression-level=1" . + " --object-streams=generate minimal.pdf", + "minimal-1.pdf", 0); +show_ntests(); # ---------- $td->notify("--- Specialized filtering Tests ---"); $n_tests += 3; diff --git a/qpdf/qtest/qpdf/minimal-1.pdf b/qpdf/qtest/qpdf/minimal-1.pdf new file mode 100644 index 00000000..726a9d11 Binary files /dev/null and b/qpdf/qtest/qpdf/minimal-1.pdf differ diff --git a/qpdf/qtest/qpdf/minimal-9.pdf b/qpdf/qtest/qpdf/minimal-9.pdf new file mode 100644 index 00000000..46becb36 Binary files /dev/null and b/qpdf/qtest/qpdf/minimal-9.pdf differ