From 2794bfb1a665cad93a38144bea0ba0daea7152e7 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Fri, 23 Aug 2019 19:59:38 -0400 Subject: [PATCH] Add flags to control zlib compression level (fixes #113) --- ChangeLog | 14 +++++ include/qpdf/QPDFWriter.hh | 17 ++++-- libqpdf/QPDFWriter.cc | 20 ++++--- manual/build.mk | 3 +- manual/qpdf-manual.xml | 98 +++++++++++++++++++++++++++++----- qpdf/qpdf.cc | 36 +++++++++++++ qpdf/qtest/qpdf.test | 12 +++++ qpdf/qtest/qpdf/minimal-1.pdf | Bin 0 -> 750 bytes qpdf/qtest/qpdf/minimal-9.pdf | Bin 0 -> 743 bytes 9 files changed, 178 insertions(+), 22 deletions(-) create mode 100644 qpdf/qtest/qpdf/minimal-1.pdf create mode 100644 qpdf/qtest/qpdf/minimal-9.pdf diff --git a/ChangeLog b/ChangeLog index cdfa3fce..915d73f8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,22 @@ 2019-08-23 Jay Berkenbilt + * Add --recompress-streams option to qpdf and + QPDFWriter::setRecompressFlate to cause QPDFWriter to recompress + streams that are already compressed with /FlateDecode. + * Add option Pl_Flate::setCompressionLevel to globally set the zlib compression level used by all Pl_Flate pipelines. + * Add --compression-level flag to qpdf to set the zlib compression + level. When combined with --recompress-flate, this will cause most + of qpdf's streams to use the maximum compression level. This + results in only a very small amount of savings in size that comes + at a fairly significant performance cost, but it could be useful + for archival files or other cases where every byte counts and + creation time doesn't matter so much. Note that using + --object-streams=generate in combination with these options gives + you the biggest advantage. Fixes #113. + 2019-08-22 Jay Berkenbilt * In QPDFObjectHandle::ParserCallbacks, in addition to diff --git a/include/qpdf/QPDFWriter.hh b/include/qpdf/QPDFWriter.hh index 860b0630..0fd114db 100644 --- a/include/qpdf/QPDFWriter.hh +++ b/include/qpdf/QPDFWriter.hh @@ -189,10 +189,11 @@ class QPDFWriter // filters on the input. When combined with // setCompressStreams(true), which the default, the effect of this // is that streams filtered with these older and less efficient - // filters will be recompressed with the Flate filter. As a - // special case, if a stream is already compressed with + // filters will be recompressed with the Flate filter. By default, + // as a special case, if a stream is already compressed with // FlateDecode and setCompressStreams is enabled, the original - // compressed data will be preserved. + // compressed data will be preserved. This behavior can be + // overridden by calling setRecompressFlate(true). // // qpdf_dl_specialized: In addition to uncompressing the // generalized compression formats, supported non-lossy @@ -209,6 +210,15 @@ class QPDFWriter QPDF_DLL void setDecodeLevel(qpdf_stream_decode_level_e); + // By default, when both the input and output contents of a stream + // are compressed with Flate, qpdf does not uncompress and + // recompress the stream. Passing true here causes it to do so. + // This can be useful if recompressing all streams with a higher + // compression level, which can be set by calling the static + // method Pl_Flate::setCompressionLevel. + QPDF_DLL + void setRecompressFlate(bool); + // Set value of content stream normalization. The default is // "false". If true, we attempt to normalize newlines inside of // content streams. Some constructs such as inline images may @@ -597,6 +607,7 @@ class QPDFWriter bool compress_streams_set; qpdf_stream_decode_level_e stream_decode_level; bool stream_decode_level_set; + bool recompress_flate; bool qdf_mode; bool preserve_unreferenced_objects; bool newline_before_endstream; diff --git a/libqpdf/QPDFWriter.cc b/libqpdf/QPDFWriter.cc index 6c92338d..30bc1fcb 100644 --- a/libqpdf/QPDFWriter.cc +++ b/libqpdf/QPDFWriter.cc @@ -37,6 +37,7 @@ QPDFWriter::Members::Members(QPDF& pdf) : compress_streams_set(false), stream_decode_level(qpdf_dl_none), stream_decode_level_set(false), + recompress_flate(false), qdf_mode(false), preserve_unreferenced_objects(false), newline_before_endstream(false), @@ -206,6 +207,12 @@ QPDFWriter::setDecodeLevel(qpdf_stream_decode_level_e val) this->m->stream_decode_level_set = true; } +void +QPDFWriter::setRecompressFlate(bool val) +{ + this->m->recompress_flate = val; +} + void QPDFWriter::setContentNormalization(bool val) { @@ -1716,13 +1723,14 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level, if (this->m->compress_streams) { // Don't filter if the stream is already compressed with - // FlateDecode. We don't want to make it worse by getting - // rid of a predictor or otherwise messing with it. We - // should also avoid messing with anything that's - // compressed with a lossy compression scheme, but we - // don't support any of those right now. + // FlateDecode. This way we don't make it worse if the + // original file used a better Flate algorithm, and we + // don't spend time and CPU cycles uncompressing and + // recompressing stuff. This can be overridden with + // setRecompressFlate(true). QPDFObjectHandle filter_obj = stream_dict.getKey("/Filter"); - if ((! object.isDataModified()) && + if ((! this->m->recompress_flate) && + (! object.isDataModified()) && filter_obj.isName() && ((filter_obj.getName() == "/FlateDecode") || (filter_obj.getName() == "/Fl"))) diff --git a/manual/build.mk b/manual/build.mk index 03e8fe56..3911b8e2 100644 --- a/manual/build.mk +++ b/manual/build.mk @@ -26,7 +26,8 @@ endif $(OUTDOC).pdf: $(OUTDOC).fo qpdf/build/qpdf $(FOP) $< -pdf $@.tmp - qpdf/build/qpdf --linearize $@.tmp $@ + qpdf/build/qpdf --linearize --object-streams=generate \ + --recompress-flate --compression-level=9 $@.tmp $@ $(OUTDOC).html: $(INDOC).xml manual/html.xsl $(VALIDATE) $(XSLTPROC) --output $@ manual/html.xsl $< diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml index db2a54fa..6e72456e 100644 --- a/manual/qpdf-manual.xml +++ b/manual/qpdf-manual.xml @@ -1433,27 +1433,32 @@ outfile.pdf : decode streams filtered with - supported generalized filters: , - , - , and - . We define generalized + supported generalized filters: + /LZWDecode, + /FlateDecode, + /ASCII85Decode, and + /ASCIIHexDecode. We define generalized filters as those to be used for general-purpose compression or encoding, as opposed to filters specifically designed - for image data. + for image data. Note that, by default, streams already + compressed with /FlateDecode are not + uncompressed and recompressed unless you also specify + . : in addition to generalized, decode streams with supported non-lossy specialized - filters; currently this is just + filters; currently this is just + /RunLengthDecode : in addition to generalized and specialized, decode streams with supported lossy filters; - currently this is just (JPEG) + currently this is just /DCTDecode (JPEG) @@ -1476,7 +1481,10 @@ outfile.pdf : recompress stream data when possible (default); equivalent to - + . Does not + recompress streams already compressed with + /FlateDecode unless + is also specified. @@ -1498,6 +1506,37 @@ outfile.pdf + + + + + By default, streams already compressed with + /FlateDecode are left alone rather than + being uncompressed and recompressed. This option causes qpdf + to uncompress and recompress the streams. There is a + significant performance cost to using this option, but you + probably want to use it if you specify + . + + + + + + + + When writing new streams that are compressed with + /FlateDecode, use the specified compression + level. The value of should be a number + from 1 to 9 and is passed directly to zlib, which implements + deflate compression. Note that qpdf doesn't uncompress and + recompress streams by default. To have this option apply to + already compressed streams, you should also specify + . If your goal is to shrink + the size of PDF files, you should also use + . + + + @@ -4449,7 +4488,7 @@ print "\n"; - Library Enhancements + Library and CLI Enhancements @@ -4508,6 +4547,41 @@ print "\n"; bytes of the combined contents. + + + Static method + Pl_Flate::setCompressionLevel can be + called to set the zlib compression level globally used by + all instances of Pl_Flate in deflate mode. + + + + + The method + QPDFWriter::setRecompressFlate can be + called to tell QPDFWriter to + uncompress and recompress streams already compressed with + /FlateDecode. + + + + + CLI enhancement: the + instructs qpdf to recompress streams that + are already compressed with /FlateDecode. + Useful with . + + + + + CLI enhancement: the + + sets the zlib compression level used for any streams + compressed by /FlateDecode. Most + effective when combined with + . + + The underlying implementation of QPDF arrays has been @@ -5699,9 +5773,9 @@ print "\n"; Disregard data check errors when uncompressing - streams. This is consistent with - most other PDF readers and allows qpdf to recover data from - another class of malformed PDF files. + /FlateDecode streams. This is consistent + with most other PDF readers and allows qpdf to recover data + from another class of malformed PDF files. diff --git a/qpdf/qpdf.cc b/qpdf/qpdf.cc index a0f7f7ea..a5eef425 100644 --- a/qpdf/qpdf.cc +++ b/qpdf/qpdf.cc @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -124,6 +125,9 @@ struct Options stream_data_mode(qpdf_s_compress), compress_streams(true), compress_streams_set(false), + recompress_flate(false), + recompress_flate_set(false), + compression_level(-1), decode_level(qpdf_dl_generalized), decode_level_set(false), normalize_set(false), @@ -217,6 +221,9 @@ struct Options qpdf_stream_data_e stream_data_mode; bool compress_streams; bool compress_streams_set; + bool recompress_flate; + bool recompress_flate_set; + int compression_level; qpdf_stream_decode_level_e decode_level; bool decode_level_set; bool normalize_set; @@ -632,6 +639,8 @@ class ArgParser void argCollate(); void argStreamData(char* parameter); void argCompressStreams(char* parameter); + void argRecompressFlate(); + void argCompressionLevel(char* parameter); void argDecodeLevel(char* parameter); void argNormalizeContent(char* parameter); void argSuppressRecovery(); @@ -847,6 +856,9 @@ ArgParser::initOptionTable() &ArgParser::argStreamData, stream_data_choices); (*t)["compress-streams"] = oe_requiredChoices( &ArgParser::argCompressStreams, yn); + (*t)["recompress-flate"] = oe_bare(&ArgParser::argRecompressFlate); + (*t)["compression-level"] = oe_requiredParameter( + &ArgParser::argCompressionLevel, "level"); char const* decode_level_choices[] = {"none", "generalized", "specialized", "all", 0}; (*t)["decode-level"] = oe_requiredChoices( @@ -1328,6 +1340,9 @@ ArgParser::argHelp() << "--stream-data=option controls transformation of stream data (below)\n" << "--compress-streams=[yn] controls whether to compress streams on output\n" << "--decode-level=option controls how to filter streams from the input\n" + << "--recompress-flate recompress streams already compressed with Flate\n" + << "--compression-level=n set zlib compression level; most effective with\n" + << " --recompress-flate --object-streams=generate\n" << "--normalize-content=[yn] enables or disables normalization of content streams\n" << "--object-streams=mode controls handing of object streams\n" << "--preserve-unreferenced preserve unreferenced objects\n" @@ -1724,6 +1739,19 @@ ArgParser::argCompressStreams(char* parameter) o.compress_streams = (strcmp(parameter, "y") == 0); } +void +ArgParser::argRecompressFlate() +{ + o.recompress_flate_set = true; + o.recompress_flate = true; +} + +void +ArgParser::argCompressionLevel(char* parameter) +{ + o.compression_level = QUtil::string_to_int(parameter); +} + void ArgParser::argDecodeLevel(char* parameter) { @@ -4889,6 +4917,10 @@ static void set_encryption_options(QPDF& pdf, Options& o, QPDFWriter& w) static void set_writer_options(QPDF& pdf, Options& o, QPDFWriter& w) { + if (o.compression_level >= 0) + { + Pl_Flate::setCompressionLevel(o.compression_level); + } if (o.qdf_mode) { w.setQDFMode(true); @@ -4913,6 +4945,10 @@ static void set_writer_options(QPDF& pdf, Options& o, QPDFWriter& w) { w.setCompressStreams(o.compress_streams); } + if (o.recompress_flate_set) + { + w.setRecompressFlate(o.recompress_flate); + } if (o.decode_level_set) { w.setDecodeLevel(o.decode_level); diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index d7046e8b..9474d723 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -3876,8 +3876,20 @@ $td->runtest("convert inline-images to qdf", compare_pdfs("inline-images.pdf", "a.pdf"); show_ntests(); +# ---------- +$td->notify("--- Compression Level ---"); +$n_tests += 4; +check_pdf("recompress with level", + "qpdf --static-id --recompress-flate --compression-level=9" . + " --object-streams=generate minimal.pdf", + "minimal-9.pdf", 0); +check_pdf("recompress with level", + "qpdf --static-id --recompress-flate --compression-level=1" . + " --object-streams=generate minimal.pdf", + "minimal-1.pdf", 0); +show_ntests(); # ---------- $td->notify("--- Specialized filtering Tests ---"); $n_tests += 3; diff --git a/qpdf/qtest/qpdf/minimal-1.pdf b/qpdf/qtest/qpdf/minimal-1.pdf new file mode 100644 index 0000000000000000000000000000000000000000..726a9d119ceca12de21b7991407d705de99768ff GIT binary patch literal 750 zcmY!laBZ^4|D>$ol3WFSpVYkck_-hS zBTEH+x6GW9)FL3AlUS1KlA4^K0#xXyU<#5cDlSnlGE=a#<0>vGN=?k=s$g`TaJTD_ zfk5l~@0#bNm-OD!@;JC(siXVFp|}q5jj~=2!kIyZ_wRXn#THFIGsp7zY!4NQM3Z9@ zKShL3s5rA}US9uG=FtMFqC5TtEkAx%3moyPEZxl8rKAPm&YXeenz8pa)#Qt(9}8Waek#yJIC_`u$=6zkd%=IuR z;6k}^h=EZchbQxl+5Ik{2~QIBVep5=!Yvr zD;O#m0dcH?eo%fsFuH-Z1}W$VXI7;u08@vji$b)Gv7w2fsil#bsj;brrMacCk+Fq| znUR@^v9X1*k-4R*9bpx*D2XUJiGfMYjjc}LV`n4hiDVb`f5vV98M~$$GBB7bGC$_R Z2#ey9#G;alqSQ1lpv7FOs;>TSTmWpx3w!_o literal 0 HcmV?d00001 diff --git a/qpdf/qtest/qpdf/minimal-9.pdf b/qpdf/qtest/qpdf/minimal-9.pdf new file mode 100644 index 0000000000000000000000000000000000000000..46becb3684da0a975dd7b76e4022413f5b76c087 GIT binary patch literal 743 zcmY!laBZ^4|D>$ol3WFSpVYkck_-hS zBVz@9x6GW9)FL3AlUS1KlA4^K0#xXyU<#5cDlSnlGE=a#<0>vGN=?k=s<`FaAJ~1! zK%nLRU(wI}mt1oFCmdR--c->c_d~(%$*rX&N%$K$ zer(hXnWo$xRAe&W`Awc{#DqSDKbUXIx1XN z^8_6{_`aDgJae7GlLR-}tH!ghjI~XGM=ycdRvu%wQlD{+eR_gEjD!(+O z@J*fbqT@W4E~;Ygdl~vC$1Y9HOM!Y6$N~kOIZ_Zp1I@$&dzc{yQZd)VpnwbI#vukq zfgGO9o6hz&oN!Q7VLfq;f4dBwHa77ZH@229Gvl zBkV5Fhs0h$Vo`3f0w~g*^K(jb^NJNr6!Zg%Qd2UMOY(~p42^*Csh}UO5UpUSU4R{VPa-vW@2n? zVQge>X=+DUMJ!74Nls#5QgdUg6ZqKK$ax~!Mg5;K+dsxlrz99mzBAwD!U&4ulEk8t TilWpsE)!ETE>%@me>W}wTI&Z@ literal 0 HcmV?d00001