From 7f8892525f897b17049f9e59bc4ce8ac28c9e082 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Thu, 27 Jul 2017 23:42:27 -0400 Subject: [PATCH] Add precheck streams capability When requested, QPDFWriter will do more aggress prechecking of streams to make sure it can actually succeed in decoding them before attempting to do so. This will allow preservation of raw data even when the raw data is corrupted relative to the specified filters. --- ChangeLog | 4 ++++ include/qpdf/QPDF.hh | 14 +++++++------ include/qpdf/QPDFObjectHandle.hh | 3 ++- include/qpdf/QPDFWriter.hh | 12 +++++++++++ libqpdf/QPDF.cc | 28 +++++++++++++++++--------- libqpdf/QPDFObjectHandle.cc | 5 +++-- libqpdf/QPDFWriter.cc | 22 ++++++++++++++++++++ libqpdf/QPDF_Stream.cc | 17 ++++++++++------ libqpdf/qpdf/QPDF_Stream.hh | 3 ++- manual/qpdf-manual.xml | 17 ++++++++++++++++ qpdf/qpdf.cc | 10 +++++++++ qpdf/qpdf.testcov | 1 + qpdf/qtest/qpdf.test | 20 ++++++++++++++++++ qpdf/qtest/qpdf/bad-data-out.pdf | Bin 0 -> 759 bytes qpdf/qtest/qpdf/bad-data-precheck.pdf | Bin 0 -> 797 bytes qpdf/qtest/qpdf/bad-data.out | 2 ++ qpdf/qtest/qpdf/bad-data.pdf | Bin 0 -> 799 bytes 17 files changed, 133 insertions(+), 25 deletions(-) create mode 100644 qpdf/qtest/qpdf/bad-data-out.pdf create mode 100644 qpdf/qtest/qpdf/bad-data-precheck.pdf create mode 100644 qpdf/qtest/qpdf/bad-data.out create mode 100644 qpdf/qtest/qpdf/bad-data.pdf diff --git a/ChangeLog b/ChangeLog index 119a4c6c..026833d4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2017-07-27 Jay Berkenbilt + * Add --precheck-streams command-line option and setStreamPrecheck + option to QPDFWriter to tell QPDFWriter to attempt decoding a + stream fully before deciding whether to filter it or not. + * Recover gracefully from streams that aren't filterable because the filter parameters are invalid in the stream dictionary or the dictionary itself is invalid. diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index 18a6851f..ef9ce597 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -540,13 +540,14 @@ class QPDF { friend class QPDF_Stream; private: - static void pipeStreamData(QPDF* qpdf, int objid, int generation, + static bool pipeStreamData(QPDF* qpdf, int objid, int generation, qpdf_offset_t offset, size_t length, QPDFObjectHandle dict, - Pipeline* pipeline) + Pipeline* pipeline, bool suppress_warnings) { - qpdf->pipeStreamData( - objid, generation, offset, length, dict, pipeline); + return qpdf->pipeStreamData( + objid, generation, offset, length, dict, pipeline, + suppress_warnings); } }; friend class Pipe; @@ -666,10 +667,11 @@ class QPDF void findAttachmentStreams(); // Calls finish() on the pipeline when done but does not delete it - void pipeStreamData(int objid, int generation, + bool pipeStreamData(int objid, int generation, qpdf_offset_t offset, size_t length, QPDFObjectHandle dict, - Pipeline* pipeline); + Pipeline* pipeline, + bool suppress_warnings); // For QPDFWriter: diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index 0fc989a5..11a52596 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -394,7 +394,8 @@ class QPDFObjectHandle // replaced if writing a new stream object. QPDF_DLL bool pipeStreamData(Pipeline*, bool filter, - bool normalize, bool compress); + bool normalize, bool compress, + bool suppress_warnings = false); // Replace a stream's dictionary. The new dictionary must be // consistent with the stream's data. This is most appropriately diff --git a/include/qpdf/QPDFWriter.hh b/include/qpdf/QPDFWriter.hh index b2738c1f..2687cce0 100644 --- a/include/qpdf/QPDFWriter.hh +++ b/include/qpdf/QPDFWriter.hh @@ -144,6 +144,17 @@ class QPDFWriter QPDF_DLL void setQDFMode(bool); + // Enable stream precheck mode. In this mode, all filterable + // streams are checked by actually attempting to decode them + // before filtering. This may add significant time to the process + // of writing the data because all streams from the input must be + // read twice, but it enables the raw stream data to be preserved + // even in cases where qpdf would run into errors decoding the + // stream after it determines that it should be able to do it. + // Examples would include compressed data with errors in it. + QPDF_DLL + void setPrecheckStreams(bool); + // Set the minimum PDF version. If the PDF version of the input // file (or previously set minimum version) is less than the // version passed to this method, the PDF version of the output @@ -415,6 +426,7 @@ class QPDFWriter bool stream_data_mode_set; qpdf_stream_data_e stream_data_mode; bool qdf_mode; + bool precheck_streams; bool static_id; bool suppress_original_object_ids; bool direct_stream_lengths; diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index 32c8cdf9..b5c1212c 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -2134,12 +2134,14 @@ QPDF::getCompressibleObjGens() return result; } -void +bool QPDF::pipeStreamData(int objid, int generation, qpdf_offset_t offset, size_t length, QPDFObjectHandle stream_dict, - Pipeline* pipeline) + Pipeline* pipeline, + bool suppress_warnings) { + bool success = false; std::vector > to_delete; if (this->encrypted) { @@ -2165,21 +2167,29 @@ QPDF::pipeStreamData(int objid, int generation, length -= len; pipeline->write(QUtil::unsigned_char_pointer(buf), len); } + success = true; } catch (QPDFExc& e) { - warn(e); + if (! suppress_warnings) + { + warn(e); + } } catch (std::runtime_error& e) { - QTC::TC("qpdf", "QPDF decoding error warning"); - warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(), - "", this->file->getLastOffset(), - "error decoding stream data for object " + - QUtil::int_to_string(objid) + " " + - QUtil::int_to_string(generation) + ": " + e.what())); + if (! suppress_warnings) + { + QTC::TC("qpdf", "QPDF decoding error warning"); + warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(), + "", this->file->getLastOffset(), + "error decoding stream data for object " + + QUtil::int_to_string(objid) + " " + + QUtil::int_to_string(generation) + ": " + e.what())); + } } pipeline->finish(); + return success; } void diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 7618cdf3..bac233df 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -496,11 +496,12 @@ QPDFObjectHandle::getRawStreamData() bool QPDFObjectHandle::pipeStreamData(Pipeline* p, bool filter, - bool normalize, bool compress) + bool normalize, bool compress, + bool suppress_warnings) { assertStream(); return dynamic_cast(obj.getPointer())->pipeStreamData( - p, filter, normalize, compress); + p, filter, normalize, compress, suppress_warnings); } void diff --git a/libqpdf/QPDFWriter.cc b/libqpdf/QPDFWriter.cc index 01748fc7..59e306fc 100644 --- a/libqpdf/QPDFWriter.cc +++ b/libqpdf/QPDFWriter.cc @@ -57,6 +57,7 @@ QPDFWriter::init() stream_data_mode_set = false; stream_data_mode = qpdf_s_compress; qdf_mode = false; + precheck_streams = false; static_id = false; suppress_original_object_ids = false; direct_stream_lengths = true; @@ -176,6 +177,12 @@ QPDFWriter::setQDFMode(bool val) this->qdf_mode = val; } +void +QPDFWriter::setPrecheckStreams(bool val) +{ + this->precheck_streams = val; +} + void QPDFWriter::setMinimumPDFVersion(std::string const& version) { @@ -1522,6 +1529,21 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level, flags |= f_stream; + if (filter && this->precheck_streams) + { + try + { + QTC::TC("qpdf", "QPDFWriter precheck stream"); + Pl_Discard discard; + filter = object.pipeStreamData( + &discard, true, false, false, true); + } + catch (std::exception) + { + filter = false; + } + } + pushPipeline(new Pl_Buffer("stream data")); activatePipelineStack(); bool filtered = diff --git a/libqpdf/QPDF_Stream.cc b/libqpdf/QPDF_Stream.cc index b4d14441..31d583b8 100644 --- a/libqpdf/QPDF_Stream.cc +++ b/libqpdf/QPDF_Stream.cc @@ -85,7 +85,7 @@ PointerHolder QPDF_Stream::getStreamData() { Pl_Buffer buf("stream data buffer"); - if (! pipeStreamData(&buf, true, false, false)) + if (! pipeStreamData(&buf, true, false, false, false)) { throw std::logic_error("getStreamData called on unfilterable stream"); } @@ -97,7 +97,7 @@ PointerHolder QPDF_Stream::getRawStreamData() { Pl_Buffer buf("stream data buffer"); - pipeStreamData(&buf, false, false, false); + pipeStreamData(&buf, false, false, false, false); QTC::TC("qpdf", "QPDF_Stream getRawStreamData"); return buf.getBuffer(); } @@ -351,7 +351,8 @@ QPDF_Stream::filterable(std::vector& filters, bool QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter, - bool normalize, bool compress) + bool normalize, bool compress, + bool suppress_warnings) { std::vector filters; int predictor = 1; @@ -487,9 +488,13 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter, else { QTC::TC("qpdf", "QPDF_Stream pipe original stream data"); - QPDF::Pipe::pipeStreamData(this->qpdf, this->objid, this->generation, - this->offset, this->length, - this->stream_dict, pipeline); + if (! QPDF::Pipe::pipeStreamData(this->qpdf, this->objid, this->generation, + this->offset, this->length, + this->stream_dict, pipeline, + suppress_warnings)) + { + filter = false; + } } return filter; diff --git a/libqpdf/qpdf/QPDF_Stream.hh b/libqpdf/qpdf/QPDF_Stream.hh index fa405d70..d053fd0f 100644 --- a/libqpdf/qpdf/QPDF_Stream.hh +++ b/libqpdf/qpdf/QPDF_Stream.hh @@ -23,7 +23,8 @@ class QPDF_Stream: public QPDFObject // See comments in QPDFObjectHandle.hh for these methods. bool pipeStreamData(Pipeline*, bool filter, - bool normalize, bool compress); + bool normalize, bool compress, + bool suppress_warnings); PointerHolder getStreamData(); PointerHolder getRawStreamData(); void replaceStreamData(PointerHolder data, diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml index a4c34e90..cd35718d 100644 --- a/manual/qpdf-manual.xml +++ b/manual/qpdf-manual.xml @@ -821,6 +821,23 @@ outfile.pdf + + + + + Tells qpdf to precheck each stream for the ability to decode + it. Ordinarily qpdf tries to decode streams that it thinks it + can decode based on the filters, and if there ends up being an + error when actually trying to do the decode, the stream data + is truncated. This flag causes qpdf to actually read the + stream fully before deciding whether to filter the stream. + This option will slow qpdf down since it will have to read the + stream twice, but it allows raw stream data to be preserved in + cases where the decoding of the stream would fail for some + reason. This may be useful in working with some damaged files. + + + diff --git a/qpdf/qpdf.cc b/qpdf/qpdf.cc index c52e1125..99cfd3a1 100644 --- a/qpdf/qpdf.cc +++ b/qpdf/qpdf.cc @@ -202,6 +202,7 @@ familiar with the PDF file format or who are PDF developers.\n\ --suppress-recovery prevents qpdf from attempting to recover damaged files\n\ --object-streams=mode controls handing of object streams\n\ --ignore-xref-streams tells qpdf to ignore any cross-reference streams\n\ +--precheck-streams precheck ability to decode streams\n\ --qdf turns on \"QDF mode\" (below)\n\ --min-version=version sets the minimum PDF version of the output file\n\ --force-version=version forces this to be the PDF version of the output file\n\ @@ -1028,6 +1029,7 @@ int main(int argc, char* argv[]) qpdf_object_stream_e object_stream_mode = qpdf_o_preserve; bool ignore_xref_streams = false; bool qdf_mode = false; + bool precheck_streams = false; std::string min_version; std::string force_version; @@ -1213,6 +1215,10 @@ int main(int argc, char* argv[]) { qdf_mode = true; } + else if (strcmp(arg, "precheck-streams") == 0) + { + precheck_streams = true; + } else if (strcmp(arg, "min-version") == 0) { if (parameter == 0) @@ -1704,6 +1710,10 @@ int main(int argc, char* argv[]) { w.setQDFMode(true); } + if (precheck_streams) + { + w.setPrecheckStreams(true); + } if (normalize_set) { w.setContentNormalization(normalize); diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 268ecb16..bf227c7a 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -279,3 +279,4 @@ QPDFObjectHandle treat word as string 0 QPDFObjectHandle found fake 1 QPDFObjectHandle no val for last key 0 QPDF resolve failure to null 0 +QPDFWriter precheck stream 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index b80ab9cb..b61882b9 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -723,6 +723,26 @@ $td->runtest("check output", {$td->FILE => "from-scratch-0.pdf"}); show_ntests(); # ---------- +$td->notify("--- Precheck streams ---"); +$n_tests += 4; + +$td->runtest("bad stream without precheck", + {$td->COMMAND => "qpdf --static-id bad-data.pdf a.pdf"}, + {$td->FILE => "bad-data.out", $td->EXIT_STATUS => 3}, + $td->NORMALIZE_NEWLINES); +$td->runtest("check output", + {$td->FILE => "a.pdf"}, + {$td->FILE => "bad-data-out.pdf"}); +$td->runtest("bad stream with precheck", + {$td->COMMAND => + "qpdf --static-id --precheck-streams bad-data.pdf a.pdf"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); +$td->runtest("check output", + {$td->FILE => "a.pdf"}, + {$td->FILE => "bad-data-precheck.pdf"}); +show_ntests(); +# ---------- $td->notify("--- Copy Foreign Objects ---"); $n_tests += 7; diff --git a/qpdf/qtest/qpdf/bad-data-out.pdf b/qpdf/qtest/qpdf/bad-data-out.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f43006628160dc0f405bbb1c76b5f8df01cdbd4a GIT binary patch literal 759 zcmZWn%Z}4P5Cl$WzM?NVf;|tvq{tGxSqsFnLu{$tFL4{rJN;gkU$H*>noQT<@?2LC6Avci+As1eekE4%m5|<; z3rI2nq&3$K6%Mu**xEx=l27lruyoRWaELmgJSRqD%OyI9Pe?zE_w&yrzVl$GsP_&V z2(CZg&akuN|G4|NY_VqbKhlmz2f>_K=HjKd4>k7CURfOWc1& zSC<-sD?_@q*+Fnyn-`|7N8m=^QNx0QTpl*A3*I}X|K50~HRkB&@~u`h>a3t5JUUnp z3pzHqWg#RiJUtLck&IvqCybL_|9}~>m;yP&xS*Y#VIrpfXP8v~b5%rlYp8RLuJ;tM z-_^oee`&f~wyeJ+BoMrs!D^~>gff*HQD#Yql~TA#6CqWVY%{s#`BZYnqfEp!O`0W{ f6f#w@h?OLoNU|tj_si0a>tawTXOqd*+k*WCNTte! literal 0 HcmV?d00001 diff --git a/qpdf/qtest/qpdf/bad-data-precheck.pdf b/qpdf/qtest/qpdf/bad-data-precheck.pdf new file mode 100644 index 0000000000000000000000000000000000000000..4314025acf6be5a0e4293faaed5e0cd9e3dbc582 GIT binary patch literal 797 zcmZ8fOODe(5alDJV2L{v64?QEcl#?v7R+QUA_fRaq>+fS=){#^lxZWoGZR>G0dByG zBX9w>86hNAupTbK4pHrlW0dyB{kmV(d#}3P?7@E$;8lg5Mq8=?k( zhiuIlJ^wi|oohKsw$xaFef;XAi?ci>>&K`T`Zc(rp_IBnd1|dcmDc5;|KnVY9R9M^ zyZPn&#}AuFckkWW{A_LP-u;VTqt?am;?CubYu~Ql-cV*&%&YW_$hzl&EU_|RKf&c0 z>TD*#K2p>@Qwy+9GWA%MSpz(mrz9((xQDB$uEt}0(SB>Ms@0@z+?!To3Z$FRC0uKm z)MH#1IJX=KEN8dC_dMaj9CjEZxn_ZCFrQ-E!x*Q@?qS@g{(G1R>GAAgZWuL~E@ZZ( zF_?)C^4vVF>Mahl_Xq*ly)KM9uJllZLL!PNa4>aUoQ44xZW_!ZF=uf{Fqe4|_sLBd i1(Dz)bbao-f@nO5ym+!5sVsCAgU2GP-9CDeSpNX@ZQAqz literal 0 HcmV?d00001 diff --git a/qpdf/qtest/qpdf/bad-data.out b/qpdf/qtest/qpdf/bad-data.out new file mode 100644 index 00000000..3ea1d07f --- /dev/null +++ b/qpdf/qtest/qpdf/bad-data.out @@ -0,0 +1,2 @@ +WARNING: bad-data.pdf (file position 319): error decoding stream data for object 4 0: LZWDecoder: bad code received +qpdf: operation succeeded with warnings; resulting file may have some problems diff --git a/qpdf/qtest/qpdf/bad-data.pdf b/qpdf/qtest/qpdf/bad-data.pdf new file mode 100644 index 0000000000000000000000000000000000000000..94ddafd449dcd13cfec0a42e5221e3dc18655064 GIT binary patch literal 799 zcmah{OODe(5alDJ&=PkjB(ekAcKee^Q3Nv?i--Y25@{r&EIM%|7-hPV-GRW03vdHg z9Dxh4%?Kf}g7t6-c8F?w9HX$Hy>Y*)SM};uHwV4=j(bx(6bk;(*Zt^ zOU%I|WC(ox<|P6@lt!-f61rW6DzS++ToDfSS{ZPGKTMMxCLjvxv?7=FYN`#_RAo?^ z9Q?A~BTQ0xpf^;-p)Yhjh}s@dt%1xav63~!T(7e^>9mStO_Acqg`gd zcSt#m)$`XRwsR%t7}e4|2fsx%)0M#t`0=xo9?o@w0G5=S4CN`?e8}1PaPaN57(3q6 zM*r%kukYV&@7=t0ZTr2k<^9`dKgNx-pV^J`E0?}pzP=^bE|?eoS(#PW1DRtn0)K$3 zS7_3?1b(EbX{wgMPf~SXVVxZVz)k+U*|Zh17o!6Rm%_ew*H4m1i=U zuE>46)1lV(aTTv|l)goXfcJYa>G(20(Gn6-v?B)--^W=i@`Rs6i?&#}-T!Ay%azK^ P6gC8&%bLxjCo%g2aGTv+ literal 0 HcmV?d00001