diff --git a/ChangeLog b/ChangeLog index 119a4c6c..026833d4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2017-07-27 Jay Berkenbilt + * Add --precheck-streams command-line option and setStreamPrecheck + option to QPDFWriter to tell QPDFWriter to attempt decoding a + stream fully before deciding whether to filter it or not. + * Recover gracefully from streams that aren't filterable because the filter parameters are invalid in the stream dictionary or the dictionary itself is invalid. diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index 18a6851f..ef9ce597 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -540,13 +540,14 @@ class QPDF { friend class QPDF_Stream; private: - static void pipeStreamData(QPDF* qpdf, int objid, int generation, + static bool pipeStreamData(QPDF* qpdf, int objid, int generation, qpdf_offset_t offset, size_t length, QPDFObjectHandle dict, - Pipeline* pipeline) + Pipeline* pipeline, bool suppress_warnings) { - qpdf->pipeStreamData( - objid, generation, offset, length, dict, pipeline); + return qpdf->pipeStreamData( + objid, generation, offset, length, dict, pipeline, + suppress_warnings); } }; friend class Pipe; @@ -666,10 +667,11 @@ class QPDF void findAttachmentStreams(); // Calls finish() on the pipeline when done but does not delete it - void pipeStreamData(int objid, int generation, + bool pipeStreamData(int objid, int generation, qpdf_offset_t offset, size_t length, QPDFObjectHandle dict, - Pipeline* pipeline); + Pipeline* pipeline, + bool suppress_warnings); // For QPDFWriter: diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index 0fc989a5..11a52596 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -394,7 +394,8 @@ class QPDFObjectHandle // replaced if writing a new stream object. QPDF_DLL bool pipeStreamData(Pipeline*, bool filter, - bool normalize, bool compress); + bool normalize, bool compress, + bool suppress_warnings = false); // Replace a stream's dictionary. The new dictionary must be // consistent with the stream's data. This is most appropriately diff --git a/include/qpdf/QPDFWriter.hh b/include/qpdf/QPDFWriter.hh index b2738c1f..2687cce0 100644 --- a/include/qpdf/QPDFWriter.hh +++ b/include/qpdf/QPDFWriter.hh @@ -144,6 +144,17 @@ class QPDFWriter QPDF_DLL void setQDFMode(bool); + // Enable stream precheck mode. In this mode, all filterable + // streams are checked by actually attempting to decode them + // before filtering. This may add significant time to the process + // of writing the data because all streams from the input must be + // read twice, but it enables the raw stream data to be preserved + // even in cases where qpdf would run into errors decoding the + // stream after it determines that it should be able to do it. + // Examples would include compressed data with errors in it. + QPDF_DLL + void setPrecheckStreams(bool); + // Set the minimum PDF version. If the PDF version of the input // file (or previously set minimum version) is less than the // version passed to this method, the PDF version of the output @@ -415,6 +426,7 @@ class QPDFWriter bool stream_data_mode_set; qpdf_stream_data_e stream_data_mode; bool qdf_mode; + bool precheck_streams; bool static_id; bool suppress_original_object_ids; bool direct_stream_lengths; diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index 32c8cdf9..b5c1212c 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -2134,12 +2134,14 @@ QPDF::getCompressibleObjGens() return result; } -void +bool QPDF::pipeStreamData(int objid, int generation, qpdf_offset_t offset, size_t length, QPDFObjectHandle stream_dict, - Pipeline* pipeline) + Pipeline* pipeline, + bool suppress_warnings) { + bool success = false; std::vector > to_delete; if (this->encrypted) { @@ -2165,21 +2167,29 @@ QPDF::pipeStreamData(int objid, int generation, length -= len; pipeline->write(QUtil::unsigned_char_pointer(buf), len); } + success = true; } catch (QPDFExc& e) { - warn(e); + if (! suppress_warnings) + { + warn(e); + } } catch (std::runtime_error& e) { - QTC::TC("qpdf", "QPDF decoding error warning"); - warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(), - "", this->file->getLastOffset(), - "error decoding stream data for object " + - QUtil::int_to_string(objid) + " " + - QUtil::int_to_string(generation) + ": " + e.what())); + if (! suppress_warnings) + { + QTC::TC("qpdf", "QPDF decoding error warning"); + warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(), + "", this->file->getLastOffset(), + "error decoding stream data for object " + + QUtil::int_to_string(objid) + " " + + QUtil::int_to_string(generation) + ": " + e.what())); + } } pipeline->finish(); + return success; } void diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 7618cdf3..bac233df 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -496,11 +496,12 @@ QPDFObjectHandle::getRawStreamData() bool QPDFObjectHandle::pipeStreamData(Pipeline* p, bool filter, - bool normalize, bool compress) + bool normalize, bool compress, + bool suppress_warnings) { assertStream(); return dynamic_cast(obj.getPointer())->pipeStreamData( - p, filter, normalize, compress); + p, filter, normalize, compress, suppress_warnings); } void diff --git a/libqpdf/QPDFWriter.cc b/libqpdf/QPDFWriter.cc index 01748fc7..59e306fc 100644 --- a/libqpdf/QPDFWriter.cc +++ b/libqpdf/QPDFWriter.cc @@ -57,6 +57,7 @@ QPDFWriter::init() stream_data_mode_set = false; stream_data_mode = qpdf_s_compress; qdf_mode = false; + precheck_streams = false; static_id = false; suppress_original_object_ids = false; direct_stream_lengths = true; @@ -176,6 +177,12 @@ QPDFWriter::setQDFMode(bool val) this->qdf_mode = val; } +void +QPDFWriter::setPrecheckStreams(bool val) +{ + this->precheck_streams = val; +} + void QPDFWriter::setMinimumPDFVersion(std::string const& version) { @@ -1522,6 +1529,21 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level, flags |= f_stream; + if (filter && this->precheck_streams) + { + try + { + QTC::TC("qpdf", "QPDFWriter precheck stream"); + Pl_Discard discard; + filter = object.pipeStreamData( + &discard, true, false, false, true); + } + catch (std::exception) + { + filter = false; + } + } + pushPipeline(new Pl_Buffer("stream data")); activatePipelineStack(); bool filtered = diff --git a/libqpdf/QPDF_Stream.cc b/libqpdf/QPDF_Stream.cc index b4d14441..31d583b8 100644 --- a/libqpdf/QPDF_Stream.cc +++ b/libqpdf/QPDF_Stream.cc @@ -85,7 +85,7 @@ PointerHolder QPDF_Stream::getStreamData() { Pl_Buffer buf("stream data buffer"); - if (! pipeStreamData(&buf, true, false, false)) + if (! pipeStreamData(&buf, true, false, false, false)) { throw std::logic_error("getStreamData called on unfilterable stream"); } @@ -97,7 +97,7 @@ PointerHolder QPDF_Stream::getRawStreamData() { Pl_Buffer buf("stream data buffer"); - pipeStreamData(&buf, false, false, false); + pipeStreamData(&buf, false, false, false, false); QTC::TC("qpdf", "QPDF_Stream getRawStreamData"); return buf.getBuffer(); } @@ -351,7 +351,8 @@ QPDF_Stream::filterable(std::vector& filters, bool QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter, - bool normalize, bool compress) + bool normalize, bool compress, + bool suppress_warnings) { std::vector filters; int predictor = 1; @@ -487,9 +488,13 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter, else { QTC::TC("qpdf", "QPDF_Stream pipe original stream data"); - QPDF::Pipe::pipeStreamData(this->qpdf, this->objid, this->generation, - this->offset, this->length, - this->stream_dict, pipeline); + if (! QPDF::Pipe::pipeStreamData(this->qpdf, this->objid, this->generation, + this->offset, this->length, + this->stream_dict, pipeline, + suppress_warnings)) + { + filter = false; + } } return filter; diff --git a/libqpdf/qpdf/QPDF_Stream.hh b/libqpdf/qpdf/QPDF_Stream.hh index fa405d70..d053fd0f 100644 --- a/libqpdf/qpdf/QPDF_Stream.hh +++ b/libqpdf/qpdf/QPDF_Stream.hh @@ -23,7 +23,8 @@ class QPDF_Stream: public QPDFObject // See comments in QPDFObjectHandle.hh for these methods. bool pipeStreamData(Pipeline*, bool filter, - bool normalize, bool compress); + bool normalize, bool compress, + bool suppress_warnings); PointerHolder getStreamData(); PointerHolder getRawStreamData(); void replaceStreamData(PointerHolder data, diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml index a4c34e90..cd35718d 100644 --- a/manual/qpdf-manual.xml +++ b/manual/qpdf-manual.xml @@ -821,6 +821,23 @@ outfile.pdf + + + + + Tells qpdf to precheck each stream for the ability to decode + it. Ordinarily qpdf tries to decode streams that it thinks it + can decode based on the filters, and if there ends up being an + error when actually trying to do the decode, the stream data + is truncated. This flag causes qpdf to actually read the + stream fully before deciding whether to filter the stream. + This option will slow qpdf down since it will have to read the + stream twice, but it allows raw stream data to be preserved in + cases where the decoding of the stream would fail for some + reason. This may be useful in working with some damaged files. + + + diff --git a/qpdf/qpdf.cc b/qpdf/qpdf.cc index c52e1125..99cfd3a1 100644 --- a/qpdf/qpdf.cc +++ b/qpdf/qpdf.cc @@ -202,6 +202,7 @@ familiar with the PDF file format or who are PDF developers.\n\ --suppress-recovery prevents qpdf from attempting to recover damaged files\n\ --object-streams=mode controls handing of object streams\n\ --ignore-xref-streams tells qpdf to ignore any cross-reference streams\n\ +--precheck-streams precheck ability to decode streams\n\ --qdf turns on \"QDF mode\" (below)\n\ --min-version=version sets the minimum PDF version of the output file\n\ --force-version=version forces this to be the PDF version of the output file\n\ @@ -1028,6 +1029,7 @@ int main(int argc, char* argv[]) qpdf_object_stream_e object_stream_mode = qpdf_o_preserve; bool ignore_xref_streams = false; bool qdf_mode = false; + bool precheck_streams = false; std::string min_version; std::string force_version; @@ -1213,6 +1215,10 @@ int main(int argc, char* argv[]) { qdf_mode = true; } + else if (strcmp(arg, "precheck-streams") == 0) + { + precheck_streams = true; + } else if (strcmp(arg, "min-version") == 0) { if (parameter == 0) @@ -1704,6 +1710,10 @@ int main(int argc, char* argv[]) { w.setQDFMode(true); } + if (precheck_streams) + { + w.setPrecheckStreams(true); + } if (normalize_set) { w.setContentNormalization(normalize); diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 268ecb16..bf227c7a 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -279,3 +279,4 @@ QPDFObjectHandle treat word as string 0 QPDFObjectHandle found fake 1 QPDFObjectHandle no val for last key 0 QPDF resolve failure to null 0 +QPDFWriter precheck stream 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index b80ab9cb..b61882b9 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -723,6 +723,26 @@ $td->runtest("check output", {$td->FILE => "from-scratch-0.pdf"}); show_ntests(); # ---------- +$td->notify("--- Precheck streams ---"); +$n_tests += 4; + +$td->runtest("bad stream without precheck", + {$td->COMMAND => "qpdf --static-id bad-data.pdf a.pdf"}, + {$td->FILE => "bad-data.out", $td->EXIT_STATUS => 3}, + $td->NORMALIZE_NEWLINES); +$td->runtest("check output", + {$td->FILE => "a.pdf"}, + {$td->FILE => "bad-data-out.pdf"}); +$td->runtest("bad stream with precheck", + {$td->COMMAND => + "qpdf --static-id --precheck-streams bad-data.pdf a.pdf"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); +$td->runtest("check output", + {$td->FILE => "a.pdf"}, + {$td->FILE => "bad-data-precheck.pdf"}); +show_ntests(); +# ---------- $td->notify("--- Copy Foreign Objects ---"); $n_tests += 7; diff --git a/qpdf/qtest/qpdf/bad-data-out.pdf b/qpdf/qtest/qpdf/bad-data-out.pdf new file mode 100644 index 00000000..f4300662 Binary files /dev/null and b/qpdf/qtest/qpdf/bad-data-out.pdf differ diff --git a/qpdf/qtest/qpdf/bad-data-precheck.pdf b/qpdf/qtest/qpdf/bad-data-precheck.pdf new file mode 100644 index 00000000..4314025a Binary files /dev/null and b/qpdf/qtest/qpdf/bad-data-precheck.pdf differ diff --git a/qpdf/qtest/qpdf/bad-data.out b/qpdf/qtest/qpdf/bad-data.out new file mode 100644 index 00000000..3ea1d07f --- /dev/null +++ b/qpdf/qtest/qpdf/bad-data.out @@ -0,0 +1,2 @@ +WARNING: bad-data.pdf (file position 319): error decoding stream data for object 4 0: LZWDecoder: bad code received +qpdf: operation succeeded with warnings; resulting file may have some problems diff --git a/qpdf/qtest/qpdf/bad-data.pdf b/qpdf/qtest/qpdf/bad-data.pdf new file mode 100644 index 00000000..94ddafd4 Binary files /dev/null and b/qpdf/qtest/qpdf/bad-data.pdf differ