Add precheck streams capability

When requested, QPDFWriter will do more aggress prechecking of streams
to make sure it can actually succeed in decoding them before
attempting to do so. This will allow preservation of raw data even
when the raw data is corrupted relative to the specified filters.
This commit is contained in:
Jay Berkenbilt 2017-07-27 23:42:27 -04:00
parent 428d96dfe1
commit 7f8892525f
17 changed files with 133 additions and 25 deletions

View File

@ -1,5 +1,9 @@
2017-07-27 Jay Berkenbilt <ejb@ql.org>
* Add --precheck-streams command-line option and setStreamPrecheck
option to QPDFWriter to tell QPDFWriter to attempt decoding a
stream fully before deciding whether to filter it or not.
* Recover gracefully from streams that aren't filterable because
the filter parameters are invalid in the stream dictionary or the
dictionary itself is invalid.

View File

@ -540,13 +540,14 @@ class QPDF
{
friend class QPDF_Stream;
private:
static void pipeStreamData(QPDF* qpdf, int objid, int generation,
static bool pipeStreamData(QPDF* qpdf, int objid, int generation,
qpdf_offset_t offset, size_t length,
QPDFObjectHandle dict,
Pipeline* pipeline)
Pipeline* pipeline, bool suppress_warnings)
{
qpdf->pipeStreamData(
objid, generation, offset, length, dict, pipeline);
return qpdf->pipeStreamData(
objid, generation, offset, length, dict, pipeline,
suppress_warnings);
}
};
friend class Pipe;
@ -666,10 +667,11 @@ class QPDF
void findAttachmentStreams();
// Calls finish() on the pipeline when done but does not delete it
void pipeStreamData(int objid, int generation,
bool pipeStreamData(int objid, int generation,
qpdf_offset_t offset, size_t length,
QPDFObjectHandle dict,
Pipeline* pipeline);
Pipeline* pipeline,
bool suppress_warnings);
// For QPDFWriter:

View File

@ -394,7 +394,8 @@ class QPDFObjectHandle
// replaced if writing a new stream object.
QPDF_DLL
bool pipeStreamData(Pipeline*, bool filter,
bool normalize, bool compress);
bool normalize, bool compress,
bool suppress_warnings = false);
// Replace a stream's dictionary. The new dictionary must be
// consistent with the stream's data. This is most appropriately

View File

@ -144,6 +144,17 @@ class QPDFWriter
QPDF_DLL
void setQDFMode(bool);
// Enable stream precheck mode. In this mode, all filterable
// streams are checked by actually attempting to decode them
// before filtering. This may add significant time to the process
// of writing the data because all streams from the input must be
// read twice, but it enables the raw stream data to be preserved
// even in cases where qpdf would run into errors decoding the
// stream after it determines that it should be able to do it.
// Examples would include compressed data with errors in it.
QPDF_DLL
void setPrecheckStreams(bool);
// Set the minimum PDF version. If the PDF version of the input
// file (or previously set minimum version) is less than the
// version passed to this method, the PDF version of the output
@ -415,6 +426,7 @@ class QPDFWriter
bool stream_data_mode_set;
qpdf_stream_data_e stream_data_mode;
bool qdf_mode;
bool precheck_streams;
bool static_id;
bool suppress_original_object_ids;
bool direct_stream_lengths;

View File

@ -2134,12 +2134,14 @@ QPDF::getCompressibleObjGens()
return result;
}
void
bool
QPDF::pipeStreamData(int objid, int generation,
qpdf_offset_t offset, size_t length,
QPDFObjectHandle stream_dict,
Pipeline* pipeline)
Pipeline* pipeline,
bool suppress_warnings)
{
bool success = false;
std::vector<PointerHolder<Pipeline> > to_delete;
if (this->encrypted)
{
@ -2165,21 +2167,29 @@ QPDF::pipeStreamData(int objid, int generation,
length -= len;
pipeline->write(QUtil::unsigned_char_pointer(buf), len);
}
success = true;
}
catch (QPDFExc& e)
{
warn(e);
if (! suppress_warnings)
{
warn(e);
}
}
catch (std::runtime_error& e)
{
QTC::TC("qpdf", "QPDF decoding error warning");
warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
"", this->file->getLastOffset(),
"error decoding stream data for object " +
QUtil::int_to_string(objid) + " " +
QUtil::int_to_string(generation) + ": " + e.what()));
if (! suppress_warnings)
{
QTC::TC("qpdf", "QPDF decoding error warning");
warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
"", this->file->getLastOffset(),
"error decoding stream data for object " +
QUtil::int_to_string(objid) + " " +
QUtil::int_to_string(generation) + ": " + e.what()));
}
}
pipeline->finish();
return success;
}
void

View File

@ -496,11 +496,12 @@ QPDFObjectHandle::getRawStreamData()
bool
QPDFObjectHandle::pipeStreamData(Pipeline* p, bool filter,
bool normalize, bool compress)
bool normalize, bool compress,
bool suppress_warnings)
{
assertStream();
return dynamic_cast<QPDF_Stream*>(obj.getPointer())->pipeStreamData(
p, filter, normalize, compress);
p, filter, normalize, compress, suppress_warnings);
}
void

View File

@ -57,6 +57,7 @@ QPDFWriter::init()
stream_data_mode_set = false;
stream_data_mode = qpdf_s_compress;
qdf_mode = false;
precheck_streams = false;
static_id = false;
suppress_original_object_ids = false;
direct_stream_lengths = true;
@ -176,6 +177,12 @@ QPDFWriter::setQDFMode(bool val)
this->qdf_mode = val;
}
void
QPDFWriter::setPrecheckStreams(bool val)
{
this->precheck_streams = val;
}
void
QPDFWriter::setMinimumPDFVersion(std::string const& version)
{
@ -1522,6 +1529,21 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level,
flags |= f_stream;
if (filter && this->precheck_streams)
{
try
{
QTC::TC("qpdf", "QPDFWriter precheck stream");
Pl_Discard discard;
filter = object.pipeStreamData(
&discard, true, false, false, true);
}
catch (std::exception)
{
filter = false;
}
}
pushPipeline(new Pl_Buffer("stream data"));
activatePipelineStack();
bool filtered =

View File

@ -85,7 +85,7 @@ PointerHolder<Buffer>
QPDF_Stream::getStreamData()
{
Pl_Buffer buf("stream data buffer");
if (! pipeStreamData(&buf, true, false, false))
if (! pipeStreamData(&buf, true, false, false, false))
{
throw std::logic_error("getStreamData called on unfilterable stream");
}
@ -97,7 +97,7 @@ PointerHolder<Buffer>
QPDF_Stream::getRawStreamData()
{
Pl_Buffer buf("stream data buffer");
pipeStreamData(&buf, false, false, false);
pipeStreamData(&buf, false, false, false, false);
QTC::TC("qpdf", "QPDF_Stream getRawStreamData");
return buf.getBuffer();
}
@ -351,7 +351,8 @@ QPDF_Stream::filterable(std::vector<std::string>& filters,
bool
QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter,
bool normalize, bool compress)
bool normalize, bool compress,
bool suppress_warnings)
{
std::vector<std::string> filters;
int predictor = 1;
@ -487,9 +488,13 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter,
else
{
QTC::TC("qpdf", "QPDF_Stream pipe original stream data");
QPDF::Pipe::pipeStreamData(this->qpdf, this->objid, this->generation,
this->offset, this->length,
this->stream_dict, pipeline);
if (! QPDF::Pipe::pipeStreamData(this->qpdf, this->objid, this->generation,
this->offset, this->length,
this->stream_dict, pipeline,
suppress_warnings))
{
filter = false;
}
}
return filter;

View File

@ -23,7 +23,8 @@ class QPDF_Stream: public QPDFObject
// See comments in QPDFObjectHandle.hh for these methods.
bool pipeStreamData(Pipeline*, bool filter,
bool normalize, bool compress);
bool normalize, bool compress,
bool suppress_warnings);
PointerHolder<Buffer> getStreamData();
PointerHolder<Buffer> getRawStreamData();
void replaceStreamData(PointerHolder<Buffer> data,

View File

@ -821,6 +821,23 @@ outfile.pdf</option>
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--precheck-streams</option></term>
<listitem>
<para>
Tells qpdf to precheck each stream for the ability to decode
it. Ordinarily qpdf tries to decode streams that it thinks it
can decode based on the filters, and if there ends up being an
error when actually trying to do the decode, the stream data
is truncated. This flag causes qpdf to actually read the
stream fully before deciding whether to filter the stream.
This option will slow qpdf down since it will have to read the
stream twice, but it allows raw stream data to be preserved in
cases where the decoding of the stream would fail for some
reason. This may be useful in working with some damaged files.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--qdf</option></term>
<listitem>

View File

@ -202,6 +202,7 @@ familiar with the PDF file format or who are PDF developers.\n\
--suppress-recovery prevents qpdf from attempting to recover damaged files\n\
--object-streams=mode controls handing of object streams\n\
--ignore-xref-streams tells qpdf to ignore any cross-reference streams\n\
--precheck-streams precheck ability to decode streams\n\
--qdf turns on \"QDF mode\" (below)\n\
--min-version=version sets the minimum PDF version of the output file\n\
--force-version=version forces this to be the PDF version of the output file\n\
@ -1028,6 +1029,7 @@ int main(int argc, char* argv[])
qpdf_object_stream_e object_stream_mode = qpdf_o_preserve;
bool ignore_xref_streams = false;
bool qdf_mode = false;
bool precheck_streams = false;
std::string min_version;
std::string force_version;
@ -1213,6 +1215,10 @@ int main(int argc, char* argv[])
{
qdf_mode = true;
}
else if (strcmp(arg, "precheck-streams") == 0)
{
precheck_streams = true;
}
else if (strcmp(arg, "min-version") == 0)
{
if (parameter == 0)
@ -1704,6 +1710,10 @@ int main(int argc, char* argv[])
{
w.setQDFMode(true);
}
if (precheck_streams)
{
w.setPrecheckStreams(true);
}
if (normalize_set)
{
w.setContentNormalization(normalize);

View File

@ -279,3 +279,4 @@ QPDFObjectHandle treat word as string 0
QPDFObjectHandle found fake 1
QPDFObjectHandle no val for last key 0
QPDF resolve failure to null 0
QPDFWriter precheck stream 0

View File

@ -723,6 +723,26 @@ $td->runtest("check output",
{$td->FILE => "from-scratch-0.pdf"});
show_ntests();
# ----------
$td->notify("--- Precheck streams ---");
$n_tests += 4;
$td->runtest("bad stream without precheck",
{$td->COMMAND => "qpdf --static-id bad-data.pdf a.pdf"},
{$td->FILE => "bad-data.out", $td->EXIT_STATUS => 3},
$td->NORMALIZE_NEWLINES);
$td->runtest("check output",
{$td->FILE => "a.pdf"},
{$td->FILE => "bad-data-out.pdf"});
$td->runtest("bad stream with precheck",
{$td->COMMAND =>
"qpdf --static-id --precheck-streams bad-data.pdf a.pdf"},
{$td->STRING => "", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->runtest("check output",
{$td->FILE => "a.pdf"},
{$td->FILE => "bad-data-precheck.pdf"});
show_ntests();
# ----------
$td->notify("--- Copy Foreign Objects ---");
$n_tests += 7;

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,2 @@
WARNING: bad-data.pdf (file position 319): error decoding stream data for object 4 0: LZWDecoder: bad code received
qpdf: operation succeeded with warnings; resulting file may have some problems

Binary file not shown.