mirror of
https://github.com/qpdf/qpdf.git
synced 2025-01-31 10:58:25 +00:00
Add precheck streams capability
When requested, QPDFWriter will do more aggress prechecking of streams to make sure it can actually succeed in decoding them before attempting to do so. This will allow preservation of raw data even when the raw data is corrupted relative to the specified filters.
This commit is contained in:
parent
428d96dfe1
commit
7f8892525f
@ -1,5 +1,9 @@
|
||||
2017-07-27 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Add --precheck-streams command-line option and setStreamPrecheck
|
||||
option to QPDFWriter to tell QPDFWriter to attempt decoding a
|
||||
stream fully before deciding whether to filter it or not.
|
||||
|
||||
* Recover gracefully from streams that aren't filterable because
|
||||
the filter parameters are invalid in the stream dictionary or the
|
||||
dictionary itself is invalid.
|
||||
|
@ -540,13 +540,14 @@ class QPDF
|
||||
{
|
||||
friend class QPDF_Stream;
|
||||
private:
|
||||
static void pipeStreamData(QPDF* qpdf, int objid, int generation,
|
||||
static bool pipeStreamData(QPDF* qpdf, int objid, int generation,
|
||||
qpdf_offset_t offset, size_t length,
|
||||
QPDFObjectHandle dict,
|
||||
Pipeline* pipeline)
|
||||
Pipeline* pipeline, bool suppress_warnings)
|
||||
{
|
||||
qpdf->pipeStreamData(
|
||||
objid, generation, offset, length, dict, pipeline);
|
||||
return qpdf->pipeStreamData(
|
||||
objid, generation, offset, length, dict, pipeline,
|
||||
suppress_warnings);
|
||||
}
|
||||
};
|
||||
friend class Pipe;
|
||||
@ -666,10 +667,11 @@ class QPDF
|
||||
void findAttachmentStreams();
|
||||
|
||||
// Calls finish() on the pipeline when done but does not delete it
|
||||
void pipeStreamData(int objid, int generation,
|
||||
bool pipeStreamData(int objid, int generation,
|
||||
qpdf_offset_t offset, size_t length,
|
||||
QPDFObjectHandle dict,
|
||||
Pipeline* pipeline);
|
||||
Pipeline* pipeline,
|
||||
bool suppress_warnings);
|
||||
|
||||
// For QPDFWriter:
|
||||
|
||||
|
@ -394,7 +394,8 @@ class QPDFObjectHandle
|
||||
// replaced if writing a new stream object.
|
||||
QPDF_DLL
|
||||
bool pipeStreamData(Pipeline*, bool filter,
|
||||
bool normalize, bool compress);
|
||||
bool normalize, bool compress,
|
||||
bool suppress_warnings = false);
|
||||
|
||||
// Replace a stream's dictionary. The new dictionary must be
|
||||
// consistent with the stream's data. This is most appropriately
|
||||
|
@ -144,6 +144,17 @@ class QPDFWriter
|
||||
QPDF_DLL
|
||||
void setQDFMode(bool);
|
||||
|
||||
// Enable stream precheck mode. In this mode, all filterable
|
||||
// streams are checked by actually attempting to decode them
|
||||
// before filtering. This may add significant time to the process
|
||||
// of writing the data because all streams from the input must be
|
||||
// read twice, but it enables the raw stream data to be preserved
|
||||
// even in cases where qpdf would run into errors decoding the
|
||||
// stream after it determines that it should be able to do it.
|
||||
// Examples would include compressed data with errors in it.
|
||||
QPDF_DLL
|
||||
void setPrecheckStreams(bool);
|
||||
|
||||
// Set the minimum PDF version. If the PDF version of the input
|
||||
// file (or previously set minimum version) is less than the
|
||||
// version passed to this method, the PDF version of the output
|
||||
@ -415,6 +426,7 @@ class QPDFWriter
|
||||
bool stream_data_mode_set;
|
||||
qpdf_stream_data_e stream_data_mode;
|
||||
bool qdf_mode;
|
||||
bool precheck_streams;
|
||||
bool static_id;
|
||||
bool suppress_original_object_ids;
|
||||
bool direct_stream_lengths;
|
||||
|
@ -2134,12 +2134,14 @@ QPDF::getCompressibleObjGens()
|
||||
return result;
|
||||
}
|
||||
|
||||
void
|
||||
bool
|
||||
QPDF::pipeStreamData(int objid, int generation,
|
||||
qpdf_offset_t offset, size_t length,
|
||||
QPDFObjectHandle stream_dict,
|
||||
Pipeline* pipeline)
|
||||
Pipeline* pipeline,
|
||||
bool suppress_warnings)
|
||||
{
|
||||
bool success = false;
|
||||
std::vector<PointerHolder<Pipeline> > to_delete;
|
||||
if (this->encrypted)
|
||||
{
|
||||
@ -2165,21 +2167,29 @@ QPDF::pipeStreamData(int objid, int generation,
|
||||
length -= len;
|
||||
pipeline->write(QUtil::unsigned_char_pointer(buf), len);
|
||||
}
|
||||
success = true;
|
||||
}
|
||||
catch (QPDFExc& e)
|
||||
{
|
||||
warn(e);
|
||||
if (! suppress_warnings)
|
||||
{
|
||||
warn(e);
|
||||
}
|
||||
}
|
||||
catch (std::runtime_error& e)
|
||||
{
|
||||
QTC::TC("qpdf", "QPDF decoding error warning");
|
||||
warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
|
||||
"", this->file->getLastOffset(),
|
||||
"error decoding stream data for object " +
|
||||
QUtil::int_to_string(objid) + " " +
|
||||
QUtil::int_to_string(generation) + ": " + e.what()));
|
||||
if (! suppress_warnings)
|
||||
{
|
||||
QTC::TC("qpdf", "QPDF decoding error warning");
|
||||
warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
|
||||
"", this->file->getLastOffset(),
|
||||
"error decoding stream data for object " +
|
||||
QUtil::int_to_string(objid) + " " +
|
||||
QUtil::int_to_string(generation) + ": " + e.what()));
|
||||
}
|
||||
}
|
||||
pipeline->finish();
|
||||
return success;
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -496,11 +496,12 @@ QPDFObjectHandle::getRawStreamData()
|
||||
|
||||
bool
|
||||
QPDFObjectHandle::pipeStreamData(Pipeline* p, bool filter,
|
||||
bool normalize, bool compress)
|
||||
bool normalize, bool compress,
|
||||
bool suppress_warnings)
|
||||
{
|
||||
assertStream();
|
||||
return dynamic_cast<QPDF_Stream*>(obj.getPointer())->pipeStreamData(
|
||||
p, filter, normalize, compress);
|
||||
p, filter, normalize, compress, suppress_warnings);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -57,6 +57,7 @@ QPDFWriter::init()
|
||||
stream_data_mode_set = false;
|
||||
stream_data_mode = qpdf_s_compress;
|
||||
qdf_mode = false;
|
||||
precheck_streams = false;
|
||||
static_id = false;
|
||||
suppress_original_object_ids = false;
|
||||
direct_stream_lengths = true;
|
||||
@ -176,6 +177,12 @@ QPDFWriter::setQDFMode(bool val)
|
||||
this->qdf_mode = val;
|
||||
}
|
||||
|
||||
void
|
||||
QPDFWriter::setPrecheckStreams(bool val)
|
||||
{
|
||||
this->precheck_streams = val;
|
||||
}
|
||||
|
||||
void
|
||||
QPDFWriter::setMinimumPDFVersion(std::string const& version)
|
||||
{
|
||||
@ -1522,6 +1529,21 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level,
|
||||
|
||||
flags |= f_stream;
|
||||
|
||||
if (filter && this->precheck_streams)
|
||||
{
|
||||
try
|
||||
{
|
||||
QTC::TC("qpdf", "QPDFWriter precheck stream");
|
||||
Pl_Discard discard;
|
||||
filter = object.pipeStreamData(
|
||||
&discard, true, false, false, true);
|
||||
}
|
||||
catch (std::exception)
|
||||
{
|
||||
filter = false;
|
||||
}
|
||||
}
|
||||
|
||||
pushPipeline(new Pl_Buffer("stream data"));
|
||||
activatePipelineStack();
|
||||
bool filtered =
|
||||
|
@ -85,7 +85,7 @@ PointerHolder<Buffer>
|
||||
QPDF_Stream::getStreamData()
|
||||
{
|
||||
Pl_Buffer buf("stream data buffer");
|
||||
if (! pipeStreamData(&buf, true, false, false))
|
||||
if (! pipeStreamData(&buf, true, false, false, false))
|
||||
{
|
||||
throw std::logic_error("getStreamData called on unfilterable stream");
|
||||
}
|
||||
@ -97,7 +97,7 @@ PointerHolder<Buffer>
|
||||
QPDF_Stream::getRawStreamData()
|
||||
{
|
||||
Pl_Buffer buf("stream data buffer");
|
||||
pipeStreamData(&buf, false, false, false);
|
||||
pipeStreamData(&buf, false, false, false, false);
|
||||
QTC::TC("qpdf", "QPDF_Stream getRawStreamData");
|
||||
return buf.getBuffer();
|
||||
}
|
||||
@ -351,7 +351,8 @@ QPDF_Stream::filterable(std::vector<std::string>& filters,
|
||||
|
||||
bool
|
||||
QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter,
|
||||
bool normalize, bool compress)
|
||||
bool normalize, bool compress,
|
||||
bool suppress_warnings)
|
||||
{
|
||||
std::vector<std::string> filters;
|
||||
int predictor = 1;
|
||||
@ -487,9 +488,13 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter,
|
||||
else
|
||||
{
|
||||
QTC::TC("qpdf", "QPDF_Stream pipe original stream data");
|
||||
QPDF::Pipe::pipeStreamData(this->qpdf, this->objid, this->generation,
|
||||
this->offset, this->length,
|
||||
this->stream_dict, pipeline);
|
||||
if (! QPDF::Pipe::pipeStreamData(this->qpdf, this->objid, this->generation,
|
||||
this->offset, this->length,
|
||||
this->stream_dict, pipeline,
|
||||
suppress_warnings))
|
||||
{
|
||||
filter = false;
|
||||
}
|
||||
}
|
||||
|
||||
return filter;
|
||||
|
@ -23,7 +23,8 @@ class QPDF_Stream: public QPDFObject
|
||||
|
||||
// See comments in QPDFObjectHandle.hh for these methods.
|
||||
bool pipeStreamData(Pipeline*, bool filter,
|
||||
bool normalize, bool compress);
|
||||
bool normalize, bool compress,
|
||||
bool suppress_warnings);
|
||||
PointerHolder<Buffer> getStreamData();
|
||||
PointerHolder<Buffer> getRawStreamData();
|
||||
void replaceStreamData(PointerHolder<Buffer> data,
|
||||
|
@ -821,6 +821,23 @@ outfile.pdf</option>
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term><option>--precheck-streams</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Tells qpdf to precheck each stream for the ability to decode
|
||||
it. Ordinarily qpdf tries to decode streams that it thinks it
|
||||
can decode based on the filters, and if there ends up being an
|
||||
error when actually trying to do the decode, the stream data
|
||||
is truncated. This flag causes qpdf to actually read the
|
||||
stream fully before deciding whether to filter the stream.
|
||||
This option will slow qpdf down since it will have to read the
|
||||
stream twice, but it allows raw stream data to be preserved in
|
||||
cases where the decoding of the stream would fail for some
|
||||
reason. This may be useful in working with some damaged files.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term><option>--qdf</option></term>
|
||||
<listitem>
|
||||
|
10
qpdf/qpdf.cc
10
qpdf/qpdf.cc
@ -202,6 +202,7 @@ familiar with the PDF file format or who are PDF developers.\n\
|
||||
--suppress-recovery prevents qpdf from attempting to recover damaged files\n\
|
||||
--object-streams=mode controls handing of object streams\n\
|
||||
--ignore-xref-streams tells qpdf to ignore any cross-reference streams\n\
|
||||
--precheck-streams precheck ability to decode streams\n\
|
||||
--qdf turns on \"QDF mode\" (below)\n\
|
||||
--min-version=version sets the minimum PDF version of the output file\n\
|
||||
--force-version=version forces this to be the PDF version of the output file\n\
|
||||
@ -1028,6 +1029,7 @@ int main(int argc, char* argv[])
|
||||
qpdf_object_stream_e object_stream_mode = qpdf_o_preserve;
|
||||
bool ignore_xref_streams = false;
|
||||
bool qdf_mode = false;
|
||||
bool precheck_streams = false;
|
||||
std::string min_version;
|
||||
std::string force_version;
|
||||
|
||||
@ -1213,6 +1215,10 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
qdf_mode = true;
|
||||
}
|
||||
else if (strcmp(arg, "precheck-streams") == 0)
|
||||
{
|
||||
precheck_streams = true;
|
||||
}
|
||||
else if (strcmp(arg, "min-version") == 0)
|
||||
{
|
||||
if (parameter == 0)
|
||||
@ -1704,6 +1710,10 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
w.setQDFMode(true);
|
||||
}
|
||||
if (precheck_streams)
|
||||
{
|
||||
w.setPrecheckStreams(true);
|
||||
}
|
||||
if (normalize_set)
|
||||
{
|
||||
w.setContentNormalization(normalize);
|
||||
|
@ -279,3 +279,4 @@ QPDFObjectHandle treat word as string 0
|
||||
QPDFObjectHandle found fake 1
|
||||
QPDFObjectHandle no val for last key 0
|
||||
QPDF resolve failure to null 0
|
||||
QPDFWriter precheck stream 0
|
||||
|
@ -723,6 +723,26 @@ $td->runtest("check output",
|
||||
{$td->FILE => "from-scratch-0.pdf"});
|
||||
show_ntests();
|
||||
# ----------
|
||||
$td->notify("--- Precheck streams ---");
|
||||
$n_tests += 4;
|
||||
|
||||
$td->runtest("bad stream without precheck",
|
||||
{$td->COMMAND => "qpdf --static-id bad-data.pdf a.pdf"},
|
||||
{$td->FILE => "bad-data.out", $td->EXIT_STATUS => 3},
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
$td->runtest("check output",
|
||||
{$td->FILE => "a.pdf"},
|
||||
{$td->FILE => "bad-data-out.pdf"});
|
||||
$td->runtest("bad stream with precheck",
|
||||
{$td->COMMAND =>
|
||||
"qpdf --static-id --precheck-streams bad-data.pdf a.pdf"},
|
||||
{$td->STRING => "", $td->EXIT_STATUS => 0},
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
$td->runtest("check output",
|
||||
{$td->FILE => "a.pdf"},
|
||||
{$td->FILE => "bad-data-precheck.pdf"});
|
||||
show_ntests();
|
||||
# ----------
|
||||
$td->notify("--- Copy Foreign Objects ---");
|
||||
$n_tests += 7;
|
||||
|
||||
|
BIN
qpdf/qtest/qpdf/bad-data-out.pdf
Normal file
BIN
qpdf/qtest/qpdf/bad-data-out.pdf
Normal file
Binary file not shown.
BIN
qpdf/qtest/qpdf/bad-data-precheck.pdf
Normal file
BIN
qpdf/qtest/qpdf/bad-data-precheck.pdf
Normal file
Binary file not shown.
2
qpdf/qtest/qpdf/bad-data.out
Normal file
2
qpdf/qtest/qpdf/bad-data.out
Normal file
@ -0,0 +1,2 @@
|
||||
WARNING: bad-data.pdf (file position 319): error decoding stream data for object 4 0: LZWDecoder: bad code received
|
||||
qpdf: operation succeeded with warnings; resulting file may have some problems
|
BIN
qpdf/qtest/qpdf/bad-data.pdf
Normal file
BIN
qpdf/qtest/qpdf/bad-data.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user