Refactor parseContentStream

This commit is contained in:
Jay Berkenbilt 2018-01-30 21:25:51 -05:00
parent 05ff619b09
commit fcd611b61e
5 changed files with 155 additions and 74 deletions

View File

@ -45,6 +45,18 @@
characters may surround the EI operator that marks the end of an
inline image.
* New method QPDFObjectHandle::parsePageContents() to improve upon
QPDFObjectHandle::parseContentStream(). The parseContentStream
method used to operate on a single content stream, but was fixed
to properly handle pages with contents split across multiple
streams in an earlier release. The new method parsePageContents()
can be called on the page object rather than the value of the
page dictionary's /Contents key. This removes a few lines of
boiler-plate code from any code that uses parseContentStream, and
it also enables creation of more helpful error messages if
problems are encountered as the error messages can include
information about which page the streams come from.
2018-02-04 Jay Berkenbilt <ejb@ql.org>
* Add QPDFWriter::setLinearizationPass1Filename method and

View File

@ -88,7 +88,7 @@ class QPDFObjectHandle
virtual void decryptString(std::string& val) = 0;
};
// This class is used by parseContentStream. Callers must
// This class is used by parsePageContents. Callers must
// instantiate a subclass of this with handlers defined to accept
// QPDFObjectHandles that are parsed from the stream.
class ParserCallbacks
@ -103,8 +103,8 @@ class QPDFObjectHandle
protected:
// Implementors may call this method during parsing to
// terminate parsing early. This method throws an exception
// that is caught by parseContentStream, so its effect is
// terminate parsing early. This method throws an exception
// that is caught by parsePageContents, so its effect is
// immediate.
QPDF_DLL
void terminateParsing();
@ -187,6 +187,24 @@ class QPDFObjectHandle
QPDF* context);
// Helpers for parsing content streams
// Parse a page's contents through ParserCallbacks, described
// above. This method works whether the contents are a single
// stream or an array of streams. Call on a page object.
QPDF_DLL
void parsePageContents(ParserCallbacks* callbacks);
// Pipe a page's contents through the given pipeline. This method
// works whether the contents are a single stream or an array of
// streams. Call on a page object.
QPDF_DLL
void pipePageContents(Pipeline* p);
// Older method: stream_or_array should be the value of /Contents
// from a page object. It's more convenient to just call
// parsePageContents on the page object, and error messages will
// also be more useful because the page object information will be
// known.
QPDF_DLL
static void parseContentStream(QPDFObjectHandle stream_or_array,
ParserCallbacks* callbacks);
@ -697,12 +715,17 @@ class QPDFObjectHandle
QPDFTokenizer& tokenizer, bool& empty,
StringDecrypter* decrypter, QPDF* context,
bool content_stream);
static void parseContentStream_internal(
PointerHolder<Buffer> stream_data,
void parseContentStream_internal(
std::string const& description,
ParserCallbacks* callbacks);
// Other methods
static void parseContentStream_data(
PointerHolder<Buffer>,
std::string const& description,
ParserCallbacks* callbacks);
std::vector<QPDFObjectHandle> arrayOrStreamToStreamArray(
std::string const& description, std::string& all_description);
void pipeContentStreams(Pipeline* p, std::string const& description,
std::string& all_description);
static void warn(QPDF*, QPDFExc const&);
bool initialized;

View File

@ -628,44 +628,78 @@ QPDFObjectHandle::getPageImages()
}
std::vector<QPDFObjectHandle>
QPDFObjectHandle::getPageContents()
QPDFObjectHandle::arrayOrStreamToStreamArray(
std::string const& description, std::string& all_description)
{
assertPageObject();
all_description = description;
std::vector<QPDFObjectHandle> result;
QPDFObjectHandle contents = this->getKey("/Contents");
if (contents.isArray())
if (isArray())
{
int n_items = contents.getArrayNItems();
int n_items = getArrayNItems();
for (int i = 0; i < n_items; ++i)
{
QPDFObjectHandle item = contents.getArrayItem(i);
QPDFObjectHandle item = getArrayItem(i);
if (item.isStream())
{
result.push_back(item);
}
else
{
result.push_back(item);
}
else
{
throw std::runtime_error(
"unknown item type while inspecting "
"element of /Contents array in page "
"dictionary");
QTC::TC("qpdf", "QPDFObjectHandle non-stream in stream array");
warn(item.getOwningQPDF(),
QPDFExc(qpdf_e_damaged_pdf, description,
"item index " + QUtil::int_to_string(i) +
" (from 0)", 0,
"ignoring non-stream in an array of streams"));
}
}
}
else if (contents.isStream())
else if (isStream())
{
result.push_back(contents);
result.push_back(*this);
}
else if (! contents.isNull())
else if (! isNull())
{
throw std::runtime_error("unknown object type inspecting /Contents "
"key in page dictionary");
warn(getOwningQPDF(),
QPDFExc(qpdf_e_damaged_pdf, "", description, 0,
" object is supposed to be a stream or an"
" array of streams but is neither"));
}
bool first = true;
for (std::vector<QPDFObjectHandle>::iterator iter = result.begin();
iter != result.end(); ++iter)
{
QPDFObjectHandle item = *iter;
std::string og =
QUtil::int_to_string(item.getObjectID()) + " " +
QUtil::int_to_string(item.getGeneration());
if (first)
{
first = false;
}
else
{
all_description += ",";
}
all_description += " stream " + og;
}
return result;
}
std::vector<QPDFObjectHandle>
QPDFObjectHandle::getPageContents()
{
assertPageObject();
std::string description = "page object " +
QUtil::int_to_string(this->objid) + " " +
QUtil::int_to_string(this->generation);
std::string all_description;
return this->getKey("/Contents").arrayOrStreamToStreamArray(
description, all_description);
}
void
QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first)
{
@ -806,61 +840,72 @@ QPDFObjectHandle::parse(std::string const& object_str,
}
void
QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
ParserCallbacks* callbacks)
QPDFObjectHandle::pipePageContents(Pipeline* p)
{
std::vector<QPDFObjectHandle> streams;
if (stream_or_array.isArray())
{
streams = stream_or_array.getArrayAsVector();
}
else
{
streams.push_back(stream_or_array);
}
Pl_Buffer buf("concatenated stream data buffer");
std::string all_description = "content stream objects";
bool first = true;
std::string description = "page object " +
QUtil::int_to_string(this->objid) + " " +
QUtil::int_to_string(this->generation);
std::string all_description;
this->getKey("/Contents").pipeContentStreams(
p, description, all_description);
}
void
QPDFObjectHandle::pipeContentStreams(
Pipeline* p, std::string const& description, std::string& all_description)
{
std::vector<QPDFObjectHandle> streams =
arrayOrStreamToStreamArray(
description, all_description);
for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
iter != streams.end(); ++iter)
{
QPDFObjectHandle stream = *iter;
if (! stream.isStream())
std::string og =
QUtil::int_to_string(stream.getObjectID()) + " " +
QUtil::int_to_string(stream.getGeneration());
std::string description = "content stream object " + og;
if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized))
{
QTC::TC("qpdf", "QPDFObjectHandle non-stream in parsecontent");
QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
warn(stream.getOwningQPDF(),
QPDFExc(qpdf_e_damaged_pdf, "content stream",
"", 0,
"ignoring non-stream while parsing content streams"));
}
else
{
std::string og = QUtil::int_to_string(stream.getObjectID()) + " " +
QUtil::int_to_string(stream.getGeneration());
std::string description = "content stream object " + og;
if (first)
{
first = false;
}
else
{
all_description += ",";
}
all_description += " " + og;
if (! stream.pipeStreamData(&buf, 0, qpdf_dl_specialized))
{
QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
warn(stream.getOwningQPDF(),
QPDFExc(qpdf_e_damaged_pdf, "content stream",
description, 0,
"errors while decoding content stream"));
}
description, 0,
"errors while decoding content stream"));
}
}
}
void
QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks)
{
std::string description = "page object " +
QUtil::int_to_string(this->objid) + " " +
QUtil::int_to_string(this->generation);
this->getKey("/Contents").parseContentStream_internal(
description, callbacks);
}
void
QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
ParserCallbacks* callbacks)
{
stream_or_array.parseContentStream_internal(
"content stream objects", callbacks);
}
void
QPDFObjectHandle::parseContentStream_internal(
std::string const& description,
ParserCallbacks* callbacks)
{
Pl_Buffer buf("concatenated stream data buffer");
std::string all_description;
pipeContentStreams(&buf, description, all_description);
PointerHolder<Buffer> stream_data = buf.getBuffer();
try
{
parseContentStream_internal(stream_data, all_description, callbacks);
parseContentStream_data(stream_data, all_description, callbacks);
}
catch (TerminateParsing&)
{
@ -870,9 +915,10 @@ QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
}
void
QPDFObjectHandle::parseContentStream_internal(PointerHolder<Buffer> stream_data,
std::string const& description,
ParserCallbacks* callbacks)
QPDFObjectHandle::parseContentStream_data(
PointerHolder<Buffer> stream_data,
std::string const& description,
ParserCallbacks* callbacks)
{
size_t length = stream_data->getSize();
PointerHolder<InputSource> input =

View File

@ -277,7 +277,6 @@ QPDFObjectHandle found fake 1
QPDFObjectHandle no val for last key 0
QPDF resolve failure to null 0
QPDFWriter preserve unreferenced standard 0
QPDFObjectHandle non-stream in parsecontent 0
QPDFObjectHandle errors in parsecontent 0
QPDF stream with non-space 0
qpdf same file error 0
@ -304,3 +303,4 @@ QPDF_Stream TIFF predictor 0
QPDFTokenizer EOF when not allowed 0
QPDFTokenizer inline image at EOF 0
Pl_QPDFTokenizer found ID 0
QPDFObjectHandle non-stream in stream array 0

View File

@ -4,6 +4,6 @@ File is not encrypted
File is not linearized
WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
WARNING: split-content-stream-errors.pdf (file position 557): stream will be re-processed without filtering to avoid data loss
WARNING: content stream: ignoring non-stream while parsing content streams
WARNING: content stream objects (item index 0 (from 0)): ignoring non-stream in an array of streams
WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
WARNING: content stream (content stream object 6 0): errors while decoding content stream