2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-06-03 19:00:51 +00:00

Refactor parseContentStream

This commit is contained in:
Jay Berkenbilt 2018-01-30 21:25:51 -05:00
parent 05ff619b09
commit fcd611b61e
5 changed files with 155 additions and 74 deletions

View File

@ -45,6 +45,18 @@
characters may surround the EI operator that marks the end of an characters may surround the EI operator that marks the end of an
inline image. inline image.
* New method QPDFObjectHandle::parsePageContents() to improve upon
QPDFObjectHandle::parseContentStream(). The parseContentStream
method used to operate on a single content stream, but was fixed
to properly handle pages with contents split across multiple
streams in an earlier release. The new method parsePageContents()
can be called on the page object rather than the value of the
page dictionary's /Contents key. This removes a few lines of
boiler-plate code from any code that uses parseContentStream, and
it also enables creation of more helpful error messages if
problems are encountered as the error messages can include
information about which page the streams come from.
2018-02-04 Jay Berkenbilt <ejb@ql.org> 2018-02-04 Jay Berkenbilt <ejb@ql.org>
* Add QPDFWriter::setLinearizationPass1Filename method and * Add QPDFWriter::setLinearizationPass1Filename method and

View File

@ -88,7 +88,7 @@ class QPDFObjectHandle
virtual void decryptString(std::string& val) = 0; virtual void decryptString(std::string& val) = 0;
}; };
// This class is used by parseContentStream. Callers must // This class is used by parsePageContents. Callers must
// instantiate a subclass of this with handlers defined to accept // instantiate a subclass of this with handlers defined to accept
// QPDFObjectHandles that are parsed from the stream. // QPDFObjectHandles that are parsed from the stream.
class ParserCallbacks class ParserCallbacks
@ -103,8 +103,8 @@ class QPDFObjectHandle
protected: protected:
// Implementors may call this method during parsing to // Implementors may call this method during parsing to
// terminate parsing early. This method throws an exception // terminate parsing early. This method throws an exception
// that is caught by parseContentStream, so its effect is // that is caught by parsePageContents, so its effect is
// immediate. // immediate.
QPDF_DLL QPDF_DLL
void terminateParsing(); void terminateParsing();
@ -187,6 +187,24 @@ class QPDFObjectHandle
QPDF* context); QPDF* context);
// Helpers for parsing content streams // Helpers for parsing content streams
// Parse a page's contents through ParserCallbacks, described
// above. This method works whether the contents are a single
// stream or an array of streams. Call on a page object.
QPDF_DLL
void parsePageContents(ParserCallbacks* callbacks);
// Pipe a page's contents through the given pipeline. This method
// works whether the contents are a single stream or an array of
// streams. Call on a page object.
QPDF_DLL
void pipePageContents(Pipeline* p);
// Older method: stream_or_array should be the value of /Contents
// from a page object. It's more convenient to just call
// parsePageContents on the page object, and error messages will
// also be more useful because the page object information will be
// known.
QPDF_DLL QPDF_DLL
static void parseContentStream(QPDFObjectHandle stream_or_array, static void parseContentStream(QPDFObjectHandle stream_or_array,
ParserCallbacks* callbacks); ParserCallbacks* callbacks);
@ -697,12 +715,17 @@ class QPDFObjectHandle
QPDFTokenizer& tokenizer, bool& empty, QPDFTokenizer& tokenizer, bool& empty,
StringDecrypter* decrypter, QPDF* context, StringDecrypter* decrypter, QPDF* context,
bool content_stream); bool content_stream);
static void parseContentStream_internal( void parseContentStream_internal(
PointerHolder<Buffer> stream_data,
std::string const& description, std::string const& description,
ParserCallbacks* callbacks); ParserCallbacks* callbacks);
static void parseContentStream_data(
// Other methods PointerHolder<Buffer>,
std::string const& description,
ParserCallbacks* callbacks);
std::vector<QPDFObjectHandle> arrayOrStreamToStreamArray(
std::string const& description, std::string& all_description);
void pipeContentStreams(Pipeline* p, std::string const& description,
std::string& all_description);
static void warn(QPDF*, QPDFExc const&); static void warn(QPDF*, QPDFExc const&);
bool initialized; bool initialized;

View File

@ -628,44 +628,78 @@ QPDFObjectHandle::getPageImages()
} }
std::vector<QPDFObjectHandle> std::vector<QPDFObjectHandle>
QPDFObjectHandle::getPageContents() QPDFObjectHandle::arrayOrStreamToStreamArray(
std::string const& description, std::string& all_description)
{ {
assertPageObject(); all_description = description;
std::vector<QPDFObjectHandle> result; std::vector<QPDFObjectHandle> result;
QPDFObjectHandle contents = this->getKey("/Contents"); if (isArray())
if (contents.isArray())
{ {
int n_items = contents.getArrayNItems(); int n_items = getArrayNItems();
for (int i = 0; i < n_items; ++i) for (int i = 0; i < n_items; ++i)
{ {
QPDFObjectHandle item = contents.getArrayItem(i); QPDFObjectHandle item = getArrayItem(i);
if (item.isStream()) if (item.isStream())
{
result.push_back(item);
}
else
{ {
result.push_back(item); QTC::TC("qpdf", "QPDFObjectHandle non-stream in stream array");
} warn(item.getOwningQPDF(),
else QPDFExc(qpdf_e_damaged_pdf, description,
{ "item index " + QUtil::int_to_string(i) +
throw std::runtime_error( " (from 0)", 0,
"unknown item type while inspecting " "ignoring non-stream in an array of streams"));
"element of /Contents array in page "
"dictionary");
} }
} }
} }
else if (contents.isStream()) else if (isStream())
{ {
result.push_back(contents); result.push_back(*this);
} }
else if (! contents.isNull()) else if (! isNull())
{ {
throw std::runtime_error("unknown object type inspecting /Contents " warn(getOwningQPDF(),
"key in page dictionary"); QPDFExc(qpdf_e_damaged_pdf, "", description, 0,
" object is supposed to be a stream or an"
" array of streams but is neither"));
}
bool first = true;
for (std::vector<QPDFObjectHandle>::iterator iter = result.begin();
iter != result.end(); ++iter)
{
QPDFObjectHandle item = *iter;
std::string og =
QUtil::int_to_string(item.getObjectID()) + " " +
QUtil::int_to_string(item.getGeneration());
if (first)
{
first = false;
}
else
{
all_description += ",";
}
all_description += " stream " + og;
} }
return result; return result;
} }
std::vector<QPDFObjectHandle>
QPDFObjectHandle::getPageContents()
{
assertPageObject();
std::string description = "page object " +
QUtil::int_to_string(this->objid) + " " +
QUtil::int_to_string(this->generation);
std::string all_description;
return this->getKey("/Contents").arrayOrStreamToStreamArray(
description, all_description);
}
void void
QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first) QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first)
{ {
@ -806,61 +840,72 @@ QPDFObjectHandle::parse(std::string const& object_str,
} }
void void
QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, QPDFObjectHandle::pipePageContents(Pipeline* p)
ParserCallbacks* callbacks)
{ {
std::vector<QPDFObjectHandle> streams; std::string description = "page object " +
if (stream_or_array.isArray()) QUtil::int_to_string(this->objid) + " " +
{ QUtil::int_to_string(this->generation);
streams = stream_or_array.getArrayAsVector(); std::string all_description;
} this->getKey("/Contents").pipeContentStreams(
else p, description, all_description);
{ }
streams.push_back(stream_or_array);
} void
Pl_Buffer buf("concatenated stream data buffer"); QPDFObjectHandle::pipeContentStreams(
std::string all_description = "content stream objects"; Pipeline* p, std::string const& description, std::string& all_description)
bool first = true; {
std::vector<QPDFObjectHandle> streams =
arrayOrStreamToStreamArray(
description, all_description);
for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin(); for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
iter != streams.end(); ++iter) iter != streams.end(); ++iter)
{ {
QPDFObjectHandle stream = *iter; QPDFObjectHandle stream = *iter;
if (! stream.isStream()) std::string og =
QUtil::int_to_string(stream.getObjectID()) + " " +
QUtil::int_to_string(stream.getGeneration());
std::string description = "content stream object " + og;
if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized))
{ {
QTC::TC("qpdf", "QPDFObjectHandle non-stream in parsecontent"); QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
warn(stream.getOwningQPDF(), warn(stream.getOwningQPDF(),
QPDFExc(qpdf_e_damaged_pdf, "content stream", QPDFExc(qpdf_e_damaged_pdf, "content stream",
"", 0, description, 0,
"ignoring non-stream while parsing content streams")); "errors while decoding content stream"));
}
else
{
std::string og = QUtil::int_to_string(stream.getObjectID()) + " " +
QUtil::int_to_string(stream.getGeneration());
std::string description = "content stream object " + og;
if (first)
{
first = false;
}
else
{
all_description += ",";
}
all_description += " " + og;
if (! stream.pipeStreamData(&buf, 0, qpdf_dl_specialized))
{
QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
warn(stream.getOwningQPDF(),
QPDFExc(qpdf_e_damaged_pdf, "content stream",
description, 0,
"errors while decoding content stream"));
}
} }
} }
}
void
QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks)
{
std::string description = "page object " +
QUtil::int_to_string(this->objid) + " " +
QUtil::int_to_string(this->generation);
this->getKey("/Contents").parseContentStream_internal(
description, callbacks);
}
void
QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
ParserCallbacks* callbacks)
{
stream_or_array.parseContentStream_internal(
"content stream objects", callbacks);
}
void
QPDFObjectHandle::parseContentStream_internal(
std::string const& description,
ParserCallbacks* callbacks)
{
Pl_Buffer buf("concatenated stream data buffer");
std::string all_description;
pipeContentStreams(&buf, description, all_description);
PointerHolder<Buffer> stream_data = buf.getBuffer(); PointerHolder<Buffer> stream_data = buf.getBuffer();
try try
{ {
parseContentStream_internal(stream_data, all_description, callbacks); parseContentStream_data(stream_data, all_description, callbacks);
} }
catch (TerminateParsing&) catch (TerminateParsing&)
{ {
@ -870,9 +915,10 @@ QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
} }
void void
QPDFObjectHandle::parseContentStream_internal(PointerHolder<Buffer> stream_data, QPDFObjectHandle::parseContentStream_data(
std::string const& description, PointerHolder<Buffer> stream_data,
ParserCallbacks* callbacks) std::string const& description,
ParserCallbacks* callbacks)
{ {
size_t length = stream_data->getSize(); size_t length = stream_data->getSize();
PointerHolder<InputSource> input = PointerHolder<InputSource> input =

View File

@ -277,7 +277,6 @@ QPDFObjectHandle found fake 1
QPDFObjectHandle no val for last key 0 QPDFObjectHandle no val for last key 0
QPDF resolve failure to null 0 QPDF resolve failure to null 0
QPDFWriter preserve unreferenced standard 0 QPDFWriter preserve unreferenced standard 0
QPDFObjectHandle non-stream in parsecontent 0
QPDFObjectHandle errors in parsecontent 0 QPDFObjectHandle errors in parsecontent 0
QPDF stream with non-space 0 QPDF stream with non-space 0
qpdf same file error 0 qpdf same file error 0
@ -304,3 +303,4 @@ QPDF_Stream TIFF predictor 0
QPDFTokenizer EOF when not allowed 0 QPDFTokenizer EOF when not allowed 0
QPDFTokenizer inline image at EOF 0 QPDFTokenizer inline image at EOF 0
Pl_QPDFTokenizer found ID 0 Pl_QPDFTokenizer found ID 0
QPDFObjectHandle non-stream in stream array 0

View File

@ -4,6 +4,6 @@ File is not encrypted
File is not linearized File is not linearized
WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
WARNING: split-content-stream-errors.pdf (file position 557): stream will be re-processed without filtering to avoid data loss WARNING: split-content-stream-errors.pdf (file position 557): stream will be re-processed without filtering to avoid data loss
WARNING: content stream: ignoring non-stream while parsing content streams WARNING: content stream objects (item index 0 (from 0)): ignoring non-stream in an array of streams
WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
WARNING: content stream (content stream object 6 0): errors while decoding content stream WARNING: content stream (content stream object 6 0): errors while decoding content stream