mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 02:49:00 +00:00
Refactor parseContentStream
This commit is contained in:
parent
05ff619b09
commit
fcd611b61e
12
ChangeLog
12
ChangeLog
@ -45,6 +45,18 @@
|
||||
characters may surround the EI operator that marks the end of an
|
||||
inline image.
|
||||
|
||||
* New method QPDFObjectHandle::parsePageContents() to improve upon
|
||||
QPDFObjectHandle::parseContentStream(). The parseContentStream
|
||||
method used to operate on a single content stream, but was fixed
|
||||
to properly handle pages with contents split across multiple
|
||||
streams in an earlier release. The new method parsePageContents()
|
||||
can be called on the page object rather than the value of the
|
||||
page dictionary's /Contents key. This removes a few lines of
|
||||
boiler-plate code from any code that uses parseContentStream, and
|
||||
it also enables creation of more helpful error messages if
|
||||
problems are encountered as the error messages can include
|
||||
information about which page the streams come from.
|
||||
|
||||
2018-02-04 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Add QPDFWriter::setLinearizationPass1Filename method and
|
||||
|
@ -88,7 +88,7 @@ class QPDFObjectHandle
|
||||
virtual void decryptString(std::string& val) = 0;
|
||||
};
|
||||
|
||||
// This class is used by parseContentStream. Callers must
|
||||
// This class is used by parsePageContents. Callers must
|
||||
// instantiate a subclass of this with handlers defined to accept
|
||||
// QPDFObjectHandles that are parsed from the stream.
|
||||
class ParserCallbacks
|
||||
@ -103,8 +103,8 @@ class QPDFObjectHandle
|
||||
|
||||
protected:
|
||||
// Implementors may call this method during parsing to
|
||||
// terminate parsing early. This method throws an exception
|
||||
// that is caught by parseContentStream, so its effect is
|
||||
// terminate parsing early. This method throws an exception
|
||||
// that is caught by parsePageContents, so its effect is
|
||||
// immediate.
|
||||
QPDF_DLL
|
||||
void terminateParsing();
|
||||
@ -187,6 +187,24 @@ class QPDFObjectHandle
|
||||
QPDF* context);
|
||||
|
||||
// Helpers for parsing content streams
|
||||
|
||||
// Parse a page's contents through ParserCallbacks, described
|
||||
// above. This method works whether the contents are a single
|
||||
// stream or an array of streams. Call on a page object.
|
||||
QPDF_DLL
|
||||
void parsePageContents(ParserCallbacks* callbacks);
|
||||
|
||||
// Pipe a page's contents through the given pipeline. This method
|
||||
// works whether the contents are a single stream or an array of
|
||||
// streams. Call on a page object.
|
||||
QPDF_DLL
|
||||
void pipePageContents(Pipeline* p);
|
||||
|
||||
// Older method: stream_or_array should be the value of /Contents
|
||||
// from a page object. It's more convenient to just call
|
||||
// parsePageContents on the page object, and error messages will
|
||||
// also be more useful because the page object information will be
|
||||
// known.
|
||||
QPDF_DLL
|
||||
static void parseContentStream(QPDFObjectHandle stream_or_array,
|
||||
ParserCallbacks* callbacks);
|
||||
@ -697,12 +715,17 @@ class QPDFObjectHandle
|
||||
QPDFTokenizer& tokenizer, bool& empty,
|
||||
StringDecrypter* decrypter, QPDF* context,
|
||||
bool content_stream);
|
||||
static void parseContentStream_internal(
|
||||
PointerHolder<Buffer> stream_data,
|
||||
void parseContentStream_internal(
|
||||
std::string const& description,
|
||||
ParserCallbacks* callbacks);
|
||||
|
||||
// Other methods
|
||||
static void parseContentStream_data(
|
||||
PointerHolder<Buffer>,
|
||||
std::string const& description,
|
||||
ParserCallbacks* callbacks);
|
||||
std::vector<QPDFObjectHandle> arrayOrStreamToStreamArray(
|
||||
std::string const& description, std::string& all_description);
|
||||
void pipeContentStreams(Pipeline* p, std::string const& description,
|
||||
std::string& all_description);
|
||||
static void warn(QPDF*, QPDFExc const&);
|
||||
|
||||
bool initialized;
|
||||
|
@ -628,44 +628,78 @@ QPDFObjectHandle::getPageImages()
|
||||
}
|
||||
|
||||
std::vector<QPDFObjectHandle>
|
||||
QPDFObjectHandle::getPageContents()
|
||||
QPDFObjectHandle::arrayOrStreamToStreamArray(
|
||||
std::string const& description, std::string& all_description)
|
||||
{
|
||||
assertPageObject();
|
||||
|
||||
all_description = description;
|
||||
std::vector<QPDFObjectHandle> result;
|
||||
QPDFObjectHandle contents = this->getKey("/Contents");
|
||||
if (contents.isArray())
|
||||
if (isArray())
|
||||
{
|
||||
int n_items = contents.getArrayNItems();
|
||||
int n_items = getArrayNItems();
|
||||
for (int i = 0; i < n_items; ++i)
|
||||
{
|
||||
QPDFObjectHandle item = contents.getArrayItem(i);
|
||||
QPDFObjectHandle item = getArrayItem(i);
|
||||
if (item.isStream())
|
||||
{
|
||||
result.push_back(item);
|
||||
}
|
||||
else
|
||||
{
|
||||
result.push_back(item);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"unknown item type while inspecting "
|
||||
"element of /Contents array in page "
|
||||
"dictionary");
|
||||
QTC::TC("qpdf", "QPDFObjectHandle non-stream in stream array");
|
||||
warn(item.getOwningQPDF(),
|
||||
QPDFExc(qpdf_e_damaged_pdf, description,
|
||||
"item index " + QUtil::int_to_string(i) +
|
||||
" (from 0)", 0,
|
||||
"ignoring non-stream in an array of streams"));
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (contents.isStream())
|
||||
else if (isStream())
|
||||
{
|
||||
result.push_back(contents);
|
||||
result.push_back(*this);
|
||||
}
|
||||
else if (! contents.isNull())
|
||||
else if (! isNull())
|
||||
{
|
||||
throw std::runtime_error("unknown object type inspecting /Contents "
|
||||
"key in page dictionary");
|
||||
warn(getOwningQPDF(),
|
||||
QPDFExc(qpdf_e_damaged_pdf, "", description, 0,
|
||||
" object is supposed to be a stream or an"
|
||||
" array of streams but is neither"));
|
||||
}
|
||||
|
||||
bool first = true;
|
||||
for (std::vector<QPDFObjectHandle>::iterator iter = result.begin();
|
||||
iter != result.end(); ++iter)
|
||||
{
|
||||
QPDFObjectHandle item = *iter;
|
||||
std::string og =
|
||||
QUtil::int_to_string(item.getObjectID()) + " " +
|
||||
QUtil::int_to_string(item.getGeneration());
|
||||
if (first)
|
||||
{
|
||||
first = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
all_description += ",";
|
||||
}
|
||||
all_description += " stream " + og;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<QPDFObjectHandle>
|
||||
QPDFObjectHandle::getPageContents()
|
||||
{
|
||||
assertPageObject();
|
||||
std::string description = "page object " +
|
||||
QUtil::int_to_string(this->objid) + " " +
|
||||
QUtil::int_to_string(this->generation);
|
||||
std::string all_description;
|
||||
return this->getKey("/Contents").arrayOrStreamToStreamArray(
|
||||
description, all_description);
|
||||
}
|
||||
|
||||
void
|
||||
QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first)
|
||||
{
|
||||
@ -806,61 +840,72 @@ QPDFObjectHandle::parse(std::string const& object_str,
|
||||
}
|
||||
|
||||
void
|
||||
QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
|
||||
ParserCallbacks* callbacks)
|
||||
QPDFObjectHandle::pipePageContents(Pipeline* p)
|
||||
{
|
||||
std::vector<QPDFObjectHandle> streams;
|
||||
if (stream_or_array.isArray())
|
||||
{
|
||||
streams = stream_or_array.getArrayAsVector();
|
||||
}
|
||||
else
|
||||
{
|
||||
streams.push_back(stream_or_array);
|
||||
}
|
||||
Pl_Buffer buf("concatenated stream data buffer");
|
||||
std::string all_description = "content stream objects";
|
||||
bool first = true;
|
||||
std::string description = "page object " +
|
||||
QUtil::int_to_string(this->objid) + " " +
|
||||
QUtil::int_to_string(this->generation);
|
||||
std::string all_description;
|
||||
this->getKey("/Contents").pipeContentStreams(
|
||||
p, description, all_description);
|
||||
}
|
||||
|
||||
void
|
||||
QPDFObjectHandle::pipeContentStreams(
|
||||
Pipeline* p, std::string const& description, std::string& all_description)
|
||||
{
|
||||
std::vector<QPDFObjectHandle> streams =
|
||||
arrayOrStreamToStreamArray(
|
||||
description, all_description);
|
||||
for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
|
||||
iter != streams.end(); ++iter)
|
||||
{
|
||||
QPDFObjectHandle stream = *iter;
|
||||
if (! stream.isStream())
|
||||
std::string og =
|
||||
QUtil::int_to_string(stream.getObjectID()) + " " +
|
||||
QUtil::int_to_string(stream.getGeneration());
|
||||
std::string description = "content stream object " + og;
|
||||
if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized))
|
||||
{
|
||||
QTC::TC("qpdf", "QPDFObjectHandle non-stream in parsecontent");
|
||||
QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
|
||||
warn(stream.getOwningQPDF(),
|
||||
QPDFExc(qpdf_e_damaged_pdf, "content stream",
|
||||
"", 0,
|
||||
"ignoring non-stream while parsing content streams"));
|
||||
}
|
||||
else
|
||||
{
|
||||
std::string og = QUtil::int_to_string(stream.getObjectID()) + " " +
|
||||
QUtil::int_to_string(stream.getGeneration());
|
||||
std::string description = "content stream object " + og;
|
||||
if (first)
|
||||
{
|
||||
first = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
all_description += ",";
|
||||
}
|
||||
all_description += " " + og;
|
||||
if (! stream.pipeStreamData(&buf, 0, qpdf_dl_specialized))
|
||||
{
|
||||
QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
|
||||
warn(stream.getOwningQPDF(),
|
||||
QPDFExc(qpdf_e_damaged_pdf, "content stream",
|
||||
description, 0,
|
||||
"errors while decoding content stream"));
|
||||
}
|
||||
description, 0,
|
||||
"errors while decoding content stream"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks)
|
||||
{
|
||||
std::string description = "page object " +
|
||||
QUtil::int_to_string(this->objid) + " " +
|
||||
QUtil::int_to_string(this->generation);
|
||||
this->getKey("/Contents").parseContentStream_internal(
|
||||
description, callbacks);
|
||||
}
|
||||
|
||||
void
|
||||
QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
|
||||
ParserCallbacks* callbacks)
|
||||
{
|
||||
stream_or_array.parseContentStream_internal(
|
||||
"content stream objects", callbacks);
|
||||
}
|
||||
|
||||
void
|
||||
QPDFObjectHandle::parseContentStream_internal(
|
||||
std::string const& description,
|
||||
ParserCallbacks* callbacks)
|
||||
{
|
||||
Pl_Buffer buf("concatenated stream data buffer");
|
||||
std::string all_description;
|
||||
pipeContentStreams(&buf, description, all_description);
|
||||
PointerHolder<Buffer> stream_data = buf.getBuffer();
|
||||
try
|
||||
{
|
||||
parseContentStream_internal(stream_data, all_description, callbacks);
|
||||
parseContentStream_data(stream_data, all_description, callbacks);
|
||||
}
|
||||
catch (TerminateParsing&)
|
||||
{
|
||||
@ -870,9 +915,10 @@ QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
|
||||
}
|
||||
|
||||
void
|
||||
QPDFObjectHandle::parseContentStream_internal(PointerHolder<Buffer> stream_data,
|
||||
std::string const& description,
|
||||
ParserCallbacks* callbacks)
|
||||
QPDFObjectHandle::parseContentStream_data(
|
||||
PointerHolder<Buffer> stream_data,
|
||||
std::string const& description,
|
||||
ParserCallbacks* callbacks)
|
||||
{
|
||||
size_t length = stream_data->getSize();
|
||||
PointerHolder<InputSource> input =
|
||||
|
@ -277,7 +277,6 @@ QPDFObjectHandle found fake 1
|
||||
QPDFObjectHandle no val for last key 0
|
||||
QPDF resolve failure to null 0
|
||||
QPDFWriter preserve unreferenced standard 0
|
||||
QPDFObjectHandle non-stream in parsecontent 0
|
||||
QPDFObjectHandle errors in parsecontent 0
|
||||
QPDF stream with non-space 0
|
||||
qpdf same file error 0
|
||||
@ -304,3 +303,4 @@ QPDF_Stream TIFF predictor 0
|
||||
QPDFTokenizer EOF when not allowed 0
|
||||
QPDFTokenizer inline image at EOF 0
|
||||
Pl_QPDFTokenizer found ID 0
|
||||
QPDFObjectHandle non-stream in stream array 0
|
||||
|
@ -4,6 +4,6 @@ File is not encrypted
|
||||
File is not linearized
|
||||
WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
|
||||
WARNING: split-content-stream-errors.pdf (file position 557): stream will be re-processed without filtering to avoid data loss
|
||||
WARNING: content stream: ignoring non-stream while parsing content streams
|
||||
WARNING: content stream objects (item index 0 (from 0)): ignoring non-stream in an array of streams
|
||||
WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
|
||||
WARNING: content stream (content stream object 6 0): errors while decoding content stream
|
||||
|
Loading…
Reference in New Issue
Block a user