From fcd611b61eb6cc352b4e072fc791681ad927aee2 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Tue, 30 Jan 2018 21:25:51 -0500 Subject: [PATCH] Refactor parseContentStream --- ChangeLog | 12 ++ include/qpdf/QPDFObjectHandle.hh | 37 +++- libqpdf/QPDFObjectHandle.cc | 176 +++++++++++------- qpdf/qpdf.testcov | 2 +- .../qpdf/split-content-stream-errors.out | 2 +- 5 files changed, 155 insertions(+), 74 deletions(-) diff --git a/ChangeLog b/ChangeLog index e9dea347..b29e6548 100644 --- a/ChangeLog +++ b/ChangeLog @@ -45,6 +45,18 @@ characters may surround the EI operator that marks the end of an inline image. + * New method QPDFObjectHandle::parsePageContents() to improve upon + QPDFObjectHandle::parseContentStream(). The parseContentStream + method used to operate on a single content stream, but was fixed + to properly handle pages with contents split across multiple + streams in an earlier release. The new method parsePageContents() + can be called on the page object rather than the value of the + page dictionary's /Contents key. This removes a few lines of + boiler-plate code from any code that uses parseContentStream, and + it also enables creation of more helpful error messages if + problems are encountered as the error messages can include + information about which page the streams come from. + 2018-02-04 Jay Berkenbilt * Add QPDFWriter::setLinearizationPass1Filename method and diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index cd4c4767..86fa0202 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -88,7 +88,7 @@ class QPDFObjectHandle virtual void decryptString(std::string& val) = 0; }; - // This class is used by parseContentStream. Callers must + // This class is used by parsePageContents. Callers must // instantiate a subclass of this with handlers defined to accept // QPDFObjectHandles that are parsed from the stream. class ParserCallbacks @@ -103,8 +103,8 @@ class QPDFObjectHandle protected: // Implementors may call this method during parsing to - // terminate parsing early. This method throws an exception - // that is caught by parseContentStream, so its effect is + // terminate parsing early. This method throws an exception + // that is caught by parsePageContents, so its effect is // immediate. QPDF_DLL void terminateParsing(); @@ -187,6 +187,24 @@ class QPDFObjectHandle QPDF* context); // Helpers for parsing content streams + + // Parse a page's contents through ParserCallbacks, described + // above. This method works whether the contents are a single + // stream or an array of streams. Call on a page object. + QPDF_DLL + void parsePageContents(ParserCallbacks* callbacks); + + // Pipe a page's contents through the given pipeline. This method + // works whether the contents are a single stream or an array of + // streams. Call on a page object. + QPDF_DLL + void pipePageContents(Pipeline* p); + + // Older method: stream_or_array should be the value of /Contents + // from a page object. It's more convenient to just call + // parsePageContents on the page object, and error messages will + // also be more useful because the page object information will be + // known. QPDF_DLL static void parseContentStream(QPDFObjectHandle stream_or_array, ParserCallbacks* callbacks); @@ -697,12 +715,17 @@ class QPDFObjectHandle QPDFTokenizer& tokenizer, bool& empty, StringDecrypter* decrypter, QPDF* context, bool content_stream); - static void parseContentStream_internal( - PointerHolder stream_data, + void parseContentStream_internal( std::string const& description, ParserCallbacks* callbacks); - - // Other methods + static void parseContentStream_data( + PointerHolder, + std::string const& description, + ParserCallbacks* callbacks); + std::vector arrayOrStreamToStreamArray( + std::string const& description, std::string& all_description); + void pipeContentStreams(Pipeline* p, std::string const& description, + std::string& all_description); static void warn(QPDF*, QPDFExc const&); bool initialized; diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index fb15cb1c..1e73f9a6 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -628,44 +628,78 @@ QPDFObjectHandle::getPageImages() } std::vector -QPDFObjectHandle::getPageContents() +QPDFObjectHandle::arrayOrStreamToStreamArray( + std::string const& description, std::string& all_description) { - assertPageObject(); - + all_description = description; std::vector result; - QPDFObjectHandle contents = this->getKey("/Contents"); - if (contents.isArray()) + if (isArray()) { - int n_items = contents.getArrayNItems(); + int n_items = getArrayNItems(); for (int i = 0; i < n_items; ++i) { - QPDFObjectHandle item = contents.getArrayItem(i); + QPDFObjectHandle item = getArrayItem(i); if (item.isStream()) + { + result.push_back(item); + } + else { - result.push_back(item); - } - else - { - throw std::runtime_error( - "unknown item type while inspecting " - "element of /Contents array in page " - "dictionary"); + QTC::TC("qpdf", "QPDFObjectHandle non-stream in stream array"); + warn(item.getOwningQPDF(), + QPDFExc(qpdf_e_damaged_pdf, description, + "item index " + QUtil::int_to_string(i) + + " (from 0)", 0, + "ignoring non-stream in an array of streams")); } } } - else if (contents.isStream()) + else if (isStream()) { - result.push_back(contents); + result.push_back(*this); } - else if (! contents.isNull()) + else if (! isNull()) { - throw std::runtime_error("unknown object type inspecting /Contents " - "key in page dictionary"); + warn(getOwningQPDF(), + QPDFExc(qpdf_e_damaged_pdf, "", description, 0, + " object is supposed to be a stream or an" + " array of streams but is neither")); + } + + bool first = true; + for (std::vector::iterator iter = result.begin(); + iter != result.end(); ++iter) + { + QPDFObjectHandle item = *iter; + std::string og = + QUtil::int_to_string(item.getObjectID()) + " " + + QUtil::int_to_string(item.getGeneration()); + if (first) + { + first = false; + } + else + { + all_description += ","; + } + all_description += " stream " + og; } return result; } +std::vector +QPDFObjectHandle::getPageContents() +{ + assertPageObject(); + std::string description = "page object " + + QUtil::int_to_string(this->objid) + " " + + QUtil::int_to_string(this->generation); + std::string all_description; + return this->getKey("/Contents").arrayOrStreamToStreamArray( + description, all_description); +} + void QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first) { @@ -806,61 +840,72 @@ QPDFObjectHandle::parse(std::string const& object_str, } void -QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, - ParserCallbacks* callbacks) +QPDFObjectHandle::pipePageContents(Pipeline* p) { - std::vector streams; - if (stream_or_array.isArray()) - { - streams = stream_or_array.getArrayAsVector(); - } - else - { - streams.push_back(stream_or_array); - } - Pl_Buffer buf("concatenated stream data buffer"); - std::string all_description = "content stream objects"; - bool first = true; + std::string description = "page object " + + QUtil::int_to_string(this->objid) + " " + + QUtil::int_to_string(this->generation); + std::string all_description; + this->getKey("/Contents").pipeContentStreams( + p, description, all_description); +} + +void +QPDFObjectHandle::pipeContentStreams( + Pipeline* p, std::string const& description, std::string& all_description) +{ + std::vector streams = + arrayOrStreamToStreamArray( + description, all_description); for (std::vector::iterator iter = streams.begin(); iter != streams.end(); ++iter) { QPDFObjectHandle stream = *iter; - if (! stream.isStream()) + std::string og = + QUtil::int_to_string(stream.getObjectID()) + " " + + QUtil::int_to_string(stream.getGeneration()); + std::string description = "content stream object " + og; + if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized)) { - QTC::TC("qpdf", "QPDFObjectHandle non-stream in parsecontent"); + QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent"); warn(stream.getOwningQPDF(), QPDFExc(qpdf_e_damaged_pdf, "content stream", - "", 0, - "ignoring non-stream while parsing content streams")); - } - else - { - std::string og = QUtil::int_to_string(stream.getObjectID()) + " " + - QUtil::int_to_string(stream.getGeneration()); - std::string description = "content stream object " + og; - if (first) - { - first = false; - } - else - { - all_description += ","; - } - all_description += " " + og; - if (! stream.pipeStreamData(&buf, 0, qpdf_dl_specialized)) - { - QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent"); - warn(stream.getOwningQPDF(), - QPDFExc(qpdf_e_damaged_pdf, "content stream", - description, 0, - "errors while decoding content stream")); - } + description, 0, + "errors while decoding content stream")); } } +} + +void +QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks) +{ + std::string description = "page object " + + QUtil::int_to_string(this->objid) + " " + + QUtil::int_to_string(this->generation); + this->getKey("/Contents").parseContentStream_internal( + description, callbacks); +} + +void +QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, + ParserCallbacks* callbacks) +{ + stream_or_array.parseContentStream_internal( + "content stream objects", callbacks); +} + +void +QPDFObjectHandle::parseContentStream_internal( + std::string const& description, + ParserCallbacks* callbacks) +{ + Pl_Buffer buf("concatenated stream data buffer"); + std::string all_description; + pipeContentStreams(&buf, description, all_description); PointerHolder stream_data = buf.getBuffer(); try { - parseContentStream_internal(stream_data, all_description, callbacks); + parseContentStream_data(stream_data, all_description, callbacks); } catch (TerminateParsing&) { @@ -870,9 +915,10 @@ QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, } void -QPDFObjectHandle::parseContentStream_internal(PointerHolder stream_data, - std::string const& description, - ParserCallbacks* callbacks) +QPDFObjectHandle::parseContentStream_data( + PointerHolder stream_data, + std::string const& description, + ParserCallbacks* callbacks) { size_t length = stream_data->getSize(); PointerHolder input = diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 57fd4fd4..35ca70d3 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -277,7 +277,6 @@ QPDFObjectHandle found fake 1 QPDFObjectHandle no val for last key 0 QPDF resolve failure to null 0 QPDFWriter preserve unreferenced standard 0 -QPDFObjectHandle non-stream in parsecontent 0 QPDFObjectHandle errors in parsecontent 0 QPDF stream with non-space 0 qpdf same file error 0 @@ -304,3 +303,4 @@ QPDF_Stream TIFF predictor 0 QPDFTokenizer EOF when not allowed 0 QPDFTokenizer inline image at EOF 0 Pl_QPDFTokenizer found ID 0 +QPDFObjectHandle non-stream in stream array 0 diff --git a/qpdf/qtest/qpdf/split-content-stream-errors.out b/qpdf/qtest/qpdf/split-content-stream-errors.out index 81e6b8cb..fbfe020d 100644 --- a/qpdf/qtest/qpdf/split-content-stream-errors.out +++ b/qpdf/qtest/qpdf/split-content-stream-errors.out @@ -4,6 +4,6 @@ File is not encrypted File is not linearized WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received WARNING: split-content-stream-errors.pdf (file position 557): stream will be re-processed without filtering to avoid data loss -WARNING: content stream: ignoring non-stream while parsing content streams +WARNING: content stream objects (item index 0 (from 0)): ignoring non-stream in an array of streams WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received WARNING: content stream (content stream object 6 0): errors while decoding content stream