diff --git a/ChangeLog b/ChangeLog index 39a7cbf7..256d83ea 100644 --- a/ChangeLog +++ b/ChangeLog @@ -78,6 +78,35 @@ production use. Even if it did, it would be very unusual for a PDF file to actually be adversely affected by this issue. + * Add support for coalescing a page's contents into a single + stream if they are represented as an array of streams. This can be + performed from the command line using the --coalesce-contents + option. Coalescing content streams can simplify things for + software that wants to operate on a page's content streams without + having to handle weird edge cases like content streams split in + the middle of tokens. Note that + QPDFObjectHandle::parsePageContents and + QPDFObjectHandle::parseContentStream already handled split content + streams. This is mainly to set the stage for new methods of + operating on page contents. The new method + QPDFObjectHandle::pipeContentStreams will pipe all of a page's + content streams though a single pipeline. The new method + QPDFObjectHandle.coalesceContentStreams, when called on a page + object, will do nothing if the page's contents are a single + stream, but if they are an array of streams, it will replace the + page's contents with a single stream whose contents are the + concatenation of the original streams. + + * A few library routines throw exceptions if called on non-page + objects. These constraints have been relaxed somewhat to make qpdf + more tolerant of files whose page dictionaries are not properly + marked as such. Mostly exceptions about page operations being + called on non page objects will only be thrown in cases where the + operation had no chance of succeeding anyway. This change has no + impact on any default mode operations, but it could allow + applications that use page-level APIs in QPDFObjectHandle to be + more tolerant of certain types of damaged files. + 2018-02-04 Jay Berkenbilt * Add QPDFWriter::setLinearizationPass1Filename method and diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index 86fa0202..14dadd6c 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -200,6 +200,20 @@ class QPDFObjectHandle QPDF_DLL void pipePageContents(Pipeline* p); + // When called on a stream or stream array that is some page's + // content streams, do the same as pipePageContents. This method + // is a lower level way to do what pipePageContents does, but it + // allows you to perform this operation on a contents object that + // is disconnected from a page object. The description argument + // should describe the containing page and is used in error + // messages. The all_description argument is initialized to + // something that could be used to describe the result of the + // pipeline. It is the description amended with the identifiers of + // the underlying objects. + QPDF_DLL + void pipeContentStreams(Pipeline* p, std::string const& description, + std::string& all_description); + // Older method: stream_or_array should be the value of /Contents // from a page object. It's more convenient to just call // parsePageContents on the page object, and error messages will @@ -556,30 +570,30 @@ class QPDFObjectHandle // Convenience routines for commonly performed functions - // Throws an exception if this is not a Page object. Returns an - // empty map if there are no images or no resources. This - // function does not presently support inherited resources. If - // this is a significant concern, call + // Returns an empty map if there are no images or no resources. + // This function does not presently support inherited resources. + // If this is a significant concern, call // pushInheritedAttributesToPage() on the QPDF object that owns - // this page. See comment in the source for details. Return - // value is a map from XObject name to the image object, which is - // always a stream. + // this page. See comment in the source for details. Return value + // is a map from XObject name to the image object, which is always + // a stream. QPDF_DLL std::map getPageImages(); // Returns a vector of stream objects representing the content // streams for the given page. This routine allows the caller to // not care whether there are one or more than one content streams - // for a page. Throws an exception if this is not a Page object. + // for a page. QPDF_DLL std::vector getPageContents(); - // Add the given object as a new content stream for this page. If - // parameter 'first' is true, add to the beginning. Otherwise, - // add to the end. This routine automatically converts the page + // Add the given object as a new content stream for this page. If + // parameter 'first' is true, add to the beginning. Otherwise, add + // to the end. This routine automatically converts the page // contents to an array if it is a scalar, allowing the caller not - // to care what the initial structure is. Throws an exception if - // this is not a Page object. + // to care what the initial structure is. You can call + // coalesceContentStreams() afterwards if you want to force it to + // be a single stream. QPDF_DLL void addPageContents(QPDFObjectHandle contents, bool first); @@ -590,6 +604,16 @@ class QPDFObjectHandle QPDF_DLL void rotatePage(int angle, bool relative); + // Coalesce a page's content streams. A page's content may be a + // stream or an array of streams. If this page's content is an + // array, concatenate the streams into a single stream. This can + // be useful when working with files that split content streams in + // arbitary spots, such as in the middle of a token, as that can + // confuse some software. You could also call this after calling + // addPageContents. + QPDF_DLL + void coalesceContentStreams(); + // Initializers for objects. This Factory class gives the QPDF // class specific permission to call factory methods without // making it a friend of the whole QPDFObjectHandle class. @@ -724,8 +748,6 @@ class QPDFObjectHandle ParserCallbacks* callbacks); std::vector arrayOrStreamToStreamArray( std::string const& description, std::string& all_description); - void pipeContentStreams(Pipeline* p, std::string const& description, - std::string& all_description); static void warn(QPDF*, QPDFExc const&); bool initialized; diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 1e73f9a6..51de87e1 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,39 @@ class TerminateParsing { }; +class CoalesceProvider: public QPDFObjectHandle::StreamDataProvider +{ + public: + CoalesceProvider(QPDFObjectHandle containing_page, + QPDFObjectHandle old_contents) : + containing_page(containing_page), + old_contents(old_contents) + { + } + virtual ~CoalesceProvider() + { + } + virtual void provideStreamData(int objid, int generation, + Pipeline* pipeline); + + private: + QPDFObjectHandle containing_page; + QPDFObjectHandle old_contents; +}; + +void +CoalesceProvider::provideStreamData(int, int, Pipeline* p) +{ + QTC::TC("qpdf", "QPDFObjectHandle coalesce provide stream data"); + Pl_Concatenate concat("concatenate", p); + std::string description = "page object " + + QUtil::int_to_string(containing_page.getObjectID()) + " " + + QUtil::int_to_string(containing_page.getGeneration()); + std::string all_description; + old_contents.pipeContentStreams(&concat, description, all_description); + concat.manualFinish(); +} + void QPDFObjectHandle::ParserCallbacks::terminateParsing() { @@ -691,7 +725,6 @@ QPDFObjectHandle::arrayOrStreamToStreamArray( std::vector QPDFObjectHandle::getPageContents() { - assertPageObject(); std::string description = "page object " + QUtil::int_to_string(this->objid) + " " + QUtil::int_to_string(this->generation); @@ -703,7 +736,6 @@ QPDFObjectHandle::getPageContents() void QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first) { - assertPageObject(); new_contents.assertStream(); std::vector orig_contents = getPageContents(); @@ -785,6 +817,33 @@ QPDFObjectHandle::rotatePage(int angle, bool relative) replaceKey("/Rotate", QPDFObjectHandle::newInteger(new_angle)); } +void +QPDFObjectHandle::coalesceContentStreams() +{ + assertPageObject(); + QPDFObjectHandle contents = this->getKey("/Contents"); + if (contents.isStream()) + { + QTC::TC("qpdf", "QPDFObjectHandle coalesce called on stream"); + return; + } + QPDF* qpdf = getOwningQPDF(); + if (qpdf == 0) + { + // Should not be possible for a page object to not have an + // owning PDF unless it was manually constructed in some + // incorrect way. + throw std::logic_error("coalesceContentStreams called on object" + " with no associated PDF file"); + } + QPDFObjectHandle new_contents = newStream(qpdf); + this->replaceKey("/Contents", new_contents); + + PointerHolder provider = + new CoalesceProvider(*this, contents); + new_contents.replaceStreamData(provider, newNull(), newNull()); +} + std::string QPDFObjectHandle::unparse() { @@ -842,6 +901,7 @@ QPDFObjectHandle::parse(std::string const& object_str, void QPDFObjectHandle::pipePageContents(Pipeline* p) { + assertPageObject(); std::string description = "page object " + QUtil::int_to_string(this->objid) + " " + QUtil::int_to_string(this->generation); @@ -879,6 +939,7 @@ QPDFObjectHandle::pipeContentStreams( void QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks) { + assertPageObject(); std::string description = "page object " + QUtil::int_to_string(this->objid) + " " + QUtil::int_to_string(this->generation); @@ -1728,15 +1789,15 @@ QPDFObjectHandle::assertNumber() bool QPDFObjectHandle::isPageObject() { - return (this->isDictionary() && this->hasKey("/Type") && - (this->getKey("/Type").getName() == "/Page")); + // Some PDF files have /Type broken on pages. + return (this->isDictionary() && this->hasKey("/Contents")); } bool QPDFObjectHandle::isPagesObject() { - return (this->isDictionary() && this->hasKey("/Type") && - (this->getKey("/Type").getName() == "/Pages")); + // Some PDF files have /Type broken on pages. + return (this->isDictionary() && this->hasKey("/Kids")); } void diff --git a/qpdf/qpdf.cc b/qpdf/qpdf.cc index bf5d7078..74285bcd 100644 --- a/qpdf/qpdf.cc +++ b/qpdf/qpdf.cc @@ -90,6 +90,7 @@ struct Options qdf_mode(false), preserve_unreferenced_objects(false), newline_before_endstream(false), + coalesce_contents(false), show_npages(false), deterministic_id(false), static_id(false), @@ -154,6 +155,7 @@ struct Options bool preserve_unreferenced_objects; bool newline_before_endstream; std::string linearize_pass1; + bool coalesce_contents; std::string min_version; std::string force_version; bool show_npages; @@ -391,6 +393,7 @@ familiar with the PDF file format or who are PDF developers.\n\ --object-streams=mode controls handing of object streams\n\ --preserve-unreferenced preserve unreferenced objects\n\ --newline-before-endstream always put a newline before endstream\n\ +--coalesce-contents force all pages' content to be a single stream\n\ --qdf turns on \"QDF mode\" (below)\n\ --linearize-pass1=file write intermediate pass of linearized file\n\ for debugging\n\ @@ -1543,6 +1546,10 @@ static void parse_options(int argc, char* argv[], Options& o) } o.linearize_pass1 = parameter; } + else if (strcmp(arg, "coalesce-contents") == 0) + { + o.coalesce_contents = true; + } else if (strcmp(arg, "min-version") == 0) { if (parameter == 0) @@ -1960,6 +1967,19 @@ static void do_inspection(QPDF& pdf, Options& o) } } +static void handle_transformations(QPDF& pdf, Options& o) +{ + if (o.coalesce_contents) + { + std::vector pages = pdf.getAllPages(); + for (std::vector::iterator iter = pages.begin(); + iter != pages.end(); ++iter) + { + (*iter).coalesceContentStreams(); + } + } +} + static void handle_page_specs(QPDF& pdf, Options& o, std::vector >& page_heap) { @@ -2382,6 +2402,7 @@ int main(int argc, char* argv[]) pdf.processFile(o.infilename, o.password); } + handle_transformations(pdf, o); std::vector > page_heap; if (! o.page_specs.empty()) { diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 35ca70d3..a1ce662d 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -304,3 +304,5 @@ QPDFTokenizer EOF when not allowed 0 QPDFTokenizer inline image at EOF 0 Pl_QPDFTokenizer found ID 0 QPDFObjectHandle non-stream in stream array 0 +QPDFObjectHandle coalesce called on stream 0 +QPDFObjectHandle coalesce provide stream data 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index d2afff4f..9d279267 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -734,6 +734,28 @@ $td->runtest("stream with tiff predictor", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); +show_ntests(); +# ---------- +$td->notify("--- Coalesce contents ---"); +$n_tests += 4; + +$td->runtest("coalesce contents with qdf", + {$td->COMMAND => + "qpdf --qdf --static-id" . + " --coalesce-contents coalesce.pdf a.pdf"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}); +$td->runtest("check output", + {$td->FILE => "a.pdf"}, + {$td->FILE => "coalesce-out.qdf"}); +$td->runtest("coalesce contents without qdf", + {$td->COMMAND => + "qpdf --static-id" . + " --coalesce-contents coalesce.pdf a.pdf"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}); +$td->runtest("check output", + {$td->FILE => "a.pdf"}, + {$td->FILE => "coalesce-out.pdf"}); + show_ntests(); # ---------- $td->notify("--- Newline before endstream ---"); diff --git a/qpdf/qtest/qpdf/coalesce-out.pdf b/qpdf/qtest/qpdf/coalesce-out.pdf new file mode 100644 index 00000000..78505aba Binary files /dev/null and b/qpdf/qtest/qpdf/coalesce-out.pdf differ diff --git a/qpdf/qtest/qpdf/coalesce-out.qdf b/qpdf/qtest/qpdf/coalesce-out.qdf new file mode 100644 index 00000000..9a7129f3 --- /dev/null +++ b/qpdf/qtest/qpdf/coalesce-out.qdf @@ -0,0 +1,171 @@ +%PDF-1.3 +% +%QDF-1.0 + +%% Original object ID: 1 0 +1 0 obj +<< + /Pages 2 0 R + /Type /Catalog +>> +endobj + +%% Original object ID: 2 0 +2 0 obj +<< + /Count 2 + /Kids [ + 3 0 R + 4 0 R + ] + /Type /Pages +>> +endobj + +%% Page 1 +%% Original object ID: 3 0 +3 0 obj +<< + /Contents 5 0 R + /MediaBox [ + 0 + 0 + 612 + 792 + ] + /Parent 2 0 R + /Resources << + /Font << + /F1 7 0 R + >> + /ProcSet 8 0 R + >> + /Type /Page +>> +endobj + +%% Page 2 +%% Original object ID: 4 0 +4 0 obj +<< + /Contents 9 0 R + /MediaBox [ + 0 + 0 + 612 + 792 + ] + /Parent 2 0 R + /Resources << + /Font << + /F1 11 0 R + >> + /ProcSet 12 0 R + >> + /Type /Page +>> +endobj + +%% Contents for page 1 +%% Original object ID: 19 0 +5 0 obj +<< + /Length 6 0 R +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET [ /array/split ] BI +/CS /G/W 66/H 47/BPC 8/F/Fl/DP<> +ID xI P|C;U`7Z Ę}D_W->>^&u]"!*&E|Sy d-<B0B@N+<hlK/56L >0>Y!c\Y %Y8?&}j;3lpsHtQTt*hUw%)p"DiRjDYNUAvF& u#cW ߉WO +EI +endstream +endobj + +%QDF: ignore_newline +6 0 obj +371 +endobj + +%% Original object ID: 13 0 +7 0 obj +<< + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + /Name /F1 + /Subtype /Type1 + /Type /Font +>> +endobj + +%% Original object ID: 14 0 +8 0 obj +[ + /PDF + /Text +] +endobj + +%% Contents for page 2 +%% Original object ID: 15 0 +9 0 obj +<< + /Length 10 0 R +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET +endstream +endobj + +10 0 obj +44 +endobj + +%% Original object ID: 17 0 +11 0 obj +<< + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + /Name /F1 + /Subtype /Type1 + /Type /Font +>> +endobj + +%% Original object ID: 18 0 +12 0 obj +[ + /PDF + /Text +] +endobj + +xref +0 13 +0000000000 65535 f +0000000052 00000 n +0000000133 00000 n +0000000252 00000 n +0000000481 00000 n +0000000726 00000 n +0000001174 00000 n +0000001222 00000 n +0000001368 00000 n +0000001454 00000 n +0000001554 00000 n +0000001602 00000 n +0000001749 00000 n +trailer << + /Root 1 0 R + /Size 13 + /ID [<31415926535897932384626433832795>] +>> +startxref +1785 +%%EOF diff --git a/qpdf/qtest/qpdf/coalesce.pdf b/qpdf/qtest/qpdf/coalesce.pdf new file mode 100644 index 00000000..ba5d959b --- /dev/null +++ b/qpdf/qtest/qpdf/coalesce.pdf @@ -0,0 +1,217 @@ +%PDF-1.3 +% +%QDF-1.0 + +1 0 obj +<< + /Pages 2 0 R + /Type /Catalog +>> +endobj + +2 0 obj +<< + /Count 2 + /Kids [ + 3 0 R + 4 0 R + ] + /Type /Pages +>> +endobj + +%% Page 1 +3 0 obj +<< + /Contents [ + 5 0 R + 7 0 R + 9 0 R + 11 0 R + ] + /MediaBox [ + 0 + 0 + 612 + 792 + ] + /Parent 2 0 R + /Resources << + /Font << + /F1 13 0 R + >> + /ProcSet 14 0 R + >> + /Type /Page +>> +endobj + +%% Page 2 +4 0 obj +<< + /Contents 15 0 R + /MediaBox [ + 0 + 0 + 612 + 792 + ] + /Parent 2 0 R + /Resources << + /Font << + /F1 17 0 R + >> + /ProcSet 18 0 R + >> + /Type /Page +>> +endobj + +%% Contents for page 1 +5 0 obj +<< + /Length 6 0 R +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Pot +endstream +endobj + +%QDF: ignore_newline +6 0 obj +33 +endobj + +%% Contents for page 1 +7 0 obj +<< + /Length 8 0 R +>> +stream +ato) Tj +ET [ /array +endstream +endobj + +%QDF: ignore_newline +8 0 obj +19 +endobj + +%% Contents for page 1 +9 0 obj +<< + /Length 10 0 R +>> +stream +/split ] BI +/CS /G/W 66/H 47/BPC 8/F/Fl/DP<> +ID xI P|C;U`7Z Ę}D_W->>^&u]"!*&E|Sy d-<B0B@N+<hlK/56L >0>Y!c\Y %Y8?&}j;3lpsHt +endstream +endobj + +%QDF: ignore_newline +10 0 obj +253 +endobj + +%% Contents for page 1 +11 0 obj +<< + /Length 12 0 R +>> +stream +QTt*hUw%)p"DiRjDYNUAvF& u#cW ߉WO +EI +endstream +endobj + +%QDF: ignore_newline +12 0 obj +66 +endobj + +13 0 obj +<< + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + /Name /F1 + /Subtype /Type1 + /Type /Font +>> +endobj + +14 0 obj +[ + /PDF + /Text +] +endobj + +%% Contents for page 2 +15 0 obj +<< + /Length 16 0 R +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET +endstream +endobj + +16 0 obj +44 +endobj + +17 0 obj +<< + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + /Name /F1 + /Subtype /Type1 + /Type /Font +>> +endobj + +18 0 obj +[ + /PDF + /Text +] +endobj + +xref +0 19 +0000000000 65535 f +0000000025 00000 n +0000000079 00000 n +0000000171 00000 n +0000000416 00000 n +0000000634 00000 n +0000000744 00000 n +0000000786 00000 n +0000000882 00000 n +0000000924 00000 n +0000001255 00000 n +0000001299 00000 n +0000001444 00000 n +0000001464 00000 n +0000001583 00000 n +0000001642 00000 n +0000001743 00000 n +0000001763 00000 n +0000001882 00000 n +trailer << + /Root 1 0 R + /Size 19 + /ID [<6af379f20e8dcd4e724869daec3ba023>] +>> +startxref +1918 +%%EOF