diff --git a/ChangeLog b/ChangeLog index 9536fd79..f24e1b75 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2019-01-28 Jay Berkenbilt + + * When linearizing or getting the list of all pages in a file, + replace duplicated page objects with a shallow copy of the page + object. Linearization and all page manipulation APIs require page + objects to be unique. Pages that were originally duplicated will + still share contents and any other indirect resources. Fixes #268. + 2019-01-26 Jay Berkenbilt * Add --overlay and --underlay options. Fixes #207. diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index ef0f0eb7..cfb10aac 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -527,15 +527,16 @@ class QPDF void optimize(std::map const& object_stream_data, bool allow_changes = true); - // Traverse page tree return all /Page objects. For efficiency, - // this method returns a const reference to an internal vector of - // pages. Calls to addPage, addPageAt, and removePage safely - // update this, but directly manipulation of the pages three or - // pushing inheritable objects to the page level may invalidate - // it. See comments for updateAllPagesCache() for additional - // notes. Newer code should use - // QPDFPageDocumentHelper::getAllPages instead. The decision to - // expose this internal cache was arguably incorrect, but it is + // Traverse page tree return all /Page objects. It also detects + // and resolves cases in which the same /Page object is + // duplicated. For efficiency, this method returns a const + // reference to an internal vector of pages. Calls to addPage, + // addPageAt, and removePage safely update this, but directly + // manipulation of the pages three or pushing inheritable objects + // to the page level may invalidate it. See comments for + // updateAllPagesCache() for additional notes. Newer code should + // use QPDFPageDocumentHelper::getAllPages instead. The decision + // to expose this internal cache was arguably incorrect, but it is // being left here for compatibility. It is, however, completely // safe to use this for files that you are not modifying. QPDF_DLL @@ -895,6 +896,10 @@ class QPDF void getAllPagesInternal2(QPDFObjectHandle cur_pages, std::vector& result, std::set& visited); + void getAllPagesInternal3(QPDFObjectHandle cur_pages, + std::vector& result, + std::set& visited, + std::set& seen); void insertPage(QPDFObjectHandle newpage, int pos); int findPage(QPDFObjGen const& og); int findPage(QPDFObjectHandle& page); diff --git a/libqpdf/QPDF_optimization.cc b/libqpdf/QPDF_optimization.cc index f282e5f3..ecece7db 100644 --- a/libqpdf/QPDF_optimization.cc +++ b/libqpdf/QPDF_optimization.cc @@ -156,6 +156,9 @@ QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys) return; } + // Calling getAllPages() resolves any duplicated page objects. + getAllPages(); + // key_ancestors is a mapping of page attribute keys to a stack of // Pages nodes that contain values for them. std::map > key_ancestors; diff --git a/libqpdf/QPDF_pages.cc b/libqpdf/QPDF_pages.cc index ea5afdb5..7b6b369e 100644 --- a/libqpdf/QPDF_pages.cc +++ b/libqpdf/QPDF_pages.cc @@ -62,8 +62,18 @@ QPDF::getAllPagesInternal(QPDFObjectHandle cur_pages, void QPDF::getAllPagesInternal2(QPDFObjectHandle cur_pages, - std::vector& result, - std::set& visited) + std::vector& result, + std::set& visited) +{ + std::set seen; + getAllPagesInternal3(cur_pages, result, visited, seen); +} + +void +QPDF::getAllPagesInternal3(QPDFObjectHandle cur_pages, + std::vector& result, + std::set& visited, + std::set& seen) { QPDFObjGen this_og = cur_pages.getObjGen(); if (visited.count(this_og) > 0) @@ -94,11 +104,21 @@ QPDF::getAllPagesInternal2(QPDFObjectHandle cur_pages, int n = kids.getArrayNItems(); for (int i = 0; i < n; ++i) { - getAllPagesInternal2(kids.getArrayItem(i), result, visited); + QPDFObjectHandle kid = kids.getArrayItem(i); + if (seen.count(kid.getObjGen())) + { + // Make a copy of the page. This does the same as + // shallowCopyPage in QPDFPageObjectHelper. + QTC::TC("qpdf", "QPDF resolve duplicated page object"); + kid = makeIndirectObject(QPDFObjectHandle(kid).shallowCopy()); + kids.setArrayItem(i, kid); + } + getAllPagesInternal3(kid, result, visited, seen); } } else if (type == "/Page") { + seen.insert(this_og); result.push_back(cur_pages); } else diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml index b673be25..aaeaeb54 100644 --- a/manual/qpdf-manual.xml +++ b/manual/qpdf-manual.xml @@ -4410,6 +4410,13 @@ print "\n"; suite and properly handled. + + + Linearization and page manipulation APIs now detect and + recover from files that have duplicate Page objects in the + pages tree. + + diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 821880cf..98857597 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -428,3 +428,4 @@ QPDFFormFieldObjectHelper fallback Tf 0 QPDFPageObjectHelper non-trivial inheritance 0 QPDFPageObjectHelper copy shared attribute 0 qpdf from_nr from repeat_nr 0 +QPDF resolve duplicated page object 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index 946ffff6..9b72706f 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -582,7 +582,7 @@ $td->runtest("check output", {$td->FILE => "page_api_1-out2.pdf"}); $td->runtest("duplicate page", {$td->COMMAND => "test_driver 17 page_api_2.pdf"}, - {$td->FILE => "page_api_2.out", $td->EXIT_STATUS => 2}, + {$td->FILE => "page_api_2.out", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); $td->runtest("delete and re-add a page", {$td->COMMAND => "test_driver 18 page_api_1.pdf"}, @@ -1727,6 +1727,30 @@ foreach my $f (qw(screen print)) {$td->FILE => "manual-appearances-$f-out.pdf"}); } +show_ntests(); +# ---------- +$td->notify("--- Duplicated Page Object ---"); +$n_tests += 4; + +$td->runtest("linearize duplicated pages", + {$td->COMMAND => + "qpdf --static-id --linearize" . + " page_api_2.pdf a.pdf"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); +$td->runtest("compare files", + {$td->FILE => "a.pdf"}, + {$td->FILE => "linearize-duplicate-page.pdf"}); +$td->runtest("extract duplicated pages", + {$td->COMMAND => + "qpdf --static-id page_api_2.pdf" . + " --pages . -- a.pdf"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); +$td->runtest("compare files", + {$td->FILE => "a.pdf"}, + {$td->FILE => "extract-duplicate-page.pdf"}); + show_ntests(); # ---------- $td->notify("--- Merging and Splitting ---"); diff --git a/qpdf/qtest/qpdf/extract-duplicate-page.pdf b/qpdf/qtest/qpdf/extract-duplicate-page.pdf new file mode 100644 index 00000000..191943aa Binary files /dev/null and b/qpdf/qtest/qpdf/extract-duplicate-page.pdf differ diff --git a/qpdf/qtest/qpdf/linearize-duplicate-page.pdf b/qpdf/qtest/qpdf/linearize-duplicate-page.pdf new file mode 100644 index 00000000..27a8898e Binary files /dev/null and b/qpdf/qtest/qpdf/linearize-duplicate-page.pdf differ diff --git a/qpdf/qtest/qpdf/page_api_2.out b/qpdf/qtest/qpdf/page_api_2.out index 9fc019c2..30e9b9c3 100644 --- a/qpdf/qtest/qpdf/page_api_2.out +++ b/qpdf/qtest/qpdf/page_api_2.out @@ -1 +1 @@ -page_api_2.pdf (page 1 (numbered from zero): object 4 0): duplicate page reference found; this would cause loss of data +test 17 done diff --git a/qpdf/test_driver.cc b/qpdf/test_driver.cc index a03c4b98..743a0082 100644 --- a/qpdf/test_driver.cc +++ b/qpdf/test_driver.cc @@ -923,11 +923,24 @@ void runtest(int n, char const* filename1, char const* arg2) } else if (n == 17) { - // The input file to this test case is broken to exercise an - // error condition. + // The input file to this test case has a duplicated page. + QPDFObjectHandle page_kids = + pdf.getRoot().getKey("/Pages").getKey("/Kids"); + assert(page_kids.getArrayItem(0).getObjGen() == + page_kids.getArrayItem(1).getObjGen()); std::vector const& pages = pdf.getAllPages(); + assert(pages.size() == 3); + assert(! (pages.at(0).getObjGen() == pages.at(1).getObjGen())); + assert(QPDFObjectHandle(pages.at(0)).getKey("/Contents").getObjGen() == + QPDFObjectHandle(pages.at(1)).getKey("/Contents").getObjGen()); pdf.removePage(pages.at(0)); - std::cout << "you can't see this" << std::endl; + assert(pages.size() == 2); + PointerHolder b = QPDFObjectHandle(pages.at(0)). + getKey("/Contents").getStreamData(); + std::string contents = std::string( + reinterpret_cast(b->getBuffer()), + b->getSize()); + assert(contents.find("page 0") != std::string::npos); } else if (n == 18) {