From 52f9d326a56e6409a1487c724241f91de33e3ba6 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Mon, 28 Jan 2019 20:13:10 -0500 Subject: [PATCH] Resolve duplicated page objects (fixes #268) When linearizing a file or getting the list of all pages in a file, detect if the pages tree contains a duplicated page object and, if so, shallow copy it. This makes it possible to have a one to one mapping of page positions to page objects. --- ChangeLog | 8 ++++++ include/qpdf/QPDF.hh | 23 +++++++++------- libqpdf/QPDF_optimization.cc | 3 +++ libqpdf/QPDF_pages.cc | 26 ++++++++++++++++--- manual/qpdf-manual.xml | 7 +++++ qpdf/qpdf.testcov | 1 + qpdf/qtest/qpdf.test | 26 ++++++++++++++++++- qpdf/qtest/qpdf/extract-duplicate-page.pdf | Bin 0 -> 1370 bytes qpdf/qtest/qpdf/linearize-duplicate-page.pdf | Bin 0 -> 1899 bytes qpdf/qtest/qpdf/page_api_2.out | 2 +- qpdf/test_driver.cc | 19 +++++++++++--- 11 files changed, 98 insertions(+), 17 deletions(-) create mode 100644 qpdf/qtest/qpdf/extract-duplicate-page.pdf create mode 100644 qpdf/qtest/qpdf/linearize-duplicate-page.pdf diff --git a/ChangeLog b/ChangeLog index 9536fd79..f24e1b75 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2019-01-28 Jay Berkenbilt + + * When linearizing or getting the list of all pages in a file, + replace duplicated page objects with a shallow copy of the page + object. Linearization and all page manipulation APIs require page + objects to be unique. Pages that were originally duplicated will + still share contents and any other indirect resources. Fixes #268. + 2019-01-26 Jay Berkenbilt * Add --overlay and --underlay options. Fixes #207. diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index ef0f0eb7..cfb10aac 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -527,15 +527,16 @@ class QPDF void optimize(std::map const& object_stream_data, bool allow_changes = true); - // Traverse page tree return all /Page objects. For efficiency, - // this method returns a const reference to an internal vector of - // pages. Calls to addPage, addPageAt, and removePage safely - // update this, but directly manipulation of the pages three or - // pushing inheritable objects to the page level may invalidate - // it. See comments for updateAllPagesCache() for additional - // notes. Newer code should use - // QPDFPageDocumentHelper::getAllPages instead. The decision to - // expose this internal cache was arguably incorrect, but it is + // Traverse page tree return all /Page objects. It also detects + // and resolves cases in which the same /Page object is + // duplicated. For efficiency, this method returns a const + // reference to an internal vector of pages. Calls to addPage, + // addPageAt, and removePage safely update this, but directly + // manipulation of the pages three or pushing inheritable objects + // to the page level may invalidate it. See comments for + // updateAllPagesCache() for additional notes. Newer code should + // use QPDFPageDocumentHelper::getAllPages instead. The decision + // to expose this internal cache was arguably incorrect, but it is // being left here for compatibility. It is, however, completely // safe to use this for files that you are not modifying. QPDF_DLL @@ -895,6 +896,10 @@ class QPDF void getAllPagesInternal2(QPDFObjectHandle cur_pages, std::vector& result, std::set& visited); + void getAllPagesInternal3(QPDFObjectHandle cur_pages, + std::vector& result, + std::set& visited, + std::set& seen); void insertPage(QPDFObjectHandle newpage, int pos); int findPage(QPDFObjGen const& og); int findPage(QPDFObjectHandle& page); diff --git a/libqpdf/QPDF_optimization.cc b/libqpdf/QPDF_optimization.cc index f282e5f3..ecece7db 100644 --- a/libqpdf/QPDF_optimization.cc +++ b/libqpdf/QPDF_optimization.cc @@ -156,6 +156,9 @@ QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys) return; } + // Calling getAllPages() resolves any duplicated page objects. + getAllPages(); + // key_ancestors is a mapping of page attribute keys to a stack of // Pages nodes that contain values for them. std::map > key_ancestors; diff --git a/libqpdf/QPDF_pages.cc b/libqpdf/QPDF_pages.cc index ea5afdb5..7b6b369e 100644 --- a/libqpdf/QPDF_pages.cc +++ b/libqpdf/QPDF_pages.cc @@ -62,8 +62,18 @@ QPDF::getAllPagesInternal(QPDFObjectHandle cur_pages, void QPDF::getAllPagesInternal2(QPDFObjectHandle cur_pages, - std::vector& result, - std::set& visited) + std::vector& result, + std::set& visited) +{ + std::set seen; + getAllPagesInternal3(cur_pages, result, visited, seen); +} + +void +QPDF::getAllPagesInternal3(QPDFObjectHandle cur_pages, + std::vector& result, + std::set& visited, + std::set& seen) { QPDFObjGen this_og = cur_pages.getObjGen(); if (visited.count(this_og) > 0) @@ -94,11 +104,21 @@ QPDF::getAllPagesInternal2(QPDFObjectHandle cur_pages, int n = kids.getArrayNItems(); for (int i = 0; i < n; ++i) { - getAllPagesInternal2(kids.getArrayItem(i), result, visited); + QPDFObjectHandle kid = kids.getArrayItem(i); + if (seen.count(kid.getObjGen())) + { + // Make a copy of the page. This does the same as + // shallowCopyPage in QPDFPageObjectHelper. + QTC::TC("qpdf", "QPDF resolve duplicated page object"); + kid = makeIndirectObject(QPDFObjectHandle(kid).shallowCopy()); + kids.setArrayItem(i, kid); + } + getAllPagesInternal3(kid, result, visited, seen); } } else if (type == "/Page") { + seen.insert(this_og); result.push_back(cur_pages); } else diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml index b673be25..aaeaeb54 100644 --- a/manual/qpdf-manual.xml +++ b/manual/qpdf-manual.xml @@ -4410,6 +4410,13 @@ print "\n"; suite and properly handled. + + + Linearization and page manipulation APIs now detect and + recover from files that have duplicate Page objects in the + pages tree. + + diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 821880cf..98857597 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -428,3 +428,4 @@ QPDFFormFieldObjectHelper fallback Tf 0 QPDFPageObjectHelper non-trivial inheritance 0 QPDFPageObjectHelper copy shared attribute 0 qpdf from_nr from repeat_nr 0 +QPDF resolve duplicated page object 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index 946ffff6..9b72706f 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -582,7 +582,7 @@ $td->runtest("check output", {$td->FILE => "page_api_1-out2.pdf"}); $td->runtest("duplicate page", {$td->COMMAND => "test_driver 17 page_api_2.pdf"}, - {$td->FILE => "page_api_2.out", $td->EXIT_STATUS => 2}, + {$td->FILE => "page_api_2.out", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); $td->runtest("delete and re-add a page", {$td->COMMAND => "test_driver 18 page_api_1.pdf"}, @@ -1727,6 +1727,30 @@ foreach my $f (qw(screen print)) {$td->FILE => "manual-appearances-$f-out.pdf"}); } +show_ntests(); +# ---------- +$td->notify("--- Duplicated Page Object ---"); +$n_tests += 4; + +$td->runtest("linearize duplicated pages", + {$td->COMMAND => + "qpdf --static-id --linearize" . + " page_api_2.pdf a.pdf"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); +$td->runtest("compare files", + {$td->FILE => "a.pdf"}, + {$td->FILE => "linearize-duplicate-page.pdf"}); +$td->runtest("extract duplicated pages", + {$td->COMMAND => + "qpdf --static-id page_api_2.pdf" . + " --pages . -- a.pdf"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); +$td->runtest("compare files", + {$td->FILE => "a.pdf"}, + {$td->FILE => "extract-duplicate-page.pdf"}); + show_ntests(); # ---------- $td->notify("--- Merging and Splitting ---"); diff --git a/qpdf/qtest/qpdf/extract-duplicate-page.pdf b/qpdf/qtest/qpdf/extract-duplicate-page.pdf new file mode 100644 index 0000000000000000000000000000000000000000..191943aa3b936be702846353580384efc10251c9 GIT binary patch literal 1370 zcmd5+&ubGw6c$TBmI@w3FCHFBpher=nc1Hb0&UYk(PB&@JroaIqG+x>VVz}@4TkJJ$Abw|j7Nu&!K_Hw~>reuR_0UM3y)FadjDIwJN ze6ooZb2XUsMGhM~Ct?cN`Y6w|$Tt@XntmmMJvEUU5x6h1K>^3$TP%UShxY!qM0N#s zj?1LZ%SbJxE+Ddd+(R+Q_^z5_MGvo#P>2!=SgXZzOe{IKMWH5nA4M>7G!=faF9c#+ zs9v{}`W>OMl#6r_jhJezWIkOO<;_~9_`J>tA z^UH2!?QnMbtn%aD_N@n%^`~g)>+RRpHb1#+y~_(>4d8aOu_(6MYRo0L`w}~6atQ8G zCU<0!%>$iDPurO>HXx-NY>l}~jlgA#zs>H+oZZ9qtrd3DT%?W%#B;ox1z`}d08+S# z5%jBBAm_0VXKN`&{M9^4&{Io!EJRV4Vgbb=T#AJeo-0eSC|T_hCkt3~&a)Ak!p!J_ zOcj{336HL}QX0&`hVPy1lR(ax(17Dw+9Poi(;y^9QUhcz^%^ literal 0 HcmV?d00001 diff --git a/qpdf/qtest/qpdf/linearize-duplicate-page.pdf b/qpdf/qtest/qpdf/linearize-duplicate-page.pdf new file mode 100644 index 0000000000000000000000000000000000000000..27a8898eb9424e1774f871d84cedb423d24b6ebb GIT binary patch literal 1899 zcmd5-O=}ZD7><^xEEN<*FCJc$K#R7!v)?Nr*d`6FS{su{g;Ll_G8j1%TKupU5aS_%)(`6tEfprj90M8bHNCw`y2MUNyAi7SV zR1%oi;j!QtmM{pIgWc$YG+IQ1cn2ONXBttmXV}{B_jD_J&s2@fJqs~7dK(4Q?Xdtj zl1rZB1LlW1h;`qO0N?U`1HlC%gIEjLQ-n=ZvQUXj6<8}eEn>C|Q+FJrrD&!hX&q`( zO_6DfnvU+2iV9JQX2}>sGcChXWW`iz)MLc zUZY(Wf;ud&`FxXsSf&x}`Q6kw&D4Xc+l#PAvD%{%tFX4;!6yw+HRKEPy3r%h)M46y z(G6knwr$&X$F}8`-LmaBpLajyCP2UN@YTC-C(5sHpW3!><(w(UUZnQ>gWcU+?%>Jk z^PERv6bBeF%1V_`_PvOCQHT^A*1gO+E}i!W$WwlzBff?ubFS8CfH#x=>MZp8K^q(9 zBdTJU;f4UgG2(biB|aFx-Cz;M5>Y5fZZ?Q86#r?64~HxTPFzF+X959=;!i-u;~3!* z3irP}Q<{?c;LK7(GEQEd*xJ6BKl^iM|8W1q-j%JFdxy>Z#PZhmjr`Yhv$;$8$?F+Z z;u)jFQSj!xl zB*p)QJjym(8S0kn>{uzmYe*pb+l;t2X2j*mnG7Sil9SYBzmE(@D&BEBA@Z4PsE)?Y zKRI@Fu8=XO!xX8xa~*|2!bYy|RZ#FSRkw7&on)bz># literal 0 HcmV?d00001 diff --git a/qpdf/qtest/qpdf/page_api_2.out b/qpdf/qtest/qpdf/page_api_2.out index 9fc019c2..30e9b9c3 100644 --- a/qpdf/qtest/qpdf/page_api_2.out +++ b/qpdf/qtest/qpdf/page_api_2.out @@ -1 +1 @@ -page_api_2.pdf (page 1 (numbered from zero): object 4 0): duplicate page reference found; this would cause loss of data +test 17 done diff --git a/qpdf/test_driver.cc b/qpdf/test_driver.cc index a03c4b98..743a0082 100644 --- a/qpdf/test_driver.cc +++ b/qpdf/test_driver.cc @@ -923,11 +923,24 @@ void runtest(int n, char const* filename1, char const* arg2) } else if (n == 17) { - // The input file to this test case is broken to exercise an - // error condition. + // The input file to this test case has a duplicated page. + QPDFObjectHandle page_kids = + pdf.getRoot().getKey("/Pages").getKey("/Kids"); + assert(page_kids.getArrayItem(0).getObjGen() == + page_kids.getArrayItem(1).getObjGen()); std::vector const& pages = pdf.getAllPages(); + assert(pages.size() == 3); + assert(! (pages.at(0).getObjGen() == pages.at(1).getObjGen())); + assert(QPDFObjectHandle(pages.at(0)).getKey("/Contents").getObjGen() == + QPDFObjectHandle(pages.at(1)).getKey("/Contents").getObjGen()); pdf.removePage(pages.at(0)); - std::cout << "you can't see this" << std::endl; + assert(pages.size() == 2); + PointerHolder b = QPDFObjectHandle(pages.at(0)). + getKey("/Contents").getStreamData(); + std::string contents = std::string( + reinterpret_cast(b->getBuffer()), + b->getSize()); + assert(contents.find("page 0") != std::string::npos); } else if (n == 18) {