From eb802cfa8c7109504ad10bf4c89c47c876d9a382 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Thu, 21 Jun 2012 10:42:18 -0400 Subject: [PATCH] Implement page manipulation APIs --- include/qpdf/Constants.h | 3 +- include/qpdf/QPDF.hh | 34 +++-- libqpdf/QPDF_pages.cc | 271 +++++++++++++++++++++++---------------- qpdf/qpdf.testcov | 4 + 4 files changed, 192 insertions(+), 120 deletions(-) diff --git a/include/qpdf/Constants.h b/include/qpdf/Constants.h index 97708362..e50950f8 100644 --- a/include/qpdf/Constants.h +++ b/include/qpdf/Constants.h @@ -22,7 +22,8 @@ enum qpdf_error_code_e qpdf_e_system, /* I/O error, memory error, etc. */ qpdf_e_unsupported, /* PDF feature not (yet) supported by qpdf */ qpdf_e_password, /* incorrect password for encrypted file */ - qpdf_e_damaged_pdf /* syntax errors or other damage in PDF */ + qpdf_e_damaged_pdf, /* syntax errors or other damage in PDF */ + qpdf_e_pages, /* erroneous or unsupported pages structure */ }; /* Write Parameters */ diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index 04a254bc..3d53e466 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -340,14 +340,26 @@ class QPDF // Convenience routines for common functions. See also // QPDFObjectHandle.hh for additional convenience routines. - // Traverse page tree return all /Page objects. + // Page handling API + + // Traverse page tree return all /Page objects. Note that calls + // to page manipulation APIs will change the internal vector that + // this routine returns a pointer to. If you don't want that, + // assign this to a regular vector rather than a const reference. QPDF_DLL std::vector const& getAllPages(); - // QPDF internally caches the /Pages tree. This method will clear - // the cache when e.g. direct modifications have been made. + // This method synchronizes QPDF's cache of the page structure + // with the actual /Pages tree. If you restrict changes to the + // /Pages tree, including addition, removal, or replacement of + // pages or changes to any /Pages objects, to calls to these page + // handling APIs, you never need to call this method. If you + // modify /Pages structures directly, you must call this method + // afterwards. This method updates the internal list of pages, so + // after calling this method, any previous references returned by + // getAllPages() will be valid again. QPDF_DLL - void clearPagesCache(); + void updateAllPagesCache(); // Add new page at the beginning or the end of the current pdf QPDF_DLL @@ -356,11 +368,11 @@ class QPDF // Add new page before or after refpage QPDF_DLL void addPageAt(QPDFObjectHandle newpage, bool before, - QPDFObjectHandle const& refpage); + QPDFObjectHandle refpage); - // Remove pageoh from the pdf. + // Remove page from the pdf. QPDF_DLL - void removePage(QPDFObjectHandle const& pageoh); + void removePage(QPDFObjectHandle page); // Resolver class is restricted to QPDFObjectHandle so that only // it can resolve indirect references. @@ -541,12 +553,12 @@ class QPDF void getAllPagesInternal(QPDFObjectHandle cur_pages, std::vector& result); - // creates pageobj_to_pages_pos if necessary - // returns position, or -1 if not found + void insertPage(QPDFObjectHandle newpage, int pos); int findPage(int objid, int generation); - int findPage(QPDFObjectHandle const& pageoh); // convenience - + int findPage(QPDFObjectHandle& page); void flattenPagesTree(); + void insertPageobjToPage(QPDFObjectHandle const& obj, int pos, + bool check_duplicate); // methods to support encryption -- implemented in QPDF_encryption.cc encryption_method_e interpretCF(QPDFObjectHandle); diff --git a/libqpdf/QPDF_pages.cc b/libqpdf/QPDF_pages.cc index bd631c96..930e8bd1 100644 --- a/libqpdf/QPDF_pages.cc +++ b/libqpdf/QPDF_pages.cc @@ -6,6 +6,40 @@ #include #include +// In support of page manipulation APIs, these methods internally +// maintain state about pages in a pair of data structures: all_pages, +// which is a vector of page objects, and pageobj_to_pages_pos, which +// maps a page object to its position in the all_pages array. +// Unfortunately, the getAllPages() method returns a const reference +// to all_pages and has been in the public API long before the +// introduction of mutation APIs, so we're pretty much stuck with it. +// Anyway, there are lots of calls to it in the library, so the +// efficiency of having it cached is probably worth keeping it. + +// The goal of this code is to ensure that the all_pages vector, which +// users may have a reference to, and the pageobj_to_pages_pos map, +// which users will not have access to, remain consistent outside of +// any call to the library. As long as users only touch the /Pages +// structure through page-specific API calls, they never have to worry +// about anything, and this will also stay consistent. If a user +// touches anything about the /Pages structure outside of these calls +// (such as by directly looking up and manipulating the underlying +// objects), they can call updatePagesCache() to bring things back in +// sync. + +// If the user doesn't ever use the page manipulation APIs, then qpdf +// leaves the /Pages structure alone. If the user does use the APIs, +// then we push all inheritable objects down and flatten the /Pages +// tree. This makes it easier for us to keep /Pages, all_pages, and +// pageobj_to_pages_pos internally consistent at all times. + +// Responsibility for keeping all_pages, pageobj_to_pages_pos, and the +// Pages structure consistent should remain in as few places as +// possible. As of initial writing, only flattenPagesTree, +// insertPage, and removePage, along with methods they call, are +// concerned with it. Everything else goes through one of those +// methods. + std::vector const& QPDF::getAllPages() { @@ -44,152 +78,173 @@ QPDF::getAllPagesInternal(QPDFObjectHandle cur_pages, } } -// FIXXX here down - void -QPDF::clearPagesCache() +QPDF::updateAllPagesCache() { + // Force regeneration of the pages cache. We force immediate + // recalculation of all_pages since users may have references to + // it that they got from calls to getAllPages(). We can defer + // recalculation of pageobj_to_pages_pos until needed. + QTC::TC("qpdf", "QPDF updateAllPagesCache"); this->all_pages.clear(); this->pageobj_to_pages_pos.clear(); + getAllPages(); } void QPDF::flattenPagesTree() { - clearPagesCache(); + // If not already done, flatten the /Pages structure and + // initialize pageobj_to_pages_pos. - // FIXME: more specific method, we don't want to generate the extra stuff. - // We also need cheap fixup after addPage/removePage. - - // no compressed objects to be produced here... - std::map object_stream_data; - optimize(object_stream_data); // push down inheritance - - std::vector kids = this->getAllPages(); - QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages"); - - const int len = kids.size(); - for (int pos = 0; pos < len; ++pos) + if (! this->pageobj_to_pages_pos.empty()) { - // populate pageobj_to_pages_pos - ObjGen og(kids[pos].getObjectID(), kids[pos].getGeneration()); - if (! this->pageobj_to_pages_pos.insert(std::make_pair(og, pos)).second) - { - // insert failed: duplicate entry found - *out_stream << "WARNING: duplicate page reference found, " - << "but currently not fully supported." << std::endl; - } - - // fix parent links - kids[pos].replaceKey("/Parent", pages); + return; } - pages.replaceKey("/Kids", QPDFObjectHandle::newArray(kids)); + // Push inherited objects down to the /Page level + optimizePagesTree(true); + getAllPages(); + + QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages"); + + int const len = (int)this->all_pages.size(); + for (int pos = 0; pos < len; ++pos) + { + // populate pageobj_to_pages_pos and fix parent pointer + insertPageobjToPage(this->all_pages[pos], pos, true); + this->all_pages[pos].replaceKey("/Parent", pages); + } + + pages.replaceKey("/Kids", QPDFObjectHandle::newArray(this->all_pages)); // /Count has not changed assert(pages.getKey("/Count").getIntValue() == len); } -int -QPDF::findPage(int objid, int generation) +void +QPDF::insertPageobjToPage(QPDFObjectHandle const& obj, int pos, + bool check_duplicate) { - if (this->pageobj_to_pages_pos.empty()) + ObjGen og(obj.getObjectID(), obj.getGeneration()); + bool duplicate = + (! this->pageobj_to_pages_pos.insert(std::make_pair(og, pos)).second); + if (duplicate && check_duplicate) { - flattenPagesTree(); + QTC::TC("qpdf", "QPDF duplicate page reference"); + setLastObjectDescription("page " + QUtil::int_to_string(pos) + + " (numbered from zero)", + og.obj, og.gen); + throw QPDFExc(qpdf_e_pages, this->file->getName(), + this->last_object_description, 0, + "duplicate page reference found;" + " this would cause loss of data"); } - std::map::iterator it = - this->pageobj_to_pages_pos.find(ObjGen(objid, generation)); - if (it != this->pageobj_to_pages_pos.end()) - { - return (*it).second; - } - return -1; // throw? -} - -int -QPDF::findPage(QPDFObjectHandle const& pageoh) -{ - if (!pageoh.isInitialized()) - { - return -1; - // TODO? throw - } - return findPage(pageoh.getObjectID(), pageoh.getGeneration()); } void -QPDF::addPage(QPDFObjectHandle newpage, bool first) +QPDF::insertPage(QPDFObjectHandle newpage, int pos) { - if (this->pageobj_to_pages_pos.empty()) - { - flattenPagesTree(); - } - - newpage.assertPageObject(); // FIXME: currently private - - QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages"); - QPDFObjectHandle kids = pages.getKey("/Kids"); - - newpage.replaceKey("/Parent", pages); - if (first) - { - kids.insertItem(0, newpage); - } - else - { - kids.appendItem(newpage); - } - pages.replaceKey("/Count", - QPDFObjectHandle::newInteger(kids.getArrayNItems())); - - // FIXME: this is overkill, but cache is now stale - clearPagesCache(); -} - -void -QPDF::addPageAt(QPDFObjectHandle newpage, bool before, - QPDFObjectHandle const &refpage) -{ - int refpos = findPage(refpage); // also ensures flat /Pages - if (refpos == -1) - { - throw "Could not find refpage"; - } + // pos is numbered from 0, so pos = 0 inserts at the begining and + // pos = npages adds to the end. + flattenPagesTree(); newpage.assertPageObject(); + QTC::TC("qpdf", "QPDF insert page", + (pos == 0) ? 0 : // insert at beginning + (pos == ((int)this->all_pages.size())) ? 1 : // insert at end + 2); // insert in middle + QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages"); QPDFObjectHandle kids = pages.getKey("/Kids"); - - if (! before) - { - ++refpos; - } + assert ((pos >= 0) && (pos <= (int)this->all_pages.size())); newpage.replaceKey("/Parent", pages); - kids.insertItem(refpos, newpage); - pages.replaceKey("/Count", - QPDFObjectHandle::newInteger(kids.getArrayNItems())); - - // FIXME: this is overkill, but cache is now stale - clearPagesCache(); + kids.insertItem(pos, newpage); + int npages = kids.getArrayNItems(); + pages.replaceKey("/Count", QPDFObjectHandle::newInteger(npages)); + this->all_pages.insert(this->all_pages.begin() + pos, newpage); + assert((int)this->all_pages.size() == npages); + for (int i = pos + 1; i < npages; ++i) + { + insertPageobjToPage(this->all_pages[i], i, false); + } + insertPageobjToPage(newpage, pos, true); + assert((int)this->pageobj_to_pages_pos.size() == npages); } void -QPDF::removePage(QPDFObjectHandle const& pageoh) +QPDF::removePage(QPDFObjectHandle page) { - int pos = findPage(pageoh); // also ensures flat /Pages - if (pos == -1) - { - throw "Can't remove non-existing page"; - } + int pos = findPage(page); // also ensures flat /Pages + QTC::TC("qpdf", "QPDF remove page", + (pos == 0) ? 0 : // remove at beginning + (pos == ((int)this->all_pages.size() - 1)) ? 1 : // remove at end + 2); // remove in middle QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages"); QPDFObjectHandle kids = pages.getKey("/Kids"); kids.eraseItem(pos); - pages.replaceKey("/Count", - QPDFObjectHandle::newInteger(kids.getArrayNItems())); - - // FIXME: this is overkill, but cache is now stale - clearPagesCache(); + int npages = kids.getArrayNItems(); + pages.replaceKey("/Count", QPDFObjectHandle::newInteger(npages)); + this->all_pages.erase(this->all_pages.begin() + pos); + assert((int)this->all_pages.size() == npages); + this->pageobj_to_pages_pos.erase( + ObjGen(page.getObjectID(), page.getGeneration())); + assert((int)this->pageobj_to_pages_pos.size() == npages); + for (int i = pos; i < npages; ++i) + { + insertPageobjToPage(this->all_pages[i], i, false); + } +} + +void +QPDF::addPageAt(QPDFObjectHandle newpage, bool before, + QPDFObjectHandle refpage) +{ + int refpos = findPage(refpage); + if (! before) + { + ++refpos; + } + insertPage(newpage, refpos); +} + + +void +QPDF::addPage(QPDFObjectHandle newpage, bool first) +{ + getAllPages(); + if (first) + { + insertPage(newpage, 0); + } + else + { + insertPage(newpage, (int)this->all_pages.size()); + } +} + +int +QPDF::findPage(QPDFObjectHandle& page) +{ + page.assertPageObject(); + return findPage(page.getObjectID(), page.getGeneration()); +} + +int +QPDF::findPage(int objid, int generation) +{ + flattenPagesTree(); + std::map::iterator it = + this->pageobj_to_pages_pos.find(ObjGen(objid, generation)); + if (it == this->pageobj_to_pages_pos.end()) + { + setLastObjectDescription("page object", objid, generation); + QPDFExc(qpdf_e_pages, this->file->getName(), + this->last_object_description, 0, + "page object not referenced in /Pages tree"); + } + return (*it).second; } diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 8cfe81e4..249497a3 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -203,3 +203,7 @@ qpdf-c called qpdf_init_write_memory 0 exercise processFile(name) 0 exercise processFile(FILE*) 0 exercise processMemoryFile 0 +QPDF duplicate page reference 0 +QPDF remove page 2 +QPDF insert page 2 +QPDF updateAllPagesCache 0