Resolve duplicated page objects (fixes #268)

When linearizing a file or getting the list of all pages in a file,
detect if the pages tree contains a duplicated page object and, if so,
shallow copy it. This makes it possible to have a one to one mapping
of page positions to page objects.
This commit is contained in:
Jay Berkenbilt 2019-01-28 20:13:10 -05:00
parent 9e01c8bd99
commit 52f9d326a5
11 changed files with 98 additions and 17 deletions

View File

@ -1,3 +1,11 @@
2019-01-28 Jay Berkenbilt <ejb@ql.org>
* When linearizing or getting the list of all pages in a file,
replace duplicated page objects with a shallow copy of the page
object. Linearization and all page manipulation APIs require page
objects to be unique. Pages that were originally duplicated will
still share contents and any other indirect resources. Fixes #268.
2019-01-26 Jay Berkenbilt <ejb@ql.org>
* Add --overlay and --underlay options. Fixes #207.

View File

@ -527,15 +527,16 @@ class QPDF
void optimize(std::map<int, int> const& object_stream_data,
bool allow_changes = true);
// Traverse page tree return all /Page objects. For efficiency,
// this method returns a const reference to an internal vector of
// pages. Calls to addPage, addPageAt, and removePage safely
// update this, but directly manipulation of the pages three or
// pushing inheritable objects to the page level may invalidate
// it. See comments for updateAllPagesCache() for additional
// notes. Newer code should use
// QPDFPageDocumentHelper::getAllPages instead. The decision to
// expose this internal cache was arguably incorrect, but it is
// Traverse page tree return all /Page objects. It also detects
// and resolves cases in which the same /Page object is
// duplicated. For efficiency, this method returns a const
// reference to an internal vector of pages. Calls to addPage,
// addPageAt, and removePage safely update this, but directly
// manipulation of the pages three or pushing inheritable objects
// to the page level may invalidate it. See comments for
// updateAllPagesCache() for additional notes. Newer code should
// use QPDFPageDocumentHelper::getAllPages instead. The decision
// to expose this internal cache was arguably incorrect, but it is
// being left here for compatibility. It is, however, completely
// safe to use this for files that you are not modifying.
QPDF_DLL
@ -895,6 +896,10 @@ class QPDF
void getAllPagesInternal2(QPDFObjectHandle cur_pages,
std::vector<QPDFObjectHandle>& result,
std::set<QPDFObjGen>& visited);
void getAllPagesInternal3(QPDFObjectHandle cur_pages,
std::vector<QPDFObjectHandle>& result,
std::set<QPDFObjGen>& visited,
std::set<QPDFObjGen>& seen);
void insertPage(QPDFObjectHandle newpage, int pos);
int findPage(QPDFObjGen const& og);
int findPage(QPDFObjectHandle& page);

View File

@ -156,6 +156,9 @@ QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys)
return;
}
// Calling getAllPages() resolves any duplicated page objects.
getAllPages();
// key_ancestors is a mapping of page attribute keys to a stack of
// Pages nodes that contain values for them.
std::map<std::string, std::vector<QPDFObjectHandle> > key_ancestors;

View File

@ -62,8 +62,18 @@ QPDF::getAllPagesInternal(QPDFObjectHandle cur_pages,
void
QPDF::getAllPagesInternal2(QPDFObjectHandle cur_pages,
std::vector<QPDFObjectHandle>& result,
std::set<QPDFObjGen>& visited)
std::vector<QPDFObjectHandle>& result,
std::set<QPDFObjGen>& visited)
{
std::set<QPDFObjGen> seen;
getAllPagesInternal3(cur_pages, result, visited, seen);
}
void
QPDF::getAllPagesInternal3(QPDFObjectHandle cur_pages,
std::vector<QPDFObjectHandle>& result,
std::set<QPDFObjGen>& visited,
std::set<QPDFObjGen>& seen)
{
QPDFObjGen this_og = cur_pages.getObjGen();
if (visited.count(this_og) > 0)
@ -94,11 +104,21 @@ QPDF::getAllPagesInternal2(QPDFObjectHandle cur_pages,
int n = kids.getArrayNItems();
for (int i = 0; i < n; ++i)
{
getAllPagesInternal2(kids.getArrayItem(i), result, visited);
QPDFObjectHandle kid = kids.getArrayItem(i);
if (seen.count(kid.getObjGen()))
{
// Make a copy of the page. This does the same as
// shallowCopyPage in QPDFPageObjectHelper.
QTC::TC("qpdf", "QPDF resolve duplicated page object");
kid = makeIndirectObject(QPDFObjectHandle(kid).shallowCopy());
kids.setArrayItem(i, kid);
}
getAllPagesInternal3(kid, result, visited, seen);
}
}
else if (type == "/Page")
{
seen.insert(this_og);
result.push_back(cur_pages);
}
else

View File

@ -4410,6 +4410,13 @@ print "\n";
suite and properly handled.
</para>
</listitem>
<listitem>
<para>
Linearization and page manipulation APIs now detect and
recover from files that have duplicate Page objects in the
pages tree.
</para>
</listitem>
</itemizedlist>
</listitem>
<listitem>

View File

@ -428,3 +428,4 @@ QPDFFormFieldObjectHelper fallback Tf 0
QPDFPageObjectHelper non-trivial inheritance 0
QPDFPageObjectHelper copy shared attribute 0
qpdf from_nr from repeat_nr 0
QPDF resolve duplicated page object 0

View File

@ -582,7 +582,7 @@ $td->runtest("check output",
{$td->FILE => "page_api_1-out2.pdf"});
$td->runtest("duplicate page",
{$td->COMMAND => "test_driver 17 page_api_2.pdf"},
{$td->FILE => "page_api_2.out", $td->EXIT_STATUS => 2},
{$td->FILE => "page_api_2.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->runtest("delete and re-add a page",
{$td->COMMAND => "test_driver 18 page_api_1.pdf"},
@ -1727,6 +1727,30 @@ foreach my $f (qw(screen print))
{$td->FILE => "manual-appearances-$f-out.pdf"});
}
show_ntests();
# ----------
$td->notify("--- Duplicated Page Object ---");
$n_tests += 4;
$td->runtest("linearize duplicated pages",
{$td->COMMAND =>
"qpdf --static-id --linearize" .
" page_api_2.pdf a.pdf"},
{$td->STRING => "", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->runtest("compare files",
{$td->FILE => "a.pdf"},
{$td->FILE => "linearize-duplicate-page.pdf"});
$td->runtest("extract duplicated pages",
{$td->COMMAND =>
"qpdf --static-id page_api_2.pdf" .
" --pages . -- a.pdf"},
{$td->STRING => "", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->runtest("compare files",
{$td->FILE => "a.pdf"},
{$td->FILE => "extract-duplicate-page.pdf"});
show_ntests();
# ----------
$td->notify("--- Merging and Splitting ---");

Binary file not shown.

Binary file not shown.

View File

@ -1 +1 @@
page_api_2.pdf (page 1 (numbered from zero): object 4 0): duplicate page reference found; this would cause loss of data
test 17 done

View File

@ -923,11 +923,24 @@ void runtest(int n, char const* filename1, char const* arg2)
}
else if (n == 17)
{
// The input file to this test case is broken to exercise an
// error condition.
// The input file to this test case has a duplicated page.
QPDFObjectHandle page_kids =
pdf.getRoot().getKey("/Pages").getKey("/Kids");
assert(page_kids.getArrayItem(0).getObjGen() ==
page_kids.getArrayItem(1).getObjGen());
std::vector<QPDFObjectHandle> const& pages = pdf.getAllPages();
assert(pages.size() == 3);
assert(! (pages.at(0).getObjGen() == pages.at(1).getObjGen()));
assert(QPDFObjectHandle(pages.at(0)).getKey("/Contents").getObjGen() ==
QPDFObjectHandle(pages.at(1)).getKey("/Contents").getObjGen());
pdf.removePage(pages.at(0));
std::cout << "you can't see this" << std::endl;
assert(pages.size() == 2);
PointerHolder<Buffer> b = QPDFObjectHandle(pages.at(0)).
getKey("/Contents").getStreamData();
std::string contents = std::string(
reinterpret_cast<char const*>(b->getBuffer()),
b->getSize());
assert(contents.find("page 0") != std::string::npos);
}
else if (n == 18)
{