mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 02:49:00 +00:00
Resolve duplicated page objects (fixes #268)
When linearizing a file or getting the list of all pages in a file, detect if the pages tree contains a duplicated page object and, if so, shallow copy it. This makes it possible to have a one to one mapping of page positions to page objects.
This commit is contained in:
parent
9e01c8bd99
commit
52f9d326a5
@ -1,3 +1,11 @@
|
||||
2019-01-28 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* When linearizing or getting the list of all pages in a file,
|
||||
replace duplicated page objects with a shallow copy of the page
|
||||
object. Linearization and all page manipulation APIs require page
|
||||
objects to be unique. Pages that were originally duplicated will
|
||||
still share contents and any other indirect resources. Fixes #268.
|
||||
|
||||
2019-01-26 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Add --overlay and --underlay options. Fixes #207.
|
||||
|
@ -527,15 +527,16 @@ class QPDF
|
||||
void optimize(std::map<int, int> const& object_stream_data,
|
||||
bool allow_changes = true);
|
||||
|
||||
// Traverse page tree return all /Page objects. For efficiency,
|
||||
// this method returns a const reference to an internal vector of
|
||||
// pages. Calls to addPage, addPageAt, and removePage safely
|
||||
// update this, but directly manipulation of the pages three or
|
||||
// pushing inheritable objects to the page level may invalidate
|
||||
// it. See comments for updateAllPagesCache() for additional
|
||||
// notes. Newer code should use
|
||||
// QPDFPageDocumentHelper::getAllPages instead. The decision to
|
||||
// expose this internal cache was arguably incorrect, but it is
|
||||
// Traverse page tree return all /Page objects. It also detects
|
||||
// and resolves cases in which the same /Page object is
|
||||
// duplicated. For efficiency, this method returns a const
|
||||
// reference to an internal vector of pages. Calls to addPage,
|
||||
// addPageAt, and removePage safely update this, but directly
|
||||
// manipulation of the pages three or pushing inheritable objects
|
||||
// to the page level may invalidate it. See comments for
|
||||
// updateAllPagesCache() for additional notes. Newer code should
|
||||
// use QPDFPageDocumentHelper::getAllPages instead. The decision
|
||||
// to expose this internal cache was arguably incorrect, but it is
|
||||
// being left here for compatibility. It is, however, completely
|
||||
// safe to use this for files that you are not modifying.
|
||||
QPDF_DLL
|
||||
@ -895,6 +896,10 @@ class QPDF
|
||||
void getAllPagesInternal2(QPDFObjectHandle cur_pages,
|
||||
std::vector<QPDFObjectHandle>& result,
|
||||
std::set<QPDFObjGen>& visited);
|
||||
void getAllPagesInternal3(QPDFObjectHandle cur_pages,
|
||||
std::vector<QPDFObjectHandle>& result,
|
||||
std::set<QPDFObjGen>& visited,
|
||||
std::set<QPDFObjGen>& seen);
|
||||
void insertPage(QPDFObjectHandle newpage, int pos);
|
||||
int findPage(QPDFObjGen const& og);
|
||||
int findPage(QPDFObjectHandle& page);
|
||||
|
@ -156,6 +156,9 @@ QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys)
|
||||
return;
|
||||
}
|
||||
|
||||
// Calling getAllPages() resolves any duplicated page objects.
|
||||
getAllPages();
|
||||
|
||||
// key_ancestors is a mapping of page attribute keys to a stack of
|
||||
// Pages nodes that contain values for them.
|
||||
std::map<std::string, std::vector<QPDFObjectHandle> > key_ancestors;
|
||||
|
@ -62,8 +62,18 @@ QPDF::getAllPagesInternal(QPDFObjectHandle cur_pages,
|
||||
|
||||
void
|
||||
QPDF::getAllPagesInternal2(QPDFObjectHandle cur_pages,
|
||||
std::vector<QPDFObjectHandle>& result,
|
||||
std::set<QPDFObjGen>& visited)
|
||||
std::vector<QPDFObjectHandle>& result,
|
||||
std::set<QPDFObjGen>& visited)
|
||||
{
|
||||
std::set<QPDFObjGen> seen;
|
||||
getAllPagesInternal3(cur_pages, result, visited, seen);
|
||||
}
|
||||
|
||||
void
|
||||
QPDF::getAllPagesInternal3(QPDFObjectHandle cur_pages,
|
||||
std::vector<QPDFObjectHandle>& result,
|
||||
std::set<QPDFObjGen>& visited,
|
||||
std::set<QPDFObjGen>& seen)
|
||||
{
|
||||
QPDFObjGen this_og = cur_pages.getObjGen();
|
||||
if (visited.count(this_og) > 0)
|
||||
@ -94,11 +104,21 @@ QPDF::getAllPagesInternal2(QPDFObjectHandle cur_pages,
|
||||
int n = kids.getArrayNItems();
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
getAllPagesInternal2(kids.getArrayItem(i), result, visited);
|
||||
QPDFObjectHandle kid = kids.getArrayItem(i);
|
||||
if (seen.count(kid.getObjGen()))
|
||||
{
|
||||
// Make a copy of the page. This does the same as
|
||||
// shallowCopyPage in QPDFPageObjectHelper.
|
||||
QTC::TC("qpdf", "QPDF resolve duplicated page object");
|
||||
kid = makeIndirectObject(QPDFObjectHandle(kid).shallowCopy());
|
||||
kids.setArrayItem(i, kid);
|
||||
}
|
||||
getAllPagesInternal3(kid, result, visited, seen);
|
||||
}
|
||||
}
|
||||
else if (type == "/Page")
|
||||
{
|
||||
seen.insert(this_og);
|
||||
result.push_back(cur_pages);
|
||||
}
|
||||
else
|
||||
|
@ -4410,6 +4410,13 @@ print "\n";
|
||||
suite and properly handled.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Linearization and page manipulation APIs now detect and
|
||||
recover from files that have duplicate Page objects in the
|
||||
pages tree.
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</listitem>
|
||||
<listitem>
|
||||
|
@ -428,3 +428,4 @@ QPDFFormFieldObjectHelper fallback Tf 0
|
||||
QPDFPageObjectHelper non-trivial inheritance 0
|
||||
QPDFPageObjectHelper copy shared attribute 0
|
||||
qpdf from_nr from repeat_nr 0
|
||||
QPDF resolve duplicated page object 0
|
||||
|
@ -582,7 +582,7 @@ $td->runtest("check output",
|
||||
{$td->FILE => "page_api_1-out2.pdf"});
|
||||
$td->runtest("duplicate page",
|
||||
{$td->COMMAND => "test_driver 17 page_api_2.pdf"},
|
||||
{$td->FILE => "page_api_2.out", $td->EXIT_STATUS => 2},
|
||||
{$td->FILE => "page_api_2.out", $td->EXIT_STATUS => 0},
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
$td->runtest("delete and re-add a page",
|
||||
{$td->COMMAND => "test_driver 18 page_api_1.pdf"},
|
||||
@ -1727,6 +1727,30 @@ foreach my $f (qw(screen print))
|
||||
{$td->FILE => "manual-appearances-$f-out.pdf"});
|
||||
}
|
||||
|
||||
show_ntests();
|
||||
# ----------
|
||||
$td->notify("--- Duplicated Page Object ---");
|
||||
$n_tests += 4;
|
||||
|
||||
$td->runtest("linearize duplicated pages",
|
||||
{$td->COMMAND =>
|
||||
"qpdf --static-id --linearize" .
|
||||
" page_api_2.pdf a.pdf"},
|
||||
{$td->STRING => "", $td->EXIT_STATUS => 0},
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
$td->runtest("compare files",
|
||||
{$td->FILE => "a.pdf"},
|
||||
{$td->FILE => "linearize-duplicate-page.pdf"});
|
||||
$td->runtest("extract duplicated pages",
|
||||
{$td->COMMAND =>
|
||||
"qpdf --static-id page_api_2.pdf" .
|
||||
" --pages . -- a.pdf"},
|
||||
{$td->STRING => "", $td->EXIT_STATUS => 0},
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
$td->runtest("compare files",
|
||||
{$td->FILE => "a.pdf"},
|
||||
{$td->FILE => "extract-duplicate-page.pdf"});
|
||||
|
||||
show_ntests();
|
||||
# ----------
|
||||
$td->notify("--- Merging and Splitting ---");
|
||||
|
BIN
qpdf/qtest/qpdf/extract-duplicate-page.pdf
Normal file
BIN
qpdf/qtest/qpdf/extract-duplicate-page.pdf
Normal file
Binary file not shown.
BIN
qpdf/qtest/qpdf/linearize-duplicate-page.pdf
Normal file
BIN
qpdf/qtest/qpdf/linearize-duplicate-page.pdf
Normal file
Binary file not shown.
@ -1 +1 @@
|
||||
page_api_2.pdf (page 1 (numbered from zero): object 4 0): duplicate page reference found; this would cause loss of data
|
||||
test 17 done
|
||||
|
@ -923,11 +923,24 @@ void runtest(int n, char const* filename1, char const* arg2)
|
||||
}
|
||||
else if (n == 17)
|
||||
{
|
||||
// The input file to this test case is broken to exercise an
|
||||
// error condition.
|
||||
// The input file to this test case has a duplicated page.
|
||||
QPDFObjectHandle page_kids =
|
||||
pdf.getRoot().getKey("/Pages").getKey("/Kids");
|
||||
assert(page_kids.getArrayItem(0).getObjGen() ==
|
||||
page_kids.getArrayItem(1).getObjGen());
|
||||
std::vector<QPDFObjectHandle> const& pages = pdf.getAllPages();
|
||||
assert(pages.size() == 3);
|
||||
assert(! (pages.at(0).getObjGen() == pages.at(1).getObjGen()));
|
||||
assert(QPDFObjectHandle(pages.at(0)).getKey("/Contents").getObjGen() ==
|
||||
QPDFObjectHandle(pages.at(1)).getKey("/Contents").getObjGen());
|
||||
pdf.removePage(pages.at(0));
|
||||
std::cout << "you can't see this" << std::endl;
|
||||
assert(pages.size() == 2);
|
||||
PointerHolder<Buffer> b = QPDFObjectHandle(pages.at(0)).
|
||||
getKey("/Contents").getStreamData();
|
||||
std::string contents = std::string(
|
||||
reinterpret_cast<char const*>(b->getBuffer()),
|
||||
b->getSize());
|
||||
assert(contents.find("page 0") != std::string::npos);
|
||||
}
|
||||
else if (n == 18)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user