Add optional conflict detection to mergeResources

Also improve behavior around direct vs. indirect resources.
This commit is contained in:
Jay Berkenbilt 2021-03-02 06:28:55 -05:00
parent e17585c2d2
commit d7ffdfa994
9 changed files with 557 additions and 408 deletions

View File

@ -1,3 +1,14 @@
2021-03-03 Jay Berkenbilt <ejb@ql.org>
* Add QPDFObjectHandle::makeResourcesIndirect
2021-03-02 Jay Berkenbilt <ejb@ql.org>
* Add an optional resource_names argument to getUniqueResourceName
for added efficiency.
* Add conflict detection QPDFObjectHandle::mergeResources.
2021-03-01 Jay Berkenbilt <ejb@ql.org>
* Improve code that finds unreferenced resources to ignore names

View File

@ -731,13 +731,27 @@ class QPDFObjectHandle
QPDF_DLL
bool isOrHasName(std::string const&);
// Merge resource dictionaries. Assumes resource dictionaries have
// the property that the collection of keys of all first-level
// dictionary members contains no duplicates. This method does
// nothing if both this object and the other object are not
// dictionaries. Otherwise, it has following behavior, where
// "object" refers to the object whose method is invoked, and
// "other" refers to the argument:
// Make all resources in a resource dictionary indirect. This just
// goes through all entries of top-level subdictionaries and
// converts any direct objects to indirect objects. This can be
// useful to call before mergeResources if it is going to be
// called multiple times to prevent resources from being copied
// multiple times.
QPDF_DLL
void makeResourcesIndirect(QPDF& owning_qpdf);
// Merge resource dictionaries. If the "conflicts" parameter is
// provided, conflicts in dictionary subitems are resolved, and
// "conflicts" is initialized to a map such that
// conflicts[resource_type][old_key] == [new_key]
//
// See also makeResourcesIndirect, which can be useful to call
// before calling this.
//
// This method does nothing if both this object and the other
// object are not dictionaries. Otherwise, it has following
// behavior, where "object" refers to the object whose method is
// invoked, and "other" refers to the argument:
//
// * For each key in "other" whose value is an array:
// * If "object" does not have that entry, shallow copy it.
@ -747,20 +761,32 @@ class QPDFObjectHandle
// * For each key in "other" whose value is a dictionary:
// * If "object" does not have that entry, shallow copy it.
// * Otherwise, for each key in the subdictionary:
// * If key is not present in "object"'s entry, shallow copy it.
// * Otherwise, ignore. Conflicts are not detected.
// * If key is not present in "object"'s entry, shallow copy
// it if direct or just add it if indirect.
// * Otherwise, if conflicts are being detected:
// * If there is a key (oldkey) already in the dictionary
// that points to the same indirect destination as key,
// indicate that key was replaced by oldkey. This would
// happen if these two resource dictionaries have
// previously been merged.
// * Otherwise pick a new key (newkey) that is unique within
// the resource dictionary, store that in the resource
// dictionary with key's destination as its destination,
// and indicate that key was replaced by newkey.
//
// The primary purpose of this method is to facilitate merging of
// resource dictionaries that are supposed to have the same scope
// as each other. For example, this can be used to merge a form
// XObject's /Resources dictionary with a form field's /DR.
// Conflicts are not detected. If, in the future, there should be
// a need to detect conflicts, this method could detect them and
// return a mapping from old to new names. This mapping could be
// used for filtering the stream. This would be necessary, for
// example, to merge a form XObject's resources with a page's
// resources with the intention of concatenating the content
// streams.
// XObject's /Resources dictionary with a form field's /DR or to
// merge two /DR dictionaries. The "conflicts" parameter may be
// previously initialized. This method adds to whatever is already
// there, which can be useful when merging with multiple things.
QPDF_DLL
void mergeResources(
QPDFObjectHandle other,
std::map<std::string, std::map<std::string, std::string>>* conflicts);
// ABI: eliminate version without conflicts and make conflicts
// default to nullptr.
QPDF_DLL
void mergeResources(QPDFObjectHandle other);
@ -779,7 +805,19 @@ class QPDFObjectHandle
// increase efficiency if adding multiple items with the same
// prefix. (Why doesn't it set min_suffix to the next number?
// Well, maybe you aren't going to actually use the name it
// returns.)
// returns.) If you are calling this multiple times on the same
// resource dictionary, you can initialize resource_names by
// calling getResourceNames(), incrementally update it as you add
// resources, and keep passing it in so that getUniqueResourceName
// doesn't have to traverse the resource dictionary each time it's
// called.
QPDF_DLL
std::string getUniqueResourceName(
std::string const& prefix,
int& min_suffix,
std::set<std::string>* resource_names);
// ABI: remove this version and make resource_names default to
// nullptr.
QPDF_DLL
std::string getUniqueResourceName(std::string const& prefix,
int& min_suffix);

View File

@ -1056,60 +1056,143 @@ QPDFObjectHandle::isOrHasName(std::string const& value)
return false;
}
void
QPDFObjectHandle::makeResourcesIndirect(QPDF& owning_qpdf)
{
if (! isDictionary())
{
return;
}
for (auto const& i1: ditems())
{
QPDFObjectHandle sub = i1.second;
if (! sub.isDictionary())
{
continue;
}
for (auto i2: sub.ditems())
{
std::string const& key = i2.first;
QPDFObjectHandle val = i2.second;
if (! val.isIndirect())
{
sub.replaceKey(key, owning_qpdf.makeIndirectObject(val));
}
}
}
}
void
QPDFObjectHandle::mergeResources(QPDFObjectHandle other)
{
mergeResources(other, nullptr);
}
void
QPDFObjectHandle::mergeResources(
QPDFObjectHandle other,
std::map<std::string, std::map<std::string, std::string>>* conflicts)
{
if (! (isDictionary() && other.isDictionary()))
{
QTC::TC("qpdf", "QPDFObjectHandle merge top type mismatch");
return;
}
std::set<std::string> other_keys = other.getKeys();
for (std::set<std::string>::iterator iter = other_keys.begin();
iter != other_keys.end(); ++iter)
auto make_og_to_name = [](
QPDFObjectHandle& dict,
std::map<QPDFObjGen, std::string>& og_to_name)
{
std::string const& key = *iter;
QPDFObjectHandle other_val = other.getKey(key);
if (hasKey(key))
for (auto i: dict.ditems())
{
QPDFObjectHandle this_val = getKey(key);
if (i.second.isIndirect())
{
og_to_name[i.second.getObjGen()] = i.first;
}
}
};
// This algorithm is described in comments in QPDFObjectHandle.hh
// above the declaration of mergeResources.
for (auto o_top: other.ditems())
{
std::string const& rtype = o_top.first;
QPDFObjectHandle other_val = o_top.second;
if (hasKey(rtype))
{
QPDFObjectHandle this_val = getKey(rtype);
if (this_val.isDictionary() && other_val.isDictionary())
{
if (this_val.isIndirect())
{
// Do this even if there are no keys. Various
// places in the code call mergeResources with
// resource dictionaries that contain empty
// subdictionaries just to get this shallow copy
// functionality.
QTC::TC("qpdf", "QPDFObjectHandle replace with copy");
this_val = this_val.shallowCopy();
replaceKey(key, this_val);
replaceKey(rtype, this_val);
}
std::set<std::string> other_val_keys = other_val.getKeys();
for (std::set<std::string>::iterator i2 =
other_val_keys.begin();
i2 != other_val_keys.end(); ++i2)
std::map<QPDFObjGen, std::string> og_to_name;
std::set<std::string> rnames;
int min_suffix = 1;
bool initialized_maps = false;
for (auto ov_iter: other_val.ditems())
{
if (! this_val.hasKey(*i2))
std::string const& key = ov_iter.first;
QPDFObjectHandle rval = ov_iter.second;
if (! this_val.hasKey(key))
{
QTC::TC("qpdf", "QPDFObjectHandle merge shallow copy");
this_val.replaceKey(
*i2, other_val.getKey(*i2).shallowCopy());
if (! rval.isIndirect())
{
QTC::TC("qpdf", "QPDFObjectHandle merge shallow copy");
rval = rval.shallowCopy();
}
this_val.replaceKey(key, rval);
}
else if (conflicts)
{
if (! initialized_maps)
{
make_og_to_name(this_val, og_to_name);
rnames = this_val.getResourceNames();
initialized_maps = true;
}
auto rval_og = rval.getObjGen();
if (rval.isIndirect() &&
og_to_name.count(rval_og))
{
QTC::TC("qpdf", "QPDFObjectHandle merge reuse");
auto new_key = og_to_name[rval_og];
if (new_key != key)
{
(*conflicts)[rtype][key] = new_key;
}
}
else
{
QTC::TC("qpdf", "QPDFObjectHandle merge generate");
std::string new_key = getUniqueResourceName(
key + "_", min_suffix, &rnames);
(*conflicts)[rtype][key] = new_key;
this_val.replaceKey(new_key, rval);
}
}
}
}
else if (this_val.isArray() && other_val.isArray())
{
std::set<std::string> scalars;
int n = this_val.getArrayNItems();
for (int i = 0; i < n; ++i)
for (auto this_item: this_val.aitems())
{
QPDFObjectHandle this_item = this_val.getArrayItem(i);
if (this_item.isScalar())
{
scalars.insert(this_item.unparse());
}
}
n = other_val.getArrayNItems();
for (int i = 0; i < n; ++i)
for (auto other_item: other_val.aitems())
{
QPDFObjectHandle other_item = other_val.getArrayItem(i);
if (other_item.isScalar())
{
if (scalars.count(other_item.unparse()) == 0)
@ -1128,7 +1211,7 @@ QPDFObjectHandle::mergeResources(QPDFObjectHandle other)
else
{
QTC::TC("qpdf", "QPDFObjectHandle merge copy from other");
replaceKey(key, other_val.shallowCopy());
replaceKey(rtype, other_val.shallowCopy());
}
}
}
@ -1165,7 +1248,16 @@ std::string
QPDFObjectHandle::getUniqueResourceName(std::string const& prefix,
int& min_suffix)
{
std::set<std::string> names = getResourceNames();
return getUniqueResourceName(prefix, min_suffix, nullptr);
}
std::string
QPDFObjectHandle::getUniqueResourceName(std::string const& prefix,
int& min_suffix,
std::set<std::string>* namesp)
{
std::set<std::string> names = (namesp ? *namesp : getResourceNames());
int max_suffix = min_suffix + QIntC::to_int(names.size());
while (min_suffix <= max_suffix)
{

View File

@ -581,3 +581,5 @@ qpdf copy form fields in pages 0
qpdf keep some fields in pages 0
qpdf pages keeping field from original 0
qpdf no more fields in pages 0
QPDFObjectHandle merge reuse 0
QPDFObjectHandle merge generate 0

View File

@ -1598,7 +1598,7 @@ $td->runtest("merge dictionary",
$td->NORMALIZE_NEWLINES);
$td->runtest("unique resource name",
{$td->COMMAND => "test_driver 60 minimal.pdf"},
{$td->STRING => "test 60 done\n", $td->EXIT_STATUS => 0},
{$td->FILE => "test60.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->runtest("check output",
{$td->FILE => "a.pdf"},

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,24 @@
first merge
/Y:
/F3 -> /F3_1
/Z:
/F2 -> /F2_1
second merge
/Y:
/F3 -> /F3_1
/F5 -> /F5_1
/Z:
/F2 -> /F2_1
third merge
/Y:
/F3 -> /F3_1
/F5 -> /F5_1
/Z:
/F2 -> /F2_1
fourth merge
/Y:
/F3 -> /F3_1
/F5 -> /F5_1
/Z:
/F2 -> /F2_1
test 60 done

Binary file not shown.

View File

@ -2362,7 +2362,9 @@ void runtest(int n, char const* filename1, char const* arg2)
}
else if (n == 60)
{
// Boundary condition testing for getUniqueResourceName
// Boundary condition testing for getUniqueResourceName;
// additional testing of mergeResources with conflict
// detection
QPDFObjectHandle r1 = QPDFObjectHandle::newDictionary();
int min_suffix = 1;
for (int i = 1; i < 3; ++i)
@ -2372,8 +2374,69 @@ void runtest(int n, char const* filename1, char const* arg2)
r1.getKey("/Z").replaceKey(
name, QPDFObjectHandle::newString("moo"));
}
pdf.getTrailer().replaceKey("/QTest", r1);
auto make_resource = [&](QPDFObjectHandle& dict,
std::string const& key,
std::string const& str) {
auto o1 = QPDFObjectHandle::newArray();
o1.appendItem(QPDFObjectHandle::newString(str));
dict.replaceKey(key, pdf.makeIndirectObject(o1));
};
auto z = r1.getKey("/Z");
r1.replaceKey("/Y", QPDFObjectHandle::newDictionary());
auto y = r1.getKey("/Y");
make_resource(z, "/F1", "r1.Z.F1");
make_resource(z, "/F2", "r1.Z.F2");
make_resource(y, "/F2", "r1.Y.F2");
make_resource(y, "/F3", "r1.Y.F3");
QPDFObjectHandle r2 =
QPDFObjectHandle::parse("<< /Z << >> /Y << >> >>");
z = r2.getKey("/Z");
y = r2.getKey("/Y");
make_resource(z, "/F2", "r2.Z.F2");
make_resource(y, "/F3", "r2.Y.F3");
make_resource(y, "/F4", "r2.Y.F4");
// Add a direct object
y.replaceKey("/F5", QPDFObjectHandle::newString("direct r2.Y.F5"));
std::map<std::string, std::map<std::string, std::string>> conflicts;
auto show_conflicts = [&](std::string const& msg) {
std::cout << msg << std::endl;
for (auto const& i1: conflicts)
{
std::cout << i1.first << ":" << std::endl;
for (auto const& i2: i1.second)
{
std::cout << " " << i2.first << " -> " << i2.second
<< std::endl;
}
}
};
r1.mergeResources(r2, &conflicts);
show_conflicts("first merge");
auto r3 = r1.shallowCopy();
// Merge again. The direct object gets recopied. Everything
// else is the same.
r1.mergeResources(r2, &conflicts);
show_conflicts("second merge");
// Make all resources in r2 direct. Then merge two more times.
// We should get the one previously direct object copied one
// time as an indirect object.
r2.makeResourcesIndirect(pdf);
r1.mergeResources(r2, &conflicts);
show_conflicts("third merge");
r1.mergeResources(r2, &conflicts);
show_conflicts("fourth merge");
// The only differences between /QTest and /QTest3 should be
// the direct objects merged from r2.
pdf.getTrailer().replaceKey("/QTest1", r1);
pdf.getTrailer().replaceKey("/QTest2", r2);
pdf.getTrailer().replaceKey("/QTest3", r3);
QPDFWriter w(pdf, "a.pdf");
w.setQDFMode(true);
w.setStaticID(true);
w.write();
}