qpdf/libqpdf/QPDF_optimization.cc

467 lines
16 KiB
C++

// See the "Optimization" section of the manual.
#include <qpdf/assert_debug.h>
#include <qpdf/QPDF.hh>
#include <qpdf/QPDFExc.hh>
#include <qpdf/QPDF_Array.hh>
#include <qpdf/QPDF_Dictionary.hh>
#include <qpdf/QTC.hh>
QPDF::ObjUser::ObjUser() :
ou_type(ou_bad),
pageno(0)
{
}
QPDF::ObjUser::ObjUser(user_e type) :
ou_type(type),
pageno(0)
{
qpdf_assert_debug(type == ou_root);
}
QPDF::ObjUser::ObjUser(user_e type, int pageno) :
ou_type(type),
pageno(pageno)
{
qpdf_assert_debug((type == ou_page) || (type == ou_thumb));
}
QPDF::ObjUser::ObjUser(user_e type, std::string const& key) :
ou_type(type),
pageno(0),
key(key)
{
qpdf_assert_debug((type == ou_trailer_key) || (type == ou_root_key));
}
bool
QPDF::ObjUser::operator<(ObjUser const& rhs) const
{
if (this->ou_type < rhs.ou_type) {
return true;
} else if (this->ou_type == rhs.ou_type) {
if (this->pageno < rhs.pageno) {
return true;
} else if (this->pageno == rhs.pageno) {
return (this->key < rhs.key);
}
}
return false;
}
void
QPDF::optimize(
std::map<int, int> const& object_stream_data,
bool allow_changes,
std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
{
if (!this->m->obj_user_to_objects.empty()) {
// already optimized
return;
}
// The PDF specification indicates that /Outlines is supposed to
// be an indirect reference. Force it to be so if it exists and
// is direct. (This has been seen in the wild.)
QPDFObjectHandle root = getRoot();
if (root.getKey("/Outlines").isDictionary()) {
QPDFObjectHandle outlines = root.getKey("/Outlines");
if (!outlines.isIndirect()) {
QTC::TC("qpdf", "QPDF_optimization indirect outlines");
root.replaceKey("/Outlines", makeIndirectObject(outlines));
}
}
// Traverse pages tree pushing all inherited resources down to the
// page level. This also initializes this->m->all_pages.
pushInheritedAttributesToPage(allow_changes, false);
// Traverse pages
int n = toI(this->m->all_pages.size());
for (int pageno = 0; pageno < n; ++pageno) {
updateObjectMaps(
ObjUser(ObjUser::ou_page, pageno),
this->m->all_pages.at(toS(pageno)),
skip_stream_parameters);
}
// Traverse document-level items
std::set<std::string> keys = this->m->trailer.getKeys();
for (auto const& key: keys) {
if (key == "/Root") {
// handled separately
} else {
updateObjectMaps(
ObjUser(ObjUser::ou_trailer_key, key),
this->m->trailer.getKey(key),
skip_stream_parameters);
}
}
keys = root.getKeys();
for (auto const& key: keys) {
// Technically, /I keys from /Thread dictionaries are supposed
// to be handled separately, but we are going to disregard
// that specification for now. There is loads of evidence
// that pdlin and Acrobat both disregard things like this from
// time to time, so this is almost certain not to cause any
// problems.
updateObjectMaps(
ObjUser(ObjUser::ou_root_key, key),
root.getKey(key),
skip_stream_parameters);
}
ObjUser root_ou = ObjUser(ObjUser::ou_root);
QPDFObjGen root_og = QPDFObjGen(root.getObjGen());
this->m->obj_user_to_objects[root_ou].insert(root_og);
this->m->object_to_obj_users[root_og].insert(root_ou);
filterCompressedObjects(object_stream_data);
}
void
QPDF::pushInheritedAttributesToPage()
{
// Public API should not have access to allow_changes.
pushInheritedAttributesToPage(true, false);
}
void
QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys)
{
// Traverse pages tree pushing all inherited resources down to the
// page level.
// The record of whether we've done this is cleared by
// updateAllPagesCache(). If we're warning for skipped keys,
// re-traverse unconditionally.
if (this->m->pushed_inherited_attributes_to_pages && (!warn_skipped_keys)) {
return;
}
// Calling getAllPages() resolves any duplicated page objects.
getAllPages();
// key_ancestors is a mapping of page attribute keys to a stack of
// Pages nodes that contain values for them.
std::map<std::string, std::vector<QPDFObjectHandle>> key_ancestors;
this->m->all_pages.clear();
std::set<QPDFObjGen> visited;
pushInheritedAttributesToPageInternal(
this->m->trailer.getKey("/Root").getKey("/Pages"),
key_ancestors,
this->m->all_pages,
allow_changes,
warn_skipped_keys,
visited);
if (!key_ancestors.empty()) {
throw std::logic_error("key_ancestors not empty after"
" pushing inherited attributes to pages");
}
this->m->pushed_inherited_attributes_to_pages = true;
}
void
QPDF::pushInheritedAttributesToPageInternal(
QPDFObjectHandle cur_pages,
std::map<std::string, std::vector<QPDFObjectHandle>>& key_ancestors,
std::vector<QPDFObjectHandle>& pages,
bool allow_changes,
bool warn_skipped_keys,
std::set<QPDFObjGen>& visited)
{
QPDFObjGen this_og = cur_pages.getObjGen();
if (visited.count(this_og) > 0) {
throw QPDFExc(
qpdf_e_pages,
this->m->file->getName(),
this->m->last_object_description,
0,
"Loop detected in /Pages structure (inherited attributes)");
}
visited.insert(this_og);
if (!cur_pages.isDictionary()) {
throw QPDFExc(
qpdf_e_damaged_pdf,
this->m->file->getName(),
this->m->last_object_description,
this->m->file->getLastOffset(),
"invalid object in page tree");
}
// Extract the underlying dictionary object
std::string type = cur_pages.getKey("/Type").getName();
if (type == "/Pages") {
// Make a list of inheritable keys. Only the keys /MediaBox,
// /CropBox, /Resources, and /Rotate are inheritable
// attributes. Push this object onto the stack of pages nodes
// that have values for this attribute.
std::set<std::string> inheritable_keys;
std::set<std::string> keys = cur_pages.getKeys();
for (auto const& key: keys) {
if ((key == "/MediaBox") || (key == "/CropBox") ||
(key == "/Resources") || (key == "/Rotate")) {
if (!allow_changes) {
throw QPDFExc(
qpdf_e_internal,
this->m->file->getName(),
this->m->last_object_description,
this->m->file->getLastOffset(),
"optimize detected an "
"inheritable attribute when called "
"in no-change mode");
}
// This is an inheritable resource
inheritable_keys.insert(key);
QPDFObjectHandle oh = cur_pages.getKey(key);
QTC::TC(
"qpdf",
"QPDF opt direct pages resource",
oh.isIndirect() ? 0 : 1);
if (!oh.isIndirect()) {
if (!oh.isScalar()) {
// Replace shared direct object non-scalar
// resources with indirect objects to avoid
// copying large structures around.
cur_pages.replaceKey(key, makeIndirectObject(oh));
oh = cur_pages.getKey(key);
} else {
// It's okay to copy scalars.
QTC::TC("qpdf", "QPDF opt inherited scalar");
}
}
key_ancestors[key].push_back(oh);
if (key_ancestors[key].size() > 1) {
QTC::TC("qpdf", "QPDF opt key ancestors depth > 1");
}
// Remove this resource from this node. It will be
// reattached at the page level.
cur_pages.removeKey(key);
} else if (!((key == "/Type") || (key == "/Parent") ||
(key == "/Kids") || (key == "/Count"))) {
// Warn when flattening, but not if the key is at the top
// level (i.e. "/Parent" not set), as we don't change these;
// but flattening removes intermediate /Pages nodes.
if ((warn_skipped_keys) && (cur_pages.hasKey("/Parent"))) {
QTC::TC("qpdf", "QPDF unknown key not inherited");
setLastObjectDescription(
"Pages object",
cur_pages.getObjectID(),
cur_pages.getGeneration());
warn(
qpdf_e_pages,
this->m->last_object_description,
0,
("Unknown key " + key +
" in /Pages object"
" is being discarded as a result of"
" flattening the /Pages tree"));
}
}
}
// Visit descendant nodes.
QPDFObjectHandle kids = cur_pages.getKey("/Kids");
int n = kids.getArrayNItems();
for (int i = 0; i < n; ++i) {
pushInheritedAttributesToPageInternal(
kids.getArrayItem(i),
key_ancestors,
pages,
allow_changes,
warn_skipped_keys,
visited);
}
// For each inheritable key, pop the stack. If the stack
// becomes empty, remove it from the map. That way, the
// invariant that the list of keys in key_ancestors is exactly
// those keys for which inheritable attributes are available.
if (!inheritable_keys.empty()) {
QTC::TC("qpdf", "QPDF opt inheritable keys");
for (auto const& key: inheritable_keys) {
key_ancestors[key].pop_back();
if (key_ancestors[key].empty()) {
QTC::TC("qpdf", "QPDF opt erase empty key ancestor");
key_ancestors.erase(key);
}
}
} else {
QTC::TC("qpdf", "QPDF opt no inheritable keys");
}
} else if (type == "/Page") {
// Add all available inheritable attributes not present in
// this object to this object.
for (auto const& iter: key_ancestors) {
std::string const& key = iter.first;
if (!cur_pages.hasKey(key)) {
QTC::TC("qpdf", "QPDF opt resource inherited");
cur_pages.replaceKey(key, iter.second.back());
} else {
QTC::TC("qpdf", "QPDF opt page resource hides ancestor");
}
}
pages.push_back(cur_pages);
} else {
throw QPDFExc(
qpdf_e_damaged_pdf,
this->m->file->getName(),
this->m->last_object_description,
this->m->file->getLastOffset(),
"invalid Type " + type + " in page tree");
}
visited.erase(this_og);
}
void
QPDF::updateObjectMaps(
ObjUser const& ou,
QPDFObjectHandle oh,
std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
{
std::set<QPDFObjGen> visited;
updateObjectMapsInternal(ou, oh, skip_stream_parameters, visited, true, 0);
}
void
QPDF::updateObjectMapsInternal(
ObjUser const& ou,
QPDFObjectHandle oh,
std::function<int(QPDFObjectHandle&)> skip_stream_parameters,
std::set<QPDFObjGen>& visited,
bool top,
int depth)
{
// Traverse the object tree from this point taking care to avoid
// crossing page boundaries.
bool is_page_node = false;
if (oh.isDictionaryOfType("/Page")) {
is_page_node = true;
if (!top) {
return;
}
}
if (oh.isIndirect()) {
QPDFObjGen og(oh.getObjGen());
if (visited.count(og)) {
QTC::TC("qpdf", "QPDF opt loop detected");
return;
}
this->m->obj_user_to_objects[ou].insert(og);
this->m->object_to_obj_users[og].insert(ou);
visited.insert(og);
}
if (oh.isArray()) {
int n = oh.getArrayNItems();
for (int i = 0; i < n; ++i) {
updateObjectMapsInternal(
ou,
oh.getArrayItem(i),
skip_stream_parameters,
visited,
false,
1 + depth);
}
} else if (oh.isDictionary() || oh.isStream()) {
QPDFObjectHandle dict = oh;
bool is_stream = oh.isStream();
int ssp = 0;
if (is_stream) {
dict = oh.getDict();
if (skip_stream_parameters) {
ssp = skip_stream_parameters(oh);
}
}
std::set<std::string> keys = dict.getKeys();
for (auto const& key: keys) {
if (is_page_node && (key == "/Thumb")) {
// Traverse page thumbnail dictionaries as a special
// case.
updateObjectMapsInternal(
ObjUser(ObjUser::ou_thumb, ou.pageno),
dict.getKey(key),
skip_stream_parameters,
visited,
false,
1 + depth);
} else if (is_page_node && (key == "/Parent")) {
// Don't traverse back up the page tree
} else if (
((ssp >= 1) && (key == "/Length")) ||
((ssp >= 2) &&
((key == "/Filter") || (key == "/DecodeParms")))) {
// Don't traverse into stream parameters that we are
// not going to write.
} else {
updateObjectMapsInternal(
ou,
dict.getKey(key),
skip_stream_parameters,
visited,
false,
1 + depth);
}
}
}
}
void
QPDF::filterCompressedObjects(std::map<int, int> const& object_stream_data)
{
if (object_stream_data.empty()) {
return;
}
// Transform object_to_obj_users and obj_user_to_objects so that
// they refer only to uncompressed objects. If something is a
// user of a compressed object, then it is really a user of the
// object stream that contains it.
std::map<ObjUser, std::set<QPDFObjGen>> t_obj_user_to_objects;
std::map<QPDFObjGen, std::set<ObjUser>> t_object_to_obj_users;
for (auto const& i1: this->m->obj_user_to_objects) {
ObjUser const& ou = i1.first;
std::set<QPDFObjGen> const& objects = i1.second;
for (auto const& og: objects) {
auto i2 = object_stream_data.find(og.getObj());
if (i2 == object_stream_data.end()) {
t_obj_user_to_objects[ou].insert(og);
} else {
t_obj_user_to_objects[ou].insert(QPDFObjGen(i2->second, 0));
}
}
}
for (auto const& i1: this->m->object_to_obj_users) {
QPDFObjGen const& og = i1.first;
std::set<ObjUser> const& objusers = i1.second;
for (auto const& ou: objusers) {
auto i2 = object_stream_data.find(og.getObj());
if (i2 == object_stream_data.end()) {
t_object_to_obj_users[og].insert(ou);
} else {
t_object_to_obj_users[QPDFObjGen(i2->second, 0)].insert(ou);
}
}
}
this->m->obj_user_to_objects = t_obj_user_to_objects;
this->m->object_to_obj_users = t_object_to_obj_users;
}