diff --git a/ChangeLog b/ChangeLog index 036c2bca..d9b09752 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,12 @@ 2021-03-01 Jay Berkenbilt + * Improve code that finds unreferenced resources to ignore names + in the content stream that are not fonts or XObjects. This should + reduce the number of cases when qpdf needlessly decides not to + remove unreferenced resources. Hopefully it doesn't create any new + bugs where it removes unreferenced resources that it isn't + supposed to. + * QPDFObjectHandle::ParserCallbacks: add virtual handleWarning method, and provide default (empty) implementation of it and handleEOF(). diff --git a/TODO b/TODO index b7562008..5a0b8e0b 100644 --- a/TODO +++ b/TODO @@ -34,15 +34,6 @@ Document-level work --copy-attachments-from to preserve these. What will the strategy be for deduplicating in the automatic case? -* When I get to tagged PDF, note that the presence of /Artifact and - /Standard (and maybe others?) causes a false positive on detection - of unresolved names. Example: form-fields-and-annotations.pdf. This - used to give a warning (never in a released version), but the - warning was removed. See comments about tagged pdf in - QPDFPageObjectHelper::removeUnreferencedResourcesHelper. Another - potential solution is to recognize names that refer to fonts and - xobjects but only looking at names used with Tf and Do operators. - Fuzz Errors =========== diff --git a/libqpdf/QPDFPageObjectHelper.cc b/libqpdf/QPDFPageObjectHelper.cc index 58144a3f..344ff15e 100644 --- a/libqpdf/QPDFPageObjectHelper.cc +++ b/libqpdf/QPDFPageObjectHelper.cc @@ -684,7 +684,7 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper( ResourceFinder rf; try { - ph.filterContents(&rf); + ph.parseContents(&rf); } catch (std::exception& e) { @@ -711,9 +711,9 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper( QPDFObjectHandle resources = ph.getAttribute("/Resources", true); std::vector rdicts; std::set known_names; + std::vector to_filter = {"/Font", "/XObject"}; if (resources.isDictionary()) { - std::vector to_filter = {"/Font", "/XObject"}; for (auto const& iter: to_filter) { QPDFObjectHandle dict = resources.getKey(iter); @@ -729,12 +729,17 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper( } std::set local_unresolved; - for (auto const& name: rf.getNames()) + auto names_by_rtype = rf.getNamesByResourceType(); + for (auto const& i1: to_filter) { - if (! known_names.count(name)) + for (auto const& n_iter: names_by_rtype[i1]) { - unresolved.insert(name); - local_unresolved.insert(name); + std::string const& name = n_iter.first; + if (! known_names.count(name)) + { + unresolved.insert(name); + local_unresolved.insert(name); + } } } // Older versions of the PDF spec allowed form XObjects to omit @@ -754,11 +759,17 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper( if ((! local_unresolved.empty()) && resources.isDictionary()) { - // Don't issue a warning for this case. There are some cases - // of names that aren't XObject references, for example, - // /Artifact in tagged PDF. Until we are certain that we know - // the meaning of every name in a content stream, we don't - // want to give warnings because they will be false positives. + // It's not worth issuing a warning for this case. From qpdf + // 10.3, we are hopefully only looking at names that are + // referencing fonts and XObjects, but until we're certain + // that we know the meaning of every name in a content stream, + // we don't want to give warnings that might be false + // positives. Also, this can happen in legitimate cases with + // older PDFs, and there's nothing to be done about it, so + // there's no good reason to issue a warning. The only sad + // thing is that it was a false positive that alerted me to a + // logic error in the code, and any future such errors would + // now be hidden. QTC::TC("qpdf", "QPDFPageObjectHelper unresolved names"); return false; } diff --git a/libqpdf/ResourceFinder.cc b/libqpdf/ResourceFinder.cc index 74ba671f..6b9929e4 100644 --- a/libqpdf/ResourceFinder.cc +++ b/libqpdf/ResourceFinder.cc @@ -1,28 +1,53 @@ #include ResourceFinder::ResourceFinder() : + last_name_offset(0), saw_bad(false) { } void -ResourceFinder::handleToken(QPDFTokenizer::Token const& token) +ResourceFinder::handleObject(QPDFObjectHandle obj, size_t offset, size_t) { - if ((token.getType() == QPDFTokenizer::tt_word) && - (! this->last_name.empty())) + if (obj.isOperator() && (! this->last_name.empty())) { - this->names.insert(this->last_name); + static std::map op_to_rtype = { + {"CS", "/ColorSpace"}, + {"cs", "/ColorSpace"}, + {"gs", "/ExtGState"}, + {"Tf", "/Font"}, + {"SCN", "/Pattern"}, + {"scn", "/Pattern"}, + {"BDC", "/Properties"}, + {"DP", "/Properties"}, + {"sh", "/Shading"}, + {"Do", "/XObject"}, + }; + std::string op = obj.getOperatorValue(); + std::string resource_type; + auto iter = op_to_rtype.find(op); + if (iter != op_to_rtype.end()) + { + resource_type = iter->second; + } + if (! resource_type.empty()) + { + this->names.insert(this->last_name); + this->names_by_resource_type[ + resource_type][this->last_name].insert(this->last_name_offset); + } } - else if (token.getType() == QPDFTokenizer::tt_name) + else if (obj.isName()) { - this->last_name = - QPDFObjectHandle::newName(token.getValue()).getName(); + this->last_name = obj.getName(); + this->last_name_offset = offset; } - else if (token.getType() == QPDFTokenizer::tt_bad) - { - saw_bad = true; - } - writeToken(token); +} + +void +ResourceFinder::handleWarning() +{ + this->saw_bad = true; } std::set const& @@ -31,6 +56,12 @@ ResourceFinder::getNames() const return this->names; } +std::map>> const& +ResourceFinder::getNamesByResourceType() const +{ + return this->names_by_resource_type; +} + bool ResourceFinder::sawBad() const { diff --git a/libqpdf/qpdf/ResourceFinder.hh b/libqpdf/qpdf/ResourceFinder.hh index 0ac74eab..ac3d5b4c 100644 --- a/libqpdf/qpdf/ResourceFinder.hh +++ b/libqpdf/qpdf/ResourceFinder.hh @@ -3,19 +3,26 @@ #include -class ResourceFinder: public QPDFObjectHandle::TokenFilter +class ResourceFinder: public QPDFObjectHandle::ParserCallbacks { public: ResourceFinder(); virtual ~ResourceFinder() = default; - virtual void handleToken(QPDFTokenizer::Token const&) override; + virtual void handleObject(QPDFObjectHandle, size_t, size_t) override; + virtual void handleWarning() override; std::set const& getNames() const; + std::map>> const& getNamesByResourceType() const; bool sawBad() const; private: std::string last_name; + size_t last_name_offset; std::set names; - std::map> names_by_resource_type; + std::map>> names_by_resource_type; bool saw_bad; }; diff --git a/qpdf/qtest/qpdf/split-tokens-split.out b/qpdf/qtest/qpdf/split-tokens-split.out index ab9f3b7a..8e1003be 100644 --- a/qpdf/qtest/qpdf/split-tokens-split.out +++ b/qpdf/qtest/qpdf/split-tokens-split.out @@ -1,3 +1,4 @@ +WARNING: page object 3 0 stream 5 0, stream 7 0, stream 9 0, stream 11 0 (content, offset 375): null character not allowed in name token WARNING: split-tokens.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this object WARNING: empty PDF: content normalization encountered bad tokens WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents