2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-05-29 00:10:54 +00:00

Remove unreferenced: ignore names that are not Fonts or XObjects

Converted ResourceFinder to ParserCallbacks so we can better detect
the name that precedes various operators and use the operators to sort
the names into resource types. This enables us to be smarter about
detecting unreferenced resources in pages and also sets the stage for
reconciling differences in /DR across documents.
This commit is contained in:
Jay Berkenbilt 2021-03-01 16:43:35 -05:00
parent a15ec6967d
commit e17585c2d2
6 changed files with 83 additions and 35 deletions

View File

@ -1,5 +1,12 @@
2021-03-01 Jay Berkenbilt <ejb@ql.org> 2021-03-01 Jay Berkenbilt <ejb@ql.org>
* Improve code that finds unreferenced resources to ignore names
in the content stream that are not fonts or XObjects. This should
reduce the number of cases when qpdf needlessly decides not to
remove unreferenced resources. Hopefully it doesn't create any new
bugs where it removes unreferenced resources that it isn't
supposed to.
* QPDFObjectHandle::ParserCallbacks: add virtual handleWarning * QPDFObjectHandle::ParserCallbacks: add virtual handleWarning
method, and provide default (empty) implementation of it and method, and provide default (empty) implementation of it and
handleEOF(). handleEOF().

9
TODO
View File

@ -34,15 +34,6 @@ Document-level work
--copy-attachments-from to preserve these. What will the strategy be --copy-attachments-from to preserve these. What will the strategy be
for deduplicating in the automatic case? for deduplicating in the automatic case?
* When I get to tagged PDF, note that the presence of /Artifact and
/Standard (and maybe others?) causes a false positive on detection
of unresolved names. Example: form-fields-and-annotations.pdf. This
used to give a warning (never in a released version), but the
warning was removed. See comments about tagged pdf in
QPDFPageObjectHelper::removeUnreferencedResourcesHelper. Another
potential solution is to recognize names that refer to fonts and
xobjects but only looking at names used with Tf and Do operators.
Fuzz Errors Fuzz Errors
=========== ===========

View File

@ -684,7 +684,7 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper(
ResourceFinder rf; ResourceFinder rf;
try try
{ {
ph.filterContents(&rf); ph.parseContents(&rf);
} }
catch (std::exception& e) catch (std::exception& e)
{ {
@ -711,9 +711,9 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper(
QPDFObjectHandle resources = ph.getAttribute("/Resources", true); QPDFObjectHandle resources = ph.getAttribute("/Resources", true);
std::vector<QPDFObjectHandle> rdicts; std::vector<QPDFObjectHandle> rdicts;
std::set<std::string> known_names; std::set<std::string> known_names;
std::vector<std::string> to_filter = {"/Font", "/XObject"};
if (resources.isDictionary()) if (resources.isDictionary())
{ {
std::vector<std::string> to_filter = {"/Font", "/XObject"};
for (auto const& iter: to_filter) for (auto const& iter: to_filter)
{ {
QPDFObjectHandle dict = resources.getKey(iter); QPDFObjectHandle dict = resources.getKey(iter);
@ -729,12 +729,17 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper(
} }
std::set<std::string> local_unresolved; std::set<std::string> local_unresolved;
for (auto const& name: rf.getNames()) auto names_by_rtype = rf.getNamesByResourceType();
for (auto const& i1: to_filter)
{ {
if (! known_names.count(name)) for (auto const& n_iter: names_by_rtype[i1])
{ {
unresolved.insert(name); std::string const& name = n_iter.first;
local_unresolved.insert(name); if (! known_names.count(name))
{
unresolved.insert(name);
local_unresolved.insert(name);
}
} }
} }
// Older versions of the PDF spec allowed form XObjects to omit // Older versions of the PDF spec allowed form XObjects to omit
@ -754,11 +759,17 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper(
if ((! local_unresolved.empty()) && resources.isDictionary()) if ((! local_unresolved.empty()) && resources.isDictionary())
{ {
// Don't issue a warning for this case. There are some cases // It's not worth issuing a warning for this case. From qpdf
// of names that aren't XObject references, for example, // 10.3, we are hopefully only looking at names that are
// /Artifact in tagged PDF. Until we are certain that we know // referencing fonts and XObjects, but until we're certain
// the meaning of every name in a content stream, we don't // that we know the meaning of every name in a content stream,
// want to give warnings because they will be false positives. // we don't want to give warnings that might be false
// positives. Also, this can happen in legitimate cases with
// older PDFs, and there's nothing to be done about it, so
// there's no good reason to issue a warning. The only sad
// thing is that it was a false positive that alerted me to a
// logic error in the code, and any future such errors would
// now be hidden.
QTC::TC("qpdf", "QPDFPageObjectHelper unresolved names"); QTC::TC("qpdf", "QPDFPageObjectHelper unresolved names");
return false; return false;
} }

View File

@ -1,28 +1,53 @@
#include <qpdf/ResourceFinder.hh> #include <qpdf/ResourceFinder.hh>
ResourceFinder::ResourceFinder() : ResourceFinder::ResourceFinder() :
last_name_offset(0),
saw_bad(false) saw_bad(false)
{ {
} }
void void
ResourceFinder::handleToken(QPDFTokenizer::Token const& token) ResourceFinder::handleObject(QPDFObjectHandle obj, size_t offset, size_t)
{ {
if ((token.getType() == QPDFTokenizer::tt_word) && if (obj.isOperator() && (! this->last_name.empty()))
(! this->last_name.empty()))
{ {
this->names.insert(this->last_name); static std::map<std::string, std::string> op_to_rtype = {
{"CS", "/ColorSpace"},
{"cs", "/ColorSpace"},
{"gs", "/ExtGState"},
{"Tf", "/Font"},
{"SCN", "/Pattern"},
{"scn", "/Pattern"},
{"BDC", "/Properties"},
{"DP", "/Properties"},
{"sh", "/Shading"},
{"Do", "/XObject"},
};
std::string op = obj.getOperatorValue();
std::string resource_type;
auto iter = op_to_rtype.find(op);
if (iter != op_to_rtype.end())
{
resource_type = iter->second;
}
if (! resource_type.empty())
{
this->names.insert(this->last_name);
this->names_by_resource_type[
resource_type][this->last_name].insert(this->last_name_offset);
}
} }
else if (token.getType() == QPDFTokenizer::tt_name) else if (obj.isName())
{ {
this->last_name = this->last_name = obj.getName();
QPDFObjectHandle::newName(token.getValue()).getName(); this->last_name_offset = offset;
} }
else if (token.getType() == QPDFTokenizer::tt_bad) }
{
saw_bad = true; void
} ResourceFinder::handleWarning()
writeToken(token); {
this->saw_bad = true;
} }
std::set<std::string> const& std::set<std::string> const&
@ -31,6 +56,12 @@ ResourceFinder::getNames() const
return this->names; return this->names;
} }
std::map<std::string, std::map<std::string, std::set<size_t>>> const&
ResourceFinder::getNamesByResourceType() const
{
return this->names_by_resource_type;
}
bool bool
ResourceFinder::sawBad() const ResourceFinder::sawBad() const
{ {

View File

@ -3,19 +3,26 @@
#include <qpdf/QPDFObjectHandle.hh> #include <qpdf/QPDFObjectHandle.hh>
class ResourceFinder: public QPDFObjectHandle::TokenFilter class ResourceFinder: public QPDFObjectHandle::ParserCallbacks
{ {
public: public:
ResourceFinder(); ResourceFinder();
virtual ~ResourceFinder() = default; virtual ~ResourceFinder() = default;
virtual void handleToken(QPDFTokenizer::Token const&) override; virtual void handleObject(QPDFObjectHandle, size_t, size_t) override;
virtual void handleWarning() override;
std::set<std::string> const& getNames() const; std::set<std::string> const& getNames() const;
std::map<std::string,
std::map<std::string,
std::set<size_t>>> const& getNamesByResourceType() const;
bool sawBad() const; bool sawBad() const;
private: private:
std::string last_name; std::string last_name;
size_t last_name_offset;
std::set<std::string> names; std::set<std::string> names;
std::map<std::string, std::set<std::string>> names_by_resource_type; std::map<std::string,
std::map<std::string,
std::set<size_t>>> names_by_resource_type;
bool saw_bad; bool saw_bad;
}; };

View File

@ -1,3 +1,4 @@
WARNING: page object 3 0 stream 5 0, stream 7 0, stream 9 0, stream 11 0 (content, offset 375): null character not allowed in name token
WARNING: split-tokens.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this object WARNING: split-tokens.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this object
WARNING: empty PDF: content normalization encountered bad tokens WARNING: empty PDF: content normalization encountered bad tokens
WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents