Refactor the creation of unresolved objects

Create unresolved objects only for objects in the xref table (except during
parsing of the xref table). Do not add indirect nulls into the the object
cache as the result of a cache miss during a call to getObject except
during parsing or creation/updating from JSON. To support this behaviour,
add new private methods getObjectForParser and getObjectForJSON.

As a result of this change, dangling references are treated as direct nulls
rather than indirect nulls.
This commit is contained in:
m-holger 2024-03-08 16:57:56 +00:00
parent 6ae439f180
commit 542cb91b7d
15 changed files with 177 additions and 195 deletions

View File

@ -814,7 +814,8 @@ class QPDF
}
};
// The ParseGuard class allows QPDFParser to detect re-entrant parsing.
// The ParseGuard class allows QPDFParser to detect re-entrant parsing. It also provides
// special access to allow the parser to create unresolved objects and dangling references.
class ParseGuard
{
friend class QPDFParser;
@ -827,6 +828,13 @@ class QPDF
qpdf->inParse(true);
}
}
static std::shared_ptr<QPDFObject>
getObject(QPDF* qpdf, int id, int gen)
{
return qpdf->getObjectForParser(id, gen);
}
~ParseGuard()
{
if (qpdf) {
@ -1051,13 +1059,14 @@ class QPDF
void resolve(QPDFObjGen og);
void resolveObjectsInStream(int obj_stream_number);
void stopOnError(std::string const& message);
QPDFObjectHandle reserveObjectIfNotExists(QPDFObjGen const& og);
QPDFObjectHandle reserveStream(QPDFObjGen const& og);
QPDFObjGen nextObjGen();
QPDFObjectHandle newIndirect(QPDFObjGen const&, std::shared_ptr<QPDFObject> const&);
QPDFObjectHandle makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj);
bool isCached(QPDFObjGen const& og);
bool isUnresolved(QPDFObjGen const& og);
std::shared_ptr<QPDFObject> getObjectForParser(int id, int gen);
std::shared_ptr<QPDFObject> getObjectForJSON(int id, int gen);
void removeObject(QPDFObjGen og);
void updateCache(
QPDFObjGen const& og,

View File

@ -1932,31 +1932,50 @@ QPDF::newStream(std::string const& data)
return result;
}
QPDFObjectHandle
QPDF::reserveObjectIfNotExists(QPDFObjGen const& og)
{
if (!isCached(og) && m->xref_table.count(og) == 0) {
updateCache(og, QPDF_Reserved::create(), -1, -1);
return newIndirect(og, m->obj_cache[og].object);
} else {
return getObject(og);
}
}
QPDFObjectHandle
QPDF::reserveStream(QPDFObjGen const& og)
{
return {QPDF_Stream::create(this, og, QPDFObjectHandle::newDictionary(), 0, 0)};
}
std::shared_ptr<QPDFObject>
QPDF::getObjectForParser(int id, int gen)
{
// This method is called by the parser and therefore must not resolve any objects.
auto og = QPDFObjGen(id, gen);
auto [it, inserted] = m->obj_cache.try_emplace(og);
auto& obj = it->second.object;
if (inserted) {
obj = (m->parsed && !m->xref_table.count(og)) ? QPDF_Null::create()
: QPDF_Unresolved::create(this, og);
}
return obj;
}
std::shared_ptr<QPDFObject>
QPDF::getObjectForJSON(int id, int gen)
{
auto og = QPDFObjGen(id, gen);
auto [it, inserted] = m->obj_cache.try_emplace(og);
auto& obj = it->second.object;
if (inserted) {
obj = (m->parsed && !m->xref_table.count(og)) ? QPDF_Null::create(this, og)
: QPDF_Unresolved::create(this, og);
}
return obj;
}
QPDFObjectHandle
QPDF::getObject(QPDFObjGen const& og)
{
// This method is called by the parser and therefore must not resolve any objects.
if (!isCached(og)) {
m->obj_cache[og] = ObjCache(QPDF_Unresolved::create(this, og), -1, -1);
if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) {
return {it->second.object};
} else if (m->parsed && !m->xref_table.count(og)) {
return QPDF_Null::create();
} else {
auto result = m->obj_cache.try_emplace(og, QPDF_Unresolved::create(this, og), -1, -1);
return {result.first->second.object};
}
return newIndirect(og, m->obj_cache[og].object);
}
QPDFObjectHandle

View File

@ -166,10 +166,7 @@ QPDFParser::parseRemainder(bool content_stream)
auto id = QIntC::to_int(int_buffer[(int_count - 1) % 2]);
auto gen = QIntC::to_int(int_buffer[(int_count) % 2]);
if (!(id < 1 || gen < 0 || gen >= 65535)) {
// This action has the desirable side effect of causing dangling references
// (references to indirect objects that don't appear in the PDF) in any parsed
// object to appear in the object cache.
add(std::move(context->getObject(id, gen).obj));
add(QPDF::ParseGuard::getObject(context, id, gen));
} else {
QTC::TC("qpdf", "QPDFParser invalid objgen");
addNull();

View File

@ -3,15 +3,15 @@
#include <qpdf/JSON_writer.hh>
#include <qpdf/QPDFObject_private.hh>
QPDF_Null::QPDF_Null() :
QPDFValue(::ot_null, "null")
QPDF_Null::QPDF_Null(QPDF* qpdf, QPDFObjGen og) :
QPDFValue(::ot_null, "null", qpdf, og)
{
}
std::shared_ptr<QPDFObject>
QPDF_Null::create()
QPDF_Null::create(QPDF* qpdf, QPDFObjGen og)
{
return do_create(new QPDF_Null());
return do_create(new QPDF_Null(qpdf, og));
}
std::shared_ptr<QPDFObject>

View File

@ -240,11 +240,6 @@ class QPDF::JSONReactor: public JSON::Reactor
descr(std::make_shared<QPDFValue::Description>(
QPDFValue::JSON_Descr(std::make_shared<std::string>(is->getName()), "")))
{
for (auto& oc: pdf.m->obj_cache) {
if (oc.second.object->getTypeCode() == ::ot_reserved) {
reserved.insert(oc.first);
}
}
}
~JSONReactor() override = default;
void dictionaryStart() override;
@ -305,7 +300,6 @@ class QPDF::JSONReactor: public JSON::Reactor
bool saw_data{false};
bool saw_datafile{false};
bool this_stream_needs_data{false};
std::set<QPDFObjGen> reserved;
std::vector<StackFrame> stack;
QPDFObjectHandle next_obj;
state_e next_state{st_top};
@ -420,12 +414,6 @@ QPDF::JSONReactor::containerEnd(JSON const& value)
// Handle dangling indirect object references which the PDF spec says to treat as nulls.
// It's tempting to make this an error, but that would be wrong since valid input files may
// have these.
for (auto& oc: pdf.m->obj_cache) {
if (oc.second.object->getTypeCode() == ::ot_reserved && reserved.count(oc.first) == 0) {
QTC::TC("qpdf", "QPDF_json non-trivial null reserved");
pdf.updateCache(oc.first, QPDF_Null::create(), -1, -1);
}
}
}
if (!stack.empty()) {
auto state = stack.back().state;
@ -565,7 +553,7 @@ QPDF::JSONReactor::dictionaryItem(std::string const& key, JSON const& value)
} else if (is_obj_key(key, obj, gen)) {
this->cur_object = key;
if (setNextStateIfDictionary(key, value, st_object_top)) {
next_obj = pdf.reserveObjectIfNotExists(QPDFObjGen(obj, gen));
next_obj = pdf.getObjectForJSON(obj, gen);
}
} else {
QTC::TC("qpdf", "QPDF_json bad object key");
@ -767,7 +755,7 @@ QPDF::JSONReactor::makeObject(JSON const& value)
int gen = 0;
std::string str;
if (is_indirect_object(str_v, obj, gen)) {
result = pdf.reserveObjectIfNotExists(QPDFObjGen(obj, gen));
result = pdf.getObjectForJSON(obj, gen);
} else if (is_unicode_string(str_v, str)) {
result = QPDFObjectHandle::newUnicodeString(str);
} else if (is_binary_string(str_v, str)) {

View File

@ -7,7 +7,7 @@ class QPDF_Null: public QPDFValue
{
public:
~QPDF_Null() override = default;
static std::shared_ptr<QPDFObject> create();
static std::shared_ptr<QPDFObject> create(QPDF* qpdf = nullptr, QPDFObjGen og = QPDFObjGen());
static std::shared_ptr<QPDFObject> create(
std::shared_ptr<QPDFObject> parent,
std::string_view const& static_descr,
@ -21,7 +21,7 @@ class QPDF_Null: public QPDFValue
void writeJSON(int json_version, JSON::Writer& p) override;
private:
QPDF_Null();
QPDF_Null(QPDF* qpdf = nullptr, QPDFObjGen og = QPDFObjGen());
};
#endif // QPDF_NULL_HH

View File

@ -90,17 +90,17 @@ main()
obj = QPDF_Array::create({10, "null"_qpdf.getObj()}, true);
QPDF_Array& b = *obj->as<QPDF_Array>();
b.setAt(5, pdf.getObject(5, 0));
b.setAt(5, pdf.newIndirectNull());
b.setAt(7, "[0 1 2 3]"_qpdf);
assert(b.at(3).isNull());
assert(b.at(8).isNull());
assert(b.at(5).isIndirect());
assert(b.unparse() == "[ null null null null null 5 0 R null [ 0 1 2 3 ] null null ]");
assert(b.unparse() == "[ null null null null null 3 0 R null [ 0 1 2 3 ] null null ]");
auto c = b.copy(true);
auto d = b.copy(false);
b.at(7).setArrayItem(2, "42"_qpdf);
assert(c->unparse() == "[ null null null null null 5 0 R null [ 0 1 42 3 ] null null ]");
assert(d->unparse() == "[ null null null null null 5 0 R null [ 0 1 2 3 ] null null ]");
assert(c->unparse() == "[ null null null null null 3 0 R null [ 0 1 42 3 ] null null ]");
assert(d->unparse() == "[ null null null null null 3 0 R null [ 0 1 2 3 ] null null ]");
try {
b.setAt(3, {});

View File

@ -673,7 +673,6 @@ QPDF_json ignore second-level key 0
QPDF_json ignore unknown key in object_top 0
QPDF_json ignore unknown key in trailer 0
QPDF_json ignore unknown key in stream 0
QPDF_json non-trivial null reserved 0
QPDF_json data and datafile 0
QPDF_json no stream data in update mode 0
QPDF_json updating existing stream 0

View File

@ -7,5 +7,5 @@
/nesting is direct
/strings is direct
unparse: 7 0 R
unparseResolved: << /dangling-ref-for-json-test [ 9 0 R ] /hex#20strings [ (Potato) <01020300040560> (AB) ] /indirect 8 0 R /names [ /nesting /hex#20strings /text#2fplain ] /nesting << /a [ 1 2 << /x (y) >> [ (z) ] ] /b << / (legal) /a [ 1 2 ] >> >> /strings [ (one) <24a2> () (\(\)) (\() (\)) (a\f\b\t\r\nb) (") ("") ("\("\)") <410042> (a\nb) (a b) <efbbbfcf80> <efbbbff09fa594> ] >>
unparseResolved: << /dangling-ref-for-json-test [ null ] /hex#20strings [ (Potato) <01020300040560> (AB) ] /indirect 8 0 R /names [ /nesting /hex#20strings /text#2fplain ] /nesting << /a [ 1 2 << /x (y) >> [ (z) ] ] /b << / (legal) /a [ 1 2 ] >> >> /strings [ (one) <24a2> () (\(\)) (\() (\)) (a\f\b\t\r\nb) (") ("") ("\("\)") <410042> (a\nb) (a b) <efbbbfcf80> <efbbbff09fa594> ] >>
test 1 done

View File

@ -14,14 +14,14 @@ endobj
2 0 obj
<<
/dangling-ref-for-json-test [
4 0 R
null
]
/hex#20strings [
(Potato)
<01020300040560>
(AB)
]
/indirect 5 0 R
/indirect 4 0 R
/names [
/nesting
/hex#20strings
@ -71,27 +71,22 @@ endobj
<<
/Count 1
/Kids [
6 0 R
5 0 R
]
/Type /Pages
>>
endobj
%% Original object ID: 9 0
4 0 obj
null
endobj
%% Original object ID: 8 0
5 0 obj
4 0 obj
(hello)
endobj
%% Page 1
%% Original object ID: 3 0
6 0 obj
5 0 obj
<<
/Contents 7 0 R
/Contents 6 0 R
/MediaBox [
0
0
@ -101,9 +96,9 @@ endobj
/Parent 3 0 R
/Resources <<
/Font <<
/F1 9 0 R
/F1 8 0 R
>>
/ProcSet 10 0 R
/ProcSet 9 0 R
>>
/Type /Page
>>
@ -111,9 +106,9 @@ endobj
%% Contents for page 1
%% Original object ID: 4 0
7 0 obj
6 0 obj
<<
/Length 8 0 R
/Length 7 0 R
>>
stream
BT
@ -124,12 +119,12 @@ ET
endstream
endobj
8 0 obj
7 0 obj
44
endobj
%% Original object ID: 6 0
9 0 obj
8 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
@ -140,7 +135,7 @@ endobj
endobj
%% Original object ID: 5 0
10 0 obj
9 0 obj
[
/PDF
/Text
@ -148,24 +143,23 @@ endobj
endobj
xref
0 11
0 10
0000000000 65535 f
0000000052 00000 n
0000000133 00000 n
0000000756 00000 n
0000000855 00000 n
0000000903 00000 n
0000000964 00000 n
0000001207 00000 n
0000001306 00000 n
0000001352 00000 n
0000001497 00000 n
0000000755 00000 n
0000000854 00000 n
0000000915 00000 n
0000001157 00000 n
0000001256 00000 n
0000001302 00000 n
0000001447 00000 n
trailer <<
/QTest 2 0 R
/Root 1 0 R
/Size 11
/Size 10
/ID [<31415926535897932384626433832795><31415926535897932384626433832795>]
>>
startxref
1533
1482
%%EOF

View File

@ -5,7 +5,7 @@
item 2 is direct
item 3 is indirect
item 4 is direct
item 5 is indirect
item 5 is direct
unparse: 9 0 R
unparseResolved: [ /literal null /indirect 8 0 R /undefined 10 0 R ]
unparseResolved: [ /literal null /indirect 8 0 R /undefined null ]
test 1 done

View File

@ -18,7 +18,7 @@ endobj
/indirect
4 0 R
/undefined
5 0 R
null
]
endobj
@ -27,7 +27,7 @@ endobj
<<
/Count 1
/Kids [
6 0 R
5 0 R
]
/Type /Pages
>>
@ -38,16 +38,11 @@ endobj
null
endobj
%% Original object ID: 10 0
5 0 obj
null
endobj
%% Page 1
%% Original object ID: 3 0
6 0 obj
5 0 obj
<<
/Contents 7 0 R
/Contents 6 0 R
/MediaBox [
0
0
@ -57,9 +52,9 @@ endobj
/Parent 3 0 R
/Resources <<
/Font <<
/F1 9 0 R
/F1 8 0 R
>>
/ProcSet 10 0 R
/ProcSet 9 0 R
>>
/Type /Page
>>
@ -67,9 +62,9 @@ endobj
%% Contents for page 1
%% Original object ID: 4 0
7 0 obj
6 0 obj
<<
/Length 8 0 R
/Length 7 0 R
>>
stream
BT
@ -80,12 +75,12 @@ ET
endstream
endobj
8 0 obj
7 0 obj
44
endobj
%% Original object ID: 6 0
9 0 obj
8 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
@ -96,7 +91,7 @@ endobj
endobj
%% Original object ID: 7 0
10 0 obj
9 0 obj
[
/PDF
/Text
@ -104,24 +99,23 @@ endobj
endobj
xref
0 11
0 10
0000000000 65535 f
0000000052 00000 n
0000000133 00000 n
0000000239 00000 n
0000000338 00000 n
0000000387 00000 n
0000000445 00000 n
0000000688 00000 n
0000000787 00000 n
0000000833 00000 n
0000000978 00000 n
0000000238 00000 n
0000000337 00000 n
0000000395 00000 n
0000000637 00000 n
0000000736 00000 n
0000000782 00000 n
0000000927 00000 n
trailer <<
/QTest 2 0 R
/Root 1 0 R
/Size 11
/Size 10
/ID [<06c2c8fc54c5f9cc9246898e1e1a7146><31415926535897932384626433832795>]
>>
startxref
1014
962
%%EOF

View File

@ -14,14 +14,14 @@ endobj
2 0 obj
<<
/dangling-ref-for-json-test [
4 0 R
null
]
/hex#20strings [
(Potato)
<01020300040560>
(AB)
]
/indirect 5 0 R
/indirect 4 0 R
/names [
/nesting
/hex#20strings
@ -71,37 +71,32 @@ endobj
<<
/Count 1
/Kids [
7 0 R
6 0 R
]
/Type /Pages
>>
endobj
%% Original object ID: 9 0
4 0 obj
null
endobj
%% Original object ID: 8 0
5 0 obj
4 0 obj
<<
/K /V
/Length 6 0 R
/Length 5 0 R
>>
stream
new-stream-here
endstream
endobj
6 0 obj
5 0 obj
16
endobj
%% Page 1
%% Original object ID: 3 0
7 0 obj
6 0 obj
<<
/Contents 8 0 R
/Contents 7 0 R
/MediaBox [
0
0
@ -111,9 +106,9 @@ endobj
/Parent 3 0 R
/Resources <<
/Font <<
/F1 10 0 R
/F1 9 0 R
>>
/ProcSet 11 0 R
/ProcSet 10 0 R
>>
/Type /Page
>>
@ -121,9 +116,9 @@ endobj
%% Contents for page 1
%% Original object ID: 4 0
8 0 obj
7 0 obj
<<
/Length 9 0 R
/Length 8 0 R
>>
stream
BT
@ -134,12 +129,12 @@ ET
endstream
endobj
9 0 obj
8 0 obj
44
endobj
%% Original object ID: 6 0
10 0 obj
9 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
@ -150,7 +145,7 @@ endobj
endobj
%% Original object ID: 5 0
11 0 obj
10 0 obj
[
/PDF
/Text
@ -158,25 +153,24 @@ endobj
endobj
xref
0 12
0 11
0000000000 65535 f
0000000052 00000 n
0000000133 00000 n
0000000756 00000 n
0000000855 00000 n
0000000903 00000 n
0000000982 00000 n
0000001038 00000 n
0000001282 00000 n
0000001381 00000 n
0000001427 00000 n
0000001573 00000 n
0000000755 00000 n
0000000854 00000 n
0000000933 00000 n
0000000989 00000 n
0000001232 00000 n
0000001331 00000 n
0000001377 00000 n
0000001522 00000 n
trailer <<
/QTest 2 0 R
/Root 1 0 R
/Size 12
/Size 11
/ID [<31415926535897932384626433832795><31415926535897932384626433832795>]
>>
startxref
1609
1558
%%EOF

View File

@ -14,14 +14,14 @@ endobj
2 0 obj
<<
/dangling-ref-for-json-test [
4 0 R
null
]
/hex#20strings [
(Potato)
<01020300040560>
(AB)
]
/indirect 5 0 R
/indirect 4 0 R
/names [
/nesting
/hex#20strings
@ -71,27 +71,22 @@ endobj
<<
/Count 1
/Kids [
6 0 R
5 0 R
]
/Type /Pages
>>
endobj
%% Original object ID: 9 0
4 0 obj
null
endobj
%% Original object ID: 8 0
5 0 obj
4 0 obj
(hello)
endobj
%% Page 1
%% Original object ID: 3 0
6 0 obj
5 0 obj
<<
/Contents 7 0 R
/Contents 6 0 R
/MediaBox [
0
0
@ -101,9 +96,9 @@ endobj
/Parent 3 0 R
/Resources <<
/Font <<
/F1 9 0 R
/F1 8 0 R
>>
/ProcSet 10 0 R
/ProcSet 9 0 R
>>
/Type /Page
>>
@ -111,9 +106,9 @@ endobj
%% Contents for page 1
%% Original object ID: 4 0
7 0 obj
6 0 obj
<<
/Length 8 0 R
/Length 7 0 R
>>
stream
BT
@ -124,12 +119,12 @@ ET
endstream
endobj
8 0 obj
7 0 obj
43
endobj
%% Original object ID: 6 0
9 0 obj
8 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
@ -140,7 +135,7 @@ endobj
endobj
%% Original object ID: 5 0
10 0 obj
9 0 obj
[
/PDF
/Text
@ -148,24 +143,23 @@ endobj
endobj
xref
0 11
0 10
0000000000 65535 f
0000000052 00000 n
0000000133 00000 n
0000000756 00000 n
0000000855 00000 n
0000000903 00000 n
0000000964 00000 n
0000001207 00000 n
0000001305 00000 n
0000001351 00000 n
0000001496 00000 n
0000000755 00000 n
0000000854 00000 n
0000000915 00000 n
0000001157 00000 n
0000001255 00000 n
0000001301 00000 n
0000001446 00000 n
trailer <<
/QTest 2 0 R
/Root 1 0 R
/Size 11
/Size 10
/ID [<31415926535897932384626433832795><31415926535897932384626433832795>]
>>
startxref
1532
1481
%%EOF

View File

@ -14,14 +14,14 @@ endobj
2 0 obj
<<
/dangling-ref-for-json-test [
4 0 R
null
]
/hex#20strings [
(Potato)
<01020300040560>
(AB)
]
/indirect 5 0 R
/indirect 4 0 R
/names [
/nesting
/hex#20strings
@ -71,27 +71,22 @@ endobj
<<
/Count 1
/Kids [
6 0 R
5 0 R
]
/Type /Pages
>>
endobj
%% Original object ID: 9 0
4 0 obj
null
endobj
%% Original object ID: 8 0
5 0 obj
4 0 obj
(hello)
endobj
%% Page 1
%% Original object ID: 3 0
6 0 obj
5 0 obj
<<
/Contents 7 0 R
/Contents 6 0 R
/MediaBox [
0
0
@ -101,9 +96,9 @@ endobj
/Parent 3 0 R
/Resources <<
/Font <<
/F1 9 0 R
/F1 8 0 R
>>
/ProcSet 10 0 R
/ProcSet 9 0 R
>>
/Type /Page
>>
@ -111,10 +106,10 @@ endobj
%% Contents for page 1
%% Original object ID: 4 0
7 0 obj
6 0 obj
<<
/Potato (salad)
/Length 8 0 R
/Length 7 0 R
>>
stream
BT
@ -125,12 +120,12 @@ ET
endstream
endobj
8 0 obj
7 0 obj
44
endobj
%% Original object ID: 6 0
9 0 obj
8 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
@ -141,7 +136,7 @@ endobj
endobj
%% Original object ID: 5 0
10 0 obj
9 0 obj
[
/PDF
/Text
@ -149,24 +144,23 @@ endobj
endobj
xref
0 11
0 10
0000000000 65535 f
0000000052 00000 n
0000000133 00000 n
0000000756 00000 n
0000000855 00000 n
0000000903 00000 n
0000000964 00000 n
0000001207 00000 n
0000001324 00000 n
0000001370 00000 n
0000001515 00000 n
0000000755 00000 n
0000000854 00000 n
0000000915 00000 n
0000001157 00000 n
0000001274 00000 n
0000001320 00000 n
0000001465 00000 n
trailer <<
/QTest 2 0 R
/Root 1 0 R
/Size 11
/Size 10
/ID [<31415926535897932384626433832795><31415926535897932384626433832795>]
>>
startxref
1551
1500
%%EOF