From a01359189b32c60c2d55b039f7aefd6c3ce0ebde Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Fri, 4 Jan 2019 10:17:33 -0500 Subject: [PATCH] Fix dangling references (fixes #240) On certain operations, such as iterating through all objects and adding new indirect objects, walk through the entire object structure and explicitly resolve any indirect references to non-existent objects. That prevents new objects from springing into existence and causing the previously dangling references to point to them. --- ChangeLog | 9 ++ include/qpdf/QPDF.hh | 15 ++- libqpdf/QPDF.cc | 127 +++++++++++++++--- libqpdf/QPDFWriter.cc | 1 + qpdf/qpdf.testcov | 2 +- qpdf/qtest/qpdf.test | 16 +++ .../qtest/qpdf/dangling-refs-dangling-out.pdf | Bin 0 -> 893 bytes qpdf/qtest/qpdf/dangling-refs-dangling.out | 13 ++ qpdf/qtest/qpdf/dangling-refs.pdf | 104 ++++++++++++++ qpdf/qtest/qpdf/issue-101.out | 26 ++++ qpdf/qtest/qpdf/issue-117.out | 8 ++ qpdf/qtest/qpdf/issue-120.out | 4 + qpdf/qtest/qpdf/issue-143.out | 3 + qpdf/qtest/qpdf/issue-51.out | 5 + qpdf/qtest/qpdf/minimal-dangling-out.pdf | Bin 0 -> 853 bytes qpdf/qtest/qpdf/minimal-dangling.out | 9 ++ qpdf/test_driver.cc | 19 +++ 17 files changed, 344 insertions(+), 17 deletions(-) create mode 100644 qpdf/qtest/qpdf/dangling-refs-dangling-out.pdf create mode 100644 qpdf/qtest/qpdf/dangling-refs-dangling.out create mode 100644 qpdf/qtest/qpdf/dangling-refs.pdf create mode 100644 qpdf/qtest/qpdf/minimal-dangling-out.pdf create mode 100644 qpdf/qtest/qpdf/minimal-dangling.out diff --git a/ChangeLog b/ChangeLog index 2c73731a..124227ea 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2019-01-04 Jay Berkenbilt + + * Detect and recover from dangling references. If a PDF file + contained an indirect reference to a non-existent object (which is + valid), when adding a new object to the file, it was possible for + the new object to take the object ID of the dangling reference, + thereby causing the dangling reference to point to the new object. + This case is now prevented. Fixes #240. + 2019-01-03 Jay Berkenbilt * Fix behavior of form field value setting to handle the following diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index 2e27d831..34e0ad31 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -431,9 +431,21 @@ class QPDF QPDF_DLL void showXRefTable(); + // Detect all indirect references to objects that don't exist and + // resolve them by replacing them with null, which is how the PDF + // spec says to interpret such dangling references. This method is + // called automatically if you try to add any new objects, if you + // call getAllObjects, and before a file is written. The qpdf + // object caches whether it has run this to avoid running it + // multiple times. You can pass true to force it to run again if + // you have explicitly added new objects that may have additional + // dangling references. + QPDF_DLL + void fixDanglingReferences(bool force = false); + // Return the approximate number of indirect objects. It is // approximate because not all objects in the file are preserved - // in all cases. + // in all cases, and gaps in object numbering are not preserved. QPDF_DLL size_t getObjectCount(); @@ -1199,6 +1211,7 @@ class QPDF CopiedStreamDataProvider* copied_stream_data_provider; std::set attachment_streams; bool reconstructed_xref; + bool fixed_dangling_refs; // Linearization data qpdf_offset_t first_xref_item_offset; // actual value from file diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index 75ee1f13..caea6424 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -94,6 +94,7 @@ QPDF::Members::Members() : pushed_inherited_attributes_to_pages(false), copied_stream_data_provider(0), reconstructed_xref(false), + fixed_dangling_refs(false), first_xref_item_offset(0), uncompressed_after_compressed(false) { @@ -1218,33 +1219,129 @@ QPDF::showXRefTable() } } +void +QPDF::fixDanglingReferences(bool force) +{ + if (this->m->fixed_dangling_refs && (! force)) + { + return; + } + this->m->fixed_dangling_refs = true; + + // Create a set of all known indirect objects including those + // we've previously resolved and those that we have created. + std::set to_process; + for (std::map::iterator iter = + this->m->obj_cache.begin(); + iter != this->m->obj_cache.end(); ++iter) + { + to_process.insert((*iter).first); + } + for (std::map::iterator iter = + this->m->xref_table.begin(); + iter != this->m->xref_table.end(); ++iter) + { + to_process.insert((*iter).first); + } + + // For each non-scalar item to process, put it in the queue. + std::list queue; + queue.push_back(this->m->trailer); + for (std::set::iterator iter = to_process.begin(); + iter != to_process.end(); ++iter) + { + QPDFObjectHandle obj = QPDFObjectHandle::Factory::newIndirect( + this, (*iter).getObj(), (*iter).getGen()); + if (obj.isDictionary() || obj.isArray()) + { + queue.push_back(obj); + } + else if (obj.isStream()) + { + queue.push_back(obj.getDict()); + } + } + + // Process the queue by recursively resolving all object + // references. We don't need to do loop detection because we don't + // traverse known indirect objects when processing the queue. + while (! queue.empty()) + { + QPDFObjectHandle obj = queue.front(); + queue.pop_front(); + std::list to_check; + if (obj.isDictionary()) + { + std::map members = + obj.getDictAsMap(); + for (std::map::iterator iter = + members.begin(); + iter != members.end(); ++iter) + { + to_check.push_back((*iter).second); + } + } + else if (obj.isArray()) + { + std::vector elements = obj.getArrayAsVector(); + for (std::vector::iterator iter = + elements.begin(); + iter != elements.end(); ++iter) + { + to_check.push_back(*iter); + } + } + for (std::list::iterator iter = to_check.begin(); + iter != to_check.end(); ++iter) + { + QPDFObjectHandle sub = *iter; + if (sub.isIndirect()) + { + if (sub.getOwningQPDF() == this) + { + QPDFObjGen og(sub.getObjGen()); + if (this->m->obj_cache.count(og) == 0) + { + QTC::TC("qpdf", "QPDF detected dangling ref"); + queue.push_back(sub); + } + } + } + else + { + queue.push_back(sub); + } + } + + } +} + size_t QPDF::getObjectCount() { // This method returns the next available indirect object number. - // makeIndirectObject uses it for this purpose. - QPDFObjGen o1(0, 0); + // makeIndirectObject uses it for this purpose. After + // fixDanglingReferences is called, all objects in the xref table + // will also be in obj_cache. + fixDanglingReferences(); + QPDFObjGen og(0, 0); if (! this->m->obj_cache.empty()) { - o1 = (*(this->m->obj_cache.rbegin())).first; + og = (*(this->m->obj_cache.rbegin())).first; } - QPDFObjGen o2(0, 0); - if (! this->m->xref_table.empty()) - { - o2 = (*(this->m->xref_table.rbegin())).first; - } - QTC::TC("qpdf", "QPDF indirect last obj from xref", - (o2.getObj() > o1.getObj()) ? 1 : 0); - return std::max(o1.getObj(), o2.getObj()); + return og.getObj(); } std::vector QPDF::getAllObjects() { + // After fixDanglingReferences is called, all objects are in the + // object cache. + fixDanglingReferences(true); std::vector result; - for (std::map::iterator iter = - this->m->xref_table.begin(); - iter != this->m->xref_table.end(); ++iter) + for (std::map::iterator iter = + this->m->obj_cache.begin(); + iter != this->m->obj_cache.end(); ++iter) { QPDFObjGen const& og = (*iter).first; @@ -1752,7 +1849,6 @@ QPDF::resolve(int objid, int generation) } ResolveRecorder rr(this, og); - // PDF spec says unknown objects resolve to the null object. if ((! this->m->obj_cache.count(og)) && this->m->xref_table.count(og)) { QPDFXRefEntry const& entry = this->m->xref_table[og]; @@ -1800,6 +1896,7 @@ QPDF::resolve(int objid, int generation) } if (this->m->obj_cache.count(og) == 0) { + // PDF spec says unknown objects resolve to the null object. QTC::TC("qpdf", "QPDF resolve failure to null"); QPDFObjectHandle oh = QPDFObjectHandle::newNull(); this->m->obj_cache[og] = diff --git a/libqpdf/QPDFWriter.cc b/libqpdf/QPDFWriter.cc index 71c48333..22b990ff 100644 --- a/libqpdf/QPDFWriter.cc +++ b/libqpdf/QPDFWriter.cc @@ -2244,6 +2244,7 @@ QPDFWriter::prepareFileForWrite() // includes stream lengths, stream filtering parameters, and // document extension level information. + this->m->pdf.fixDanglingReferences(true); std::list queue; queue.push_back(getTrimmedTrailer()); std::set visited; diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 79537417..045f99cd 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -82,7 +82,6 @@ QPDFObjectHandle clone dictionary 0 QPDFObjectHandle makeDirect loop 0 QPDFObjectHandle ERR clone stream 0 QPDFTokenizer allow pound anywhere in name 0 -QPDF indirect last obj from xref 1 QPDF default for xref stream field 0 0 QPDF prev key in xref stream dictionary 0 QPDF prev key in trailer dictionary 0 @@ -402,3 +401,4 @@ QPDFFormFieldObjectHelper list not found 0 QPDFFormFieldObjectHelper list found 0 QPDFFormFieldObjectHelper list first too low 0 QPDFFormFieldObjectHelper list last too high 0 +QPDF detected dangling ref 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index cd1db365..0ef397c2 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -173,6 +173,22 @@ $td->runtest("\@file exists and file doesn't", {$td->FILE => "check-at-file.out", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); +show_ntests(); +# ---------- +$td->notify("--- Dangling Refs ---"); +my @dangling = (qw(minimal dangling-refs)); +$n_tests += 2 * scalar(@dangling); + +foreach my $f (@dangling) +{ + $td->runtest("dangling refs: $f", + {$td->COMMAND => "test_driver 53 $f.pdf"}, + {$td->FILE => "$f-dangling.out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); + $td->runtest("check output", + {$td->FILE => "a.pdf"}, + {$td->FILE => "$f-dangling-out.pdf"}); +} show_ntests(); # ---------- $td->notify("--- Form Tests ---"); diff --git a/qpdf/qtest/qpdf/dangling-refs-dangling-out.pdf b/qpdf/qtest/qpdf/dangling-refs-dangling-out.pdf new file mode 100644 index 0000000000000000000000000000000000000000..96f810689ec22b1301cb079e30eba2cda4dc59f4 GIT binary patch literal 893 zcmZWn%Z}496y+hL5{ZAfNK^s|#&I0yAyw5pIu#YKPFsmZ)rFfnB`9N$5*IqN;s^Kx zR(t|Kz_ufV#0u8KPq0H=r>{|Q7DUCl~hJ5Qv$EF$O}2i#G#&} zx{FUhJqTlOdF}!WS5`<*mU>ns=)`(#Yuu?mCbMX&?q)&2 zIpdJR2BSD6yFhI)N-@V>EX2%ru^9XBV9dvGcQBtv|6x41*4Gpw%P|p~b5Cn~5$&xe z{p>;l2G(i8un|(9iXcguH<5l2GcQU~H{~>ryfIIsxY=M>!DBB#hfy421&jQ^3w*{R Y=7lkDj+QZt!c?Q-F?Z_q<5z9xA1{LRJ^%m! literal 0 HcmV?d00001 diff --git a/qpdf/qtest/qpdf/dangling-refs-dangling.out b/qpdf/qtest/qpdf/dangling-refs-dangling.out new file mode 100644 index 00000000..cf1522c4 --- /dev/null +++ b/qpdf/qtest/qpdf/dangling-refs-dangling.out @@ -0,0 +1,13 @@ +all objects +1 0 R +2 0 R +3 0 R +4 0 R +5 0 R +6 0 R +7 0 R +8 0 R +9 0 R +10 0 R +11 0 R +test 53 done diff --git a/qpdf/qtest/qpdf/dangling-refs.pdf b/qpdf/qtest/qpdf/dangling-refs.pdf new file mode 100644 index 00000000..77c66616 --- /dev/null +++ b/qpdf/qtest/qpdf/dangling-refs.pdf @@ -0,0 +1,104 @@ +%PDF-1.3 +%¿÷¢þ +%QDF-1.0 + +1 0 obj +<< + /Pages 2 0 R + /Type /Catalog + /Dangling 8 0 R + /AlsoDangling [ + 9 0 R + << + /yes 2 0 R + /no 10 0 R + /nope 8 0 R + >> + ] +>> +endobj + +2 0 obj +<< + /Count 1 + /Kids [ + 3 0 R + ] + /Type /Pages +>> +endobj + +%% Page 1 +3 0 obj +<< + /Contents 4 0 R + /MediaBox [ + 0 + 0 + 612 + 792 + ] + /Parent 2 0 R + /Resources << + /Font << + /F1 6 0 R + >> + /ProcSet 7 0 R + >> + /Type /Page +>> +endobj + +%% Contents for page 1 +4 0 obj +<< + /Length 5 0 R +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET +endstream +endobj + +5 0 obj +44 +endobj + +6 0 obj +<< + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + /Name /F1 + /Subtype /Type1 + /Type /Font +>> +endobj + +7 0 obj +[ + /PDF + /Text +] +endobj + +xref +0 8 +0000000000 65535 f +0000000025 00000 n +0000000195 00000 n +0000000277 00000 n +0000000492 00000 n +0000000591 00000 n +0000000610 00000 n +0000000728 00000 n +trailer << + /Root 1 0 R + /Size 8 + /ID [<7141a6cf32de469328cf0f51982b5f89><7141a6cf32de469328cf0f51982b5f89>] +>> +startxref +763 +%%EOF diff --git a/qpdf/qtest/qpdf/issue-101.out b/qpdf/qtest/qpdf/issue-101.out index f1e4d03a..fdaa4d4d 100644 --- a/qpdf/qtest/qpdf/issue-101.out +++ b/qpdf/qtest/qpdf/issue-101.out @@ -122,6 +122,32 @@ WARNING: issue-101.pdf (object 11 0, offset 1357): unknown token while reading o WARNING: issue-101.pdf (object 11 0, offset 1359): unknown token while reading object; treating as string WARNING: issue-101.pdf (object 11 0, offset 1368): unexpected ) WARNING: issue-101.pdf (object 11 0, offset 1373): expected endobj +WARNING: issue-101.pdf (object 2 0, offset 244): unknown token while reading object; treating as string +WARNING: issue-101.pdf (object 7 0, offset 3855): unknown token while reading object; treating as string +WARNING: issue-101.pdf (object 7 0, offset 3863): treating unexpected brace token as null +WARNING: issue-101.pdf (object 7 0, offset 3864): unknown token while reading object; treating as string +WARNING: issue-101.pdf (object 7 0, offset 3866): unknown token while reading object; treating as string +WARNING: issue-101.pdf (object 7 0, offset 3873): unknown token while reading object; treating as string +WARNING: issue-101.pdf (object 7 0, offset 3879): unknown token while reading object; treating as string +WARNING: issue-101.pdf (object 7 0, offset 3888): unknown token while reading object; treating as string +WARNING: issue-101.pdf (object 7 0, offset 3901): unknown token while reading object; treating as string +WARNING: issue-101.pdf (object 7 0, offset 3905): unknown token while reading object; treating as string +WARNING: issue-101.pdf (object 7 0, offset 3913): unknown token while reading object; treating as string +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake1 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake2 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake3 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake4 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake5 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake6 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake7 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake8 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake9 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake10 +WARNING: issue-101.pdf (object 7 0, offset 3844): stream dictionary lacks /Length key +WARNING: issue-101.pdf (object 7 0, offset 3962): attempting to recover stream length +WARNING: issue-101.pdf (object 7 0, offset 3962): recovered stream length: 12 WARNING: issue-101.pdf (object 8 0, offset 4067): invalid character ()) in hexstring WARNING: issue-101.pdf (object 8 0, offset 4069): expected endobj +WARNING: issue-101.pdf (object 9 0, offset 2832): unknown token while reading object; treating as string +WARNING: issue-101.pdf (object 9 0, offset 2834): expected endobj qpdf: operation succeeded with warnings; resulting file may have some problems diff --git a/qpdf/qtest/qpdf/issue-117.out b/qpdf/qtest/qpdf/issue-117.out index e831f289..18157749 100644 --- a/qpdf/qtest/qpdf/issue-117.out +++ b/qpdf/qtest/qpdf/issue-117.out @@ -5,4 +5,12 @@ WARNING: issue-117.pdf (offset 66): loop detected resolving object 2 0 WARNING: issue-117.pdf (object 2 0, offset 22): /Length key in stream dictionary is not an integer WARNING: issue-117.pdf (object 2 0, offset 67): attempting to recover stream length WARNING: issue-117.pdf (object 2 0, offset 67): recovered stream length: 91 +WARNING: issue-117.pdf (object 5 0, offset 1559): expected endstream +WARNING: issue-117.pdf (object 5 0, offset 349): attempting to recover stream length +WARNING: issue-117.pdf (object 5 0, offset 349): recovered stream length: 762 +WARNING: issue-117.pdf (object 5 0, offset 1121): expected endobj +WARNING: issue-117.pdf (object 7 0, offset 1791): unknown token while reading object; treating as string +WARNING: issue-117.pdf (object 7 0, offset 1267): /Length key in stream dictionary is not an integer +WARNING: issue-117.pdf (object 7 0, offset 1418): attempting to recover stream length +WARNING: issue-117.pdf (object 7 0, offset 1418): recovered stream length: 347 attempt to make a stream into a direct object diff --git a/qpdf/qtest/qpdf/issue-120.out b/qpdf/qtest/qpdf/issue-120.out index a54bae5b..ec32f9de 100644 --- a/qpdf/qtest/qpdf/issue-120.out +++ b/qpdf/qtest/qpdf/issue-120.out @@ -1,3 +1,7 @@ WARNING: issue-120.pdf (offset 85): loop detected resolving object 3 0 WARNING: issue-120.pdf (object 6 0, offset 85): supposed object stream 3 is not a stream +WARNING: issue-120.pdf: file is damaged +WARNING: issue-120.pdf (object 8 10, offset 26880): expected n n obj +WARNING: issue-120.pdf: Attempting to reconstruct cross-reference table +WARNING: issue-120.pdf: object 8 10 not found in file after regenerating cross reference table qpdf: operation succeeded with warnings; resulting file may have some problems diff --git a/qpdf/qtest/qpdf/issue-143.out b/qpdf/qtest/qpdf/issue-143.out index 307a726e..bded2e00 100644 --- a/qpdf/qtest/qpdf/issue-143.out +++ b/qpdf/qtest/qpdf/issue-143.out @@ -14,4 +14,7 @@ WARNING: issue-143.pdf (object 1 0, offset 21): stream dictionary lacks /Length WARNING: issue-143.pdf (object 1 0, offset 84): attempting to recover stream length WARNING: issue-143.pdf (object 1 0, offset 84): recovered stream length: 606 WARNING: issue-143.pdf object stream 1 (object 2 0, offset 33): expected dictionary key but found non-name object; inserting key /QPDFFake1 +WARNING: issue-143.pdf (object 2 0, offset 84): supposed object stream 12336 is not a stream +WARNING: issue-143.pdf (object 2 0, offset 84): supposed object stream 12336 is not a stream +WARNING: issue-143.pdf (object 2 0, offset 84): supposed object stream 12336 is not a stream qpdf: operation succeeded with warnings; resulting file may have some problems diff --git a/qpdf/qtest/qpdf/issue-51.out b/qpdf/qtest/qpdf/issue-51.out index e361b7d0..7c16e23a 100644 --- a/qpdf/qtest/qpdf/issue-51.out +++ b/qpdf/qtest/qpdf/issue-51.out @@ -8,3 +8,8 @@ WARNING: issue-51.pdf (object 2 0, offset 71): attempting to recover stream leng WARNING: issue-51.pdf (object 2 0, offset 71): unable to recover stream data; treating stream as empty WARNING: issue-51.pdf (object 2 0, offset 977): expected endobj WARNING: issue-51.pdf (object 2 0, offset 977): EOF after endobj +WARNING: issue-51.pdf (object 3 0): object has offset 0 +WARNING: issue-51.pdf (object 4 0): object has offset 0 +WARNING: issue-51.pdf (object 5 0): object has offset 0 +WARNING: issue-51.pdf (object 6 0): object has offset 0 +WARNING: issue-51.pdf (object 8 0): object has offset 0 diff --git a/qpdf/qtest/qpdf/minimal-dangling-out.pdf b/qpdf/qtest/qpdf/minimal-dangling-out.pdf new file mode 100644 index 0000000000000000000000000000000000000000..48dff413bcec9907f276e674491131e62a111889 GIT binary patch literal 853 zcmah{%Z}496y+hLVu^paNK^s|a2&^ZNL2+horsE8r>(@I>cZ1{N>C;qCGJdN#SicY ztoQ_efNe(zi50AepJ0c$PA1K$EN~WaPJGU}_a3(gz4!rp$VvOl?@xaS0|)fM8`9|j z9ms_$z(r&Tbe{pQcH_6l3h1shGSv&{c8SX7mSPXq)Lp&I3^1TilX(FX@Ji|wXhp7+ z8>&^d)Vs&pNTcu7l9tv;W=3VE08whu=W3qF9lb(z2Opog5JWDZ$3W(Iur7S43cbu{ z*oo~a*7&E~Fz`#EZWkH3o{f}&V1qPuUh2>~*X2A@*}}XAFGS5GH7dt+Dy4>-^f4uUBub zahey*i@Dk|Yu6oFsImq6M5QOnBr^$gFGI^DTL68PWRJ5VX@D2<2(yYYh|%)El)>4~ z@VjcT)M|3D{adREj-(gkA+<6vtX0}P5@)l(7lI2ogbl`UNOnQiU;*a1g)xqq zZ({=ew=j-M+nC3~e=y-*@-?|kQcT3=Jk;7=#PSZ0l6ML)uzn9F9nL%^BG all = pdf.getAllObjects(); + for (std::vector::iterator iter = all.begin(); + iter != all.end(); ++iter) + { + std::cout << (*iter).unparse() << std::endl; + } + + QPDFWriter w(pdf, "a.pdf"); + w.setStaticID(true); + w.write(); + } else { throw std::runtime_error(std::string("invalid test ") +