From 2994f9cf4cc45e33406de34d4bce45ca491df98e Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sat, 6 Jan 2024 16:51:03 -0500 Subject: [PATCH] Attempt to find xref streams during recovery (fixes #1103) --- ChangeLog | 6 ++++ libqpdf/QPDF.cc | 32 ++++++++++++++++++ manual/release-notes.rst | 5 +++ qpdf/qpdf.testcov | 1 + qpdf/qtest/object-stream.test | 12 ++++++- qpdf/qtest/qpdf/bad7-recover.out | 2 ++ qpdf/qtest/qpdf/issue-146.out | 3 ++ qpdf/qtest/qpdf/issue-148.out | 7 +++- qpdf/qtest/qpdf/issue-150.out | 1 + qpdf/qtest/qpdf/issue-202.out | 2 ++ .../qpdf/recover-xref-stream-recovered.pdf | Bin 0 -> 968 bytes qpdf/qtest/qpdf/recover-xref-stream.out | 5 +++ qpdf/qtest/qpdf/recover-xref-stream.pdf | Bin 0 -> 3817 bytes 13 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 qpdf/qtest/qpdf/recover-xref-stream-recovered.pdf create mode 100644 qpdf/qtest/qpdf/recover-xref-stream.out create mode 100644 qpdf/qtest/qpdf/recover-xref-stream.pdf diff --git a/ChangeLog b/ChangeLog index 62f1a541..0a57abea 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2024-01-06 Jay Berkenbilt + + * When recovering a file's xref table, attempt to find xref + streams if a traditional trailer dictionary is not found. Fixes + #1103. + 2024-01-05 Jay Berkenbilt * Add --set-page-labels command-line argument and supporting API. diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index af3db080..67f5e2e0 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -579,6 +579,38 @@ QPDF::reconstruct_xref(QPDFExc& e) } m->deleted_objects.clear(); + if (!m->trailer.isInitialized()) { + qpdf_offset_t max_offset{0}; + // If there are any xref streams, take the last one to appear. + for (auto const& iter: m->xref_table) { + auto entry = iter.second; + if (entry.getType() != 1) { + continue; + } + auto oh = getObjectByObjGen(iter.first); + try { + if (!oh.isStreamOfType("/XRef")) { + continue; + } + } catch (std::exception&) { + continue; + } + auto offset = entry.getOffset(); + if (offset > max_offset) { + max_offset = offset; + setTrailer(oh.getDict()); + } + } + if (max_offset > 0) { + try { + read_xref(max_offset); + } catch (std::exception&) { + throw damagedPDF("", 0, "error decoding candidate xref stream while recovering damaged file"); + } + QTC::TC("qpdf", "QPDF recover xref stream"); + } + } + if (!m->trailer.isInitialized()) { // We could check the last encountered object to see if it was an xref stream. If so, we // could try to get the trailer from there. This may make it possible to recover files with diff --git a/manual/release-notes.rst b/manual/release-notes.rst index 6fb4758a..99f759ce 100644 --- a/manual/release-notes.rst +++ b/manual/release-notes.rst @@ -67,6 +67,11 @@ Planned changes for future 12.x (subject to change): - ``QPDFPageLabelDocumentHelper::pageLabelDict`` + - Improve file recovery logic to better handle files with + cross-reference streams. This should enable qpdf to recover some + files that it would previously have reported "unable to find + trailer dictionary." + 11.7.0: December 24, 2023 - Bug fixes: diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index cbb4ac1d..df2555d6 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -689,3 +689,4 @@ QPDFPageObjectHelper used fallback without copying 0 QPDF skipping cache for known unchecked object 0 QPDF fix dangling triggered xref reconstruction 0 QPDFPageDocumentHelper flatten resources missing or invalid 0 +QPDF recover xref stream 0 diff --git a/qpdf/qtest/object-stream.test b/qpdf/qtest/object-stream.test index 04d1bb0d..22b35af4 100644 --- a/qpdf/qtest/object-stream.test +++ b/qpdf/qtest/object-stream.test @@ -16,7 +16,7 @@ cleanup(); my $td = new TestDriver('object-stream'); -my $n_tests = 3 + (36 * 4) + (12 * 2); +my $n_tests = 5 + (36 * 4) + (12 * 2); my $n_compare_pdfs = 36; for (my $n = 16; $n <= 19; ++$n) @@ -87,5 +87,15 @@ $td->runtest("check file", {$td->FILE => "gen1.qdf"}); +# Recover a file with xref streams +$td->runtest("recover file with xref stream", + {$td->COMMAND => "qpdf --static-id --compress-streams=n" . + " recover-xref-stream.pdf a.pdf"}, + {$td->FILE => "recover-xref-stream.out", $td->EXIT_STATUS => 3}, + $td->NORMALIZE_NEWLINES); +$td->runtest("check file", + {$td->FILE => "a.pdf"}, + {$td->FILE => "recover-xref-stream-recovered.pdf"}); + cleanup(); $td->report(calc_ntests($n_tests, $n_compare_pdfs)); diff --git a/qpdf/qtest/qpdf/bad7-recover.out b/qpdf/qtest/qpdf/bad7-recover.out index 1b39acd9..0e5d4a6c 100644 --- a/qpdf/qtest/qpdf/bad7-recover.out +++ b/qpdf/qtest/qpdf/bad7-recover.out @@ -1,4 +1,6 @@ WARNING: bad7.pdf: file is damaged WARNING: bad7.pdf (offset 698): expected trailer dictionary WARNING: bad7.pdf: Attempting to reconstruct cross-reference table +WARNING: bad7.pdf (object 2 0, offset 128): expected endobj +WARNING: bad7.pdf (object 4 0, offset 389): expected endobj bad7.pdf: unable to find trailer dictionary while recovering damaged file diff --git a/qpdf/qtest/qpdf/issue-146.out b/qpdf/qtest/qpdf/issue-146.out index 0aa23ed8..3e3b50da 100644 --- a/qpdf/qtest/qpdf/issue-146.out +++ b/qpdf/qtest/qpdf/issue-146.out @@ -2,4 +2,7 @@ WARNING: issue-146.pdf: file is damaged WARNING: issue-146.pdf: can't find startxref WARNING: issue-146.pdf: Attempting to reconstruct cross-reference table WARNING: issue-146.pdf (trailer, offset 695): ignoring excessively deeply nested data structure +WARNING: issue-146.pdf (object 1 0, offset 92): expected endobj +WARNING: issue-146.pdf (object 7 0, offset 146): unknown token while reading object; treating as string +WARNING: issue-146.pdf (object 7 0, offset 168): expected endobj qpdf: issue-146.pdf: unable to find trailer dictionary while recovering damaged file diff --git a/qpdf/qtest/qpdf/issue-148.out b/qpdf/qtest/qpdf/issue-148.out index a59c1343..dbc424f2 100644 --- a/qpdf/qtest/qpdf/issue-148.out +++ b/qpdf/qtest/qpdf/issue-148.out @@ -7,4 +7,9 @@ WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: s WARNING: issue-148.pdf: file is damaged WARNING: issue-148.pdf (offset 73): getStreamData called on unfilterable stream WARNING: issue-148.pdf: Attempting to reconstruct cross-reference table -qpdf: issue-148.pdf: unable to find trailer dictionary while recovering damaged file +WARNING: issue-148.pdf (xref stream: object 8 0, offset 26): stream dictionary lacks /Length key +WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): attempting to recover stream length +WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): recovered stream length: 2 +WARNING: issue-148.pdf (xref stream: object 8 0, offset 85): expected endobj +WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: stream inflate: inflate: data: incorrect header check +qpdf: issue-148.pdf: error decoding candidate xref stream while recovering damaged file diff --git a/qpdf/qtest/qpdf/issue-150.out b/qpdf/qtest/qpdf/issue-150.out index 3291f96f..9fe8b5ac 100644 --- a/qpdf/qtest/qpdf/issue-150.out +++ b/qpdf/qtest/qpdf/issue-150.out @@ -2,4 +2,5 @@ WARNING: issue-150.pdf: can't find PDF header WARNING: issue-150.pdf: file is damaged WARNING: issue-150.pdf: error reading xref: overflow/underflow converting 9900000000000000000 to 64-bit integer WARNING: issue-150.pdf: Attempting to reconstruct cross-reference table +WARNING: issue-150.pdf (object 8 0): object has offset 0 qpdf: issue-150.pdf: unable to find trailer dictionary while recovering damaged file diff --git a/qpdf/qtest/qpdf/issue-202.out b/qpdf/qtest/qpdf/issue-202.out index 8310c103..441b7087 100644 --- a/qpdf/qtest/qpdf/issue-202.out +++ b/qpdf/qtest/qpdf/issue-202.out @@ -3,4 +3,6 @@ WARNING: issue-202.pdf: file is damaged WARNING: issue-202.pdf (offset 54769): expected trailer dictionary WARNING: issue-202.pdf: Attempting to reconstruct cross-reference table WARNING: issue-202.pdf (trailer, offset 55770): ignoring excessively deeply nested data structure +WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Creator; last occurrence overrides earlier ones +WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Producer; last occurrence overrides earlier ones qpdf: issue-202.pdf: unable to find trailer dictionary while recovering damaged file diff --git a/qpdf/qtest/qpdf/recover-xref-stream-recovered.pdf b/qpdf/qtest/qpdf/recover-xref-stream-recovered.pdf new file mode 100644 index 0000000000000000000000000000000000000000..dfbfceedc2df755462008f21b6055ec5620ce739 GIT binary patch literal 968 zcmaJ<+iuf95KYQO!Y};7yhLgqg1t9iq{tF&Vu64*Bt{@bdDz68;HdFd_NEGa4nM*# z@HOy;n6>LRs0d4Ak9TLznVB=a$tW3+7sBa%|Ml&+L%;*Q+&aS{xapT`1@7f?J2N|Q z->PC`J_Gkca4#SLH_6M&0E-}woywF-?i`BQ3<3fi2oWHFat=N)=3@dA1VUiGSA$RF zMpYQ>!4%y0sY0kZ2H9WYH6L5c6jSVb(Ux;Ajl9wuOKNL8*ZaZ%0ryRwRfxd7)Fwau@cpgO5a!q83k!^WXRq1`X!g?)bqVcA72zF?J;uteZ zy_zY5-L@yq_-qAjv1hVUb+-FTUEL^?uOztRVx_aZ*ns;nFJ2Z^et&QwcPK!D-Oct( zQ>$S?@xQ^Yv9P&0(vK26RR;qL=Tw*ZW z*~){^H>xc2ObzHi;7Pki_hGCKvz+zxTk>|R_z}Tb>OkE#<&dt?4^S8CL7Jg|I?+ZN z-G_ARjMImm5xY_?dNZa^Q?^WO$jcBx=R6k z7#R%U0G9w>a|qy1R>5S2p6YK7>AO0+_0WbV`Iw@=Hv+effHx7W=jWLPPFCKgN!d7; zx2;#9PPHvt+k*U)KH9DGT+cIWGFt*CTB=;%nu5w1OfKLxAXb?KGknyOE`hK#r4L}}XW!vUaUl&W-h^Bg|Or?h2(N?Ij}Qs{t!EOzGbj?zV$Lzzc;gtCD08087d)9(CFJpG80cJ2V7v|(W{ z3rvu65g;F%J6nv?WVngK&dsr^T*_~|9xL@!mCm&&aiTq-@;H{qg(-?OF)LVow@ZY^ zA;of~hIKdLi`xHYq&N8D59O=U8y4mQ2N+^OC8hQJJg83Ho| KW(fTE2>b