diff --git a/ChangeLog b/ChangeLog index 62f1a541..0a57abea 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2024-01-06 Jay Berkenbilt + + * When recovering a file's xref table, attempt to find xref + streams if a traditional trailer dictionary is not found. Fixes + #1103. + 2024-01-05 Jay Berkenbilt * Add --set-page-labels command-line argument and supporting API. diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index af3db080..67f5e2e0 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -579,6 +579,38 @@ QPDF::reconstruct_xref(QPDFExc& e) } m->deleted_objects.clear(); + if (!m->trailer.isInitialized()) { + qpdf_offset_t max_offset{0}; + // If there are any xref streams, take the last one to appear. + for (auto const& iter: m->xref_table) { + auto entry = iter.second; + if (entry.getType() != 1) { + continue; + } + auto oh = getObjectByObjGen(iter.first); + try { + if (!oh.isStreamOfType("/XRef")) { + continue; + } + } catch (std::exception&) { + continue; + } + auto offset = entry.getOffset(); + if (offset > max_offset) { + max_offset = offset; + setTrailer(oh.getDict()); + } + } + if (max_offset > 0) { + try { + read_xref(max_offset); + } catch (std::exception&) { + throw damagedPDF("", 0, "error decoding candidate xref stream while recovering damaged file"); + } + QTC::TC("qpdf", "QPDF recover xref stream"); + } + } + if (!m->trailer.isInitialized()) { // We could check the last encountered object to see if it was an xref stream. If so, we // could try to get the trailer from there. This may make it possible to recover files with diff --git a/manual/release-notes.rst b/manual/release-notes.rst index 6fb4758a..99f759ce 100644 --- a/manual/release-notes.rst +++ b/manual/release-notes.rst @@ -67,6 +67,11 @@ Planned changes for future 12.x (subject to change): - ``QPDFPageLabelDocumentHelper::pageLabelDict`` + - Improve file recovery logic to better handle files with + cross-reference streams. This should enable qpdf to recover some + files that it would previously have reported "unable to find + trailer dictionary." + 11.7.0: December 24, 2023 - Bug fixes: diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index cbb4ac1d..df2555d6 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -689,3 +689,4 @@ QPDFPageObjectHelper used fallback without copying 0 QPDF skipping cache for known unchecked object 0 QPDF fix dangling triggered xref reconstruction 0 QPDFPageDocumentHelper flatten resources missing or invalid 0 +QPDF recover xref stream 0 diff --git a/qpdf/qtest/object-stream.test b/qpdf/qtest/object-stream.test index 04d1bb0d..22b35af4 100644 --- a/qpdf/qtest/object-stream.test +++ b/qpdf/qtest/object-stream.test @@ -16,7 +16,7 @@ cleanup(); my $td = new TestDriver('object-stream'); -my $n_tests = 3 + (36 * 4) + (12 * 2); +my $n_tests = 5 + (36 * 4) + (12 * 2); my $n_compare_pdfs = 36; for (my $n = 16; $n <= 19; ++$n) @@ -87,5 +87,15 @@ $td->runtest("check file", {$td->FILE => "gen1.qdf"}); +# Recover a file with xref streams +$td->runtest("recover file with xref stream", + {$td->COMMAND => "qpdf --static-id --compress-streams=n" . + " recover-xref-stream.pdf a.pdf"}, + {$td->FILE => "recover-xref-stream.out", $td->EXIT_STATUS => 3}, + $td->NORMALIZE_NEWLINES); +$td->runtest("check file", + {$td->FILE => "a.pdf"}, + {$td->FILE => "recover-xref-stream-recovered.pdf"}); + cleanup(); $td->report(calc_ntests($n_tests, $n_compare_pdfs)); diff --git a/qpdf/qtest/qpdf/bad7-recover.out b/qpdf/qtest/qpdf/bad7-recover.out index 1b39acd9..0e5d4a6c 100644 --- a/qpdf/qtest/qpdf/bad7-recover.out +++ b/qpdf/qtest/qpdf/bad7-recover.out @@ -1,4 +1,6 @@ WARNING: bad7.pdf: file is damaged WARNING: bad7.pdf (offset 698): expected trailer dictionary WARNING: bad7.pdf: Attempting to reconstruct cross-reference table +WARNING: bad7.pdf (object 2 0, offset 128): expected endobj +WARNING: bad7.pdf (object 4 0, offset 389): expected endobj bad7.pdf: unable to find trailer dictionary while recovering damaged file diff --git a/qpdf/qtest/qpdf/issue-146.out b/qpdf/qtest/qpdf/issue-146.out index 0aa23ed8..3e3b50da 100644 --- a/qpdf/qtest/qpdf/issue-146.out +++ b/qpdf/qtest/qpdf/issue-146.out @@ -2,4 +2,7 @@ WARNING: issue-146.pdf: file is damaged WARNING: issue-146.pdf: can't find startxref WARNING: issue-146.pdf: Attempting to reconstruct cross-reference table WARNING: issue-146.pdf (trailer, offset 695): ignoring excessively deeply nested data structure +WARNING: issue-146.pdf (object 1 0, offset 92): expected endobj +WARNING: issue-146.pdf (object 7 0, offset 146): unknown token while reading object; treating as string +WARNING: issue-146.pdf (object 7 0, offset 168): expected endobj qpdf: issue-146.pdf: unable to find trailer dictionary while recovering damaged file diff --git a/qpdf/qtest/qpdf/issue-148.out b/qpdf/qtest/qpdf/issue-148.out index a59c1343..dbc424f2 100644 --- a/qpdf/qtest/qpdf/issue-148.out +++ b/qpdf/qtest/qpdf/issue-148.out @@ -7,4 +7,9 @@ WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: s WARNING: issue-148.pdf: file is damaged WARNING: issue-148.pdf (offset 73): getStreamData called on unfilterable stream WARNING: issue-148.pdf: Attempting to reconstruct cross-reference table -qpdf: issue-148.pdf: unable to find trailer dictionary while recovering damaged file +WARNING: issue-148.pdf (xref stream: object 8 0, offset 26): stream dictionary lacks /Length key +WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): attempting to recover stream length +WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): recovered stream length: 2 +WARNING: issue-148.pdf (xref stream: object 8 0, offset 85): expected endobj +WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: stream inflate: inflate: data: incorrect header check +qpdf: issue-148.pdf: error decoding candidate xref stream while recovering damaged file diff --git a/qpdf/qtest/qpdf/issue-150.out b/qpdf/qtest/qpdf/issue-150.out index 3291f96f..9fe8b5ac 100644 --- a/qpdf/qtest/qpdf/issue-150.out +++ b/qpdf/qtest/qpdf/issue-150.out @@ -2,4 +2,5 @@ WARNING: issue-150.pdf: can't find PDF header WARNING: issue-150.pdf: file is damaged WARNING: issue-150.pdf: error reading xref: overflow/underflow converting 9900000000000000000 to 64-bit integer WARNING: issue-150.pdf: Attempting to reconstruct cross-reference table +WARNING: issue-150.pdf (object 8 0): object has offset 0 qpdf: issue-150.pdf: unable to find trailer dictionary while recovering damaged file diff --git a/qpdf/qtest/qpdf/issue-202.out b/qpdf/qtest/qpdf/issue-202.out index 8310c103..441b7087 100644 --- a/qpdf/qtest/qpdf/issue-202.out +++ b/qpdf/qtest/qpdf/issue-202.out @@ -3,4 +3,6 @@ WARNING: issue-202.pdf: file is damaged WARNING: issue-202.pdf (offset 54769): expected trailer dictionary WARNING: issue-202.pdf: Attempting to reconstruct cross-reference table WARNING: issue-202.pdf (trailer, offset 55770): ignoring excessively deeply nested data structure +WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Creator; last occurrence overrides earlier ones +WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Producer; last occurrence overrides earlier ones qpdf: issue-202.pdf: unable to find trailer dictionary while recovering damaged file diff --git a/qpdf/qtest/qpdf/recover-xref-stream-recovered.pdf b/qpdf/qtest/qpdf/recover-xref-stream-recovered.pdf new file mode 100644 index 00000000..dfbfceed Binary files /dev/null and b/qpdf/qtest/qpdf/recover-xref-stream-recovered.pdf differ diff --git a/qpdf/qtest/qpdf/recover-xref-stream.out b/qpdf/qtest/qpdf/recover-xref-stream.out new file mode 100644 index 00000000..ba0e1aa6 --- /dev/null +++ b/qpdf/qtest/qpdf/recover-xref-stream.out @@ -0,0 +1,5 @@ +WARNING: recover-xref-stream.pdf: file is damaged +WARNING: recover-xref-stream.pdf: can't find startxref +WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table +WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15) +qpdf: operation succeeded with warnings; resulting file may have some problems diff --git a/qpdf/qtest/qpdf/recover-xref-stream.pdf b/qpdf/qtest/qpdf/recover-xref-stream.pdf new file mode 100644 index 00000000..f8da3f1b Binary files /dev/null and b/qpdf/qtest/qpdf/recover-xref-stream.pdf differ