From 6eb5d0d71a389fbaae02833ae6dc23c149c90357 Mon Sep 17 00:00:00 2001 From: m-holger Date: Fri, 1 Nov 2024 11:57:50 +0000 Subject: [PATCH] Refactor Objects::recover_stream_length Change how the maximum stream length is calculated. For streams not in the xref table, instead of returning 0 length return the maximum length that does not overlap with a known object or xref table. --- libqpdf/QPDF_objects.cc | 81 +++++++++++------------ libqpdf/qpdf/QPDF_objects.hh | 19 +++++- libtests/qtest/xref/incremental-1-bad.out | 16 ++--- libtests/qtest/xref/incremental-1.out | 16 ++--- qpdf/qpdf.testcov | 1 - qpdf/qtest/qpdf/bad39-recover.out | 19 +++++- qpdf/qtest/qpdf/issue-141a.out | 2 +- 7 files changed, 88 insertions(+), 66 deletions(-) diff --git a/libqpdf/QPDF_objects.cc b/libqpdf/QPDF_objects.cc index 100a68cd..c85abd74 100644 --- a/libqpdf/QPDF_objects.cc +++ b/libqpdf/QPDF_objects.cc @@ -93,13 +93,13 @@ namespace void Xref_table::test() { - std::cout << "id, gen, offset, length, next\n"; + std::cout << "id, gen, offset, length, next, upper_bound\n"; int i = 0; for (auto const& entry: table) { if (entry.type() == 1) { std::cout << i << ", " << entry.gen() << ", " << entry.type() << ", " << entry.offset() << ", " << entry.length() << ", " << (entry.offset() + toO(entry.length())) - << '\n'; + << ", " << upper_bound(entry.offset() + 1) << '\n'; } ++i; } @@ -149,7 +149,7 @@ Xref_table::initialize() // PDF spec says %%EOF must be found within the last 1024 bytes of the file. We add an extra // 30 characters to leave room for the startxref stuff. file->seek(0, SEEK_END); - qpdf_offset_t end_offset = file->tell(); + end_offset = file->tell(); // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic // scenarios at least 3 bytes are required. if (max_id_ > end_offset / 3) { @@ -1129,26 +1129,6 @@ Xref_table::insert_free(QPDFObjGen og) } } -QPDFObjGen -Xref_table::at_offset(qpdf_offset_t offset) const noexcept -{ - int id = 0; - int gen = 0; - qpdf_offset_t start = 0; - - int i = 0; - for (auto const& item: table) { - auto o = item.offset(); - if (start < o && o <= offset) { - start = o; - id = i; - gen = item.gen(); - } - ++i; - } - return QPDFObjGen(id, gen); -} - std::map Xref_table::as_map() const { @@ -1409,6 +1389,30 @@ QPDF::findEndstream() return false; } +// Return the smallest offset that is known to belong to a different item(object/xre table) from the +// item at start. +qpdf_offset_t +Xref_table::upper_bound(qpdf_offset_t start) const noexcept +{ + auto upb = end_offset; + if (start >= end_offset) { + // Shouldn't be possible. + return start; + } + for (auto const& e: table) { + if (auto offset = e.offset(); offset > start) { + // Should never happen. + upb = std::min(upb, offset); + } + } + for (auto const& e: offsets) { + if (e.first > start) { + upb = std::min(upb, e.first); + } + } + return upb; +} + size_t Objects::recover_stream_length(QPDFObjGen og, qpdf_offset_t stream_offset) { @@ -1416,30 +1420,19 @@ Objects::recover_stream_length(QPDFObjGen og, qpdf_offset_t stream_offset) qpdf.warn(qpdf.damagedPDF(stream_offset, "attempting to recover stream length")); PatternFinder ef(qpdf, &QPDF::findEndstream); - size_t length = 0; - if (m->file->findFirst("end", stream_offset, 0, ef)) { + + auto length = xref.length(og); + length = length ? length - std::min(length, toS(stream_offset - xref.offset(og))) + : toS(xref.upper_bound(stream_offset) - stream_offset); + + if (m->file->findFirst("end", stream_offset, length, ef)) { length = toS(m->file->getLastOffset() - stream_offset); - } - - if (length) { - // Make sure this is inside this object - auto found = xref.at_offset(stream_offset + toO(length)); - if (found == QPDFObjGen() || found == og) { - // If we are trying to recover an XRef stream the xref table will not contain and - // won't contain any entries, therefore we cannot check the found length. Otherwise we - // found endstream\endobj within the space allowed for this object, so we're probably - // in good shape. - } else { - QTC::TC("qpdf", "QPDF found wrong endstream in recovery"); - length = 0; - } - } - - if (length == 0) { - qpdf.warn(qpdf.damagedPDF(stream_offset, "unable to recover stream data; treating stream as empty")); } else { - qpdf.warn(qpdf.damagedPDF(stream_offset, "recovered stream length: " + std::to_string(length))); + // NB findFirst ignores 'length' when reading data into the buffer and therefore leaves the + // file position beyond the end of the object if the target is not found. + m->file->seek(stream_offset + toO(length), SEEK_SET); } + qpdf.warn(qpdf.damagedPDF(stream_offset, "recovered stream length: " + std::to_string(length))); QTC::TC("qpdf", "QPDF recovered stream length"); return length; diff --git a/libqpdf/qpdf/QPDF_objects.hh b/libqpdf/qpdf/QPDF_objects.hh index 2ea59c65..f9f987b8 100644 --- a/libqpdf/qpdf/QPDF_objects.hh +++ b/libqpdf/qpdf/QPDF_objects.hh @@ -78,7 +78,7 @@ class QPDF::Objects return table[id].type(); } - // Returns 0 if og is not in table. + // Returns 0 if og is not in table or is not an uncompressed object. qpdf_offset_t offset(QPDFObjGen og) const noexcept { @@ -89,6 +89,18 @@ class QPDF::Objects return table[static_cast(id)].offset(); } + // (Maximum possible) size of object. Returns 0 if og is not in table or is not an + // uncompressed object. + size_t + length(QPDFObjGen og) const noexcept + { + int id = og.getObj(); + if (id < 1 || static_cast(id) >= table.size()) { + return 0; + } + return table[static_cast(id)].length(); + } + // Returns 0 if id is not in table. int stream_number(int id) const noexcept @@ -108,8 +120,6 @@ class QPDF::Objects return table[static_cast(id)].stream_index(); } - QPDFObjGen at_offset(qpdf_offset_t offset) const noexcept; - std::map as_map() const; bool @@ -228,6 +238,8 @@ class QPDF::Objects return first_item_offset_; } + qpdf_offset_t upper_bound(qpdf_offset_t start) const noexcept; + void test(); private: @@ -399,6 +411,7 @@ class QPDF::Objects // to the value of /Size. If the file is damaged, max_id_ becomes the maximum object id in // the xref table after reconstruction. int max_id_{std::numeric_limits::max() - 1}; + qpdf_offset_t end_offset{0}; // used for object length calc. // Linearization data bool uncompressed_after_compressed_{false}; diff --git a/libtests/qtest/xref/incremental-1-bad.out b/libtests/qtest/xref/incremental-1-bad.out index 0ba09b0d..ece7cd83 100644 --- a/libtests/qtest/xref/incremental-1-bad.out +++ b/libtests/qtest/xref/incremental-1-bad.out @@ -1,12 +1,12 @@ WARNING: incremental-1-bad.pdf: file is damaged WARNING: incremental-1-bad.pdf (offset 1241): xref not found WARNING: incremental-1-bad.pdf: Attempting to reconstruct cross-reference table -id, gen, offset, length, next -1, 0, 1, 9, 93, 102 -2, 0, 1, 102, 72, 174 -3, 0, 1, 1108, 172, 1280 -4, 1, 1, 987, 26, 1013 -5, 0, 1, 442, 35, 477 -6, 0, 1, 477, 118, 595 -7, 0, 1, 1013, 95, 1108 +id, gen, offset, length, next, upper_bound +1, 0, 1, 9, 93, 102, 102 +2, 0, 1, 102, 72, 174, 442 +3, 0, 1, 1108, 172, 1280, 1462 +4, 1, 1, 987, 26, 1013, 1013 +5, 0, 1, 442, 35, 477, 477 +6, 0, 1, 477, 118, 595, 987 +7, 0, 1, 1013, 95, 1108, 1108 xref done diff --git a/libtests/qtest/xref/incremental-1.out b/libtests/qtest/xref/incremental-1.out index dfddcc2c..9f707ffa 100644 --- a/libtests/qtest/xref/incremental-1.out +++ b/libtests/qtest/xref/incremental-1.out @@ -1,9 +1,9 @@ -id, gen, offset, length, next -1, 0, 1, 9, 54, 63 -2, 0, 1, 63, 72, 135 -3, 0, 1, 1069, 172, 1241 -4, 1, 1, 948, 26, 974 -5, 0, 1, 403, 35, 438 -6, 0, 1, 438, 118, 556 -7, 0, 1, 974, 95, 1069 +id, gen, offset, length, next, upper_bound +1, 0, 1, 9, 54, 63, 63 +2, 0, 1, 63, 72, 135, 403 +3, 0, 1, 1069, 172, 1241, 1423 +4, 1, 1, 948, 26, 974, 974 +5, 0, 1, 403, 35, 438, 438 +6, 0, 1, 438, 118, 556, 948 +7, 0, 1, 974, 95, 1069, 1069 xref done diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 7c587f06..5c6134e3 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -98,7 +98,6 @@ QPDF loop detected traversing objects 0 QPDF reconstructed xref table 0 QPDF recovered in readObjectAtOffset 0 QPDF recovered stream length 0 -QPDF found wrong endstream in recovery 0 QPDF_Stream pipeStreamData with null pipeline 0 QPDFWriter not recompressing /FlateDecode 0 QPDF_encryption xref stream from encrypted file 0 diff --git a/qpdf/qtest/qpdf/bad39-recover.out b/qpdf/qtest/qpdf/bad39-recover.out index bf5df4b7..01eb287c 100644 --- a/qpdf/qtest/qpdf/bad39-recover.out +++ b/qpdf/qtest/qpdf/bad39-recover.out @@ -1,11 +1,28 @@ WARNING: bad39.pdf (object 4 0, offset 385): expected endstream WARNING: bad39.pdf (object 4 0, offset 341): attempting to recover stream length -WARNING: bad39.pdf (object 4 0, offset 341): unable to recover stream data; treating stream as empty +WARNING: bad39.pdf (object 4 0, offset 341): recovered stream length: 62 +WARNING: bad39.pdf (object 4 0, offset 403): expected endobj /QTest is indirect and has type stream (10) /QTest is a stream. Dictionary: << /Length 44 >> Raw stream data: +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET +enxstream +enxobj + Uncompressed stream data: +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET +enxstream +enxobj + End of stream data unparse: 4 0 R diff --git a/qpdf/qtest/qpdf/issue-141a.out b/qpdf/qtest/qpdf/issue-141a.out index a644926e..28f73ff7 100644 --- a/qpdf/qtest/qpdf/issue-141a.out +++ b/qpdf/qtest/qpdf/issue-141a.out @@ -1,7 +1,7 @@ WARNING: issue-141a.pdf: can't find PDF header WARNING: issue-141a.pdf (xref stream: object 9 0, offset 10): stream dictionary lacks /Length key WARNING: issue-141a.pdf (xref stream: object 9 0, offset 47): attempting to recover stream length -WARNING: issue-141a.pdf (xref stream: object 9 0, offset 47): unable to recover stream data; treating stream as empty +WARNING: issue-141a.pdf (xref stream: object 9 0, offset 47): recovered stream length: 0 WARNING: issue-141a.pdf: file is damaged WARNING: issue-141a.pdf (xref stream, offset 3): Cross-reference stream's /W indicates entry size of 0 WARNING: issue-141a.pdf: Attempting to reconstruct cross-reference table