2
1
mirror of https://github.com/qpdf/qpdf.git synced 2025-01-05 08:02:11 +00:00

Refactor Objects::recover_stream_length

Change how the maximum stream length is calculated. For streams not in the
xref table, instead of returning 0 length return the maximum length that
does not overlap with a known object or xref table.
This commit is contained in:
m-holger 2024-11-01 11:57:50 +00:00
parent 780a05735c
commit 6eb5d0d71a
7 changed files with 88 additions and 66 deletions

View File

@ -93,13 +93,13 @@ namespace
void void
Xref_table::test() Xref_table::test()
{ {
std::cout << "id, gen, offset, length, next\n"; std::cout << "id, gen, offset, length, next, upper_bound\n";
int i = 0; int i = 0;
for (auto const& entry: table) { for (auto const& entry: table) {
if (entry.type() == 1) { if (entry.type() == 1) {
std::cout << i << ", " << entry.gen() << ", " << entry.type() << ", " << entry.offset() std::cout << i << ", " << entry.gen() << ", " << entry.type() << ", " << entry.offset()
<< ", " << entry.length() << ", " << (entry.offset() + toO(entry.length())) << ", " << entry.length() << ", " << (entry.offset() + toO(entry.length()))
<< '\n'; << ", " << upper_bound(entry.offset() + 1) << '\n';
} }
++i; ++i;
} }
@ -149,7 +149,7 @@ Xref_table::initialize()
// PDF spec says %%EOF must be found within the last 1024 bytes of the file. We add an extra // PDF spec says %%EOF must be found within the last 1024 bytes of the file. We add an extra
// 30 characters to leave room for the startxref stuff. // 30 characters to leave room for the startxref stuff.
file->seek(0, SEEK_END); file->seek(0, SEEK_END);
qpdf_offset_t end_offset = file->tell(); end_offset = file->tell();
// Sanity check on object ids. All objects must appear in xref table / stream. In all realistic // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
// scenarios at least 3 bytes are required. // scenarios at least 3 bytes are required.
if (max_id_ > end_offset / 3) { if (max_id_ > end_offset / 3) {
@ -1129,26 +1129,6 @@ Xref_table::insert_free(QPDFObjGen og)
} }
} }
QPDFObjGen
Xref_table::at_offset(qpdf_offset_t offset) const noexcept
{
int id = 0;
int gen = 0;
qpdf_offset_t start = 0;
int i = 0;
for (auto const& item: table) {
auto o = item.offset();
if (start < o && o <= offset) {
start = o;
id = i;
gen = item.gen();
}
++i;
}
return QPDFObjGen(id, gen);
}
std::map<QPDFObjGen, QPDFXRefEntry> std::map<QPDFObjGen, QPDFXRefEntry>
Xref_table::as_map() const Xref_table::as_map() const
{ {
@ -1409,6 +1389,30 @@ QPDF::findEndstream()
return false; return false;
} }
// Return the smallest offset that is known to belong to a different item(object/xre table) from the
// item at start.
qpdf_offset_t
Xref_table::upper_bound(qpdf_offset_t start) const noexcept
{
auto upb = end_offset;
if (start >= end_offset) {
// Shouldn't be possible.
return start;
}
for (auto const& e: table) {
if (auto offset = e.offset(); offset > start) {
// Should never happen.
upb = std::min(upb, offset);
}
}
for (auto const& e: offsets) {
if (e.first > start) {
upb = std::min(upb, e.first);
}
}
return upb;
}
size_t size_t
Objects::recover_stream_length(QPDFObjGen og, qpdf_offset_t stream_offset) Objects::recover_stream_length(QPDFObjGen og, qpdf_offset_t stream_offset)
{ {
@ -1416,30 +1420,19 @@ Objects::recover_stream_length(QPDFObjGen og, qpdf_offset_t stream_offset)
qpdf.warn(qpdf.damagedPDF(stream_offset, "attempting to recover stream length")); qpdf.warn(qpdf.damagedPDF(stream_offset, "attempting to recover stream length"));
PatternFinder ef(qpdf, &QPDF::findEndstream); PatternFinder ef(qpdf, &QPDF::findEndstream);
size_t length = 0;
if (m->file->findFirst("end", stream_offset, 0, ef)) { auto length = xref.length(og);
length = length ? length - std::min(length, toS(stream_offset - xref.offset(og)))
: toS(xref.upper_bound(stream_offset) - stream_offset);
if (m->file->findFirst("end", stream_offset, length, ef)) {
length = toS(m->file->getLastOffset() - stream_offset); length = toS(m->file->getLastOffset() - stream_offset);
}
if (length) {
// Make sure this is inside this object
auto found = xref.at_offset(stream_offset + toO(length));
if (found == QPDFObjGen() || found == og) {
// If we are trying to recover an XRef stream the xref table will not contain and
// won't contain any entries, therefore we cannot check the found length. Otherwise we
// found endstream\endobj within the space allowed for this object, so we're probably
// in good shape.
} else { } else {
QTC::TC("qpdf", "QPDF found wrong endstream in recovery"); // NB findFirst ignores 'length' when reading data into the buffer and therefore leaves the
length = 0; // file position beyond the end of the object if the target is not found.
m->file->seek(stream_offset + toO(length), SEEK_SET);
} }
}
if (length == 0) {
qpdf.warn(qpdf.damagedPDF(stream_offset, "unable to recover stream data; treating stream as empty"));
} else {
qpdf.warn(qpdf.damagedPDF(stream_offset, "recovered stream length: " + std::to_string(length))); qpdf.warn(qpdf.damagedPDF(stream_offset, "recovered stream length: " + std::to_string(length)));
}
QTC::TC("qpdf", "QPDF recovered stream length"); QTC::TC("qpdf", "QPDF recovered stream length");
return length; return length;

View File

@ -78,7 +78,7 @@ class QPDF::Objects
return table[id].type(); return table[id].type();
} }
// Returns 0 if og is not in table. // Returns 0 if og is not in table or is not an uncompressed object.
qpdf_offset_t qpdf_offset_t
offset(QPDFObjGen og) const noexcept offset(QPDFObjGen og) const noexcept
{ {
@ -89,6 +89,18 @@ class QPDF::Objects
return table[static_cast<size_t>(id)].offset(); return table[static_cast<size_t>(id)].offset();
} }
// (Maximum possible) size of object. Returns 0 if og is not in table or is not an
// uncompressed object.
size_t
length(QPDFObjGen og) const noexcept
{
int id = og.getObj();
if (id < 1 || static_cast<size_t>(id) >= table.size()) {
return 0;
}
return table[static_cast<size_t>(id)].length();
}
// Returns 0 if id is not in table. // Returns 0 if id is not in table.
int int
stream_number(int id) const noexcept stream_number(int id) const noexcept
@ -108,8 +120,6 @@ class QPDF::Objects
return table[static_cast<size_t>(id)].stream_index(); return table[static_cast<size_t>(id)].stream_index();
} }
QPDFObjGen at_offset(qpdf_offset_t offset) const noexcept;
std::map<QPDFObjGen, QPDFXRefEntry> as_map() const; std::map<QPDFObjGen, QPDFXRefEntry> as_map() const;
bool bool
@ -228,6 +238,8 @@ class QPDF::Objects
return first_item_offset_; return first_item_offset_;
} }
qpdf_offset_t upper_bound(qpdf_offset_t start) const noexcept;
void test(); void test();
private: private:
@ -399,6 +411,7 @@ class QPDF::Objects
// to the value of /Size. If the file is damaged, max_id_ becomes the maximum object id in // to the value of /Size. If the file is damaged, max_id_ becomes the maximum object id in
// the xref table after reconstruction. // the xref table after reconstruction.
int max_id_{std::numeric_limits<int>::max() - 1}; int max_id_{std::numeric_limits<int>::max() - 1};
qpdf_offset_t end_offset{0}; // used for object length calc.
// Linearization data // Linearization data
bool uncompressed_after_compressed_{false}; bool uncompressed_after_compressed_{false};

View File

@ -1,12 +1,12 @@
WARNING: incremental-1-bad.pdf: file is damaged WARNING: incremental-1-bad.pdf: file is damaged
WARNING: incremental-1-bad.pdf (offset 1241): xref not found WARNING: incremental-1-bad.pdf (offset 1241): xref not found
WARNING: incremental-1-bad.pdf: Attempting to reconstruct cross-reference table WARNING: incremental-1-bad.pdf: Attempting to reconstruct cross-reference table
id, gen, offset, length, next id, gen, offset, length, next, upper_bound
1, 0, 1, 9, 93, 102 1, 0, 1, 9, 93, 102, 102
2, 0, 1, 102, 72, 174 2, 0, 1, 102, 72, 174, 442
3, 0, 1, 1108, 172, 1280 3, 0, 1, 1108, 172, 1280, 1462
4, 1, 1, 987, 26, 1013 4, 1, 1, 987, 26, 1013, 1013
5, 0, 1, 442, 35, 477 5, 0, 1, 442, 35, 477, 477
6, 0, 1, 477, 118, 595 6, 0, 1, 477, 118, 595, 987
7, 0, 1, 1013, 95, 1108 7, 0, 1, 1013, 95, 1108, 1108
xref done xref done

View File

@ -1,9 +1,9 @@
id, gen, offset, length, next id, gen, offset, length, next, upper_bound
1, 0, 1, 9, 54, 63 1, 0, 1, 9, 54, 63, 63
2, 0, 1, 63, 72, 135 2, 0, 1, 63, 72, 135, 403
3, 0, 1, 1069, 172, 1241 3, 0, 1, 1069, 172, 1241, 1423
4, 1, 1, 948, 26, 974 4, 1, 1, 948, 26, 974, 974
5, 0, 1, 403, 35, 438 5, 0, 1, 403, 35, 438, 438
6, 0, 1, 438, 118, 556 6, 0, 1, 438, 118, 556, 948
7, 0, 1, 974, 95, 1069 7, 0, 1, 974, 95, 1069, 1069
xref done xref done

View File

@ -98,7 +98,6 @@ QPDF loop detected traversing objects 0
QPDF reconstructed xref table 0 QPDF reconstructed xref table 0
QPDF recovered in readObjectAtOffset 0 QPDF recovered in readObjectAtOffset 0
QPDF recovered stream length 0 QPDF recovered stream length 0
QPDF found wrong endstream in recovery 0
QPDF_Stream pipeStreamData with null pipeline 0 QPDF_Stream pipeStreamData with null pipeline 0
QPDFWriter not recompressing /FlateDecode 0 QPDFWriter not recompressing /FlateDecode 0
QPDF_encryption xref stream from encrypted file 0 QPDF_encryption xref stream from encrypted file 0

View File

@ -1,11 +1,28 @@
WARNING: bad39.pdf (object 4 0, offset 385): expected endstream WARNING: bad39.pdf (object 4 0, offset 385): expected endstream
WARNING: bad39.pdf (object 4 0, offset 341): attempting to recover stream length WARNING: bad39.pdf (object 4 0, offset 341): attempting to recover stream length
WARNING: bad39.pdf (object 4 0, offset 341): unable to recover stream data; treating stream as empty WARNING: bad39.pdf (object 4 0, offset 341): recovered stream length: 62
WARNING: bad39.pdf (object 4 0, offset 403): expected endobj
/QTest is indirect and has type stream (10) /QTest is indirect and has type stream (10)
/QTest is a stream. Dictionary: << /Length 44 >> /QTest is a stream. Dictionary: << /Length 44 >>
Raw stream data: Raw stream data:
BT
/F1 24 Tf
72 720 Td
(Potato) Tj
ET
enxstream
enxobj
Uncompressed stream data: Uncompressed stream data:
BT
/F1 24 Tf
72 720 Td
(Potato) Tj
ET
enxstream
enxobj
End of stream data End of stream data
unparse: 4 0 R unparse: 4 0 R

View File

@ -1,7 +1,7 @@
WARNING: issue-141a.pdf: can't find PDF header WARNING: issue-141a.pdf: can't find PDF header
WARNING: issue-141a.pdf (xref stream: object 9 0, offset 10): stream dictionary lacks /Length key WARNING: issue-141a.pdf (xref stream: object 9 0, offset 10): stream dictionary lacks /Length key
WARNING: issue-141a.pdf (xref stream: object 9 0, offset 47): attempting to recover stream length WARNING: issue-141a.pdf (xref stream: object 9 0, offset 47): attempting to recover stream length
WARNING: issue-141a.pdf (xref stream: object 9 0, offset 47): unable to recover stream data; treating stream as empty WARNING: issue-141a.pdf (xref stream: object 9 0, offset 47): recovered stream length: 0
WARNING: issue-141a.pdf: file is damaged WARNING: issue-141a.pdf: file is damaged
WARNING: issue-141a.pdf (xref stream, offset 3): Cross-reference stream's /W indicates entry size of 0 WARNING: issue-141a.pdf (xref stream, offset 3): Cross-reference stream's /W indicates entry size of 0
WARNING: issue-141a.pdf: Attempting to reconstruct cross-reference table WARNING: issue-141a.pdf: Attempting to reconstruct cross-reference table