diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index 472336c9..7650275b 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -762,7 +762,6 @@ class QPDF void setTrailer(QPDFObjectHandle obj); void read_xref(qpdf_offset_t offset); bool resolveXRefTable(); - void reconstruct_xref(QPDFExc& e); bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes); bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index db4e5022..e86b93e7 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -476,7 +476,7 @@ QPDF::parse(char const* password) } } catch (QPDFExc& e) { if (m->attempt_recovery) { - reconstruct_xref(e); + m->xref_table.reconstruct(e); QTC::TC("qpdf", "QPDF reconstructed xref table"); } else { throw; @@ -535,40 +535,42 @@ QPDF::setTrailer(QPDFObjectHandle obj) } void -QPDF::reconstruct_xref(QPDFExc& e) +QPDF::Xref_table::reconstruct(QPDFExc& e) { - if (m->xref_table.reconstructed) { + if (reconstructed) { // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now. throw e; } + auto* m = qpdf.m.get(); + // If recovery generates more than 1000 warnings, the file is so severely damaged that there // probably is no point trying to continue. const auto max_warnings = m->warnings.size() + 1000U; auto check_warnings = [this, max_warnings]() { - if (m->warnings.size() > max_warnings) { - throw damagedPDF("", 0, "too many errors while reconstructing cross-reference table"); + if (qpdf.m->warnings.size() > max_warnings) { + throw damaged_pdf("too many errors while reconstructing cross-reference table"); } }; - m->xref_table.reconstructed = true; + reconstructed = true; // We may find more objects, which may contain dangling references. m->fixed_dangling_refs = false; - warn(damagedPDF("", 0, "file is damaged")); - warn(e); - warn(damagedPDF("", 0, "Attempting to reconstruct cross-reference table")); + warn_damaged("file is damaged"); + qpdf.warn(e); + warn_damaged("Attempting to reconstruct cross-reference table"); // Delete all references to type 1 (uncompressed) objects std::set to_delete; - for (auto const& iter: m->xref_table) { + for (auto const& iter: *this) { if (iter.second.getType() == 1) { to_delete.insert(iter.first); } } for (auto const& iter: to_delete) { - m->xref_table.erase(iter); + erase(iter); } m->file->seek(0, SEEK_END); @@ -577,46 +579,45 @@ QPDF::reconstruct_xref(QPDFExc& e) // Don't allow very long tokens here during recovery. All the interesting tokens are covered. static size_t const MAX_LEN = 10; while (m->file->tell() < eof) { - QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN); + QPDFTokenizer::Token t1 = qpdf.readToken(*m->file, MAX_LEN); qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length()); if (t1.isInteger()) { auto pos = m->file->tell(); - QPDFTokenizer::Token t2 = readToken(*m->file, MAX_LEN); - if ((t2.isInteger()) && (readToken(*m->file, MAX_LEN).isWord("obj"))) { + QPDFTokenizer::Token t2 = qpdf.readToken(*m->file, MAX_LEN); + if (t2.isInteger() && qpdf.readToken(*m->file, MAX_LEN).isWord("obj")) { int obj = QUtil::string_to_int(t1.getValue().c_str()); int gen = QUtil::string_to_int(t2.getValue().c_str()); - if (obj <= m->xref_table.max_id) { - m->xref_table.insert_reconstructed(obj, token_start, gen); + if (obj <= max_id) { + insert_reconstructed(obj, token_start, gen); } else { - warn(damagedPDF( - "", 0, "ignoring object with impossibly large id " + std::to_string(obj))); + warn_damaged("ignoring object with impossibly large id " + std::to_string(obj)); } } m->file->seek(pos, SEEK_SET); - } else if (!m->xref_table.trailer && t1.isWord("trailer")) { + } else if (!trailer && t1.isWord("trailer")) { auto pos = m->file->tell(); - QPDFObjectHandle t = readTrailer(); + QPDFObjectHandle t = qpdf.readTrailer(); if (!t.isDictionary()) { // Oh well. It was worth a try. } else { - setTrailer(t); + qpdf.setTrailer(t); } m->file->seek(pos, SEEK_SET); } check_warnings(); m->file->findAndSkipNextEOL(); } - m->xref_table.deleted_objects.clear(); + deleted_objects.clear(); - if (!m->xref_table.trailer) { + if (!trailer) { qpdf_offset_t max_offset{0}; // If there are any xref streams, take the last one to appear. - for (auto const& iter: m->xref_table) { + for (auto const& iter: *this) { auto entry = iter.second; if (entry.getType() != 1) { continue; } - auto oh = getObjectByObjGen(iter.first); + auto oh = qpdf.getObjectByObjGen(iter.first); try { if (!oh.isStreamOfType("/XRef")) { continue; @@ -627,41 +628,41 @@ QPDF::reconstruct_xref(QPDFExc& e) auto offset = entry.getOffset(); if (offset > max_offset) { max_offset = offset; - setTrailer(oh.getDict()); + trailer = oh.getDict(); } check_warnings(); } if (max_offset > 0) { try { - read_xref(max_offset); + qpdf.read_xref(max_offset); } catch (std::exception&) { - throw damagedPDF( - "", 0, "error decoding candidate xref stream while recovering damaged file"); + throw damaged_pdf( + "error decoding candidate xref stream while recovering damaged file"); } QTC::TC("qpdf", "QPDF recover xref stream"); } } - if (!m->xref_table.trailer) { + if (!trailer) { // We could check the last encountered object to see if it was an xref stream. If so, we // could try to get the trailer from there. This may make it possible to recover files with // bad startxref pointers even when they have object streams. - throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file"); + throw damaged_pdf("unable to find trailer dictionary while recovering damaged file"); } - if (m->xref_table.empty()) { + if (empty()) { // We cannot check for an empty xref table in parse because empty tables are valid when // creating QPDF objects from JSON. - throw damagedPDF("", 0, "unable to find objects while recovering damaged file"); + throw damaged_pdf("unable to find objects while recovering damaged file"); } check_warnings(); - if (!m->xref_table.parsed) { - m->xref_table.parsed = true; - getAllPages(); + if (!parsed) { + parsed = true; + qpdf.getAllPages(); check_warnings(); if (m->all_pages.empty()) { - m->xref_table.parsed = false; - throw damagedPDF("", 0, "unable to find any pages while recovering damaged file"); + parsed = false; + throw damaged_pdf("unable to find any pages while recovering damaged file"); } } // We could iterate through the objects looking for streams and try to find objects inside of @@ -1766,7 +1767,7 @@ QPDF::readObjectAtOffset( } catch (QPDFExc& e) { if (try_recovery) { // Try again after reconstructing xref table - reconstruct_xref(e); + m->xref_table.reconstruct(e); if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) { qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset(); QPDFObjectHandle result = diff --git a/libqpdf/qpdf/QPDF_private.hh b/libqpdf/qpdf/QPDF_private.hh index 7a19adcc..881d995f 100644 --- a/libqpdf/qpdf/QPDF_private.hh +++ b/libqpdf/qpdf/QPDF_private.hh @@ -16,6 +16,8 @@ class QPDF::Xref_table: public std::map void insert(int obj, int f0, qpdf_offset_t f1, int f2); void insert_free(QPDFObjGen); + void reconstruct(QPDFExc& e); + QPDFObjectHandle trailer; bool reconstructed{false}; // Various tables are indexed by object id, with potential size id + 1 @@ -31,6 +33,17 @@ class QPDF::Xref_table: public std::map qpdf_offset_t first_item_offset{0}; // actual value from file private: + QPDFExc + damaged_pdf(std::string const& msg) + { + return qpdf.damagedPDF("", 0, msg); + } + + void + warn_damaged(std::string const& msg) + { + qpdf.warn(damaged_pdf(msg)); + } QPDF& qpdf; }; diff --git a/qpdf/qtest/object-stream.test b/qpdf/qtest/object-stream.test index e9468670..bed3fefa 100644 --- a/qpdf/qtest/object-stream.test +++ b/qpdf/qtest/object-stream.test @@ -102,11 +102,10 @@ $td->runtest("recover file with xref stream", {$td->COMMAND => "qpdf --static-id --compress-streams=n" . " recover-xref-stream.pdf a.pdf"}, {$td->FILE => "recover-xref-stream.out", $td->EXIT_STATUS => 3}, - $td->EXPECT_FAILURE); + $td->NORMALIZE_NEWLINES); $td->runtest("check file", {$td->FILE => "a.pdf"}, - {$td->FILE => "recover-xref-stream-recovered.pdf"}, - $td->EXPECT_FAILURE); + {$td->FILE => "recover-xref-stream-recovered.pdf"}); # Self-referential object stream $td->runtest("self-referential object stream", diff --git a/qpdf/qtest/qpdf/recover-xref-stream.out b/qpdf/qtest/qpdf/recover-xref-stream.out index ba0e1aa6..ffc4cced 100644 --- a/qpdf/qtest/qpdf/recover-xref-stream.out +++ b/qpdf/qtest/qpdf/recover-xref-stream.out @@ -1,5 +1,4 @@ WARNING: recover-xref-stream.pdf: file is damaged WARNING: recover-xref-stream.pdf: can't find startxref WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table -WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15) qpdf: operation succeeded with warnings; resulting file may have some problems