mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 02:49:00 +00:00
Move QPDF::reconstruct_xref to QPDF::Xref_table
Also, when recovering trailer from xref streams, pick the last valid trailer encountered rather than the first.
This commit is contained in:
parent
1e072e223a
commit
3fbff84594
@ -762,7 +762,6 @@ class QPDF
|
||||
void setTrailer(QPDFObjectHandle obj);
|
||||
void read_xref(qpdf_offset_t offset);
|
||||
bool resolveXRefTable();
|
||||
void reconstruct_xref(QPDFExc& e);
|
||||
bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes);
|
||||
bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
|
||||
bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
|
||||
|
@ -476,7 +476,7 @@ QPDF::parse(char const* password)
|
||||
}
|
||||
} catch (QPDFExc& e) {
|
||||
if (m->attempt_recovery) {
|
||||
reconstruct_xref(e);
|
||||
m->xref_table.reconstruct(e);
|
||||
QTC::TC("qpdf", "QPDF reconstructed xref table");
|
||||
} else {
|
||||
throw;
|
||||
@ -535,40 +535,42 @@ QPDF::setTrailer(QPDFObjectHandle obj)
|
||||
}
|
||||
|
||||
void
|
||||
QPDF::reconstruct_xref(QPDFExc& e)
|
||||
QPDF::Xref_table::reconstruct(QPDFExc& e)
|
||||
{
|
||||
if (m->xref_table.reconstructed) {
|
||||
if (reconstructed) {
|
||||
// Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
|
||||
// qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.
|
||||
throw e;
|
||||
}
|
||||
|
||||
auto* m = qpdf.m.get();
|
||||
|
||||
// If recovery generates more than 1000 warnings, the file is so severely damaged that there
|
||||
// probably is no point trying to continue.
|
||||
const auto max_warnings = m->warnings.size() + 1000U;
|
||||
auto check_warnings = [this, max_warnings]() {
|
||||
if (m->warnings.size() > max_warnings) {
|
||||
throw damagedPDF("", 0, "too many errors while reconstructing cross-reference table");
|
||||
if (qpdf.m->warnings.size() > max_warnings) {
|
||||
throw damaged_pdf("too many errors while reconstructing cross-reference table");
|
||||
}
|
||||
};
|
||||
|
||||
m->xref_table.reconstructed = true;
|
||||
reconstructed = true;
|
||||
// We may find more objects, which may contain dangling references.
|
||||
m->fixed_dangling_refs = false;
|
||||
|
||||
warn(damagedPDF("", 0, "file is damaged"));
|
||||
warn(e);
|
||||
warn(damagedPDF("", 0, "Attempting to reconstruct cross-reference table"));
|
||||
warn_damaged("file is damaged");
|
||||
qpdf.warn(e);
|
||||
warn_damaged("Attempting to reconstruct cross-reference table");
|
||||
|
||||
// Delete all references to type 1 (uncompressed) objects
|
||||
std::set<QPDFObjGen> to_delete;
|
||||
for (auto const& iter: m->xref_table) {
|
||||
for (auto const& iter: *this) {
|
||||
if (iter.second.getType() == 1) {
|
||||
to_delete.insert(iter.first);
|
||||
}
|
||||
}
|
||||
for (auto const& iter: to_delete) {
|
||||
m->xref_table.erase(iter);
|
||||
erase(iter);
|
||||
}
|
||||
|
||||
m->file->seek(0, SEEK_END);
|
||||
@ -577,46 +579,45 @@ QPDF::reconstruct_xref(QPDFExc& e)
|
||||
// Don't allow very long tokens here during recovery. All the interesting tokens are covered.
|
||||
static size_t const MAX_LEN = 10;
|
||||
while (m->file->tell() < eof) {
|
||||
QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN);
|
||||
QPDFTokenizer::Token t1 = qpdf.readToken(*m->file, MAX_LEN);
|
||||
qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
|
||||
if (t1.isInteger()) {
|
||||
auto pos = m->file->tell();
|
||||
QPDFTokenizer::Token t2 = readToken(*m->file, MAX_LEN);
|
||||
if ((t2.isInteger()) && (readToken(*m->file, MAX_LEN).isWord("obj"))) {
|
||||
QPDFTokenizer::Token t2 = qpdf.readToken(*m->file, MAX_LEN);
|
||||
if (t2.isInteger() && qpdf.readToken(*m->file, MAX_LEN).isWord("obj")) {
|
||||
int obj = QUtil::string_to_int(t1.getValue().c_str());
|
||||
int gen = QUtil::string_to_int(t2.getValue().c_str());
|
||||
if (obj <= m->xref_table.max_id) {
|
||||
m->xref_table.insert_reconstructed(obj, token_start, gen);
|
||||
if (obj <= max_id) {
|
||||
insert_reconstructed(obj, token_start, gen);
|
||||
} else {
|
||||
warn(damagedPDF(
|
||||
"", 0, "ignoring object with impossibly large id " + std::to_string(obj)));
|
||||
warn_damaged("ignoring object with impossibly large id " + std::to_string(obj));
|
||||
}
|
||||
}
|
||||
m->file->seek(pos, SEEK_SET);
|
||||
} else if (!m->xref_table.trailer && t1.isWord("trailer")) {
|
||||
} else if (!trailer && t1.isWord("trailer")) {
|
||||
auto pos = m->file->tell();
|
||||
QPDFObjectHandle t = readTrailer();
|
||||
QPDFObjectHandle t = qpdf.readTrailer();
|
||||
if (!t.isDictionary()) {
|
||||
// Oh well. It was worth a try.
|
||||
} else {
|
||||
setTrailer(t);
|
||||
qpdf.setTrailer(t);
|
||||
}
|
||||
m->file->seek(pos, SEEK_SET);
|
||||
}
|
||||
check_warnings();
|
||||
m->file->findAndSkipNextEOL();
|
||||
}
|
||||
m->xref_table.deleted_objects.clear();
|
||||
deleted_objects.clear();
|
||||
|
||||
if (!m->xref_table.trailer) {
|
||||
if (!trailer) {
|
||||
qpdf_offset_t max_offset{0};
|
||||
// If there are any xref streams, take the last one to appear.
|
||||
for (auto const& iter: m->xref_table) {
|
||||
for (auto const& iter: *this) {
|
||||
auto entry = iter.second;
|
||||
if (entry.getType() != 1) {
|
||||
continue;
|
||||
}
|
||||
auto oh = getObjectByObjGen(iter.first);
|
||||
auto oh = qpdf.getObjectByObjGen(iter.first);
|
||||
try {
|
||||
if (!oh.isStreamOfType("/XRef")) {
|
||||
continue;
|
||||
@ -627,41 +628,41 @@ QPDF::reconstruct_xref(QPDFExc& e)
|
||||
auto offset = entry.getOffset();
|
||||
if (offset > max_offset) {
|
||||
max_offset = offset;
|
||||
setTrailer(oh.getDict());
|
||||
trailer = oh.getDict();
|
||||
}
|
||||
check_warnings();
|
||||
}
|
||||
if (max_offset > 0) {
|
||||
try {
|
||||
read_xref(max_offset);
|
||||
qpdf.read_xref(max_offset);
|
||||
} catch (std::exception&) {
|
||||
throw damagedPDF(
|
||||
"", 0, "error decoding candidate xref stream while recovering damaged file");
|
||||
throw damaged_pdf(
|
||||
"error decoding candidate xref stream while recovering damaged file");
|
||||
}
|
||||
QTC::TC("qpdf", "QPDF recover xref stream");
|
||||
}
|
||||
}
|
||||
|
||||
if (!m->xref_table.trailer) {
|
||||
if (!trailer) {
|
||||
// We could check the last encountered object to see if it was an xref stream. If so, we
|
||||
// could try to get the trailer from there. This may make it possible to recover files with
|
||||
// bad startxref pointers even when they have object streams.
|
||||
|
||||
throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file");
|
||||
throw damaged_pdf("unable to find trailer dictionary while recovering damaged file");
|
||||
}
|
||||
if (m->xref_table.empty()) {
|
||||
if (empty()) {
|
||||
// We cannot check for an empty xref table in parse because empty tables are valid when
|
||||
// creating QPDF objects from JSON.
|
||||
throw damagedPDF("", 0, "unable to find objects while recovering damaged file");
|
||||
throw damaged_pdf("unable to find objects while recovering damaged file");
|
||||
}
|
||||
check_warnings();
|
||||
if (!m->xref_table.parsed) {
|
||||
m->xref_table.parsed = true;
|
||||
getAllPages();
|
||||
if (!parsed) {
|
||||
parsed = true;
|
||||
qpdf.getAllPages();
|
||||
check_warnings();
|
||||
if (m->all_pages.empty()) {
|
||||
m->xref_table.parsed = false;
|
||||
throw damagedPDF("", 0, "unable to find any pages while recovering damaged file");
|
||||
parsed = false;
|
||||
throw damaged_pdf("unable to find any pages while recovering damaged file");
|
||||
}
|
||||
}
|
||||
// We could iterate through the objects looking for streams and try to find objects inside of
|
||||
@ -1766,7 +1767,7 @@ QPDF::readObjectAtOffset(
|
||||
} catch (QPDFExc& e) {
|
||||
if (try_recovery) {
|
||||
// Try again after reconstructing xref table
|
||||
reconstruct_xref(e);
|
||||
m->xref_table.reconstruct(e);
|
||||
if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) {
|
||||
qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
|
||||
QPDFObjectHandle result =
|
||||
|
@ -16,6 +16,8 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry>
|
||||
void insert(int obj, int f0, qpdf_offset_t f1, int f2);
|
||||
void insert_free(QPDFObjGen);
|
||||
|
||||
void reconstruct(QPDFExc& e);
|
||||
|
||||
QPDFObjectHandle trailer;
|
||||
bool reconstructed{false};
|
||||
// Various tables are indexed by object id, with potential size id + 1
|
||||
@ -31,6 +33,17 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry>
|
||||
qpdf_offset_t first_item_offset{0}; // actual value from file
|
||||
|
||||
private:
|
||||
QPDFExc
|
||||
damaged_pdf(std::string const& msg)
|
||||
{
|
||||
return qpdf.damagedPDF("", 0, msg);
|
||||
}
|
||||
|
||||
void
|
||||
warn_damaged(std::string const& msg)
|
||||
{
|
||||
qpdf.warn(damaged_pdf(msg));
|
||||
}
|
||||
QPDF& qpdf;
|
||||
};
|
||||
|
||||
|
@ -102,11 +102,10 @@ $td->runtest("recover file with xref stream",
|
||||
{$td->COMMAND => "qpdf --static-id --compress-streams=n" .
|
||||
" recover-xref-stream.pdf a.pdf"},
|
||||
{$td->FILE => "recover-xref-stream.out", $td->EXIT_STATUS => 3},
|
||||
$td->EXPECT_FAILURE);
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
$td->runtest("check file",
|
||||
{$td->FILE => "a.pdf"},
|
||||
{$td->FILE => "recover-xref-stream-recovered.pdf"},
|
||||
$td->EXPECT_FAILURE);
|
||||
{$td->FILE => "recover-xref-stream-recovered.pdf"});
|
||||
|
||||
# Self-referential object stream
|
||||
$td->runtest("self-referential object stream",
|
||||
|
@ -1,5 +1,4 @@
|
||||
WARNING: recover-xref-stream.pdf: file is damaged
|
||||
WARNING: recover-xref-stream.pdf: can't find startxref
|
||||
WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table
|
||||
WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15)
|
||||
qpdf: operation succeeded with warnings; resulting file may have some problems
|
||||
|
Loading…
Reference in New Issue
Block a user