diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index 852c9e09..b94f83c6 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -737,6 +737,7 @@ class QPDF // For testing only -- do not add to DLL static bool test_json_validators(); + void test_xref(); private: // It has never been safe to copy QPDF objects as there is code in the library that assumes diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index 0248bf47..11cc499a 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -1133,3 +1133,9 @@ QPDF::removeSecurityRestrictions() acroform.replaceKey("/SigFlags", QPDFObjectHandle::newInteger(0)); } } + +void +QPDF::test_xref() +{ + objects().xref_table().test(); +} diff --git a/libqpdf/QPDF_objects.cc b/libqpdf/QPDF_objects.cc index 746c690d..70b8b645 100644 --- a/libqpdf/QPDF_objects.cc +++ b/libqpdf/QPDF_objects.cc @@ -2,6 +2,7 @@ #include +#include #include #include #include @@ -89,6 +90,21 @@ namespace }; } // namespace +void +Xref_table::test() +{ + std::cout << "id, gen, offset, length, next\n"; + int i = 0; + for (auto const& entry: table) { + if (entry.type() == 1) { + std::cout << i << ", " << entry.gen() << ", " << entry.type() << ", " << entry.offset() + << ", " << entry.length() << ", " << (entry.offset() + toO(entry.length())) + << '\n'; + } + ++i; + } +} + bool QPDF::findStartxref() { @@ -143,6 +159,7 @@ Xref_table::initialize() PatternFinder sf(qpdf, &QPDF::findStartxref); qpdf_offset_t xref_offset = 0; if (file->findLast("startxref", start_offset, 0, sf)) { + offsets.emplace_back(file->tell(), 0); xref_offset = QUtil::string_to_ll(read_token().getValue().c_str()); } @@ -167,10 +184,30 @@ Xref_table::initialize() } } + calc_lengths(); prepare_obj_table(); initialized_ = true; } +void +Xref_table::calc_lengths() +{ + if (offsets.size() > 1) { + std::sort(offsets.begin(), offsets.end()); + size_t id = 0; + auto end = table.size(); + qpdf_offset_t offset = 0; + for (auto const& item: offsets) { + if (id && id < end) { + table[id].length_ = toS(item.first - offset); + } + offset = item.first; + id = item.second; + } + } + offsets.clear(); +} + // Remove any dangling reference picked up while parsing or reconstructing the xref table from the // object table. void @@ -234,6 +271,7 @@ Xref_table::reconstruct(QPDFExc& e) file->seek(0, SEEK_END); qpdf_offset_t eof = file->tell(); + offsets.emplace_back(eof, 0); file->seek(0, SEEK_SET); // Don't allow very long tokens here during recovery. All the interesting tokens are covered. static size_t const MAX_LEN = 10; @@ -256,8 +294,13 @@ Xref_table::reconstruct(QPDFExc& e) } } file->seek(pos, SEEK_SET); - } else if (!trailer_ && t1.isWord("trailer")) { - trailers.emplace_back(file->tell()); + } else if (t1.isWord("trailer")) { + offsets.emplace_back(token_start, 0); + if (!trailer_) { + trailers.emplace_back(file->tell()); + } + } else if (t1.isWord("xref")) { + offsets.emplace_back(token_start, 0); } file->findAndSkipNextEOL(); } @@ -280,8 +323,9 @@ Xref_table::reconstruct(QPDFExc& e) for (auto it = found_objects.rbegin(); it != rend; it++) { auto [obj, gen, token_start] = *it; insert(obj, 1, token_start, gen); - check_warnings(); } + calc_lengths(); + check_warnings(); if (!trailer_) { qpdf_offset_t max_offset{0}; @@ -401,6 +445,7 @@ Xref_table::read(qpdf_offset_t xref_offset) while (QUtil::is_space(buf[skip])) { ++skip; } + offsets.emplace_back(xref_offset, 0); xref_offset = process_section(xref_offset + skip); } else { xref_offset = read_stream(xref_offset); @@ -1037,6 +1082,11 @@ Xref_table::insert(int obj, int f0, qpdf_offset_t f1, int f2) // entry. This will need to be revisited when we want to support incremental updates or more // comprehensive checking. QTC::TC("qpdf", "QPDF xref replaced / deleted object", old_type == 0 ? 0 : 1); + if (f0 == 1) { + // Save offset of deleted/replaced object to allow us to calculate object length once we + // are finished loading the xref table. + offsets.emplace_back(f1, 0); + } return; } @@ -1051,12 +1101,13 @@ Xref_table::insert(int obj, int f0, qpdf_offset_t f1, int f2) // f2 is generation QTC::TC("qpdf", "QPDF xref gen > 0", (f2 > 0) ? 1 : 0); entry = {f2, Uncompressed(f1)}; - break; + offsets.emplace_back(f1, static_cast(obj)); + return; case 2: entry = {0, Compressed(toI(f1), f2)}; object_streams_ = true; - break; + return; default: throw qpdf.damagedPDF( diff --git a/libqpdf/qpdf/QPDF_objects.hh b/libqpdf/qpdf/QPDF_objects.hh index 3a5c6358..5d3b26bc 100644 --- a/libqpdf/qpdf/QPDF_objects.hh +++ b/libqpdf/qpdf/QPDF_objects.hh @@ -228,6 +228,8 @@ class QPDF::Objects return first_item_offset_; } + void test(); + private: // Object, count, offset of first entry typedef std::tuple Subsection; @@ -294,7 +296,14 @@ class QPDF::Objects return type() == 2 ? std::get<2>(entry).stream_index : 0; } + size_t + length() const noexcept + { + return length_; + } + int gen_{0}; + size_t length_{0}; // For uncompressed objects. Xref entry; qpdf_offset_t end_before_space_{0}; qpdf_offset_t end_after_space_{0}; @@ -314,6 +323,7 @@ class QPDF::Objects } void read(qpdf_offset_t offset); + void calc_lengths(); void prepare_obj_table(); // Methods to parse tables @@ -376,6 +386,9 @@ class QPDF::Objects std::vector table; QPDFObjectHandle trailer_; + // Temporary offset table used to calculate uncompressed object length. + std::vector> offsets; + bool attempt_recovery_{true}; bool initialized_{false}; bool ignore_streams_{false}; diff --git a/libtests/CMakeLists.txt b/libtests/CMakeLists.txt index 7d2ecbd6..4c4b6120 100644 --- a/libtests/CMakeLists.txt +++ b/libtests/CMakeLists.txt @@ -34,7 +34,8 @@ set(TEST_PROGRAMS rc4 runlength sha2 - sparse_array) + sparse_array + xref) set(TEST_C_PROGRAMS logger_c) diff --git a/libtests/qtest/xref.test b/libtests/qtest/xref.test new file mode 100644 index 00000000..8237badc --- /dev/null +++ b/libtests/qtest/xref.test @@ -0,0 +1,34 @@ +#!/usr/bin/env perl +require 5.008; +use warnings; +use strict; + +unshift(@INC, '.'); + +chdir("xref") or die "chdir testdir failed: $!\n"; + +require TestDriver; + +my $td = new TestDriver('xref'); + +my @files = ("incremental-1", + "incremental-1-bad", + ); + +foreach my $file (@files) +{ + $td->runtest("xref $file", + {$td->COMMAND => "xref $file.pdf"}, + {$td->FILE => "$file.out", + $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); +} + +cleanup(); + +$td->report(scalar(@files)); + +sub cleanup +{ + unlink "tmp"; +} diff --git a/libtests/qtest/xref/incremental-1-bad.out b/libtests/qtest/xref/incremental-1-bad.out new file mode 100644 index 00000000..0ba09b0d --- /dev/null +++ b/libtests/qtest/xref/incremental-1-bad.out @@ -0,0 +1,12 @@ +WARNING: incremental-1-bad.pdf: file is damaged +WARNING: incremental-1-bad.pdf (offset 1241): xref not found +WARNING: incremental-1-bad.pdf: Attempting to reconstruct cross-reference table +id, gen, offset, length, next +1, 0, 1, 9, 93, 102 +2, 0, 1, 102, 72, 174 +3, 0, 1, 1108, 172, 1280 +4, 1, 1, 987, 26, 1013 +5, 0, 1, 442, 35, 477 +6, 0, 1, 477, 118, 595 +7, 0, 1, 1013, 95, 1108 +xref done diff --git a/libtests/qtest/xref/incremental-1-bad.pdf b/libtests/qtest/xref/incremental-1-bad.pdf new file mode 100644 index 00000000..a6a93dd0 --- /dev/null +++ b/libtests/qtest/xref/incremental-1-bad.pdf @@ -0,0 +1,145 @@ +%PDF-1.3 +1 0 obj +<< + /Type /Catalog + /Pages 2 0 R +>> +endobj + +2 0 obj +<< + /Type /Pages + /Kids [ + 3 0 R + ] + /Count 1 +>> +endobj + +3 0 obj +<< + /Type /Page + /Parent 2 0 R + /MediaBox [0 0 612 792] + /Contents 4 0 R + /Resources << + /ProcSet 5 0 R + /Font << + /F1 6 0 R + >> + >> +>> +endobj + +4 0 obj +<< + /Length 44 +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET +endstream +endobj + +5 0 obj +[ + /PDF + /Text +] +endobj + +6 0 obj +<< + /Type /Font + /Subtype /Type1 + /Name /F1 + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding +>> +endobj + +xref +0 7 +0000000000 65535 f +0000000009 00000 n +0000000063 00000 n +0000000135 00000 n +0000000307 00000 n +0000000403 00000 n +0000000438 00000 n +trailer << + /Size 7 + /Root 1 0 R +>> +startxref +556 +%%EOF + +% Delete object 4 and increment generation +xref +0 1 +0000000004 65535 f +4 1 +0000000000 00001 f +trailer << + /Size 7 + /Root 1 0 R + /Prev 556 +>> +startxref +807 +%%EOF + +% Reuse object 4 +4 1 obj +[ 7 0 R ] +endobj + +7 0 obj +<< + /Length 43 +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Salad) Tj +ET +endstream +endobj + +3 0 obj +<< + /Type /Page + /Parent 2 0 R + /MediaBox [0 0 612 792] + /Contents 4 1 R + /Resources << + /ProcSet 5 0 R + /Font << + /F1 6 0 R + >> + >> +>> +endobj + +xref +0 1 +0000000000 65535 f +3 2 +0000001069 00000 n +0000000948 00001 n +7 1 +0000000974 00000 n +trailer << + /Size 8 + /Root 1 0 R + /Prev 807 + /Gone 4 0 R +>> +startxref +1241 +%%EOF diff --git a/libtests/qtest/xref/incremental-1.out b/libtests/qtest/xref/incremental-1.out new file mode 100644 index 00000000..dfddcc2c --- /dev/null +++ b/libtests/qtest/xref/incremental-1.out @@ -0,0 +1,9 @@ +id, gen, offset, length, next +1, 0, 1, 9, 54, 63 +2, 0, 1, 63, 72, 135 +3, 0, 1, 1069, 172, 1241 +4, 1, 1, 948, 26, 974 +5, 0, 1, 403, 35, 438 +6, 0, 1, 438, 118, 556 +7, 0, 1, 974, 95, 1069 +xref done diff --git a/libtests/qtest/xref/incremental-1.pdf b/libtests/qtest/xref/incremental-1.pdf new file mode 100644 index 00000000..eed2357f --- /dev/null +++ b/libtests/qtest/xref/incremental-1.pdf @@ -0,0 +1,145 @@ +%PDF-1.3 +1 0 obj +<< + /Type /Catalog + /Pages 2 0 R +>> +endobj + +2 0 obj +<< + /Type /Pages + /Kids [ + 3 0 R + ] + /Count 1 +>> +endobj + +3 0 obj +<< + /Type /Page + /Parent 2 0 R + /MediaBox [0 0 612 792] + /Contents 4 0 R + /Resources << + /ProcSet 5 0 R + /Font << + /F1 6 0 R + >> + >> +>> +endobj + +4 0 obj +<< + /Length 44 +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET +endstream +endobj + +5 0 obj +[ + /PDF + /Text +] +endobj + +6 0 obj +<< + /Type /Font + /Subtype /Type1 + /Name /F1 + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding +>> +endobj + +xref +0 7 +0000000000 65535 f +0000000009 00000 n +0000000063 00000 n +0000000135 00000 n +0000000307 00000 n +0000000403 00000 n +0000000438 00000 n +trailer << + /Size 7 + /Root 1 0 R +>> +startxref +556 +%%EOF + +% Delete object 4 and increment generation +xref +0 1 +0000000004 65535 f +4 1 +0000000000 00001 f +trailer << + /Size 7 + /Root 1 0 R + /Prev 556 +>> +startxref +807 +%%EOF + +% Reuse object 4 +4 1 obj +[ 7 0 R ] +endobj + +7 0 obj +<< + /Length 43 +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Salad) Tj +ET +endstream +endobj + +3 0 obj +<< + /Type /Page + /Parent 2 0 R + /MediaBox [0 0 612 792] + /Contents 4 1 R + /Resources << + /ProcSet 5 0 R + /Font << + /F1 6 0 R + >> + >> +>> +endobj + +xref +0 1 +0000000000 65535 f +3 2 +0000001069 00000 n +0000000948 00001 n +7 1 +0000000974 00000 n +trailer << + /Size 8 + /Root 1 0 R + /Prev 807 + /Gone 4 0 R +>> +startxref +1241 +%%EOF diff --git a/libtests/xref.cc b/libtests/xref.cc new file mode 100644 index 00000000..11db4013 --- /dev/null +++ b/libtests/xref.cc @@ -0,0 +1,25 @@ +#include + +#include +#include +#include + +int +main(int argc, char* argv[]) +{ + if (argc != 2) { + std::cerr << "usage: xref INFILE\n"; + std::exit(2); + } + + try { + QPDF qpdf; + qpdf.processFile(argv[1]); + qpdf.test_xref(); + } catch (std::exception& e) { + std::cerr << e.what() << '\n'; + std::exit(2); + } + std::cout << "xref done\n"; + return 0; +}