diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index aa4dabef..8dc0af29 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -733,6 +733,7 @@ class QPDF class ParseGuard; class Pipe; class JobSetter; + class Xref_table; // For testing only -- do not add to DLL static bool test_json_validators(); diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index 37affabb..c6a90892 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -303,7 +303,7 @@ QPDF::registerStreamFilter( void QPDF::setIgnoreXRefStreams(bool val) { - m->ignore_xref_streams = val; + m->xref_table.ignore_streams = val; } std::shared_ptr @@ -341,6 +341,7 @@ void QPDF::setAttemptRecovery(bool val) { m->attempt_recovery = val; + m->xref_table.attempt_recovery = val; } void @@ -447,11 +448,11 @@ QPDF::parse(char const* password) // 30 characters to leave room for the startxref stuff. m->file->seek(0, SEEK_END); qpdf_offset_t end_offset = m->file->tell(); - m->xref_table_max_offset = end_offset; + m->xref_table.max_offset = end_offset; // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic // scenarios at least 3 bytes are required. - if (m->xref_table_max_id > m->xref_table_max_offset / 3) { - m->xref_table_max_id = static_cast(m->xref_table_max_offset / 3); + if (m->xref_table.max_id > m->xref_table.max_offset / 3) { + m->xref_table.max_id = static_cast(m->xref_table.max_offset / 3); } qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0); PatternFinder sf(*this, &QPDF::findStartxref); @@ -482,7 +483,7 @@ QPDF::parse(char const* password) } initializeEncryption(); - m->parsed = true; + m->xref_table.parsed = true; if (m->xref_table.size() > 0 && !getRoot().getKey("/Pages").isDictionary()) { // QPDFs created from JSON have an empty xref table and no root object yet. throw damagedPDF("", 0, "unable to find page tree"); @@ -526,16 +527,16 @@ QPDF::warn( void QPDF::setTrailer(QPDFObjectHandle obj) { - if (m->trailer) { + if (m->xref_table.trailer) { return; } - m->trailer = obj; + m->xref_table.trailer = obj; } void QPDF::reconstruct_xref(QPDFExc& e) { - if (m->reconstructed_xref) { + if (m->xref_table.reconstructed) { // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now. throw e; @@ -550,7 +551,7 @@ QPDF::reconstruct_xref(QPDFExc& e) } }; - m->reconstructed_xref = true; + m->xref_table.reconstructed = true; // We may find more objects, which may contain dangling references. m->fixed_dangling_refs = false; @@ -583,7 +584,7 @@ QPDF::reconstruct_xref(QPDFExc& e) if ((t2.isInteger()) && (readToken(*m->file, MAX_LEN).isWord("obj"))) { int obj = QUtil::string_to_int(t1.getValue().c_str()); int gen = QUtil::string_to_int(t2.getValue().c_str()); - if (obj <= m->xref_table_max_id) { + if (obj <= m->xref_table.max_id) { insertReconstructedXrefEntry(obj, token_start, gen); } else { warn(damagedPDF( @@ -591,7 +592,7 @@ QPDF::reconstruct_xref(QPDFExc& e) } } m->file->seek(pos, SEEK_SET); - } else if (!m->trailer && t1.isWord("trailer")) { + } else if (!m->xref_table.trailer && t1.isWord("trailer")) { auto pos = m->file->tell(); QPDFObjectHandle t = readTrailer(); if (!t.isDictionary()) { @@ -604,9 +605,9 @@ QPDF::reconstruct_xref(QPDFExc& e) check_warnings(); m->file->findAndSkipNextEOL(); } - m->deleted_objects.clear(); + m->xref_table.deleted_objects.clear(); - if (!m->trailer) { + if (!m->xref_table.trailer) { qpdf_offset_t max_offset{0}; // If there are any xref streams, take the last one to appear. for (auto const& iter: m->xref_table) { @@ -640,7 +641,7 @@ QPDF::reconstruct_xref(QPDFExc& e) } } - if (!m->trailer) { + if (!m->xref_table.trailer) { // We could check the last encountered object to see if it was an xref stream. If so, we // could try to get the trailer from there. This may make it possible to recover files with // bad startxref pointers even when they have object streams. @@ -653,12 +654,12 @@ QPDF::reconstruct_xref(QPDFExc& e) throw damagedPDF("", 0, "unable to find objects while recovering damaged file"); } check_warnings(); - if (!m->parsed) { - m->parsed = true; + if (!m->xref_table.parsed) { + m->xref_table.parsed = true; getAllPages(); check_warnings(); if (m->all_pages.empty()) { - m->parsed = false; + m->xref_table.parsed = false; throw damagedPDF("", 0, "unable to find any pages while recovering damaged file"); } } @@ -730,16 +731,16 @@ QPDF::read_xref(qpdf_offset_t xref_offset) } } - if (!m->trailer) { + if (!m->xref_table.trailer) { throw damagedPDF("", 0, "unable to find trailer while reading xref"); } - int size = m->trailer.getKey("/Size").getIntValueAsInt(); + int size = m->xref_table.trailer.getKey("/Size").getIntValueAsInt(); int max_obj = 0; if (!m->xref_table.empty()) { max_obj = m->xref_table.rbegin()->first.getObj(); } - if (!m->deleted_objects.empty()) { - max_obj = std::max(max_obj, *(m->deleted_objects.rbegin())); + if (!m->xref_table.deleted_objects.empty()) { + max_obj = std::max(max_obj, *(m->xref_table.deleted_objects.rbegin())); } if ((size < 1) || (size - 1 != max_obj)) { QTC::TC("qpdf", "QPDF xref size mismatch"); @@ -752,7 +753,7 @@ QPDF::read_xref(qpdf_offset_t xref_offset) // We no longer need the deleted_objects table, so go ahead and clear it out to make sure we // never depend on its being set. - m->deleted_objects.clear(); + m->xref_table.deleted_objects.clear(); // Make sure we keep only the highest generation for any object. QPDFObjGen last_og{-1, 0}; @@ -968,7 +969,7 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) for (qpdf_offset_t i = obj; i - num < obj; ++i) { if (i == 0) { // This is needed by checkLinearization() - m->first_xref_item_offset = m->file->tell(); + m->xref_table.first_item_offset = m->file->tell(); } // For xref_table, these will always be small enough to be ints qpdf_offset_t f1 = 0; @@ -1000,21 +1001,21 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) throw damagedPDF("", "expected trailer dictionary"); } - if (!m->trailer) { + if (!m->xref_table.trailer) { setTrailer(cur_trailer); - if (!m->trailer.hasKey("/Size")) { + if (!m->xref_table.trailer.hasKey("/Size")) { QTC::TC("qpdf", "QPDF trailer lacks size"); throw damagedPDF("trailer", "trailer dictionary lacks /Size key"); } - if (!m->trailer.getKey("/Size").isInteger()) { + if (!m->xref_table.trailer.getKey("/Size").isInteger()) { QTC::TC("qpdf", "QPDF trailer size not integer"); throw damagedPDF("trailer", "/Size key in trailer dictionary is not an integer"); } } if (cur_trailer.hasKey("/XRefStm")) { - if (m->ignore_xref_streams) { + if (m->xref_table.ignore_streams) { QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer"); } else { if (cur_trailer.getKey("/XRefStm").isInteger()) { @@ -1043,7 +1044,7 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) qpdf_offset_t QPDF::read_xrefStream(qpdf_offset_t xref_offset) { - if (!m->ignore_xref_streams) { + if (!m->xref_table.ignore_streams) { QPDFObjGen x_og; QPDFObjectHandle xref_obj; try { @@ -1238,14 +1239,14 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) // object record, in which case the generation number appears as the third field. if (saw_first_compressed_object) { if (fields[0] != 2) { - m->uncompressed_after_compressed = true; + m->xref_table.uncompressed_after_compressed = true; } } else if (fields[0] == 2) { saw_first_compressed_object = true; } if (obj == 0) { // This is needed by checkLinearization() - m->first_xref_item_offset = xref_offset; + m->xref_table.first_item_offset = xref_offset; } else if (fields[0] == 0) { // Ignore fields[2], which we don't care about in this case. This works around the // issue of some PDF files that put invalid values, like -1, here for deleted @@ -1258,7 +1259,7 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) } } - if (!m->trailer) { + if (!m->xref_table.trailer) { setTrailer(dict); } @@ -1284,12 +1285,12 @@ QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2) // If there is already an entry for this object and generation in the table, it means that a // later xref table has registered this object. Disregard this one. - if (obj > m->xref_table_max_id) { + if (obj > m->xref_table.max_id) { // ignore impossibly large object ids or object ids > Size. return; } - if (m->deleted_objects.count(obj)) { + if (m->xref_table.deleted_objects.count(obj)) { QTC::TC("qpdf", "QPDF xref deleted object"); return; } @@ -1326,7 +1327,7 @@ void QPDF::insertFreeXrefEntry(QPDFObjGen og) { if (!m->xref_table.count(og)) { - m->deleted_objects.insert(og.getObj()); + m->xref_table.deleted_objects.insert(og.getObj()); } } @@ -1335,13 +1336,13 @@ QPDF::insertFreeXrefEntry(QPDFObjGen og) void QPDF::insertReconstructedXrefEntry(int obj, qpdf_offset_t f1, int f2) { - if (!(obj > 0 && obj <= m->xref_table_max_id && 0 <= f2 && f2 < 65535)) { + if (!(obj > 0 && obj <= m->xref_table.max_id && 0 <= f2 && f2 < 65535)) { QTC::TC("qpdf", "QPDF xref overwrite invalid objgen"); return; } QPDFObjGen og(obj, f2); - if (!m->deleted_objects.count(obj)) { + if (!m->xref_table.deleted_objects.count(obj)) { // deleted_objects stores the uncompressed objects removed from the xref table at the start // of recovery. QTC::TC("qpdf", "QPDF xref overwrite object"); @@ -1381,11 +1382,11 @@ QPDF::showXRefTable() bool QPDF::resolveXRefTable() { - bool may_change = !m->reconstructed_xref; + bool may_change = !m->xref_table.reconstructed; for (auto& iter: m->xref_table) { if (isUnresolved(iter.first)) { resolve(iter.first); - if (may_change && m->reconstructed_xref) { + if (may_change && m->xref_table.reconstructed) { return false; } } @@ -1958,7 +1959,7 @@ QPDF::resolveObjectsInStream(int obj_stream_number) int num = QUtil::string_to_int(tnum.getValue().c_str()); long long offset = QUtil::string_to_int(toffset.getValue().c_str()); - if (num > m->xref_table_max_id) { + if (num > m->xref_table.max_id) { continue; } if (num == obj_stream_number) { @@ -2101,7 +2102,7 @@ QPDF::getObjectForParser(int id, int gen, bool parse_pdf) if (auto iter = m->obj_cache.find(og); iter != m->obj_cache.end()) { return iter->second.object; } - if (m->xref_table.count(og) || !m->parsed) { + if (m->xref_table.count(og) || !m->xref_table.parsed) { return m->obj_cache.insert({og, QPDF_Unresolved::create(this, og)}).first->second.object; } if (parse_pdf) { @@ -2117,8 +2118,9 @@ QPDF::getObjectForJSON(int id, int gen) auto [it, inserted] = m->obj_cache.try_emplace(og); auto& obj = it->second.object; if (inserted) { - obj = (m->parsed && !m->xref_table.count(og)) ? QPDF_Null::create(this, og) - : QPDF_Unresolved::create(this, og); + obj = (m->xref_table.parsed && !m->xref_table.count(og)) + ? QPDF_Null::create(this, og) + : QPDF_Unresolved::create(this, og); } return obj; } @@ -2128,7 +2130,7 @@ QPDF::getObject(QPDFObjGen const& og) { if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) { return {it->second.object}; - } else if (m->parsed && !m->xref_table.count(og)) { + } else if (m->xref_table.parsed && !m->xref_table.count(og)) { return QPDF_Null::create(); } else { auto result = m->obj_cache.try_emplace(og, QPDF_Unresolved::create(this, og), -1, -1); @@ -2526,13 +2528,13 @@ QPDF::getExtensionLevel() QPDFObjectHandle QPDF::getTrailer() { - return m->trailer; + return m->xref_table.trailer; } QPDFObjectHandle QPDF::getRoot() { - QPDFObjectHandle root = m->trailer.getKey("/Root"); + QPDFObjectHandle root = m->xref_table.trailer.getKey("/Root"); if (!root.isDictionary()) { throw damagedPDF("", 0, "unable to find /Root dictionary"); } else if ( @@ -2554,7 +2556,7 @@ QPDF::getXRefTable() std::map const& QPDF::getXRefTableInternal() { - if (!m->parsed) { + if (!m->xref_table.parsed) { throw std::logic_error("QPDF::getXRefTable called before parsing."); } @@ -2604,14 +2606,14 @@ QPDF::getCompressibleObjGens() // iterating through the xref table since it avoids preserving orphaned items. // Exclude encryption dictionary, if any - QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt"); + QPDFObjectHandle encryption_dict = m->xref_table.trailer.getKey("/Encrypt"); QPDFObjGen encryption_dict_og = encryption_dict.getObjGen(); const size_t max_obj = getObjectCount(); std::vector visited(max_obj, false); std::vector queue; queue.reserve(512); - queue.push_back(m->trailer); + queue.push_back(m->xref_table.trailer); std::vector result; if constexpr (std::is_same_v) { result.reserve(m->obj_cache.size()); diff --git a/libqpdf/QPDF_encryption.cc b/libqpdf/QPDF_encryption.cc index a2e817c4..2ea6e5eb 100644 --- a/libqpdf/QPDF_encryption.cc +++ b/libqpdf/QPDF_encryption.cc @@ -727,7 +727,7 @@ QPDF::initializeEncryption() // at /Encrypt again. Otherwise, things could go wrong if someone mutates the encryption // dictionary. - if (!m->trailer.hasKey("/Encrypt")) { + if (!m->xref_table.trailer.hasKey("/Encrypt")) { return; } @@ -736,7 +736,7 @@ QPDF::initializeEncryption() m->encp->encrypted = true; std::string id1; - QPDFObjectHandle id_obj = m->trailer.getKey("/ID"); + QPDFObjectHandle id_obj = m->xref_table.trailer.getKey("/ID"); if ((id_obj.isArray() && (id_obj.getArrayNItems() == 2) && id_obj.getArrayItem(0).isString())) { id1 = id_obj.getArrayItem(0).getStringValue(); } else { @@ -745,7 +745,7 @@ QPDF::initializeEncryption() warn(damagedPDF("trailer", "invalid /ID in trailer dictionary")); } - QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt"); + QPDFObjectHandle encryption_dict = m->xref_table.trailer.getKey("/Encrypt"); if (!encryption_dict.isDictionary()) { throw damagedPDF("/Encrypt in trailer dictionary is not a dictionary"); } diff --git a/libqpdf/QPDF_json.cc b/libqpdf/QPDF_json.cc index 8cbbcd1b..4390f081 100644 --- a/libqpdf/QPDF_json.cc +++ b/libqpdf/QPDF_json.cc @@ -593,8 +593,8 @@ QPDF::JSONReactor::dictionaryItem(std::string const& key, JSON const& value) this->saw_value = true; // The trailer must be a dictionary, so we can use setNextStateIfDictionary. if (setNextStateIfDictionary("trailer.value", value, st_object)) { - this->pdf.m->trailer = makeObject(value); - setObjectDescription(this->pdf.m->trailer, value); + pdf.m->xref_table.trailer = makeObject(value); + setObjectDescription(this->pdf.m->xref_table.trailer, value); } } else if (key == "stream") { // Don't need to set saw_stream here since there's already an error. diff --git a/libqpdf/QPDF_linearization.cc b/libqpdf/QPDF_linearization.cc index 4b525bb5..08f78579 100644 --- a/libqpdf/QPDF_linearization.cc +++ b/libqpdf/QPDF_linearization.cc @@ -461,12 +461,11 @@ QPDF::checkLinearizationInternal() break; } } - if (m->file->tell() != m->first_xref_item_offset) { + if (m->file->tell() != m->xref_table.first_item_offset) { QTC::TC("qpdf", "QPDF err /T mismatch"); linearizationWarning( - "space before first xref item (/T) mismatch " - "(computed = " + - std::to_string(m->first_xref_item_offset) + + "space before first xref item (/T) mismatch (computed = " + + std::to_string(m->xref_table.first_item_offset) + "; file = " + std::to_string(m->file->tell())); } @@ -477,7 +476,7 @@ QPDF::checkLinearizationInternal() // compressed objects are supposed to be at the end of the containing xref section if any object // streams are in use. - if (m->uncompressed_after_compressed) { + if (m->xref_table.uncompressed_after_compressed) { linearizationWarning("linearized file contains an uncompressed object after a compressed " "one in a cross-reference stream"); } diff --git a/libqpdf/QPDF_optimization.cc b/libqpdf/QPDF_optimization.cc index 03ba3c8a..f2e5e752 100644 --- a/libqpdf/QPDF_optimization.cc +++ b/libqpdf/QPDF_optimization.cc @@ -115,13 +115,13 @@ QPDF::optimize_internal( } // Traverse document-level items - for (auto const& key: m->trailer.getKeys()) { + for (auto const& key: m->xref_table.trailer.getKeys()) { if (key == "/Root") { // handled separately } else { updateObjectMaps( ObjUser(ObjUser::ou_trailer_key, key), - m->trailer.getKey(key), + m->xref_table.trailer.getKey(key), skip_stream_parameters); } } @@ -169,13 +169,13 @@ QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys) // values for them. std::map> key_ancestors; pushInheritedAttributesToPageInternal( - m->trailer.getKey("/Root").getKey("/Pages"), + m->xref_table.trailer.getKey("/Root").getKey("/Pages"), key_ancestors, allow_changes, warn_skipped_keys); if (!key_ancestors.empty()) { - throw std::logic_error("key_ancestors not empty after" - " pushing inherited attributes to pages"); + throw std::logic_error( + "key_ancestors not empty after pushing inherited attributes to pages"); } m->pushed_inherited_attributes_to_pages = true; m->ever_pushed_inherited_attributes_to_pages = true; diff --git a/libqpdf/qpdf/QPDF_private.hh b/libqpdf/qpdf/QPDF_private.hh index de9cc5fd..a5613b92 100644 --- a/libqpdf/qpdf/QPDF_private.hh +++ b/libqpdf/qpdf/QPDF_private.hh @@ -3,6 +3,25 @@ #include +// Xref_table encapsulates the pdf's xref table and trailer. +class QPDF::Xref_table: public std::map +{ + public: + QPDFObjectHandle trailer; + bool reconstructed{false}; + // Various tables are indexed by object id, with potential size id + 1 + int max_id{std::numeric_limits::max() - 1}; + qpdf_offset_t max_offset{0}; + std::set deleted_objects; + bool ignore_streams{false}; + bool parsed{false}; + bool attempt_recovery{true}; + + // Linearization data + bool uncompressed_after_compressed{false}; + qpdf_offset_t first_item_offset{0}; // actual value from file +}; + // Writer class is restricted to QPDFWriter so that only it can call certain methods. class QPDF::Writer { @@ -459,21 +478,15 @@ class QPDF::Members std::shared_ptr file; std::string last_object_description; bool provided_password_is_hex_key{false}; - bool ignore_xref_streams{false}; bool suppress_warnings{false}; size_t max_warnings{0}; bool attempt_recovery{true}; bool check_mode{false}; std::shared_ptr encp; std::string pdf_version; - std::map xref_table; - // Various tables are indexed by object id, with potential size id + 1 - int xref_table_max_id{std::numeric_limits::max() - 1}; - qpdf_offset_t xref_table_max_offset{0}; - std::set deleted_objects; + Xref_table xref_table; std::map obj_cache; std::set resolving; - QPDFObjectHandle trailer; std::vector all_pages; bool invalid_page_found{false}; std::map pageobj_to_pages_pos; @@ -485,16 +498,12 @@ class QPDF::Members std::shared_ptr copied_streams; // copied_stream_data_provider is owned by copied_streams CopiedStreamDataProvider* copied_stream_data_provider{nullptr}; - bool reconstructed_xref{false}; bool fixed_dangling_refs{false}; bool immediate_copy_from{false}; bool in_parse{false}; - bool parsed{false}; std::set resolved_object_streams; // Linearization data - qpdf_offset_t first_xref_item_offset{0}; // actual value from file - bool uncompressed_after_compressed{false}; bool linearization_warnings{false}; // Linearization parameter dictionary and hint table data: may be read from file or computed