From 8791b5f8d0ea9287f458784dc27039a2f55637b7 Mon Sep 17 00:00:00 2001 From: m-holger Date: Sun, 18 Feb 2024 01:42:47 +0000 Subject: [PATCH] In QPDFWriter replace map obj_renumber with a new object table obj --- include/qpdf/QPDF.hh | 8 ++ include/qpdf/QPDFWriter.hh | 97 ++-------------- libqpdf/QPDF.cc | 13 +++ libqpdf/QPDFWriter.cc | 76 +++++++++---- libqpdf/qpdf/ObjTable.hh | 150 +++++++++++++++++++++++++ libqpdf/qpdf/QPDFWriter_private.hh | 112 ++++++++++++++++++ libtests/CMakeLists.txt | 1 + libtests/obj_table.cc | 39 +++++++ libtests/qtest/obj_table.test | 18 +++ libtests/qtest/obj_table/obj_table.out | 22 ++++ 10 files changed, 424 insertions(+), 112 deletions(-) create mode 100644 libqpdf/qpdf/ObjTable.hh create mode 100644 libqpdf/qpdf/QPDFWriter_private.hh create mode 100644 libtests/obj_table.cc create mode 100644 libtests/qtest/obj_table.test create mode 100644 libtests/qtest/obj_table/obj_table.out diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index 04b11cba..922a6ad7 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -765,6 +765,12 @@ class QPDF { return qpdf.getCompressibleObjGens(); } + + static size_t + tableSize(QPDF& qpdf) + { + return qpdf.tableSize(); + } }; // The Resolver class is restricted to QPDFObject so that only it can resolve indirect @@ -1083,6 +1089,8 @@ class QPDF // For QPDFWriter: + size_t tableSize(); + // Get lists of all objects in order according to the part of a linearized file that they belong // to. void getLinearizedParts( diff --git a/include/qpdf/QPDFWriter.hh b/include/qpdf/QPDFWriter.hh index 95291974..f6f18f77 100644 --- a/include/qpdf/QPDFWriter.hh +++ b/include/qpdf/QPDFWriter.hh @@ -437,6 +437,10 @@ class QPDFWriter QPDF_DLL std::map getWrittenXRefTable(); + // The following structs / classes are not part of the public API. + struct Object; + class ObjTable; + private: // flags used by unparseObject static int const f_stream = 1 << 0; @@ -550,6 +554,7 @@ class QPDFWriter void writeLinearized(); void enqueuePart(std::vector& part); void writeEncryptionDictionary(); + void initializeTables(size_t extra = 0); void doWriteSetup(); void writeHeader(); void writeHintStream(int hint_id); @@ -605,97 +610,9 @@ class QPDFWriter void computeDeterministicIDData(); void discardGeneration(std::map const& in, std::map& out); + void discardGeneration(std::map& out); - class Members - { - friend class QPDFWriter; - - public: - QPDF_DLL - ~Members(); - - private: - Members(QPDF& pdf); - Members(Members const&) = delete; - - QPDF& pdf; - QPDFObjGen root_og{-1, 0}; - char const* filename{"unspecified"}; - FILE* file{nullptr}; - bool close_file{false}; - Pl_Buffer* buffer_pipeline{nullptr}; - Buffer* output_buffer{nullptr}; - bool normalize_content_set{false}; - bool normalize_content{false}; - bool compress_streams{true}; - bool compress_streams_set{false}; - qpdf_stream_decode_level_e stream_decode_level{qpdf_dl_none}; - bool stream_decode_level_set{false}; - bool recompress_flate{false}; - bool qdf_mode{false}; - bool preserve_unreferenced_objects{false}; - bool newline_before_endstream{false}; - bool static_id{false}; - bool suppress_original_object_ids{false}; - bool direct_stream_lengths{true}; - bool encrypted{false}; - bool preserve_encryption{true}; - bool linearized{false}; - bool pclm{false}; - qpdf_object_stream_e object_stream_mode{qpdf_o_preserve}; - std::string encryption_key; - bool encrypt_metadata{true}; - bool encrypt_use_aes{false}; - std::map encryption_dictionary; - int encryption_V{0}; - int encryption_R{0}; - - std::string id1; // for /ID key of - std::string id2; // trailer dictionary - std::string final_pdf_version; - int final_extension_level{0}; - std::string min_pdf_version; - int min_extension_level{0}; - std::string forced_pdf_version; - int forced_extension_level{0}; - std::string extra_header_text; - int encryption_dict_objid{0}; - std::string cur_data_key; - std::list> to_delete; - Pl_Count* pipeline{nullptr}; - std::vector object_queue; - size_t object_queue_front{0}; - std::map obj_renumber; - std::map xref; - std::map lengths; - int next_objid{1}; - int cur_stream_length_id{0}; - size_t cur_stream_length{0}; - bool added_newline{false}; - int max_ostream_index{0}; - std::set normalized_streams; - std::map page_object_to_seq; - std::map contents_to_page_seq; - std::map object_to_object_stream; - std::map> object_stream_to_objects; - std::list pipeline_stack; - unsigned long long next_stack_id{0}; - bool deterministic_id{false}; - Pl_MD5* md5_pipeline{nullptr}; - std::string deterministic_id_data; - bool did_write_setup{false}; - - // For linearization only - std::string lin_pass1_filename; - std::map obj_renumber_no_gen; - std::map object_to_object_stream_no_gen; - - // For progress reporting - std::shared_ptr progress_reporter; - int events_expected{0}; - int events_seen{0}; - int next_progress_report{0}; - }; + class Members; // Keep all member variables inside the Members object, which we dynamically allocate. This // makes it possible to add new private members without breaking binary compatibility. diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index fdd75359..8ca2501b 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -2377,6 +2377,19 @@ QPDF::getXRefTable() return m->xref_table; } +size_t +QPDF::tableSize() +{ + // If obj_cache is dense, accommodate all object in tables,else accommodate only original + // objects. + auto max_xref = m->xref_table.size() ? m->xref_table.crbegin()->first.getObj() : 0; + auto max_obj = m->obj_cache.size() ? m->obj_cache.crbegin()->first.getObj() : 0; + if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) { + return toS(++max_obj); + } + return toS(++max_xref); +} + void QPDF::getObjectStreamData(std::map& omap) { diff --git a/libqpdf/QPDFWriter.cc b/libqpdf/QPDFWriter.cc index 981fc755..de0aa951 100644 --- a/libqpdf/QPDFWriter.cc +++ b/libqpdf/QPDFWriter.cc @@ -2,7 +2,7 @@ #include // include early for large file support -#include +#include #include #include @@ -1064,7 +1064,7 @@ QPDFWriter::assignCompressedObjectNumbers(QPDFObjGen const& og) // Reserve numbers for the objects that belong to this object stream. for (auto const& iter: m->object_stream_to_objects[objid]) { - m->obj_renumber[iter] = m->next_objid++; + m->obj[iter].renumber = m->next_objid++; } } @@ -1093,18 +1093,19 @@ QPDFWriter::enqueueObject(QPDFObjectHandle object) } QPDFObjGen og = object.getObjGen(); + auto& renumber = m->obj[og].renumber; - if (m->obj_renumber.count(og) == 0) { + if (renumber == 0) { if (m->object_to_object_stream.count(og)) { // This is in an object stream. Don't process it here. Instead, enqueue the object // stream. Object streams always have generation 0. int stream_id = m->object_to_object_stream[og]; // Detect loops by storing invalid object ID 0, which will get overwritten later. - m->obj_renumber[og] = 0; + renumber = -1; enqueueObject(m->pdf.getObjectByID(stream_id, 0)); } else { m->object_queue.push_back(object); - m->obj_renumber[og] = m->next_objid++; + renumber = m->next_objid++; if ((og.getGen() == 0) && m->object_stream_to_objects.count(og.getObj())) { // For linearized files, uncompressed objects go at end, and we take care of @@ -1117,7 +1118,7 @@ QPDFWriter::enqueueObject(QPDFObjectHandle object) ++m->next_objid; } } - } else if (m->obj_renumber[og] == 0) { + } else if (renumber == -1) { // This can happen if a specially constructed file indicates that an object stream is // inside itself. QTC::TC("qpdf", "QPDFWriter ignore self-referential object stream"); @@ -1147,9 +1148,7 @@ QPDFWriter::unparseChild(QPDFObjectHandle child, int level, int flags) enqueueObject(child); } if (child.isIndirect()) { - QPDFObjGen old_og = child.getObjGen(); - int new_id = m->obj_renumber[old_og]; - writeString(std::to_string(new_id)); + writeString(std::to_string(m->obj[child].renumber)); writeString(" 0 R"); } else { unparseObject(child, level, flags); @@ -1527,9 +1526,8 @@ QPDFWriter::unparseObject( writeString(">>"); } else if (tc == ::ot_stream) { // Write stream data to a buffer. - int new_id = m->obj_renumber[old_og]; if (!m->direct_stream_lengths) { - m->cur_stream_length_id = new_id + 1; + m->cur_stream_length_id = m->obj[old_og].renumber + 1; } flags |= f_stream; @@ -1626,7 +1624,7 @@ QPDFWriter::writeObjectStream(QPDFObjectHandle object) QPDFObjGen old_og = object.getObjGen(); qpdf_assert_debug(old_og.getGen() == 0); int old_id = old_og.getObj(); - int new_id = m->obj_renumber[old_og]; + int new_stream_id = m->obj[old_og].renumber; std::vector offsets; qpdf_offset_t first = 0; @@ -1670,7 +1668,7 @@ QPDFWriter::writeObjectStream(QPDFObjectHandle object) int count = -1; for (auto const& obj: m->object_stream_to_objects[old_id]) { ++count; - int new_obj = m->obj_renumber[obj]; + int new_obj = m->obj[obj].renumber; if (first_obj == -1) { first_obj = new_obj; } @@ -1706,13 +1704,13 @@ QPDFWriter::writeObjectStream(QPDFObjectHandle object) } writeObject(obj_to_write, count); - m->xref[new_obj] = QPDFXRefEntry(new_id, count); + m->xref[new_obj] = QPDFXRefEntry(new_stream_id, count); } } // Write the object - openObject(new_id); - setDataKey(new_id); + openObject(new_stream_id); + setDataKey(new_stream_id); writeString("<<"); writeStringQDF("\n "); writeString(" /Type /ObjStm"); @@ -1754,7 +1752,7 @@ QPDFWriter::writeObjectStream(QPDFObjectHandle object) } writeString("endstream"); m->cur_data_key.clear(); - closeObject(new_id); + closeObject(new_stream_id); } void @@ -1769,7 +1767,7 @@ QPDFWriter::writeObject(QPDFObjectHandle object, int object_stream_index) } indicateProgress(false, false); - int new_id = m->obj_renumber[old_og]; + auto new_id = m->obj[old_og].renumber; if (m->qdf_mode) { if (m->page_object_to_seq.count(old_og)) { writeString("%% Page "); @@ -1979,6 +1977,10 @@ QPDFWriter::generateObjectStreams() std::vector eligible = QPDF::Writer::getCompressibleObjGens(m->pdf); size_t n_object_streams = (eligible.size() + 99U) / 100U; + + // Initialize object table for all existing objects plus some headroom for objects created + // during writing. + initializeTables(2U * n_object_streams); if (n_object_streams == 0) { return; } @@ -2055,6 +2057,13 @@ QPDFWriter::prepareFileForWrite() } } +void +QPDFWriter::initializeTables(size_t extra) +{ + auto size = QIntC::to_size(QPDF::Writer::tableSize(m->pdf) + 100) + extra; + m->obj.initialize(size); +} + void QPDFWriter::doWriteSetup() { @@ -2124,10 +2133,13 @@ QPDFWriter::doWriteSetup() switch (m->object_stream_mode) { case qpdf_o_disable: - // no action required + // Initialize object table for all existing objects plus some headroom for objects created + // during writing. + initializeTables(); break; case qpdf_o_preserve: + initializeTables(); preserveObjectStreams(); break; @@ -2215,7 +2227,7 @@ QPDFWriter::write() QPDFObjGen QPDFWriter::getRenumberedObjGen(QPDFObjGen og) { - return QPDFObjGen(m->obj_renumber[og], 0); + return QPDFObjGen(m->obj[og].renumber, 0); } std::map @@ -2533,6 +2545,26 @@ QPDFWriter::discardGeneration(std::map const& in, std::map& out) +{ + // There are deep assumptions in the linearization code in QPDF that there is only one object + // with each object number; i.e., you can't have two objects with the same object number and + // different generations. This is a pretty safe assumption because Adobe Reader and Acrobat + // can't actually handle this case. There is not much if any code in QPDF outside linearization + // that assumes this, but the linearization code as currently implemented would do weird things + // if we found such a case. In order to avoid breaking ABI changes in QPDF, we will first + // assert that this condition holds. Then we can create new maps for QPDF that throw away + // generation numbers. + + out.clear(); + m->obj.forEach([&out](auto id, auto const& item) -> void { + if (item.renumber > 0) { + out[id] = item.renumber; + } + }); +} + void QPDFWriter::writeLinearized() { @@ -2690,7 +2722,7 @@ QPDFWriter::writeLinearized() writeString("<<"); if (pass == 2) { std::vector const& pages = m->pdf.getAllPages(); - int first_page_object = m->obj_renumber[pages.at(0).getObjGen()]; + int first_page_object = m->obj[pages.at(0)].renumber; int npages = QIntC::to_int(pages.size()); writeString(" /Linearized 1 /L "); @@ -2855,7 +2887,7 @@ QPDFWriter::writeLinearized() writeString(std::to_string(first_xref_offset)); writeString("\n%%EOF\n"); - discardGeneration(m->obj_renumber, m->obj_renumber_no_gen); + discardGeneration(m->obj_renumber_no_gen); if (pass == 1) { if (m->deterministic_id) { diff --git a/libqpdf/qpdf/ObjTable.hh b/libqpdf/qpdf/ObjTable.hh new file mode 100644 index 00000000..1f0f8a2b --- /dev/null +++ b/libqpdf/qpdf/ObjTable.hh @@ -0,0 +1,150 @@ +#ifndef OBJTABLE_HH +#define OBJTABLE_HH + +#include +#include + +#include "qpdf/QIntC.hh" +#include + +// A table of objects indexed by object id. This is intended as a more efficient replacement for +// std::map containers. +// +// The table is implemented as a std::vector, with the object id implicitly represented by the index +// of the object. This has a number of implications, including: +// - operations that change the index of existing elements such as insertion and deletions are not +// permitted. +// - operations that extend the table may invalidate iterators and references to objects. +// +// The provided overloads of the access operator[] are safe. For out of bounds access they will +// either extend the table or throw a runtime error. +// +// ObjTable has a map 'sparse_elements' to deal with very sparse / extremely large object tables +// (usually as the result of invalid dangling references). This map may contain objects not found in +// the xref table of the original pdf if there are dangling references with an id significantly +// larger than the largest valid object id found in original pdf. + +template +class ObjTable: public std::vector +{ + public: + ObjTable() = default; + ObjTable(const ObjTable&) = delete; + ObjTable(ObjTable&&) = delete; + ObjTable& operator[](const ObjTable&) = delete; + ObjTable& operator[](ObjTable&&) = delete; + + // Remove unchecked access. + T& operator[](unsigned long idx) = delete; + T const& operator[](unsigned long idx) const = delete; + + inline T const& + operator[](int idx) const + { + return element(static_cast(idx)); + } + + inline T const& + operator[](QPDFObjGen og) const + { + return element(static_cast(og.getObj())); + } + + inline T const& + operator[](QPDFObjectHandle oh) const + { + return element(static_cast(oh.getObjectID())); + } + + inline bool + contains(size_t idx) const + { + return idx < std::vector::size() || sparse_elements.count(idx); + } + + inline bool + contains(QPDFObjectHandle oh) const + { + return contains(static_cast(oh.getObjectID())); + } + + protected: + inline T& + operator[](int id) + { + return element(static_cast(id)); + } + + inline T& + operator[](QPDFObjGen og) + { + return element(static_cast(og.getObj())); + } + + inline T& + operator[](QPDFObjectHandle oh) + { + return element(static_cast(oh.getObjectID())); + } + + inline T& + operator[](unsigned int id) + { + return element(id); + } + + void + initialize(size_t idx) + { + if (std::vector::size() > 0 || sparse_elements.size() > 0) { + throw ::std::logic_error("ObjTable accessed before initialization"); + } else if ( + idx >= static_cast(std::numeric_limits::max()) || + idx >= std::vector::max_size()) { + throw std::runtime_error("Invalid maximum object id initializing ObjTable."); + } else { + std::vector::resize(++idx); + } + } + + inline void + forEach(std::function fn) + { + int i = 0; + for (auto const& item: *this) { + fn(i++, item); + } + for (auto const& [id, item]: sparse_elements) { + fn(QIntC::to_int(id), item); + } + } + + private: + std::map sparse_elements; + + inline T& + element(size_t idx) + { + if (idx < std::vector::size()) { + return std::vector::operator[](idx); + } else if (idx < static_cast(std::numeric_limits::max())) { + return sparse_elements[idx]; + } + throw std::runtime_error("Invalid object id accessing ObjTable."); + return element(0); // doesn't return + } + + inline T const& + element(size_t idx) const + { + if (idx < std::vector::size()) { + return std::vector::operator[](idx); + } else if (idx < static_cast(std::numeric_limits::max())) { + return sparse_elements.at(idx); + } + throw std::runtime_error("Invalid object id accessing ObjTable."); + return element(0); // doesn't return + } +}; + +#endif // OBJTABLE_HH diff --git a/libqpdf/qpdf/QPDFWriter_private.hh b/libqpdf/qpdf/QPDFWriter_private.hh new file mode 100644 index 00000000..3ec15f09 --- /dev/null +++ b/libqpdf/qpdf/QPDFWriter_private.hh @@ -0,0 +1,112 @@ +#ifndef QPDFWRITER_PRIVATE_HH +#define QPDFWRITER_PRIVATE_HH + +#include + +#include + +// This file is intended for inclusion by QPDFWriter, QPDF, QPDF_optimization and QPDF_linearization +// only. + +struct QPDFWriter::Object +{ + int renumber{0}; +}; + +class QPDFWriter::ObjTable: public ::ObjTable +{ + friend class QPDFWriter; +}; + +class QPDFWriter::Members +{ + friend class QPDFWriter; + + public: + QPDF_DLL + ~Members(); + + private: + Members(QPDF& pdf); + Members(Members const&) = delete; + + QPDF& pdf; + QPDFObjGen root_og{-1, 0}; + char const* filename{"unspecified"}; + FILE* file{nullptr}; + bool close_file{false}; + Pl_Buffer* buffer_pipeline{nullptr}; + Buffer* output_buffer{nullptr}; + bool normalize_content_set{false}; + bool normalize_content{false}; + bool compress_streams{true}; + bool compress_streams_set{false}; + qpdf_stream_decode_level_e stream_decode_level{qpdf_dl_none}; + bool stream_decode_level_set{false}; + bool recompress_flate{false}; + bool qdf_mode{false}; + bool preserve_unreferenced_objects{false}; + bool newline_before_endstream{false}; + bool static_id{false}; + bool suppress_original_object_ids{false}; + bool direct_stream_lengths{true}; + bool encrypted{false}; + bool preserve_encryption{true}; + bool linearized{false}; + bool pclm{false}; + qpdf_object_stream_e object_stream_mode{qpdf_o_preserve}; + std::string encryption_key; + bool encrypt_metadata{true}; + bool encrypt_use_aes{false}; + std::map encryption_dictionary; + int encryption_V{0}; + int encryption_R{0}; + + std::string id1; // for /ID key of + std::string id2; // trailer dictionary + std::string final_pdf_version; + int final_extension_level{0}; + std::string min_pdf_version; + int min_extension_level{0}; + std::string forced_pdf_version; + int forced_extension_level{0}; + std::string extra_header_text; + int encryption_dict_objid{0}; + std::string cur_data_key; + std::list> to_delete; + Pl_Count* pipeline{nullptr}; + std::vector object_queue; + size_t object_queue_front{0}; + QPDFWriter::ObjTable obj; + std::map xref; + std::map lengths; + int next_objid{1}; + int cur_stream_length_id{0}; + size_t cur_stream_length{0}; + bool added_newline{false}; + int max_ostream_index{0}; + std::set normalized_streams; + std::map page_object_to_seq; + std::map contents_to_page_seq; + std::map object_to_object_stream; + std::map> object_stream_to_objects; + std::list pipeline_stack; + unsigned long long next_stack_id{0}; + bool deterministic_id{false}; + Pl_MD5* md5_pipeline{nullptr}; + std::string deterministic_id_data; + bool did_write_setup{false}; + + // For linearization only + std::string lin_pass1_filename; + std::map obj_renumber_no_gen; + std::map object_to_object_stream_no_gen; + + // For progress reporting + std::shared_ptr progress_reporter; + int events_expected{0}; + int events_seen{0}; + int next_progress_report{0}; +}; + +#endif // QPDFWRITER_PRIVATE_HH diff --git a/libtests/CMakeLists.txt b/libtests/CMakeLists.txt index ea4dc7cd..7d2ecbd6 100644 --- a/libtests/CMakeLists.txt +++ b/libtests/CMakeLists.txt @@ -23,6 +23,7 @@ set(TEST_PROGRAMS md5 nntree numrange + obj_table pdf_version pl_function pointer_holder diff --git a/libtests/obj_table.cc b/libtests/obj_table.cc new file mode 100644 index 00000000..5e83beb6 --- /dev/null +++ b/libtests/obj_table.cc @@ -0,0 +1,39 @@ +#include + +struct Test +{ + int value{0}; +}; + +class Table: public ObjTable +{ + public: + Table() + { + initialize(5); + } + + void + test() + { + for (int i = 0; i < 10; ++i) { + (*this)[i].value = 2 * i; + (*this)[1000 + i].value = 2 * (1000 + i); + } + + forEach([](auto i, auto const& item) -> void { + std::cout << std::to_string(i) << " : " << std::to_string(item.value) << "\n"; + }); + + std::cout << "2000 : " << std::to_string((*this)[2000].value) << "\n"; + } +}; + +int +main() +{ + Table().test(); + + std::cout << "object table tests done\n"; + return 0; +} diff --git a/libtests/qtest/obj_table.test b/libtests/qtest/obj_table.test new file mode 100644 index 00000000..4d07162a --- /dev/null +++ b/libtests/qtest/obj_table.test @@ -0,0 +1,18 @@ +#!/usr/bin/env perl +require 5.008; +use warnings; +use strict; + +chdir("obj_table") or die "chdir testdir failed: $!\n"; + +require TestDriver; + +my $td = new TestDriver('object table'); + +$td->runtest("obj_table", + {$td->COMMAND => "obj_table"}, + {$td->FILE => "obj_table.out", + $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); + +$td->report(1); diff --git a/libtests/qtest/obj_table/obj_table.out b/libtests/qtest/obj_table/obj_table.out new file mode 100644 index 00000000..617e3411 --- /dev/null +++ b/libtests/qtest/obj_table/obj_table.out @@ -0,0 +1,22 @@ +0 : 0 +1 : 2 +2 : 4 +3 : 6 +4 : 8 +5 : 10 +6 : 12 +7 : 14 +8 : 16 +9 : 18 +1000 : 2000 +1001 : 2002 +1002 : 2004 +1003 : 2006 +1004 : 2008 +1005 : 2010 +1006 : 2012 +1007 : 2014 +1008 : 2016 +1009 : 2018 +2000 : 0 +object table tests done