In QPDFWriter replace map obj_renumber with a new object table obj

This commit is contained in:
m-holger 2024-02-18 01:42:47 +00:00
parent 3b97c9bd26
commit 8791b5f8d0
10 changed files with 424 additions and 112 deletions

View File

@ -765,6 +765,12 @@ class QPDF
{
return qpdf.getCompressibleObjGens();
}
static size_t
tableSize(QPDF& qpdf)
{
return qpdf.tableSize();
}
};
// The Resolver class is restricted to QPDFObject so that only it can resolve indirect
@ -1083,6 +1089,8 @@ class QPDF
// For QPDFWriter:
size_t tableSize();
// Get lists of all objects in order according to the part of a linearized file that they belong
// to.
void getLinearizedParts(

View File

@ -437,6 +437,10 @@ class QPDFWriter
QPDF_DLL
std::map<QPDFObjGen, QPDFXRefEntry> getWrittenXRefTable();
// The following structs / classes are not part of the public API.
struct Object;
class ObjTable;
private:
// flags used by unparseObject
static int const f_stream = 1 << 0;
@ -550,6 +554,7 @@ class QPDFWriter
void writeLinearized();
void enqueuePart(std::vector<QPDFObjectHandle>& part);
void writeEncryptionDictionary();
void initializeTables(size_t extra = 0);
void doWriteSetup();
void writeHeader();
void writeHintStream(int hint_id);
@ -605,97 +610,9 @@ class QPDFWriter
void computeDeterministicIDData();
void discardGeneration(std::map<QPDFObjGen, int> const& in, std::map<int, int>& out);
void discardGeneration(std::map<int, int>& out);
class Members
{
friend class QPDFWriter;
public:
QPDF_DLL
~Members();
private:
Members(QPDF& pdf);
Members(Members const&) = delete;
QPDF& pdf;
QPDFObjGen root_og{-1, 0};
char const* filename{"unspecified"};
FILE* file{nullptr};
bool close_file{false};
Pl_Buffer* buffer_pipeline{nullptr};
Buffer* output_buffer{nullptr};
bool normalize_content_set{false};
bool normalize_content{false};
bool compress_streams{true};
bool compress_streams_set{false};
qpdf_stream_decode_level_e stream_decode_level{qpdf_dl_none};
bool stream_decode_level_set{false};
bool recompress_flate{false};
bool qdf_mode{false};
bool preserve_unreferenced_objects{false};
bool newline_before_endstream{false};
bool static_id{false};
bool suppress_original_object_ids{false};
bool direct_stream_lengths{true};
bool encrypted{false};
bool preserve_encryption{true};
bool linearized{false};
bool pclm{false};
qpdf_object_stream_e object_stream_mode{qpdf_o_preserve};
std::string encryption_key;
bool encrypt_metadata{true};
bool encrypt_use_aes{false};
std::map<std::string, std::string> encryption_dictionary;
int encryption_V{0};
int encryption_R{0};
std::string id1; // for /ID key of
std::string id2; // trailer dictionary
std::string final_pdf_version;
int final_extension_level{0};
std::string min_pdf_version;
int min_extension_level{0};
std::string forced_pdf_version;
int forced_extension_level{0};
std::string extra_header_text;
int encryption_dict_objid{0};
std::string cur_data_key;
std::list<std::shared_ptr<Pipeline>> to_delete;
Pl_Count* pipeline{nullptr};
std::vector<QPDFObjectHandle> object_queue;
size_t object_queue_front{0};
std::map<QPDFObjGen, int> obj_renumber;
std::map<int, QPDFXRefEntry> xref;
std::map<int, qpdf_offset_t> lengths;
int next_objid{1};
int cur_stream_length_id{0};
size_t cur_stream_length{0};
bool added_newline{false};
int max_ostream_index{0};
std::set<QPDFObjGen> normalized_streams;
std::map<QPDFObjGen, int> page_object_to_seq;
std::map<QPDFObjGen, int> contents_to_page_seq;
std::map<QPDFObjGen, int> object_to_object_stream;
std::map<int, std::set<QPDFObjGen>> object_stream_to_objects;
std::list<Pipeline*> pipeline_stack;
unsigned long long next_stack_id{0};
bool deterministic_id{false};
Pl_MD5* md5_pipeline{nullptr};
std::string deterministic_id_data;
bool did_write_setup{false};
// For linearization only
std::string lin_pass1_filename;
std::map<int, int> obj_renumber_no_gen;
std::map<int, int> object_to_object_stream_no_gen;
// For progress reporting
std::shared_ptr<ProgressReporter> progress_reporter;
int events_expected{0};
int events_seen{0};
int next_progress_report{0};
};
class Members;
// Keep all member variables inside the Members object, which we dynamically allocate. This
// makes it possible to add new private members without breaking binary compatibility.

View File

@ -2377,6 +2377,19 @@ QPDF::getXRefTable()
return m->xref_table;
}
size_t
QPDF::tableSize()
{
// If obj_cache is dense, accommodate all object in tables,else accommodate only original
// objects.
auto max_xref = m->xref_table.size() ? m->xref_table.crbegin()->first.getObj() : 0;
auto max_obj = m->obj_cache.size() ? m->obj_cache.crbegin()->first.getObj() : 0;
if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) {
return toS(++max_obj);
}
return toS(++max_xref);
}
void
QPDF::getObjectStreamData(std::map<int, int>& omap)
{

View File

@ -2,7 +2,7 @@
#include <qpdf/qpdf-config.h> // include early for large file support
#include <qpdf/QPDFWriter.hh>
#include <qpdf/QPDFWriter_private.hh>
#include <qpdf/MD5.hh>
#include <qpdf/Pl_AES_PDF.hh>
@ -1064,7 +1064,7 @@ QPDFWriter::assignCompressedObjectNumbers(QPDFObjGen const& og)
// Reserve numbers for the objects that belong to this object stream.
for (auto const& iter: m->object_stream_to_objects[objid]) {
m->obj_renumber[iter] = m->next_objid++;
m->obj[iter].renumber = m->next_objid++;
}
}
@ -1093,18 +1093,19 @@ QPDFWriter::enqueueObject(QPDFObjectHandle object)
}
QPDFObjGen og = object.getObjGen();
auto& renumber = m->obj[og].renumber;
if (m->obj_renumber.count(og) == 0) {
if (renumber == 0) {
if (m->object_to_object_stream.count(og)) {
// This is in an object stream. Don't process it here. Instead, enqueue the object
// stream. Object streams always have generation 0.
int stream_id = m->object_to_object_stream[og];
// Detect loops by storing invalid object ID 0, which will get overwritten later.
m->obj_renumber[og] = 0;
renumber = -1;
enqueueObject(m->pdf.getObjectByID(stream_id, 0));
} else {
m->object_queue.push_back(object);
m->obj_renumber[og] = m->next_objid++;
renumber = m->next_objid++;
if ((og.getGen() == 0) && m->object_stream_to_objects.count(og.getObj())) {
// For linearized files, uncompressed objects go at end, and we take care of
@ -1117,7 +1118,7 @@ QPDFWriter::enqueueObject(QPDFObjectHandle object)
++m->next_objid;
}
}
} else if (m->obj_renumber[og] == 0) {
} else if (renumber == -1) {
// This can happen if a specially constructed file indicates that an object stream is
// inside itself.
QTC::TC("qpdf", "QPDFWriter ignore self-referential object stream");
@ -1147,9 +1148,7 @@ QPDFWriter::unparseChild(QPDFObjectHandle child, int level, int flags)
enqueueObject(child);
}
if (child.isIndirect()) {
QPDFObjGen old_og = child.getObjGen();
int new_id = m->obj_renumber[old_og];
writeString(std::to_string(new_id));
writeString(std::to_string(m->obj[child].renumber));
writeString(" 0 R");
} else {
unparseObject(child, level, flags);
@ -1527,9 +1526,8 @@ QPDFWriter::unparseObject(
writeString(">>");
} else if (tc == ::ot_stream) {
// Write stream data to a buffer.
int new_id = m->obj_renumber[old_og];
if (!m->direct_stream_lengths) {
m->cur_stream_length_id = new_id + 1;
m->cur_stream_length_id = m->obj[old_og].renumber + 1;
}
flags |= f_stream;
@ -1626,7 +1624,7 @@ QPDFWriter::writeObjectStream(QPDFObjectHandle object)
QPDFObjGen old_og = object.getObjGen();
qpdf_assert_debug(old_og.getGen() == 0);
int old_id = old_og.getObj();
int new_id = m->obj_renumber[old_og];
int new_stream_id = m->obj[old_og].renumber;
std::vector<qpdf_offset_t> offsets;
qpdf_offset_t first = 0;
@ -1670,7 +1668,7 @@ QPDFWriter::writeObjectStream(QPDFObjectHandle object)
int count = -1;
for (auto const& obj: m->object_stream_to_objects[old_id]) {
++count;
int new_obj = m->obj_renumber[obj];
int new_obj = m->obj[obj].renumber;
if (first_obj == -1) {
first_obj = new_obj;
}
@ -1706,13 +1704,13 @@ QPDFWriter::writeObjectStream(QPDFObjectHandle object)
}
writeObject(obj_to_write, count);
m->xref[new_obj] = QPDFXRefEntry(new_id, count);
m->xref[new_obj] = QPDFXRefEntry(new_stream_id, count);
}
}
// Write the object
openObject(new_id);
setDataKey(new_id);
openObject(new_stream_id);
setDataKey(new_stream_id);
writeString("<<");
writeStringQDF("\n ");
writeString(" /Type /ObjStm");
@ -1754,7 +1752,7 @@ QPDFWriter::writeObjectStream(QPDFObjectHandle object)
}
writeString("endstream");
m->cur_data_key.clear();
closeObject(new_id);
closeObject(new_stream_id);
}
void
@ -1769,7 +1767,7 @@ QPDFWriter::writeObject(QPDFObjectHandle object, int object_stream_index)
}
indicateProgress(false, false);
int new_id = m->obj_renumber[old_og];
auto new_id = m->obj[old_og].renumber;
if (m->qdf_mode) {
if (m->page_object_to_seq.count(old_og)) {
writeString("%% Page ");
@ -1979,6 +1977,10 @@ QPDFWriter::generateObjectStreams()
std::vector<QPDFObjGen> eligible = QPDF::Writer::getCompressibleObjGens(m->pdf);
size_t n_object_streams = (eligible.size() + 99U) / 100U;
// Initialize object table for all existing objects plus some headroom for objects created
// during writing.
initializeTables(2U * n_object_streams);
if (n_object_streams == 0) {
return;
}
@ -2055,6 +2057,13 @@ QPDFWriter::prepareFileForWrite()
}
}
void
QPDFWriter::initializeTables(size_t extra)
{
auto size = QIntC::to_size(QPDF::Writer::tableSize(m->pdf) + 100) + extra;
m->obj.initialize(size);
}
void
QPDFWriter::doWriteSetup()
{
@ -2124,10 +2133,13 @@ QPDFWriter::doWriteSetup()
switch (m->object_stream_mode) {
case qpdf_o_disable:
// no action required
// Initialize object table for all existing objects plus some headroom for objects created
// during writing.
initializeTables();
break;
case qpdf_o_preserve:
initializeTables();
preserveObjectStreams();
break;
@ -2215,7 +2227,7 @@ QPDFWriter::write()
QPDFObjGen
QPDFWriter::getRenumberedObjGen(QPDFObjGen og)
{
return QPDFObjGen(m->obj_renumber[og], 0);
return QPDFObjGen(m->obj[og].renumber, 0);
}
std::map<QPDFObjGen, QPDFXRefEntry>
@ -2533,6 +2545,26 @@ QPDFWriter::discardGeneration(std::map<QPDFObjGen, int> const& in, std::map<int,
}
}
void
QPDFWriter::discardGeneration(std::map<int, int>& out)
{
// There are deep assumptions in the linearization code in QPDF that there is only one object
// with each object number; i.e., you can't have two objects with the same object number and
// different generations. This is a pretty safe assumption because Adobe Reader and Acrobat
// can't actually handle this case. There is not much if any code in QPDF outside linearization
// that assumes this, but the linearization code as currently implemented would do weird things
// if we found such a case. In order to avoid breaking ABI changes in QPDF, we will first
// assert that this condition holds. Then we can create new maps for QPDF that throw away
// generation numbers.
out.clear();
m->obj.forEach([&out](auto id, auto const& item) -> void {
if (item.renumber > 0) {
out[id] = item.renumber;
}
});
}
void
QPDFWriter::writeLinearized()
{
@ -2690,7 +2722,7 @@ QPDFWriter::writeLinearized()
writeString("<<");
if (pass == 2) {
std::vector<QPDFObjectHandle> const& pages = m->pdf.getAllPages();
int first_page_object = m->obj_renumber[pages.at(0).getObjGen()];
int first_page_object = m->obj[pages.at(0)].renumber;
int npages = QIntC::to_int(pages.size());
writeString(" /Linearized 1 /L ");
@ -2855,7 +2887,7 @@ QPDFWriter::writeLinearized()
writeString(std::to_string(first_xref_offset));
writeString("\n%%EOF\n");
discardGeneration(m->obj_renumber, m->obj_renumber_no_gen);
discardGeneration(m->obj_renumber_no_gen);
if (pass == 1) {
if (m->deterministic_id) {

150
libqpdf/qpdf/ObjTable.hh Normal file
View File

@ -0,0 +1,150 @@
#ifndef OBJTABLE_HH
#define OBJTABLE_HH
#include <qpdf/QPDFObjGen.hh>
#include <qpdf/QPDFObjectHandle.hh>
#include "qpdf/QIntC.hh"
#include <limits>
// A table of objects indexed by object id. This is intended as a more efficient replacement for
// std::map<QPDFObjGen, T> containers.
//
// The table is implemented as a std::vector, with the object id implicitly represented by the index
// of the object. This has a number of implications, including:
// - operations that change the index of existing elements such as insertion and deletions are not
// permitted.
// - operations that extend the table may invalidate iterators and references to objects.
//
// The provided overloads of the access operator[] are safe. For out of bounds access they will
// either extend the table or throw a runtime error.
//
// ObjTable has a map 'sparse_elements' to deal with very sparse / extremely large object tables
// (usually as the result of invalid dangling references). This map may contain objects not found in
// the xref table of the original pdf if there are dangling references with an id significantly
// larger than the largest valid object id found in original pdf.
template <class T>
class ObjTable: public std::vector<T>
{
public:
ObjTable() = default;
ObjTable(const ObjTable&) = delete;
ObjTable(ObjTable&&) = delete;
ObjTable& operator[](const ObjTable&) = delete;
ObjTable& operator[](ObjTable&&) = delete;
// Remove unchecked access.
T& operator[](unsigned long idx) = delete;
T const& operator[](unsigned long idx) const = delete;
inline T const&
operator[](int idx) const
{
return element(static_cast<size_t>(idx));
}
inline T const&
operator[](QPDFObjGen og) const
{
return element(static_cast<size_t>(og.getObj()));
}
inline T const&
operator[](QPDFObjectHandle oh) const
{
return element(static_cast<size_t>(oh.getObjectID()));
}
inline bool
contains(size_t idx) const
{
return idx < std::vector<T>::size() || sparse_elements.count(idx);
}
inline bool
contains(QPDFObjectHandle oh) const
{
return contains(static_cast<size_t>(oh.getObjectID()));
}
protected:
inline T&
operator[](int id)
{
return element(static_cast<size_t>(id));
}
inline T&
operator[](QPDFObjGen og)
{
return element(static_cast<size_t>(og.getObj()));
}
inline T&
operator[](QPDFObjectHandle oh)
{
return element(static_cast<size_t>(oh.getObjectID()));
}
inline T&
operator[](unsigned int id)
{
return element(id);
}
void
initialize(size_t idx)
{
if (std::vector<T>::size() > 0 || sparse_elements.size() > 0) {
throw ::std::logic_error("ObjTable accessed before initialization");
} else if (
idx >= static_cast<size_t>(std::numeric_limits<int>::max()) ||
idx >= std::vector<T>::max_size()) {
throw std::runtime_error("Invalid maximum object id initializing ObjTable.");
} else {
std::vector<T>::resize(++idx);
}
}
inline void
forEach(std::function<void(int, const T&)> fn)
{
int i = 0;
for (auto const& item: *this) {
fn(i++, item);
}
for (auto const& [id, item]: sparse_elements) {
fn(QIntC::to_int(id), item);
}
}
private:
std::map<size_t, T> sparse_elements;
inline T&
element(size_t idx)
{
if (idx < std::vector<T>::size()) {
return std::vector<T>::operator[](idx);
} else if (idx < static_cast<size_t>(std::numeric_limits<int>::max())) {
return sparse_elements[idx];
}
throw std::runtime_error("Invalid object id accessing ObjTable.");
return element(0); // doesn't return
}
inline T const&
element(size_t idx) const
{
if (idx < std::vector<T>::size()) {
return std::vector<T>::operator[](idx);
} else if (idx < static_cast<size_t>(std::numeric_limits<int>::max())) {
return sparse_elements.at(idx);
}
throw std::runtime_error("Invalid object id accessing ObjTable.");
return element(0); // doesn't return
}
};
#endif // OBJTABLE_HH

View File

@ -0,0 +1,112 @@
#ifndef QPDFWRITER_PRIVATE_HH
#define QPDFWRITER_PRIVATE_HH
#include <qpdf/QPDFWriter.hh>
#include <qpdf/ObjTable.hh>
// This file is intended for inclusion by QPDFWriter, QPDF, QPDF_optimization and QPDF_linearization
// only.
struct QPDFWriter::Object
{
int renumber{0};
};
class QPDFWriter::ObjTable: public ::ObjTable<QPDFWriter::Object>
{
friend class QPDFWriter;
};
class QPDFWriter::Members
{
friend class QPDFWriter;
public:
QPDF_DLL
~Members();
private:
Members(QPDF& pdf);
Members(Members const&) = delete;
QPDF& pdf;
QPDFObjGen root_og{-1, 0};
char const* filename{"unspecified"};
FILE* file{nullptr};
bool close_file{false};
Pl_Buffer* buffer_pipeline{nullptr};
Buffer* output_buffer{nullptr};
bool normalize_content_set{false};
bool normalize_content{false};
bool compress_streams{true};
bool compress_streams_set{false};
qpdf_stream_decode_level_e stream_decode_level{qpdf_dl_none};
bool stream_decode_level_set{false};
bool recompress_flate{false};
bool qdf_mode{false};
bool preserve_unreferenced_objects{false};
bool newline_before_endstream{false};
bool static_id{false};
bool suppress_original_object_ids{false};
bool direct_stream_lengths{true};
bool encrypted{false};
bool preserve_encryption{true};
bool linearized{false};
bool pclm{false};
qpdf_object_stream_e object_stream_mode{qpdf_o_preserve};
std::string encryption_key;
bool encrypt_metadata{true};
bool encrypt_use_aes{false};
std::map<std::string, std::string> encryption_dictionary;
int encryption_V{0};
int encryption_R{0};
std::string id1; // for /ID key of
std::string id2; // trailer dictionary
std::string final_pdf_version;
int final_extension_level{0};
std::string min_pdf_version;
int min_extension_level{0};
std::string forced_pdf_version;
int forced_extension_level{0};
std::string extra_header_text;
int encryption_dict_objid{0};
std::string cur_data_key;
std::list<std::shared_ptr<Pipeline>> to_delete;
Pl_Count* pipeline{nullptr};
std::vector<QPDFObjectHandle> object_queue;
size_t object_queue_front{0};
QPDFWriter::ObjTable obj;
std::map<int, QPDFXRefEntry> xref;
std::map<int, qpdf_offset_t> lengths;
int next_objid{1};
int cur_stream_length_id{0};
size_t cur_stream_length{0};
bool added_newline{false};
int max_ostream_index{0};
std::set<QPDFObjGen> normalized_streams;
std::map<QPDFObjGen, int> page_object_to_seq;
std::map<QPDFObjGen, int> contents_to_page_seq;
std::map<QPDFObjGen, int> object_to_object_stream;
std::map<int, std::set<QPDFObjGen>> object_stream_to_objects;
std::list<Pipeline*> pipeline_stack;
unsigned long long next_stack_id{0};
bool deterministic_id{false};
Pl_MD5* md5_pipeline{nullptr};
std::string deterministic_id_data;
bool did_write_setup{false};
// For linearization only
std::string lin_pass1_filename;
std::map<int, int> obj_renumber_no_gen;
std::map<int, int> object_to_object_stream_no_gen;
// For progress reporting
std::shared_ptr<QPDFWriter::ProgressReporter> progress_reporter;
int events_expected{0};
int events_seen{0};
int next_progress_report{0};
};
#endif // QPDFWRITER_PRIVATE_HH

View File

@ -23,6 +23,7 @@ set(TEST_PROGRAMS
md5
nntree
numrange
obj_table
pdf_version
pl_function
pointer_holder

39
libtests/obj_table.cc Normal file
View File

@ -0,0 +1,39 @@
#include <qpdf/ObjTable.hh>
struct Test
{
int value{0};
};
class Table: public ObjTable<Test>
{
public:
Table()
{
initialize(5);
}
void
test()
{
for (int i = 0; i < 10; ++i) {
(*this)[i].value = 2 * i;
(*this)[1000 + i].value = 2 * (1000 + i);
}
forEach([](auto i, auto const& item) -> void {
std::cout << std::to_string(i) << " : " << std::to_string(item.value) << "\n";
});
std::cout << "2000 : " << std::to_string((*this)[2000].value) << "\n";
}
};
int
main()
{
Table().test();
std::cout << "object table tests done\n";
return 0;
}

View File

@ -0,0 +1,18 @@
#!/usr/bin/env perl
require 5.008;
use warnings;
use strict;
chdir("obj_table") or die "chdir testdir failed: $!\n";
require TestDriver;
my $td = new TestDriver('object table');
$td->runtest("obj_table",
{$td->COMMAND => "obj_table"},
{$td->FILE => "obj_table.out",
$td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->report(1);

View File

@ -0,0 +1,22 @@
0 : 0
1 : 2
2 : 4
3 : 6
4 : 8
5 : 10
6 : 12
7 : 14
8 : 16
9 : 18
1000 : 2000
1001 : 2002
1002 : 2004
1003 : 2006
1004 : 2008
1005 : 2010
1006 : 2012
1007 : 2014
1008 : 2016
1009 : 2018
2000 : 0
object table tests done