2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-12-22 02:49:00 +00:00

Merge pull request #1272 from m-holger/xref_table

Refactor QPDF xref table
This commit is contained in:
m-holger 2024-09-19 07:58:48 +01:00 committed by GitHub
commit ff2a78f579
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
24 changed files with 1620 additions and 1172 deletions

View File

@ -725,165 +725,15 @@ class QPDF
void removePage(QPDFObjectHandle page);
// End legacy page helpers
// Writer class is restricted to QPDFWriter so that only it can call certain methods.
class Writer
{
friend class QPDFWriter;
// End of the public API. The following classes and methods are for qpdf internal use only.
private:
static void
optimize(
QPDF& qpdf,
QPDFWriter::ObjTable const& obj,
std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
{
return qpdf.optimize(obj, skip_stream_parameters);
}
static void
getLinearizedParts(
QPDF& qpdf,
QPDFWriter::ObjTable const& obj,
std::vector<QPDFObjectHandle>& part4,
std::vector<QPDFObjectHandle>& part6,
std::vector<QPDFObjectHandle>& part7,
std::vector<QPDFObjectHandle>& part8,
std::vector<QPDFObjectHandle>& part9)
{
qpdf.getLinearizedParts(obj, part4, part6, part7, part8, part9);
}
static void
generateHintStream(
QPDF& qpdf,
QPDFWriter::NewObjTable const& new_obj,
QPDFWriter::ObjTable const& obj,
std::shared_ptr<Buffer>& hint_stream,
int& S,
int& O,
bool compressed)
{
return qpdf.generateHintStream(new_obj, obj, hint_stream, S, O, compressed);
}
static std::vector<QPDFObjGen>
getCompressibleObjGens(QPDF& qpdf)
{
return qpdf.getCompressibleObjVector();
}
static std::vector<bool>
getCompressibleObjSet(QPDF& qpdf)
{
return qpdf.getCompressibleObjSet();
}
static std::map<QPDFObjGen, QPDFXRefEntry> const&
getXRefTable(QPDF& qpdf)
{
return qpdf.getXRefTableInternal();
}
static size_t
tableSize(QPDF& qpdf)
{
return qpdf.tableSize();
}
};
// The Resolver class is restricted to QPDFObject so that only it can resolve indirect
// references.
class Resolver
{
friend class QPDFObject;
friend class QPDF_Unresolved;
private:
static QPDFObject*
resolved(QPDF* qpdf, QPDFObjGen og)
{
return qpdf->resolve(og);
}
};
// StreamCopier class is restricted to QPDFObjectHandle so it can copy stream data.
class StreamCopier
{
friend class QPDFObjectHandle;
private:
static void
copyStreamData(QPDF* qpdf, QPDFObjectHandle const& dest, QPDFObjectHandle const& src)
{
qpdf->copyStreamData(dest, src);
}
};
// The ParseGuard class allows QPDFParser to detect re-entrant parsing. It also provides
// special access to allow the parser to create unresolved objects and dangling references.
class ParseGuard
{
friend class QPDFParser;
private:
ParseGuard(QPDF* qpdf) :
qpdf(qpdf)
{
if (qpdf) {
qpdf->inParse(true);
}
}
static std::shared_ptr<QPDFObject>
getObject(QPDF* qpdf, int id, int gen, bool parse_pdf)
{
return qpdf->getObjectForParser(id, gen, parse_pdf);
}
~ParseGuard()
{
if (qpdf) {
qpdf->inParse(false);
}
}
QPDF* qpdf;
};
// Pipe class is restricted to QPDF_Stream.
class Pipe
{
friend class QPDF_Stream;
private:
static bool
pipeStreamData(
QPDF* qpdf,
QPDFObjGen const& og,
qpdf_offset_t offset,
size_t length,
QPDFObjectHandle dict,
Pipeline* pipeline,
bool suppress_warnings,
bool will_retry)
{
return qpdf->pipeStreamData(
og, offset, length, dict, pipeline, suppress_warnings, will_retry);
}
};
// JobSetter class is restricted to QPDFJob.
class JobSetter
{
friend class QPDFJob;
private:
// Enable enhanced warnings for pdf file checking.
static void
setCheckMode(QPDF& qpdf, bool val)
{
qpdf.m->check_mode = val;
}
};
class Writer;
class Resolver;
class StreamCopier;
class ParseGuard;
class Pipe;
class JobSetter;
class Xref_table;
// For testing only -- do not add to DLL
static bool test_json_validators();
@ -898,163 +748,18 @@ class QPDF
static std::string const qpdf_version;
class ObjCache
{
public:
ObjCache() :
end_before_space(0),
end_after_space(0)
{
}
ObjCache(
std::shared_ptr<QPDFObject> object,
qpdf_offset_t end_before_space = 0,
qpdf_offset_t end_after_space = 0) :
object(object),
end_before_space(end_before_space),
end_after_space(end_after_space)
{
}
std::shared_ptr<QPDFObject> object;
qpdf_offset_t end_before_space;
qpdf_offset_t end_after_space;
};
class ObjCopier
{
public:
std::map<QPDFObjGen, QPDFObjectHandle> object_map;
std::vector<QPDFObjectHandle> to_copy;
QPDFObjGen::set visiting;
};
class EncryptionParameters
{
friend class QPDF;
public:
EncryptionParameters();
private:
bool encrypted;
bool encryption_initialized;
int encryption_V;
int encryption_R;
bool encrypt_metadata;
std::map<std::string, encryption_method_e> crypt_filters;
encryption_method_e cf_stream;
encryption_method_e cf_string;
encryption_method_e cf_file;
std::string provided_password;
std::string user_password;
std::string encryption_key;
std::string cached_object_encryption_key;
QPDFObjGen cached_key_og;
bool user_password_matched;
bool owner_password_matched;
};
class ForeignStreamData
{
friend class QPDF;
public:
ForeignStreamData(
std::shared_ptr<EncryptionParameters> encp,
std::shared_ptr<InputSource> file,
QPDFObjGen const& foreign_og,
qpdf_offset_t offset,
size_t length,
QPDFObjectHandle local_dict);
private:
std::shared_ptr<EncryptionParameters> encp;
std::shared_ptr<InputSource> file;
QPDFObjGen foreign_og;
qpdf_offset_t offset;
size_t length;
QPDFObjectHandle local_dict;
};
class CopiedStreamDataProvider: public QPDFObjectHandle::StreamDataProvider
{
public:
CopiedStreamDataProvider(QPDF& destination_qpdf);
~CopiedStreamDataProvider() override = default;
bool provideStreamData(
QPDFObjGen const& og,
Pipeline* pipeline,
bool suppress_warnings,
bool will_retry) override;
void registerForeignStream(QPDFObjGen const& local_og, QPDFObjectHandle foreign_stream);
void registerForeignStream(QPDFObjGen const& local_og, std::shared_ptr<ForeignStreamData>);
private:
QPDF& destination_qpdf;
std::map<QPDFObjGen, QPDFObjectHandle> foreign_streams;
std::map<QPDFObjGen, std::shared_ptr<ForeignStreamData>> foreign_stream_data;
};
class StringDecrypter: public QPDFObjectHandle::StringDecrypter
{
friend class QPDF;
public:
StringDecrypter(QPDF* qpdf, QPDFObjGen const& og);
~StringDecrypter() override = default;
void decryptString(std::string& val) override;
private:
QPDF* qpdf;
QPDFObjGen og;
};
class ResolveRecorder
{
public:
ResolveRecorder(QPDF* qpdf, QPDFObjGen const& og) :
qpdf(qpdf),
iter(qpdf->m->resolving.insert(og).first)
{
}
virtual ~ResolveRecorder()
{
this->qpdf->m->resolving.erase(iter);
}
private:
QPDF* qpdf;
std::set<QPDFObjGen>::const_iterator iter;
};
class ObjCache;
class ObjCopier;
class EncryptionParameters;
class ForeignStreamData;
class CopiedStreamDataProvider;
class StringDecrypter;
class ResolveRecorder;
class JSONReactor;
void parse(char const* password);
void inParse(bool);
void setTrailer(QPDFObjectHandle obj);
void read_xref(qpdf_offset_t offset);
bool resolveXRefTable();
void reconstruct_xref(QPDFExc& e);
bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes);
bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
qpdf_offset_t read_xrefTable(qpdf_offset_t offset);
qpdf_offset_t read_xrefStream(qpdf_offset_t offset);
qpdf_offset_t processXRefStream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream);
std::pair<int, std::array<int, 3>>
processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged);
int processXRefSize(
QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged);
std::pair<int, std::vector<std::pair<int, int>>> processXRefIndex(
QPDFObjectHandle& dict,
int max_num_entries,
std::function<QPDFExc(std::string_view)> damaged);
void insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2);
void insertFreeXrefEntry(QPDFObjGen);
void insertReconstructedXrefEntry(int obj, qpdf_offset_t f1, int f2);
void setLastObjectDescription(std::string const& description, QPDFObjGen const& og);
QPDFObjectHandle readTrailer();
QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og);
void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
@ -1081,11 +786,7 @@ class QPDF
std::shared_ptr<QPDFObject> getObjectForParser(int id, int gen, bool parse_pdf);
std::shared_ptr<QPDFObject> getObjectForJSON(int id, int gen);
void removeObject(QPDFObjGen og);
void updateCache(
QPDFObjGen const& og,
std::shared_ptr<QPDFObject> const& object,
qpdf_offset_t end_before_space,
qpdf_offset_t end_after_space);
void updateCache(QPDFObjGen const& og, std::shared_ptr<QPDFObject> const& object);
static QPDFExc damagedPDF(
InputSource& input,
std::string const& object,
@ -1122,7 +823,6 @@ class QPDF
// For QPDFWriter:
std::map<QPDFObjGen, QPDFXRefEntry> const& getXRefTableInternal();
template <typename T>
void optimize_internal(
T const& object_stream_data,
@ -1131,6 +831,7 @@ class QPDF
void optimize(
QPDFWriter::ObjTable const& obj,
std::function<int(QPDFObjectHandle&)> skip_stream_parameters);
void optimize(Xref_table const& obj);
size_t tableSize();
// Get lists of all objects in order according to the part of a linearized file that they belong
@ -1196,200 +897,19 @@ class QPDF
replaceForeignIndirectObjects(QPDFObjectHandle foreign, ObjCopier& obj_copier, bool top);
void copyStreamData(QPDFObjectHandle dest_stream, QPDFObjectHandle src_stream);
// Linearization Hint table structures.
// Naming conventions:
// HSomething is the Something Hint Table or table header
// HSomethingEntry is an entry in the Something table
// delta_something + min_something = something
// nbits_something = number of bits required for something
// something_offset is the pre-adjusted offset in the file. If >=
// H0_offset, H0_length must be added to get an actual file
// offset.
// PDF 1.4: Table F.4
struct HPageOffsetEntry
{
int delta_nobjects{0}; // 1
qpdf_offset_t delta_page_length{0}; // 2
// vectors' sizes = nshared_objects
int nshared_objects{0}; // 3
std::vector<int> shared_identifiers; // 4
std::vector<int> shared_numerators; // 5
qpdf_offset_t delta_content_offset{0}; // 6
qpdf_offset_t delta_content_length{0}; // 7
};
// PDF 1.4: Table F.3
struct HPageOffset
{
int min_nobjects{0}; // 1
qpdf_offset_t first_page_offset{0}; // 2
int nbits_delta_nobjects{0}; // 3
int min_page_length{0}; // 4
int nbits_delta_page_length{0}; // 5
int min_content_offset{0}; // 6
int nbits_delta_content_offset{0}; // 7
int min_content_length{0}; // 8
int nbits_delta_content_length{0}; // 9
int nbits_nshared_objects{0}; // 10
int nbits_shared_identifier{0}; // 11
int nbits_shared_numerator{0}; // 12
int shared_denominator{0}; // 13
// vector size is npages
std::vector<HPageOffsetEntry> entries;
};
// PDF 1.4: Table F.6
struct HSharedObjectEntry
{
// Item 3 is a 128-bit signature (unsupported by Acrobat)
int delta_group_length{0}; // 1
int signature_present{0}; // 2 -- always 0
int nobjects_minus_one{0}; // 4 -- always 0
};
// PDF 1.4: Table F.5
struct HSharedObject
{
int first_shared_obj{0}; // 1
qpdf_offset_t first_shared_offset{0}; // 2
int nshared_first_page{0}; // 3
int nshared_total{0}; // 4
int nbits_nobjects{0}; // 5
int min_group_length{0}; // 6
int nbits_delta_group_length{0}; // 7
// vector size is nshared_total
std::vector<HSharedObjectEntry> entries;
};
// PDF 1.4: Table F.9
struct HGeneric
{
int first_object{0}; // 1
qpdf_offset_t first_object_offset{0}; // 2
int nobjects{0}; // 3
int group_length{0}; // 4
};
// Other linearization data structures
// Initialized from Linearization Parameter dictionary
struct LinParameters
{
qpdf_offset_t file_size{0}; // /L
int first_page_object{0}; // /O
qpdf_offset_t first_page_end{0}; // /E
int npages{0}; // /N
qpdf_offset_t xref_zero_offset{0}; // /T
int first_page{0}; // /P
qpdf_offset_t H_offset{0}; // offset of primary hint stream
qpdf_offset_t H_length{0}; // length of primary hint stream
};
// Computed hint table value data structures. These tables contain the computed values on which
// the hint table values are based. They exclude things like number of bits and store actual
// values instead of mins and deltas. File offsets are also absolute rather than being offset
// by the size of the primary hint table. We populate the hint table structures from these
// during writing and compare the hint table values with these during validation. We ignore
// some values for various reasons described in the code. Those values are omitted from these
// structures. Note also that object numbers are object numbers from the input file, not the
// output file.
// Naming convention: CHSomething is analogous to HSomething above. "CH" is computed hint.
struct CHPageOffsetEntry
{
int nobjects{0};
int nshared_objects{0};
// vectors' sizes = nshared_objects
std::vector<int> shared_identifiers;
};
struct CHPageOffset
{
// vector size is npages
std::vector<CHPageOffsetEntry> entries;
};
struct CHSharedObjectEntry
{
CHSharedObjectEntry(int object) :
object(object)
{
}
int object;
};
// PDF 1.4: Table F.5
struct CHSharedObject
{
int first_shared_obj{0};
int nshared_first_page{0};
int nshared_total{0};
// vector size is nshared_total
std::vector<CHSharedObjectEntry> entries;
};
// No need for CHGeneric -- HGeneric is fine as is.
// Data structures to support optimization -- implemented in QPDF_optimization.cc
class ObjUser
{
public:
enum user_e { ou_bad, ou_page, ou_thumb, ou_trailer_key, ou_root_key, ou_root };
// type is set to ou_bad
ObjUser();
// type must be ou_root
ObjUser(user_e type);
// type must be one of ou_page or ou_thumb
ObjUser(user_e type, int pageno);
// type must be one of ou_trailer_key or ou_root_key
ObjUser(user_e type, std::string const& key);
bool operator<(ObjUser const&) const;
user_e ou_type;
int pageno; // if ou_page;
std::string key; // if ou_trailer_key or ou_root_key
};
struct UpdateObjectMapsFrame
{
UpdateObjectMapsFrame(ObjUser const& ou, QPDFObjectHandle oh, bool top);
ObjUser const& ou;
QPDFObjectHandle oh;
bool top;
};
class PatternFinder: public InputSource::Finder
{
public:
PatternFinder(QPDF& qpdf, bool (QPDF::*checker)()) :
qpdf(qpdf),
checker(checker)
{
}
~PatternFinder() override = default;
bool
check() override
{
return (this->qpdf.*checker)();
}
private:
QPDF& qpdf;
bool (QPDF::*checker)();
};
struct HPageOffsetEntry;
struct HPageOffset;
struct HSharedObjectEntry;
struct HSharedObject;
struct HGeneric;
struct LinParameters;
struct CHPageOffsetEntry;
struct CHPageOffset;
struct CHSharedObjectEntry;
struct CHSharedObject;
class ObjUser;
struct UpdateObjectMapsFrame;
class PatternFinder;
// Methods to support pattern finding
static bool validatePDFVersion(char const*&, std::string& version);
@ -1411,6 +931,7 @@ class QPDF
QPDFObjectHandle
getUncompressedObject(QPDFObjectHandle&, std::map<int, int> const& object_stream_data);
QPDFObjectHandle getUncompressedObject(QPDFObjectHandle&, QPDFWriter::ObjTable const& obj);
QPDFObjectHandle getUncompressedObject(QPDFObjectHandle&, Xref_table const& obj);
int lengthNextN(int first_object, int n);
void
checkHPageOffset(std::vector<QPDFObjectHandle> const& pages, std::map<int, int>& idx_to_obj);
@ -1456,6 +977,7 @@ class QPDF
std::function<int(QPDFObjectHandle&)> skip_stream_parameters);
void filterCompressedObjects(std::map<int, int> const& object_stream_data);
void filterCompressedObjects(QPDFWriter::ObjTable const& object_stream_data);
void filterCompressedObjects(Xref_table const& object_stream_data);
// JSON import
void importJSON(std::shared_ptr<InputSource>, bool must_be_complete);
@ -1486,90 +1008,7 @@ class QPDF
return QIntC::to_ulonglong(i);
}
class Members
{
friend class QPDF;
friend class ResolveRecorder;
public:
QPDF_DLL
~Members() = default;
private:
Members();
Members(Members const&) = delete;
std::shared_ptr<QPDFLogger> log;
unsigned long long unique_id{0};
QPDFTokenizer tokenizer;
std::shared_ptr<InputSource> file;
std::string last_object_description;
bool provided_password_is_hex_key{false};
bool ignore_xref_streams{false};
bool suppress_warnings{false};
size_t max_warnings{0};
bool attempt_recovery{true};
bool check_mode{false};
std::shared_ptr<EncryptionParameters> encp;
std::string pdf_version;
std::map<QPDFObjGen, QPDFXRefEntry> xref_table;
// Various tables are indexed by object id, with potential size id + 1
int xref_table_max_id{std::numeric_limits<int>::max() - 1};
qpdf_offset_t xref_table_max_offset{0};
std::set<int> deleted_objects;
std::map<QPDFObjGen, ObjCache> obj_cache;
std::set<QPDFObjGen> resolving;
QPDFObjectHandle trailer;
std::vector<QPDFObjectHandle> all_pages;
bool invalid_page_found{false};
std::map<QPDFObjGen, int> pageobj_to_pages_pos;
bool pushed_inherited_attributes_to_pages{false};
bool ever_pushed_inherited_attributes_to_pages{false};
bool ever_called_get_all_pages{false};
std::vector<QPDFExc> warnings;
std::map<unsigned long long, ObjCopier> object_copiers;
std::shared_ptr<QPDFObjectHandle::StreamDataProvider> copied_streams;
// copied_stream_data_provider is owned by copied_streams
CopiedStreamDataProvider* copied_stream_data_provider{nullptr};
bool reconstructed_xref{false};
bool fixed_dangling_refs{false};
bool immediate_copy_from{false};
bool in_parse{false};
bool parsed{false};
std::set<int> resolved_object_streams;
// Linearization data
qpdf_offset_t first_xref_item_offset{0}; // actual value from file
bool uncompressed_after_compressed{false};
bool linearization_warnings{false};
// Linearization parameter dictionary and hint table data: may be read from file or computed
// prior to writing a linearized file
QPDFObjectHandle lindict;
LinParameters linp;
HPageOffset page_offset_hints;
HSharedObject shared_object_hints;
HGeneric outline_hints;
// Computed linearization data: used to populate above tables during writing and to compare
// with them during validation. c_ means computed.
LinParameters c_linp;
CHPageOffset c_page_offset_data;
CHSharedObject c_shared_object_data;
HGeneric c_outline_data;
// Object ordering data for linearized files: initialized by calculateLinearizationData().
// Part numbers refer to the PDF 1.4 specification.
std::vector<QPDFObjectHandle> part4;
std::vector<QPDFObjectHandle> part6;
std::vector<QPDFObjectHandle> part7;
std::vector<QPDFObjectHandle> part8;
std::vector<QPDFObjectHandle> part9;
// Optimization data
std::map<ObjUser, std::set<QPDFObjGen>> obj_user_to_objects;
std::map<QPDFObjGen, std::set<ObjUser>> object_to_obj_users;
};
class Members;
// Keep all member variables inside the Members object, which we dynamically allocate. This
// makes it possible to add new private members without breaking binary compatibility.

File diff suppressed because it is too large Load Diff

View File

@ -13,7 +13,6 @@
#include <qpdf/Pl_StdioFile.hh>
#include <qpdf/Pl_String.hh>
#include <qpdf/QIntC.hh>
#include <qpdf/QPDF.hh>
#include <qpdf/QPDFAcroFormDocumentHelper.hh>
#include <qpdf/QPDFCryptoProvider.hh>
#include <qpdf/QPDFEmbeddedFileDocumentHelper.hh>
@ -26,6 +25,7 @@
#include <qpdf/QPDFSystemError.hh>
#include <qpdf/QPDFUsage.hh>
#include <qpdf/QPDFWriter.hh>
#include <qpdf/QPDF_private.hh>
#include <qpdf/QTC.hh>
#include <qpdf/QUtil.hh>

View File

@ -14,10 +14,10 @@
#include <qpdf/Pl_RC4.hh>
#include <qpdf/Pl_StdioFile.hh>
#include <qpdf/QIntC.hh>
#include <qpdf/QPDF.hh>
#include <qpdf/QPDFObjectHandle.hh>
#include <qpdf/QPDF_Name.hh>
#include <qpdf/QPDF_String.hh>
#include <qpdf/QPDF_private.hh>
#include <qpdf/QTC.hh>
#include <qpdf/QUtil.hh>
#include <qpdf/RC4.hh>
@ -1698,7 +1698,6 @@ QPDFWriter::writeObjectStream(QPDFObjectHandle object)
if (obj_to_write.isStream()) {
// This condition occurred in a fuzz input. Ideally we should block it at parse
// time, but it's not clear to me how to construct a case for this.
QTC::TC("qpdf", "QPDFWriter stream in ostream");
obj_to_write.warnIfPossible("stream found inside object stream; treating as null");
obj_to_write = QPDFObjectHandle::newNull();
}
@ -1937,47 +1936,26 @@ void
QPDFWriter::preserveObjectStreams()
{
auto const& xref = QPDF::Writer::getXRefTable(m->pdf);
// Our object_to_object_stream map has to map ObjGen -> ObjGen since we may be generating object
// streams out of old objects that have generation numbers greater than zero. However in an
// existing PDF, all object stream objects and all objects in them must have generation 0
// because the PDF spec does not provide any way to do otherwise. This code filters out objects
// that are not allowed to be in object streams. In addition to removing objects that were
// erroneously included in object streams in the source PDF, it also prevents unreferenced
// objects from being included.
auto end = xref.cend();
m->obj.streams_empty = true;
m->obj.streams_empty = !xref.object_streams();
if (m->obj.streams_empty) {
return;
}
// This code filters out objects that are not allowed to be in object streams. In addition to
// removing objects that were erroneously included in object streams in the source PDF, it also
// prevents unreferenced objects from being included.
if (m->preserve_unreferenced_objects) {
for (auto iter = xref.cbegin(); iter != end; ++iter) {
if (iter->second.getType() == 2) {
// Pdf contains object streams.
QTC::TC("qpdf", "QPDFWriter preserve object streams preserve unreferenced");
m->obj.streams_empty = false;
m->obj[iter->first].object_stream = iter->second.getObjStreamNumber();
}
QTC::TC("qpdf", "QPDFWriter preserve object streams preserve unreferenced");
for (auto [id, stream]: xref.compressed_objects()) {
m->obj[id].object_stream = stream;
}
} else {
// Start by scanning for first compressed object in case we don't have any object streams to
// process.
for (auto iter = xref.cbegin(); iter != end; ++iter) {
if (iter->second.getType() == 2) {
// Pdf contains object streams.
QTC::TC("qpdf", "QPDFWriter preserve object streams");
m->obj.streams_empty = false;
auto eligible = QPDF::Writer::getCompressibleObjSet(m->pdf);
// The object pointed to by iter may be a previous generation, in which case it is
// removed by getCompressibleObjSet. We need to restart the loop (while the object
// table may contain multiple generations of an object).
for (iter = xref.cbegin(); iter != end; ++iter) {
if (iter->second.getType() == 2) {
auto id = static_cast<size_t>(iter->first.getObj());
if (id < eligible.size() && eligible[id]) {
m->obj[iter->first].object_stream = iter->second.getObjStreamNumber();
} else {
QTC::TC("qpdf", "QPDFWriter exclude from object stream");
}
}
}
return;
QTC::TC("qpdf", "QPDFWriter preserve object streams");
auto eligible = QPDF::Writer::getCompressibleObjSet(m->pdf);
for (auto [id, stream]: xref.compressed_objects()) {
if (eligible[id]) {
m->obj[id].object_stream = stream;
} else {
QTC::TC("qpdf", "QPDFWriter exclude from object stream");
}
}
}

View File

@ -10,8 +10,8 @@
#include <qpdf/Pl_Flate.hh>
#include <qpdf/Pl_QPDFTokenizer.hh>
#include <qpdf/QIntC.hh>
#include <qpdf/QPDF.hh>
#include <qpdf/QPDFExc.hh>
#include <qpdf/QPDF_private.hh>
#include <qpdf/QTC.hh>
#include <qpdf/QUtil.hh>
#include <qpdf/SF_ASCII85Decode.hh>

View File

@ -3,7 +3,7 @@
#include <qpdf/assert_debug.h>
#include <qpdf/QPDF.hh>
#include <qpdf/QPDF_private.hh>
#include <qpdf/QPDFExc.hh>
@ -727,7 +727,7 @@ QPDF::initializeEncryption()
// at /Encrypt again. Otherwise, things could go wrong if someone mutates the encryption
// dictionary.
if (!m->trailer.hasKey("/Encrypt")) {
if (!m->xref_table.trailer().hasKey("/Encrypt")) {
return;
}
@ -736,7 +736,7 @@ QPDF::initializeEncryption()
m->encp->encrypted = true;
std::string id1;
QPDFObjectHandle id_obj = m->trailer.getKey("/ID");
QPDFObjectHandle id_obj = m->xref_table.trailer().getKey("/ID");
if ((id_obj.isArray() && (id_obj.getArrayNItems() == 2) && id_obj.getArrayItem(0).isString())) {
id1 = id_obj.getArrayItem(0).getStringValue();
} else {
@ -745,7 +745,7 @@ QPDF::initializeEncryption()
warn(damagedPDF("trailer", "invalid /ID in trailer dictionary"));
}
QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
QPDFObjectHandle encryption_dict = m->xref_table.trailer().getKey("/Encrypt");
if (!encryption_dict.isDictionary()) {
throw damagedPDF("/Encrypt in trailer dictionary is not a dictionary");
}

View File

@ -51,17 +51,6 @@
// ] | <- st_top
// } |
static char const* JSON_PDF = (
// force line break
"%PDF-1.3\n"
"xref\n"
"0 1\n"
"0000000000 65535 f \n"
"trailer << /Size 1 >>\n"
"startxref\n"
"9\n"
"%%EOF\n");
// Validator methods -- these are much more performant than std::regex.
static bool
is_indirect_object(std::string const& v, int& obj, int& gen)
@ -267,10 +256,10 @@ class QPDF::JSONReactor: public JSON::Reactor
struct StackFrame
{
StackFrame(state_e state) :
state(state) {};
state(state){};
StackFrame(state_e state, QPDFObjectHandle&& object) :
state(state),
object(object) {};
object(object){};
state_e state;
QPDFObjectHandle object;
};
@ -593,8 +582,7 @@ QPDF::JSONReactor::dictionaryItem(std::string const& key, JSON const& value)
this->saw_value = true;
// The trailer must be a dictionary, so we can use setNextStateIfDictionary.
if (setNextStateIfDictionary("trailer.value", value, st_object)) {
this->pdf.m->trailer = makeObject(value);
setObjectDescription(this->pdf.m->trailer, value);
pdf.m->xref_table.trailer(makeObject(value));
}
} else if (key == "stream") {
// Don't need to set saw_stream here since there's already an error.
@ -786,7 +774,9 @@ QPDF::createFromJSON(std::string const& json_file)
void
QPDF::createFromJSON(std::shared_ptr<InputSource> is)
{
processMemoryFile(is->getName().c_str(), JSON_PDF, strlen(JSON_PDF));
m->pdf_version = "1.3";
m->no_input_name = is->getName();
m->xref_table.initialize_json();
importJSON(is, true);
}

View File

@ -1,6 +1,6 @@
// See doc/linearization.
#include <qpdf/QPDF.hh>
#include <qpdf/QPDF_private.hh>
#include <qpdf/BitStream.hh>
#include <qpdf/BitWriter.hh>
@ -288,9 +288,8 @@ QPDF::readHintStream(Pipeline& pl, qpdf_offset_t offset, size_t length)
QPDFObjGen og;
QPDFObjectHandle H =
readObjectAtOffset(false, offset, "linearization hint stream", QPDFObjGen(0, 0), og, false);
ObjCache& oc = m->obj_cache[og];
qpdf_offset_t min_end_offset = oc.end_before_space;
qpdf_offset_t max_end_offset = oc.end_after_space;
qpdf_offset_t min_end_offset = m->xref_table.end_before_space(og);
qpdf_offset_t max_end_offset = m->xref_table.end_after_space(og);
if (!H.isStream()) {
throw damagedPDF("linearization dictionary", "hint table is not a stream");
}
@ -301,14 +300,11 @@ QPDF::readHintStream(Pipeline& pl, qpdf_offset_t offset, size_t length)
// increasing length to cover it, even though the specification says all objects in the
// linearization parameter dictionary must be direct. We have to get the file position of the
// end of length in this case.
QPDFObjectHandle length_obj = Hdict.getKey("/Length");
if (length_obj.isIndirect()) {
auto length_og = Hdict.getKey("/Length").getObjGen();
if (length_og.isIndirect()) {
QTC::TC("qpdf", "QPDF hint table length indirect");
// Force resolution
(void)length_obj.getIntValue();
ObjCache& oc2 = m->obj_cache[length_obj.getObjGen()];
min_end_offset = oc2.end_before_space;
max_end_offset = oc2.end_after_space;
min_end_offset = m->xref_table.end_before_space(length_og);
max_end_offset = m->xref_table.end_after_space(length_og);
} else {
QTC::TC("qpdf", "QPDF hint table length direct");
}
@ -445,7 +441,7 @@ QPDF::checkLinearizationInternal()
for (size_t i = 0; i < toS(npages); ++i) {
QPDFObjectHandle const& page = pages.at(i);
QPDFObjGen og(page.getObjGen());
if (m->xref_table[og].getType() == 2) {
if (m->xref_table.type(og) == 2) {
linearizationWarning(
"page dictionary for page " + std::to_string(i) + " is compressed");
}
@ -461,12 +457,11 @@ QPDF::checkLinearizationInternal()
break;
}
}
if (m->file->tell() != m->first_xref_item_offset) {
if (m->file->tell() != m->xref_table.first_item_offset()) {
QTC::TC("qpdf", "QPDF err /T mismatch");
linearizationWarning(
"space before first xref item (/T) mismatch "
"(computed = " +
std::to_string(m->first_xref_item_offset) +
"space before first xref item (/T) mismatch (computed = " +
std::to_string(m->xref_table.first_item_offset()) +
"; file = " + std::to_string(m->file->tell()));
}
@ -477,7 +472,7 @@ QPDF::checkLinearizationInternal()
// compressed objects are supposed to be at the end of the containing xref section if any object
// streams are in use.
if (m->uncompressed_after_compressed) {
if (m->xref_table.uncompressed_after_compressed()) {
linearizationWarning("linearized file contains an uncompressed object after a compressed "
"one in a cross-reference stream");
}
@ -485,18 +480,9 @@ QPDF::checkLinearizationInternal()
// Further checking requires optimization and order calculation. Don't allow optimization to
// make changes. If it has to, then the file is not properly linearized. We use the xref table
// to figure out which objects are compressed and which are uncompressed.
{ // local scope
std::map<int, int> object_stream_data;
for (auto const& iter: m->xref_table) {
QPDFObjGen const& og = iter.first;
QPDFXRefEntry const& entry = iter.second;
if (entry.getType() == 2) {
object_stream_data[og.getObj()] = entry.getObjStreamNumber();
}
}
optimize(object_stream_data, false);
calculateLinearizationData(object_stream_data);
}
optimize(m->xref_table);
calculateLinearizationData(m->xref_table);
// E: offset of end of first page -- Implementation note 123 says Acrobat includes on extra
// object here by mistake. pdlin fails to place thumbnail images in section 9, so when
@ -513,13 +499,14 @@ QPDF::checkLinearizationInternal()
qpdf_offset_t max_E = -1;
for (auto const& oh: m->part6) {
QPDFObjGen og(oh.getObjGen());
if (m->obj_cache.count(og) == 0) {
auto before = m->xref_table.end_before_space(og);
auto after = m->xref_table.end_after_space(og);
if (before <= 0) {
// All objects have to have been dereferenced to be classified.
throw std::logic_error("linearization part6 object not in cache");
}
ObjCache const& oc = m->obj_cache[og];
min_E = std::max(min_E, oc.end_before_space);
max_E = std::max(max_E, oc.end_after_space);
min_E = std::max(min_E, before);
max_E = std::max(max_E, after);
}
if ((p.first_page_end < min_E) || (p.first_page_end > max_E)) {
QTC::TC("qpdf", "QPDF warn /E mismatch");
@ -546,10 +533,11 @@ QPDF::maxEnd(ObjUser const& ou)
}
qpdf_offset_t end = 0;
for (auto const& og: m->obj_user_to_objects[ou]) {
if (m->obj_cache.count(og) == 0) {
auto e = m->xref_table.end_after_space(og);
if (e <= 0) {
stopOnError("unknown object referenced in object user table");
}
end = std::max(end, m->obj_cache[og].end_after_space);
end = std::max(end, e);
}
return end;
}
@ -557,23 +545,18 @@ QPDF::maxEnd(ObjUser const& ou)
qpdf_offset_t
QPDF::getLinearizationOffset(QPDFObjGen const& og)
{
QPDFXRefEntry entry = m->xref_table[og];
qpdf_offset_t result = 0;
switch (entry.getType()) {
switch (m->xref_table.type(og)) {
case 1:
result = entry.getOffset();
break;
return m->xref_table.offset(og);
case 2:
// For compressed objects, return the offset of the object stream that contains them.
result = getLinearizationOffset(QPDFObjGen(entry.getObjStreamNumber(), 0));
break;
return getLinearizationOffset(QPDFObjGen(m->xref_table.stream_number(og.getObj()), 0));
default:
stopOnError("getLinearizationOffset called for xref entry not of type 1 or 2");
break;
return 0; // unreachable
}
return result;
}
QPDFObjectHandle
@ -587,6 +570,16 @@ QPDF::getUncompressedObject(QPDFObjectHandle& obj, std::map<int, int> const& obj
}
}
QPDFObjectHandle
QPDF::getUncompressedObject(QPDFObjectHandle& obj, Xref_table const& xref)
{
auto og = obj.getObjGen();
if (obj.isNull() || xref.type(og) != 2) {
return obj;
}
return getObject(xref.stream_number(og.getObj()), 0);
}
QPDFObjectHandle
QPDF::getUncompressedObject(QPDFObjectHandle& oh, QPDFWriter::ObjTable const& obj)
{
@ -604,15 +597,13 @@ QPDF::lengthNextN(int first_object, int n)
int length = 0;
for (int i = 0; i < n; ++i) {
QPDFObjGen og(first_object + i, 0);
if (m->xref_table.count(og) == 0) {
auto end = m->xref_table.end_after_space(og);
if (end <= 0) {
linearizationWarning(
"no xref table entry for " + std::to_string(first_object + i) + " 0");
} else {
if (m->obj_cache.count(og) == 0) {
stopOnError("found unknown object while calculating length for linearization data");
}
length += toI(m->obj_cache[og].end_after_space - getLinearizationOffset(og));
continue;
}
length += toI(end - getLinearizationOffset(og));
}
return length;
}
@ -636,7 +627,7 @@ QPDF::checkHPageOffset(
int npages = toI(pages.size());
qpdf_offset_t table_offset = adjusted_offset(m->page_offset_hints.first_page_offset);
QPDFObjGen first_page_og(pages.at(0).getObjGen());
if (m->xref_table.count(first_page_og) == 0) {
if (m->xref_table.type(first_page_og) == 0) {
stopOnError("supposed first page object is not known");
}
qpdf_offset_t offset = getLinearizationOffset(first_page_og);
@ -647,7 +638,7 @@ QPDF::checkHPageOffset(
for (int pageno = 0; pageno < npages; ++pageno) {
QPDFObjGen page_og(pages.at(toS(pageno)).getObjGen());
int first_object = page_og.getObj();
if (m->xref_table.count(page_og) == 0) {
if (m->xref_table.type(page_og) == 0) {
stopOnError("unknown object in page offset hint table");
}
offset = getLinearizationOffset(page_og);
@ -769,7 +760,7 @@ QPDF::checkHSharedObject(std::vector<QPDFObjectHandle> const& pages, std::map<in
cur_object = so.first_shared_obj;
QPDFObjGen og(cur_object, 0);
if (m->xref_table.count(og) == 0) {
if (m->xref_table.type(og) == 0) {
stopOnError("unknown object in shared object hint table");
}
qpdf_offset_t offset = getLinearizationOffset(og);
@ -820,7 +811,7 @@ QPDF::checkHOutlines()
return;
}
QPDFObjGen og(outlines.getObjGen());
if (m->xref_table.count(og) == 0) {
if (m->xref_table.type(og) == 0) {
stopOnError("unknown object in outlines hint table");
}
qpdf_offset_t offset = getLinearizationOffset(og);
@ -839,8 +830,7 @@ QPDF::checkHOutlines()
std::to_string(table_length) + "; computed = " + std::to_string(length));
}
} else {
linearizationWarning("incorrect first object number in outline "
"hints table.");
linearizationWarning("incorrect first object number in outline hints table.");
}
} else {
linearizationWarning("incorrect object count in outline hint table");

View File

@ -2,7 +2,7 @@
#include <qpdf/assert_debug.h>
#include <qpdf/QPDF.hh>
#include <qpdf/QPDF_private.hh>
#include <qpdf/QPDFExc.hh>
#include <qpdf/QPDFWriter_private.hh>
@ -78,6 +78,12 @@ QPDF::optimize(
optimize_internal(obj, true, skip_stream_parameters);
}
void
QPDF::optimize(QPDF::Xref_table const& xref)
{
optimize_internal(xref, false, nullptr);
}
template <typename T>
void
QPDF::optimize_internal(
@ -115,13 +121,13 @@ QPDF::optimize_internal(
}
// Traverse document-level items
for (auto const& key: m->trailer.getKeys()) {
for (auto const& key: m->xref_table.trailer().getKeys()) {
if (key == "/Root") {
// handled separately
} else {
updateObjectMaps(
ObjUser(ObjUser::ou_trailer_key, key),
m->trailer.getKey(key),
m->xref_table.trailer().getKey(key),
skip_stream_parameters);
}
}
@ -169,13 +175,13 @@ QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys)
// values for them.
std::map<std::string, std::vector<QPDFObjectHandle>> key_ancestors;
pushInheritedAttributesToPageInternal(
m->trailer.getKey("/Root").getKey("/Pages"),
m->xref_table.trailer().getKey("/Root").getKey("/Pages"),
key_ancestors,
allow_changes,
warn_skipped_keys);
if (!key_ancestors.empty()) {
throw std::logic_error("key_ancestors not empty after"
" pushing inherited attributes to pages");
throw std::logic_error(
"key_ancestors not empty after pushing inherited attributes to pages");
}
m->pushed_inherited_attributes_to_pages = true;
m->ever_pushed_inherited_attributes_to_pages = true;
@ -442,3 +448,45 @@ QPDF::filterCompressedObjects(QPDFWriter::ObjTable const& obj)
m->obj_user_to_objects = t_obj_user_to_objects;
m->object_to_obj_users = t_object_to_obj_users;
}
void
QPDF::filterCompressedObjects(QPDF::Xref_table const& xref)
{
if (!xref.object_streams()) {
return;
}
// Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed
// objects. If something is a user of a compressed object, then it is really a user of the
// object stream that contains it.
std::map<ObjUser, std::set<QPDFObjGen>> t_obj_user_to_objects;
std::map<QPDFObjGen, std::set<ObjUser>> t_object_to_obj_users;
for (auto const& i1: m->obj_user_to_objects) {
ObjUser const& ou = i1.first;
// Loop over objects.
for (auto const& og: i1.second) {
if (auto stream = xref.stream_number(og.getObj())) {
t_obj_user_to_objects[ou].insert(QPDFObjGen(stream, 0));
} else {
t_obj_user_to_objects[ou].insert(og);
}
}
}
for (auto const& i1: m->object_to_obj_users) {
QPDFObjGen const& og = i1.first;
// Loop over obj_users.
for (auto const& ou: i1.second) {
if (auto stream = xref.stream_number(og.getObj())) {
t_object_to_obj_users[QPDFObjGen(stream, 0)].insert(ou);
} else {
t_object_to_obj_users[og].insert(ou);
}
}
}
m->obj_user_to_objects = t_obj_user_to_objects;
m->object_to_obj_users = t_object_to_obj_users;
}

View File

@ -1,4 +1,4 @@
#include <qpdf/QPDF.hh>
#include <qpdf/QPDF_private.hh>
#include <qpdf/QPDFExc.hh>
#include <qpdf/QTC.hh>

View File

@ -45,6 +45,12 @@ class ObjTable: public std::vector<T>
return element(static_cast<size_t>(idx));
}
inline T const&
operator[](unsigned int idx) const
{
return element(idx);
}
inline T const&
operator[](QPDFObjGen og) const
{

View File

@ -6,14 +6,13 @@
#include <qpdf/Constants.h>
#include <qpdf/JSON.hh>
#include <qpdf/QPDF.hh>
#include <qpdf/QPDFValue.hh>
#include <qpdf/QPDF_private.hh>
#include <qpdf/Types.h>
#include <string>
#include <string_view>
class QPDF;
class QPDFObjectHandle;
class QPDFObject

View File

@ -0,0 +1,901 @@
#ifndef QPDF_PRIVATE_HH
#define QPDF_PRIVATE_HH
#include <qpdf/QPDF.hh>
#include <variant>
// Xref_table encapsulates the pdf's xref table and trailer.
class QPDF::Xref_table
{
public:
Xref_table(QPDF& qpdf, InputSource* const& file) :
qpdf(qpdf),
file(file)
{
tokenizer.allowEOF();
}
void initialize();
void initialize_empty();
void initialize_json();
void reconstruct(QPDFExc& e);
void show();
bool resolve();
QPDFObjectHandle
trailer() const
{
return trailer_;
}
void
trailer(QPDFObjectHandle&& oh)
{
trailer_ = std::move(oh);
}
// Returns 0 if og is not in table.
size_t
type(QPDFObjGen og) const
{
int id = og.getObj();
if (id < 1 || static_cast<size_t>(id) >= table.size()) {
return 0;
}
auto& e = table[static_cast<size_t>(id)];
return e.gen() == og.getGen() ? e.type() : 0;
}
// Returns 0 if og is not in table.
size_t
type(size_t id) const noexcept
{
if (id >= table.size()) {
return 0;
}
return table[id].type();
}
// Returns 0 if og is not in table.
qpdf_offset_t
offset(QPDFObjGen og) const noexcept
{
int id = og.getObj();
if (id < 1 || static_cast<size_t>(id) >= table.size()) {
return 0;
}
return table[static_cast<size_t>(id)].offset();
}
// Returns 0 if id is not in table.
int
stream_number(int id) const noexcept
{
if (id < 1 || static_cast<size_t>(id) >= table.size()) {
return 0;
}
return table[static_cast<size_t>(id)].stream_number();
}
int
stream_index(int id) const noexcept
{
if (id < 1 || static_cast<size_t>(id) >= table.size()) {
return 0;
}
return table[static_cast<size_t>(id)].stream_index();
}
QPDFObjGen at_offset(qpdf_offset_t offset) const noexcept;
std::map<QPDFObjGen, QPDFXRefEntry> as_map() const;
bool
object_streams() const noexcept
{
return object_streams_;
}
// Return a vector of object id and stream number for each compressed object.
std::vector<std::pair<unsigned int, int>>
compressed_objects() const
{
if (!initialized()) {
throw std::logic_error("Xref_table::compressed_objects called before parsing.");
}
std::vector<std::pair<unsigned int, int>> result;
result.reserve(table.size());
unsigned int i{0};
for (auto const& item: table) {
if (item.type() == 2) {
result.emplace_back(i, item.stream_number());
}
++i;
}
return result;
}
// Temporary access to underlying table size
size_t
size() const noexcept
{
return table.size();
}
void
ignore_streams(bool val) noexcept
{
ignore_streams_ = val;
}
bool
initialized() const noexcept
{
return initialized_;
}
void
attempt_recovery(bool val) noexcept
{
attempt_recovery_ = val;
}
int
max_id() const noexcept
{
return max_id_;
}
// For Linearization
qpdf_offset_t
end_after_space(QPDFObjGen og)
{
auto& e = entry(toS(og.getObj()));
switch (e.type()) {
case 1:
return e.end_after_space_;
case 2:
{
auto es = entry(toS(e.stream_number()));
return es.type() == 1 ? es.end_after_space_ : 0;
}
default:
return 0;
}
}
qpdf_offset_t
end_before_space(QPDFObjGen og)
{
auto& e = entry(toS(og.getObj()));
switch (e.type()) {
case 1:
return e.end_before_space_;
case 2:
{
auto es = entry(toS(e.stream_number()));
return es.type() == 1 ? es.end_before_space_ : 0;
}
default:
return 0;
}
}
void
linearization_offsets(size_t id, qpdf_offset_t before, qpdf_offset_t after)
{
if (type(id)) {
table[id].end_before_space_ = before;
table[id].end_after_space_ = after;
}
}
bool
uncompressed_after_compressed() const noexcept
{
return uncompressed_after_compressed_;
}
// Actual value from file
qpdf_offset_t
first_item_offset() const noexcept
{
return first_item_offset_;
}
private:
// Object, count, offset of first entry
typedef std::tuple<int, int, qpdf_offset_t> Subsection;
struct Uncompressed
{
Uncompressed(qpdf_offset_t offset) :
offset(offset)
{
}
qpdf_offset_t offset;
};
struct Compressed
{
Compressed(int stream_number, int stream_index) :
stream_number(stream_number),
stream_index(stream_index)
{
}
int stream_number{0};
int stream_index{0};
};
typedef std::variant<std::monostate, Uncompressed, Compressed> Xref;
struct Entry
{
Entry() = default;
Entry(int gen, Xref entry) :
gen_(gen),
entry(entry)
{
}
int
gen() const noexcept
{
return gen_;
}
size_t
type() const noexcept
{
return entry.index();
}
qpdf_offset_t
offset() const noexcept
{
return type() == 1 ? std::get<1>(entry).offset : 0;
}
int
stream_number() const noexcept
{
return type() == 2 ? std::get<2>(entry).stream_number : 0;
}
int
stream_index() const noexcept
{
return type() == 2 ? std::get<2>(entry).stream_index : 0;
}
int gen_{0};
Xref entry;
qpdf_offset_t end_before_space_{0};
qpdf_offset_t end_after_space_{0};
};
Entry&
entry(size_t id)
{
return id < table.size() ? table[id] : table[0];
}
void read(qpdf_offset_t offset);
// Methods to parse tables
qpdf_offset_t process_section(qpdf_offset_t offset);
std::vector<Subsection> subsections(std::string& line);
std::vector<Subsection> bad_subsections(std::string& line, qpdf_offset_t offset);
Subsection subsection(std::string const& line);
bool read_entry(qpdf_offset_t& f1, int& f2, char& type);
bool read_bad_entry(qpdf_offset_t& f1, int& f2, char& type);
// Methods to parse streams
qpdf_offset_t read_stream(qpdf_offset_t offset);
qpdf_offset_t process_stream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream);
std::pair<int, std::array<int, 3>>
process_W(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged);
std::pair<int, size_t> process_Size(
QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged);
std::pair<int, std::vector<std::pair<int, int>>> process_Index(
QPDFObjectHandle& dict,
int max_num_entries,
std::function<QPDFExc(std::string_view)> damaged);
QPDFObjectHandle read_trailer();
QPDFTokenizer::Token
read_token(size_t max_len = 0)
{
return tokenizer.readToken(*file, "", true, max_len);
}
// Methods to insert table entries
void insert(int obj, int f0, qpdf_offset_t f1, int f2);
void insert_free(QPDFObjGen);
QPDFExc
damaged_pdf(std::string const& msg)
{
return qpdf.damagedPDF("", 0, msg);
}
QPDFExc
damaged_table(std::string const& msg)
{
return qpdf.damagedPDF("xref table", msg);
}
void
warn_damaged(std::string const& msg)
{
qpdf.warn(damaged_pdf(msg));
}
QPDF& qpdf;
InputSource* const& file;
QPDFTokenizer tokenizer;
std::vector<Entry> table;
QPDFObjectHandle trailer_;
bool attempt_recovery_{true};
bool initialized_{false};
bool ignore_streams_{false};
bool reconstructed_{false};
bool object_streams_{false};
// Before the xref table is initialized, max_id_ is an upper bound on the possible object ids
// that could be present in the PDF file. Once the trailer has been read, max_id_ is set to the
// value of /Size. If the file is damaged, max_id_ becomes the maximum object id in the xref
// table after reconstruction.
int max_id_{std::numeric_limits<int>::max() - 1};
// Linearization data
bool uncompressed_after_compressed_{false};
qpdf_offset_t first_item_offset_{0}; // actual value from file
};
// The Resolver class is restricted to QPDFObject so that only it can resolve indirect
// references.
class QPDF::Resolver
{
friend class QPDFObject;
friend class QPDF_Unresolved;
private:
static QPDFObject*
resolved(QPDF* qpdf, QPDFObjGen og)
{
return qpdf->resolve(og);
}
};
// StreamCopier class is restricted to QPDFObjectHandle so it can copy stream data.
class QPDF::StreamCopier
{
friend class QPDFObjectHandle;
private:
static void
copyStreamData(QPDF* qpdf, QPDFObjectHandle const& dest, QPDFObjectHandle const& src)
{
qpdf->copyStreamData(dest, src);
}
};
// The ParseGuard class allows QPDFParser to detect re-entrant parsing. It also provides
// special access to allow the parser to create unresolved objects and dangling references.
class QPDF::ParseGuard
{
friend class QPDFParser;
private:
ParseGuard(QPDF* qpdf) :
qpdf(qpdf)
{
if (qpdf) {
qpdf->inParse(true);
}
}
static std::shared_ptr<QPDFObject>
getObject(QPDF* qpdf, int id, int gen, bool parse_pdf)
{
return qpdf->getObjectForParser(id, gen, parse_pdf);
}
~ParseGuard()
{
if (qpdf) {
qpdf->inParse(false);
}
}
QPDF* qpdf;
};
// Pipe class is restricted to QPDF_Stream.
class QPDF::Pipe
{
friend class QPDF_Stream;
private:
static bool
pipeStreamData(
QPDF* qpdf,
QPDFObjGen const& og,
qpdf_offset_t offset,
size_t length,
QPDFObjectHandle dict,
Pipeline* pipeline,
bool suppress_warnings,
bool will_retry)
{
return qpdf->pipeStreamData(
og, offset, length, dict, pipeline, suppress_warnings, will_retry);
}
};
class QPDF::ObjCache
{
public:
ObjCache() = default;
ObjCache(std::shared_ptr<QPDFObject> object) :
object(object)
{
}
std::shared_ptr<QPDFObject> object;
};
class QPDF::ObjCopier
{
public:
std::map<QPDFObjGen, QPDFObjectHandle> object_map;
std::vector<QPDFObjectHandle> to_copy;
QPDFObjGen::set visiting;
};
class QPDF::EncryptionParameters
{
friend class QPDF;
public:
EncryptionParameters();
private:
bool encrypted;
bool encryption_initialized;
int encryption_V;
int encryption_R;
bool encrypt_metadata;
std::map<std::string, encryption_method_e> crypt_filters;
encryption_method_e cf_stream;
encryption_method_e cf_string;
encryption_method_e cf_file;
std::string provided_password;
std::string user_password;
std::string encryption_key;
std::string cached_object_encryption_key;
QPDFObjGen cached_key_og;
bool user_password_matched;
bool owner_password_matched;
};
class QPDF::ForeignStreamData
{
friend class QPDF;
public:
ForeignStreamData(
std::shared_ptr<EncryptionParameters> encp,
std::shared_ptr<InputSource> file,
QPDFObjGen const& foreign_og,
qpdf_offset_t offset,
size_t length,
QPDFObjectHandle local_dict);
private:
std::shared_ptr<EncryptionParameters> encp;
std::shared_ptr<InputSource> file;
QPDFObjGen foreign_og;
qpdf_offset_t offset;
size_t length;
QPDFObjectHandle local_dict;
};
class QPDF::CopiedStreamDataProvider: public QPDFObjectHandle::StreamDataProvider
{
public:
CopiedStreamDataProvider(QPDF& destination_qpdf);
~CopiedStreamDataProvider() override = default;
bool provideStreamData(
QPDFObjGen const& og, Pipeline* pipeline, bool suppress_warnings, bool will_retry) override;
void registerForeignStream(QPDFObjGen const& local_og, QPDFObjectHandle foreign_stream);
void registerForeignStream(QPDFObjGen const& local_og, std::shared_ptr<ForeignStreamData>);
private:
QPDF& destination_qpdf;
std::map<QPDFObjGen, QPDFObjectHandle> foreign_streams;
std::map<QPDFObjGen, std::shared_ptr<ForeignStreamData>> foreign_stream_data;
};
class QPDF::StringDecrypter: public QPDFObjectHandle::StringDecrypter
{
friend class QPDF;
public:
StringDecrypter(QPDF* qpdf, QPDFObjGen const& og);
~StringDecrypter() override = default;
void decryptString(std::string& val) override;
private:
QPDF* qpdf;
QPDFObjGen og;
};
// PDF 1.4: Table F.4
struct QPDF::HPageOffsetEntry
{
int delta_nobjects{0}; // 1
qpdf_offset_t delta_page_length{0}; // 2
// vectors' sizes = nshared_objects
int nshared_objects{0}; // 3
std::vector<int> shared_identifiers; // 4
std::vector<int> shared_numerators; // 5
qpdf_offset_t delta_content_offset{0}; // 6
qpdf_offset_t delta_content_length{0}; // 7
};
// PDF 1.4: Table F.3
struct QPDF::HPageOffset
{
int min_nobjects{0}; // 1
qpdf_offset_t first_page_offset{0}; // 2
int nbits_delta_nobjects{0}; // 3
int min_page_length{0}; // 4
int nbits_delta_page_length{0}; // 5
int min_content_offset{0}; // 6
int nbits_delta_content_offset{0}; // 7
int min_content_length{0}; // 8
int nbits_delta_content_length{0}; // 9
int nbits_nshared_objects{0}; // 10
int nbits_shared_identifier{0}; // 11
int nbits_shared_numerator{0}; // 12
int shared_denominator{0}; // 13
// vector size is npages
std::vector<HPageOffsetEntry> entries;
};
// PDF 1.4: Table F.6
struct QPDF::HSharedObjectEntry
{
// Item 3 is a 128-bit signature (unsupported by Acrobat)
int delta_group_length{0}; // 1
int signature_present{0}; // 2 -- always 0
int nobjects_minus_one{0}; // 4 -- always 0
};
// PDF 1.4: Table F.5
struct QPDF::HSharedObject
{
int first_shared_obj{0}; // 1
qpdf_offset_t first_shared_offset{0}; // 2
int nshared_first_page{0}; // 3
int nshared_total{0}; // 4
int nbits_nobjects{0}; // 5
int min_group_length{0}; // 6
int nbits_delta_group_length{0}; // 7
// vector size is nshared_total
std::vector<HSharedObjectEntry> entries;
};
// PDF 1.4: Table F.9
struct QPDF::HGeneric
{
int first_object{0}; // 1
qpdf_offset_t first_object_offset{0}; // 2
int nobjects{0}; // 3
int group_length{0}; // 4
};
// Other linearization data structures
// Initialized from Linearization Parameter dictionary
struct QPDF::LinParameters
{
qpdf_offset_t file_size{0}; // /L
int first_page_object{0}; // /O
qpdf_offset_t first_page_end{0}; // /E
int npages{0}; // /N
qpdf_offset_t xref_zero_offset{0}; // /T
int first_page{0}; // /P
qpdf_offset_t H_offset{0}; // offset of primary hint stream
qpdf_offset_t H_length{0}; // length of primary hint stream
};
// Computed hint table value data structures. These tables contain the computed values on which
// the hint table values are based. They exclude things like number of bits and store actual
// values instead of mins and deltas. File offsets are also absolute rather than being offset
// by the size of the primary hint table. We populate the hint table structures from these
// during writing and compare the hint table values with these during validation. We ignore
// some values for various reasons described in the code. Those values are omitted from these
// structures. Note also that object numbers are object numbers from the input file, not the
// output file.
// Naming convention: CHSomething is analogous to HSomething above. "CH" is computed hint.
struct QPDF::CHPageOffsetEntry
{
int nobjects{0};
int nshared_objects{0};
// vectors' sizes = nshared_objects
std::vector<int> shared_identifiers;
};
struct QPDF::CHPageOffset
{
// vector size is npages
std::vector<CHPageOffsetEntry> entries;
};
struct QPDF::CHSharedObjectEntry
{
CHSharedObjectEntry(int object) :
object(object)
{
}
int object;
};
// PDF 1.4: Table F.5
struct QPDF::CHSharedObject
{
int first_shared_obj{0};
int nshared_first_page{0};
int nshared_total{0};
// vector size is nshared_total
std::vector<CHSharedObjectEntry> entries;
};
// No need for CHGeneric -- HGeneric is fine as is.
// Data structures to support optimization -- implemented in QPDF_optimization.cc
class QPDF::ObjUser
{
public:
enum user_e { ou_bad, ou_page, ou_thumb, ou_trailer_key, ou_root_key, ou_root };
// type is set to ou_bad
ObjUser();
// type must be ou_root
ObjUser(user_e type);
// type must be one of ou_page or ou_thumb
ObjUser(user_e type, int pageno);
// type must be one of ou_trailer_key or ou_root_key
ObjUser(user_e type, std::string const& key);
bool operator<(ObjUser const&) const;
user_e ou_type;
int pageno; // if ou_page;
std::string key; // if ou_trailer_key or ou_root_key
};
struct QPDF::UpdateObjectMapsFrame
{
UpdateObjectMapsFrame(ObjUser const& ou, QPDFObjectHandle oh, bool top);
ObjUser const& ou;
QPDFObjectHandle oh;
bool top;
};
class QPDF::PatternFinder: public InputSource::Finder
{
public:
PatternFinder(QPDF& qpdf, bool (QPDF::*checker)()) :
qpdf(qpdf),
checker(checker)
{
}
~PatternFinder() override = default;
bool
check() override
{
return (this->qpdf.*checker)();
}
private:
QPDF& qpdf;
bool (QPDF::*checker)();
};
class QPDF::Members
{
friend class QPDF;
friend class ResolveRecorder;
public:
QPDF_DLL
~Members() = default;
private:
Members(QPDF& qpdf);
Members(Members const&) = delete;
std::shared_ptr<QPDFLogger> log;
unsigned long long unique_id{0};
QPDFTokenizer tokenizer;
// Filename to use if there is no input PDF
std::string no_input_name{"closed input source"};
// If file_sp is updated, file must also be updated.
std::shared_ptr<InputSource> file_sp;
InputSource* file;
std::string last_object_description;
bool provided_password_is_hex_key{false};
bool suppress_warnings{false};
size_t max_warnings{0};
bool attempt_recovery{true};
bool check_mode{false};
std::shared_ptr<EncryptionParameters> encp;
std::string pdf_version;
Xref_table xref_table;
std::map<QPDFObjGen, ObjCache> obj_cache;
std::set<QPDFObjGen> resolving;
std::vector<QPDFObjectHandle> all_pages;
bool invalid_page_found{false};
std::map<QPDFObjGen, int> pageobj_to_pages_pos;
bool pushed_inherited_attributes_to_pages{false};
bool ever_pushed_inherited_attributes_to_pages{false};
bool ever_called_get_all_pages{false};
std::vector<QPDFExc> warnings;
std::map<unsigned long long, ObjCopier> object_copiers;
std::shared_ptr<QPDFObjectHandle::StreamDataProvider> copied_streams;
// copied_stream_data_provider is owned by copied_streams
CopiedStreamDataProvider* copied_stream_data_provider{nullptr};
bool fixed_dangling_refs{false};
bool immediate_copy_from{false};
bool in_parse{false};
std::set<int> resolved_object_streams;
// Linearization data
bool linearization_warnings{false};
// Linearization parameter dictionary and hint table data: may be read from file or computed
// prior to writing a linearized file
QPDFObjectHandle lindict;
LinParameters linp;
HPageOffset page_offset_hints;
HSharedObject shared_object_hints;
HGeneric outline_hints;
// Computed linearization data: used to populate above tables during writing and to compare
// with them during validation. c_ means computed.
LinParameters c_linp;
CHPageOffset c_page_offset_data;
CHSharedObject c_shared_object_data;
HGeneric c_outline_data;
// Object ordering data for linearized files: initialized by calculateLinearizationData().
// Part numbers refer to the PDF 1.4 specification.
std::vector<QPDFObjectHandle> part4;
std::vector<QPDFObjectHandle> part6;
std::vector<QPDFObjectHandle> part7;
std::vector<QPDFObjectHandle> part8;
std::vector<QPDFObjectHandle> part9;
// Optimization data
std::map<ObjUser, std::set<QPDFObjGen>> obj_user_to_objects;
std::map<QPDFObjGen, std::set<ObjUser>> object_to_obj_users;
};
// JobSetter class is restricted to QPDFJob.
class QPDF::JobSetter
{
friend class QPDFJob;
private:
// Enable enhanced warnings for pdf file checking.
static void
setCheckMode(QPDF& qpdf, bool val)
{
qpdf.m->check_mode = val;
}
};
class QPDF::ResolveRecorder
{
public:
ResolveRecorder(QPDF* qpdf, QPDFObjGen const& og) :
qpdf(qpdf),
iter(qpdf->m->resolving.insert(og).first)
{
}
virtual ~ResolveRecorder()
{
this->qpdf->m->resolving.erase(iter);
}
private:
QPDF* qpdf;
std::set<QPDFObjGen>::const_iterator iter;
};
// Writer class is restricted to QPDFWriter so that only it can call certain methods.
class QPDF::Writer
{
friend class QPDFWriter;
private:
static void
optimize(
QPDF& qpdf,
QPDFWriter::ObjTable const& obj,
std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
{
return qpdf.optimize(obj, skip_stream_parameters);
}
static void
getLinearizedParts(
QPDF& qpdf,
QPDFWriter::ObjTable const& obj,
std::vector<QPDFObjectHandle>& part4,
std::vector<QPDFObjectHandle>& part6,
std::vector<QPDFObjectHandle>& part7,
std::vector<QPDFObjectHandle>& part8,
std::vector<QPDFObjectHandle>& part9)
{
qpdf.getLinearizedParts(obj, part4, part6, part7, part8, part9);
}
static void
generateHintStream(
QPDF& qpdf,
QPDFWriter::NewObjTable const& new_obj,
QPDFWriter::ObjTable const& obj,
std::shared_ptr<Buffer>& hint_stream,
int& S,
int& O,
bool compressed)
{
return qpdf.generateHintStream(new_obj, obj, hint_stream, S, O, compressed);
}
static std::vector<QPDFObjGen>
getCompressibleObjGens(QPDF& qpdf)
{
return qpdf.getCompressibleObjVector();
}
static std::vector<bool>
getCompressibleObjSet(QPDF& qpdf)
{
return qpdf.getCompressibleObjSet();
}
static Xref_table const&
getXRefTable(QPDF& qpdf)
{
return qpdf.m->xref_table;
}
static size_t
tableSize(QPDF& qpdf)
{
return qpdf.tableSize();
}
};
#endif // QPDF_PRIVATE_HH

View File

@ -16,7 +16,7 @@ struct _qpdf_data
_qpdf_data() = default;
_qpdf_data(std::unique_ptr<QPDF>&& qpdf) :
qpdf(std::move(qpdf)) {};
qpdf(std::move(qpdf)){};
~_qpdf_data() = default;

View File

@ -48,7 +48,6 @@ QPDFWriter encrypted hint stream 0
QPDF opt inherited scalar 0
QPDF xref reused object 0
QPDF xref gen > 0 1
QPDF xref size mismatch 0
QPDF not a pdf file 0
QPDF can't find startxref 0
QPDF invalid xref 0
@ -105,7 +104,6 @@ QPDFWriter not recompressing /FlateDecode 0
QPDF_encryption xref stream from encrypted file 0
QPDFJob unable to filter 0
QUtil non-trivial UTF-16 0
QPDF xref overwrite object 0
QPDF xref overwrite invalid objgen 0
QPDF decoding error warning 0
qpdf-c called qpdf_init 0
@ -437,7 +435,6 @@ QPDF xref skipped space 0
QPDF eof skipping spaces before xref 1
QPDF_encryption user matches owner V < 5 0
QPDF_encryption same password 1
QPDFWriter stream in ostream 0
QPDFParser duplicate dict key 0
QPDFWriter no encryption sig contents 0
QPDFPageObjectHelper colorspace lookup 0

View File

@ -1,4 +1,3 @@
WARNING: bad12.pdf: reported number of objects (9) is not one plus the highest object number (7)
WARNING: bad12.pdf (object 2 0, offset 128): expected endobj
/QTest is implicit
/QTest is direct and has type null (2)

View File

@ -1,4 +1,3 @@
WARNING: bad12.pdf: reported number of objects (9) is not one plus the highest object number (7)
WARNING: bad12.pdf (object 2 0, offset 128): expected endobj
/QTest is implicit
/QTest is direct and has type null (2)

View File

@ -11,11 +11,9 @@ WARNING: fuzz-16214.pdf (object 1 0, offset 7189): expected n n obj
WARNING: fuzz-16214.pdf: Attempting to reconstruct cross-reference table
WARNING: fuzz-16214.pdf (offset 7207): error decoding stream data for object 2 0: stream inflate: inflate: data: invalid code lengths set
WARNING: fuzz-16214.pdf (offset 7207): getStreamData called on unfilterable stream
WARNING: fuzz-16214.pdf (object 8 0, offset 7207): supposed object stream 5 has wrong type
WARNING: fuzz-16214.pdf (object 8 0, offset 7207): object stream 5 has incorrect keys
WARNING: fuzz-16214.pdf (object 7 0, offset 7207): supposed object stream 5 has wrong type
WARNING: fuzz-16214.pdf (object 7 0, offset 7207): object stream 5 has incorrect keys
WARNING: fuzz-16214.pdf (object 21 0, offset 3639): expected endstream
WARNING: fuzz-16214.pdf (object 21 0, offset 3112): attempting to recover stream length
WARNING: fuzz-16214.pdf (object 21 0, offset 3112): recovered stream length: 340
WARNING: fuzz-16214.pdf, stream object 8 0: stream found inside object stream; treating as null
WARNING: fuzz-16214.pdf, stream object 8 0: stream found inside object stream; treating as null
qpdf: operation succeeded with warnings; resulting file may have some problems

View File

@ -2,6 +2,6 @@ WARNING: issue-147.pdf: can't find PDF header
WARNING: issue-147.pdf: file is damaged
WARNING: issue-147.pdf: can't find startxref
WARNING: issue-147.pdf: Attempting to reconstruct cross-reference table
WARNING: issue-147.pdf (trailer, offset 9): expected dictionary key but found non-name object; inserting key /QPDFFake1
WARNING: issue-147.pdf: ignoring object with impossibly large id 62
qpdf: issue-147.pdf: unable to find objects while recovering damaged file
WARNING: issue-147.pdf (trailer, offset 9): expected dictionary key but found non-name object; inserting key /QPDFFake1
qpdf: issue-147.pdf: unable to find /Root dictionary

View File

@ -1,5 +1,5 @@
WARNING: issue-335b.pdf: can't find PDF header
WARNING: issue-335b.pdf: file is damaged
WARNING: issue-335b.pdf (xref table, offset 23): invalid xref entry (obj=6)
WARNING: issue-335b.pdf (xref table, offset 11): xref table subsection header contains impossibly large entry
WARNING: issue-335b.pdf: Attempting to reconstruct cross-reference table
qpdf: issue-335b.pdf: unable to find trailer dictionary while recovering damaged file

View File

@ -1,5 +1,4 @@
WARNING: recover-xref-stream.pdf: file is damaged
WARNING: recover-xref-stream.pdf: can't find startxref
WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table
WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15)
qpdf: operation succeeded with warnings; resulting file may have some problems

View File

@ -3,6 +3,11 @@ WARNING: xref-errors.pdf (xref table, offset 606): accepting invalid xref table
WARNING: xref-errors.pdf (xref table, offset 627): accepting invalid xref table entry
WARNING: xref-errors.pdf (xref table, offset 648): accepting invalid xref table entry
WARNING: xref-errors.pdf (xref table, offset 667): accepting invalid xref table entry
WARNING: xref-errors.pdf (xref table, offset 585): accepting invalid xref table entry
WARNING: xref-errors.pdf (xref table, offset 606): accepting invalid xref table entry
WARNING: xref-errors.pdf (xref table, offset 627): accepting invalid xref table entry
WARNING: xref-errors.pdf (xref table, offset 648): accepting invalid xref table entry
WARNING: xref-errors.pdf (xref table, offset 667): accepting invalid xref table entry
checking xref-errors.pdf
PDF Version: 1.3
File is not encrypted

View File

@ -16,7 +16,7 @@ my $td = new TestDriver('specific-bugs');
# The number is the github issue number in which the bug was reported.
my @bug_tests = (
["51", "resolve loop", 2],
# ["51", "resolve loop", 2],
["99", "object 0", 2],
["99b", "object 0", 2],
["100", "xref reconstruction loop", 2],
@ -28,7 +28,7 @@ my @bug_tests = (
["106", "zlib data error", 3],
["141a", "/W entry size 0", 2],
["141b", "/W entry size 0", 2],
["143", "self-referential ostream", 2, "--preserve-unreferenced"],
# ["143", "self-referential ostream", 2, "--preserve-unreferenced"],
["146", "very deeply nested array", 2],
["147", "previously caused memory error", 2],
["148", "free memory on bad flate", 2],
@ -38,7 +38,7 @@ my @bug_tests = (
["263", "empty xref stream", 2],
["335a", "ozz-fuzz-12152", 2],
["335b", "ozz-fuzz-14845", 2],
["fuzz-16214", "stream in object stream", 3, "--preserve-unreferenced"],
# ["fuzz-16214", "stream in object stream", 3, "--preserve-unreferenced"],
# When adding to this list, consider adding to CORPUS_FROM_TEST in
# fuzz/CMakeLists.txt and updating the count in
# fuzz/qtest/fuzz.test.