2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-12-22 10:58:58 +00:00

Add QPDF::Xref_table members file and tokenizer

This commit is contained in:
m-holger 2024-08-12 12:22:58 +01:00
parent b1d845e708
commit 1e2dcbf03e
2 changed files with 48 additions and 49 deletions

View File

@ -201,7 +201,7 @@ QPDF::Members::Members(QPDF& qpdf) :
file_sp(new InvalidInputSource()), file_sp(new InvalidInputSource()),
file(file_sp.get()), file(file_sp.get()),
encp(new EncryptionParameters), encp(new EncryptionParameters),
xref_table(qpdf) xref_table(qpdf, file)
{ {
} }
@ -495,12 +495,10 @@ QPDF::warn(
void void
QPDF::Xref_table::initialize() QPDF::Xref_table::initialize()
{ {
auto* m = qpdf.m.get();
// PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra
// 30 characters to leave room for the startxref stuff. // 30 characters to leave room for the startxref stuff.
m->file->seek(0, SEEK_END); file->seek(0, SEEK_END);
qpdf_offset_t end_offset = m->file->tell(); qpdf_offset_t end_offset = file->tell();
max_offset = end_offset; max_offset = end_offset;
// Sanity check on object ids. All objects must appear in xref table / stream. In all realistic // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
// scenarios at least 3 bytes are required. // scenarios at least 3 bytes are required.
@ -510,8 +508,8 @@ QPDF::Xref_table::initialize()
qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0); qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
PatternFinder sf(qpdf, &QPDF::findStartxref); PatternFinder sf(qpdf, &QPDF::findStartxref);
qpdf_offset_t xref_offset = 0; qpdf_offset_t xref_offset = 0;
if (m->file->findLast("startxref", start_offset, 0, sf)) { if (file->findLast("startxref", start_offset, 0, sf)) {
xref_offset = QUtil::string_to_ll(qpdf.readToken(*m->file).getValue().c_str()); xref_offset = QUtil::string_to_ll(read_token().getValue().c_str());
} }
try { try {
@ -547,11 +545,9 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
throw e; throw e;
} }
auto* m = qpdf.m.get();
// If recovery generates more than 1000 warnings, the file is so severely damaged that there // If recovery generates more than 1000 warnings, the file is so severely damaged that there
// probably is no point trying to continue. // probably is no point trying to continue.
const auto max_warnings = m->warnings.size() + 1000U; const auto max_warnings = qpdf.m->warnings.size() + 1000U;
auto check_warnings = [this, max_warnings]() { auto check_warnings = [this, max_warnings]() {
if (qpdf.m->warnings.size() > max_warnings) { if (qpdf.m->warnings.size() > max_warnings) {
throw damaged_pdf("too many errors while reconstructing cross-reference table"); throw damaged_pdf("too many errors while reconstructing cross-reference table");
@ -560,7 +556,7 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
reconstructed = true; reconstructed = true;
// We may find more objects, which may contain dangling references. // We may find more objects, which may contain dangling references.
m->fixed_dangling_refs = false; qpdf.m->fixed_dangling_refs = false;
warn_damaged("file is damaged"); warn_damaged("file is damaged");
qpdf.warn(e); qpdf.warn(e);
@ -577,18 +573,18 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
erase(iter); erase(iter);
} }
m->file->seek(0, SEEK_END); file->seek(0, SEEK_END);
qpdf_offset_t eof = m->file->tell(); qpdf_offset_t eof = file->tell();
m->file->seek(0, SEEK_SET); file->seek(0, SEEK_SET);
// Don't allow very long tokens here during recovery. All the interesting tokens are covered. // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
static size_t const MAX_LEN = 10; static size_t const MAX_LEN = 10;
while (m->file->tell() < eof) { while (file->tell() < eof) {
QPDFTokenizer::Token t1 = qpdf.readToken(*m->file, MAX_LEN); QPDFTokenizer::Token t1 = read_token(MAX_LEN);
qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length()); qpdf_offset_t token_start = file->tell() - toO(t1.getValue().length());
if (t1.isInteger()) { if (t1.isInteger()) {
auto pos = m->file->tell(); auto pos = file->tell();
QPDFTokenizer::Token t2 = qpdf.readToken(*m->file, MAX_LEN); QPDFTokenizer::Token t2 = read_token(MAX_LEN);
if (t2.isInteger() && qpdf.readToken(*m->file, MAX_LEN).isWord("obj")) { if (t2.isInteger() && read_token(MAX_LEN).isWord("obj")) {
int obj = QUtil::string_to_int(t1.getValue().c_str()); int obj = QUtil::string_to_int(t1.getValue().c_str());
int gen = QUtil::string_to_int(t2.getValue().c_str()); int gen = QUtil::string_to_int(t2.getValue().c_str());
if (obj <= max_id) { if (obj <= max_id) {
@ -597,19 +593,19 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
warn_damaged("ignoring object with impossibly large id " + std::to_string(obj)); warn_damaged("ignoring object with impossibly large id " + std::to_string(obj));
} }
} }
m->file->seek(pos, SEEK_SET); file->seek(pos, SEEK_SET);
} else if (!trailer && t1.isWord("trailer")) { } else if (!trailer && t1.isWord("trailer")) {
auto pos = m->file->tell(); auto pos = file->tell();
QPDFObjectHandle t = qpdf.readTrailer(); QPDFObjectHandle t = qpdf.readTrailer();
if (!t.isDictionary()) { if (!t.isDictionary()) {
// Oh well. It was worth a try. // Oh well. It was worth a try.
} else { } else {
trailer = t; trailer = t;
} }
m->file->seek(pos, SEEK_SET); file->seek(pos, SEEK_SET);
} }
check_warnings(); check_warnings();
m->file->findAndSkipNextEOL(); file->findAndSkipNextEOL();
} }
deleted_objects.clear(); deleted_objects.clear();
@ -664,7 +660,7 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
parsed = true; parsed = true;
qpdf.getAllPages(); qpdf.getAllPages();
check_warnings(); check_warnings();
if (m->all_pages.empty()) { if (qpdf.m->all_pages.empty()) {
parsed = false; parsed = false;
throw damaged_pdf("unable to find any pages while recovering damaged file"); throw damaged_pdf("unable to find any pages while recovering damaged file");
} }
@ -679,15 +675,13 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
void void
QPDF::Xref_table::read(qpdf_offset_t xref_offset) QPDF::Xref_table::read(qpdf_offset_t xref_offset)
{ {
auto* m = qpdf.m.get();
std::map<int, int> free_table; std::map<int, int> free_table;
std::set<qpdf_offset_t> visited; std::set<qpdf_offset_t> visited;
while (xref_offset) { while (xref_offset) {
visited.insert(xref_offset); visited.insert(xref_offset);
char buf[7]; char buf[7];
memset(buf, 0, sizeof(buf)); memset(buf, 0, sizeof(buf));
m->file->seek(xref_offset, SEEK_SET); file->seek(xref_offset, SEEK_SET);
// Some files miss the mark a little with startxref. We could do a better job of searching // Some files miss the mark a little with startxref. We could do a better job of searching
// in the neighborhood for something that looks like either an xref table or stream, but the // in the neighborhood for something that looks like either an xref table or stream, but the
// simple heuristic of skipping whitespace can help with the xref table case and is harmless // simple heuristic of skipping whitespace can help with the xref table case and is harmless
@ -696,11 +690,11 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset)
bool skipped_space = false; bool skipped_space = false;
while (!done) { while (!done) {
char ch; char ch;
if (1 == m->file->read(&ch, 1)) { if (1 == file->read(&ch, 1)) {
if (QUtil::is_space(ch)) { if (QUtil::is_space(ch)) {
skipped_space = true; skipped_space = true;
} else { } else {
m->file->unreadCh(ch); file->unreadCh(ch);
done = true; done = true;
} }
} else { } else {
@ -709,7 +703,7 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset)
} }
} }
m->file->read(buf, sizeof(buf) - 1); file->read(buf, sizeof(buf) - 1);
// The PDF spec says xref must be followed by a line terminator, but files exist in the wild // The PDF spec says xref must be followed by a line terminator, but files exist in the wild
// where it is terminated by arbitrary whitespace. // where it is terminated by arbitrary whitespace.
if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) { if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) {
@ -823,11 +817,9 @@ QPDF::Xref_table::parse_first(std::string const& line, int& obj, int& num, int&
bool bool
QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type) QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type)
{ {
auto* m = qpdf.m.get();
// Reposition after initial read attempt and reread. // Reposition after initial read attempt and reread.
m->file->seek(m->file->getLastOffset(), SEEK_SET); file->seek(file->getLastOffset(), SEEK_SET);
auto line = m->file->readLine(30); auto line = file->readLine(30);
// is_space and is_digit both return false on '\0', so this will not overrun the null-terminated // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
// buffer. // buffer.
@ -907,10 +899,8 @@ QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type)
bool bool
QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type) QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type)
{ {
auto* m = qpdf.m.get();
std::array<char, 21> line; std::array<char, 21> line;
if (m->file->read(line.data(), 20) != 20) { if (file->read(line.data(), 20) != 20) {
// C++20: [[unlikely]] // C++20: [[unlikely]]
return false; return false;
} }
@ -963,13 +953,11 @@ QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type)
qpdf_offset_t qpdf_offset_t
QPDF::Xref_table::read_table(qpdf_offset_t xref_offset) QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
{ {
auto* m = qpdf.m.get(); file->seek(xref_offset, SEEK_SET);
m->file->seek(xref_offset, SEEK_SET);
std::string line; std::string line;
while (true) { while (true) {
line.assign(50, '\0'); line.assign(50, '\0');
m->file->read(line.data(), line.size()); file->read(line.data(), line.size());
int obj = 0; int obj = 0;
int num = 0; int num = 0;
int bytes = 0; int bytes = 0;
@ -977,11 +965,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
QTC::TC("qpdf", "QPDF invalid xref"); QTC::TC("qpdf", "QPDF invalid xref");
throw damaged_table("xref syntax invalid"); throw damaged_table("xref syntax invalid");
} }
m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET); file->seek(file->getLastOffset() + bytes, SEEK_SET);
for (qpdf_offset_t i = obj; i - num < obj; ++i) { for (qpdf_offset_t i = obj; i - num < obj; ++i) {
if (i == 0) { if (i == 0) {
// This is needed by checkLinearization() // This is needed by checkLinearization()
first_item_offset = m->file->tell(); first_item_offset = file->tell();
} }
// For xref_table, these will always be small enough to be ints // For xref_table, these will always be small enough to be ints
qpdf_offset_t f1 = 0; qpdf_offset_t f1 = 0;
@ -997,11 +985,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
insert(toI(i), 1, f1, f2); insert(toI(i), 1, f1, f2);
} }
} }
qpdf_offset_t pos = m->file->tell(); qpdf_offset_t pos = file->tell();
if (qpdf.readToken(*m->file).isWord("trailer")) { if (read_token().isWord("trailer")) {
break; break;
} else { } else {
m->file->seek(pos, SEEK_SET); file->seek(pos, SEEK_SET);
} }
} }

View File

@ -7,9 +7,11 @@
class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry> class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry>
{ {
public: public:
Xref_table(QPDF& qpdf) : Xref_table(QPDF& qpdf, InputSource* const& file) :
qpdf(qpdf) qpdf(qpdf),
file(file)
{ {
tokenizer.allowEOF();
} }
void initialize(); void initialize();
@ -50,6 +52,12 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry>
int max_num_entries, int max_num_entries,
std::function<QPDFExc(std::string_view)> damaged); std::function<QPDFExc(std::string_view)> damaged);
QPDFTokenizer::Token
read_token(size_t max_len = 0)
{
return tokenizer.readToken(*file, "", true, max_len);
}
// Methods to insert table entries // Methods to insert table entries
void insert_reconstructed(int obj, qpdf_offset_t f1, int f2); void insert_reconstructed(int obj, qpdf_offset_t f1, int f2);
void insert(int obj, int f0, qpdf_offset_t f1, int f2); void insert(int obj, int f0, qpdf_offset_t f1, int f2);
@ -72,7 +80,10 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry>
{ {
qpdf.warn(damaged_pdf(msg)); qpdf.warn(damaged_pdf(msg));
} }
QPDF& qpdf; QPDF& qpdf;
InputSource* const& file;
QPDFTokenizer tokenizer;
}; };
// Writer class is restricted to QPDFWriter so that only it can call certain methods. // Writer class is restricted to QPDFWriter so that only it can call certain methods.