mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 02:49:00 +00:00
Add QPDF::Xref_table members file and tokenizer
This commit is contained in:
parent
b1d845e708
commit
1e2dcbf03e
@ -201,7 +201,7 @@ QPDF::Members::Members(QPDF& qpdf) :
|
||||
file_sp(new InvalidInputSource()),
|
||||
file(file_sp.get()),
|
||||
encp(new EncryptionParameters),
|
||||
xref_table(qpdf)
|
||||
xref_table(qpdf, file)
|
||||
{
|
||||
}
|
||||
|
||||
@ -495,12 +495,10 @@ QPDF::warn(
|
||||
void
|
||||
QPDF::Xref_table::initialize()
|
||||
{
|
||||
auto* m = qpdf.m.get();
|
||||
|
||||
// PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra
|
||||
// 30 characters to leave room for the startxref stuff.
|
||||
m->file->seek(0, SEEK_END);
|
||||
qpdf_offset_t end_offset = m->file->tell();
|
||||
file->seek(0, SEEK_END);
|
||||
qpdf_offset_t end_offset = file->tell();
|
||||
max_offset = end_offset;
|
||||
// Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
|
||||
// scenarios at least 3 bytes are required.
|
||||
@ -510,8 +508,8 @@ QPDF::Xref_table::initialize()
|
||||
qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
|
||||
PatternFinder sf(qpdf, &QPDF::findStartxref);
|
||||
qpdf_offset_t xref_offset = 0;
|
||||
if (m->file->findLast("startxref", start_offset, 0, sf)) {
|
||||
xref_offset = QUtil::string_to_ll(qpdf.readToken(*m->file).getValue().c_str());
|
||||
if (file->findLast("startxref", start_offset, 0, sf)) {
|
||||
xref_offset = QUtil::string_to_ll(read_token().getValue().c_str());
|
||||
}
|
||||
|
||||
try {
|
||||
@ -547,11 +545,9 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
|
||||
throw e;
|
||||
}
|
||||
|
||||
auto* m = qpdf.m.get();
|
||||
|
||||
// If recovery generates more than 1000 warnings, the file is so severely damaged that there
|
||||
// probably is no point trying to continue.
|
||||
const auto max_warnings = m->warnings.size() + 1000U;
|
||||
const auto max_warnings = qpdf.m->warnings.size() + 1000U;
|
||||
auto check_warnings = [this, max_warnings]() {
|
||||
if (qpdf.m->warnings.size() > max_warnings) {
|
||||
throw damaged_pdf("too many errors while reconstructing cross-reference table");
|
||||
@ -560,7 +556,7 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
|
||||
|
||||
reconstructed = true;
|
||||
// We may find more objects, which may contain dangling references.
|
||||
m->fixed_dangling_refs = false;
|
||||
qpdf.m->fixed_dangling_refs = false;
|
||||
|
||||
warn_damaged("file is damaged");
|
||||
qpdf.warn(e);
|
||||
@ -577,18 +573,18 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
|
||||
erase(iter);
|
||||
}
|
||||
|
||||
m->file->seek(0, SEEK_END);
|
||||
qpdf_offset_t eof = m->file->tell();
|
||||
m->file->seek(0, SEEK_SET);
|
||||
file->seek(0, SEEK_END);
|
||||
qpdf_offset_t eof = file->tell();
|
||||
file->seek(0, SEEK_SET);
|
||||
// Don't allow very long tokens here during recovery. All the interesting tokens are covered.
|
||||
static size_t const MAX_LEN = 10;
|
||||
while (m->file->tell() < eof) {
|
||||
QPDFTokenizer::Token t1 = qpdf.readToken(*m->file, MAX_LEN);
|
||||
qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
|
||||
while (file->tell() < eof) {
|
||||
QPDFTokenizer::Token t1 = read_token(MAX_LEN);
|
||||
qpdf_offset_t token_start = file->tell() - toO(t1.getValue().length());
|
||||
if (t1.isInteger()) {
|
||||
auto pos = m->file->tell();
|
||||
QPDFTokenizer::Token t2 = qpdf.readToken(*m->file, MAX_LEN);
|
||||
if (t2.isInteger() && qpdf.readToken(*m->file, MAX_LEN).isWord("obj")) {
|
||||
auto pos = file->tell();
|
||||
QPDFTokenizer::Token t2 = read_token(MAX_LEN);
|
||||
if (t2.isInteger() && read_token(MAX_LEN).isWord("obj")) {
|
||||
int obj = QUtil::string_to_int(t1.getValue().c_str());
|
||||
int gen = QUtil::string_to_int(t2.getValue().c_str());
|
||||
if (obj <= max_id) {
|
||||
@ -597,19 +593,19 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
|
||||
warn_damaged("ignoring object with impossibly large id " + std::to_string(obj));
|
||||
}
|
||||
}
|
||||
m->file->seek(pos, SEEK_SET);
|
||||
file->seek(pos, SEEK_SET);
|
||||
} else if (!trailer && t1.isWord("trailer")) {
|
||||
auto pos = m->file->tell();
|
||||
auto pos = file->tell();
|
||||
QPDFObjectHandle t = qpdf.readTrailer();
|
||||
if (!t.isDictionary()) {
|
||||
// Oh well. It was worth a try.
|
||||
} else {
|
||||
trailer = t;
|
||||
}
|
||||
m->file->seek(pos, SEEK_SET);
|
||||
file->seek(pos, SEEK_SET);
|
||||
}
|
||||
check_warnings();
|
||||
m->file->findAndSkipNextEOL();
|
||||
file->findAndSkipNextEOL();
|
||||
}
|
||||
deleted_objects.clear();
|
||||
|
||||
@ -664,7 +660,7 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
|
||||
parsed = true;
|
||||
qpdf.getAllPages();
|
||||
check_warnings();
|
||||
if (m->all_pages.empty()) {
|
||||
if (qpdf.m->all_pages.empty()) {
|
||||
parsed = false;
|
||||
throw damaged_pdf("unable to find any pages while recovering damaged file");
|
||||
}
|
||||
@ -679,15 +675,13 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
|
||||
void
|
||||
QPDF::Xref_table::read(qpdf_offset_t xref_offset)
|
||||
{
|
||||
auto* m = qpdf.m.get();
|
||||
|
||||
std::map<int, int> free_table;
|
||||
std::set<qpdf_offset_t> visited;
|
||||
while (xref_offset) {
|
||||
visited.insert(xref_offset);
|
||||
char buf[7];
|
||||
memset(buf, 0, sizeof(buf));
|
||||
m->file->seek(xref_offset, SEEK_SET);
|
||||
file->seek(xref_offset, SEEK_SET);
|
||||
// Some files miss the mark a little with startxref. We could do a better job of searching
|
||||
// in the neighborhood for something that looks like either an xref table or stream, but the
|
||||
// simple heuristic of skipping whitespace can help with the xref table case and is harmless
|
||||
@ -696,11 +690,11 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset)
|
||||
bool skipped_space = false;
|
||||
while (!done) {
|
||||
char ch;
|
||||
if (1 == m->file->read(&ch, 1)) {
|
||||
if (1 == file->read(&ch, 1)) {
|
||||
if (QUtil::is_space(ch)) {
|
||||
skipped_space = true;
|
||||
} else {
|
||||
m->file->unreadCh(ch);
|
||||
file->unreadCh(ch);
|
||||
done = true;
|
||||
}
|
||||
} else {
|
||||
@ -709,7 +703,7 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset)
|
||||
}
|
||||
}
|
||||
|
||||
m->file->read(buf, sizeof(buf) - 1);
|
||||
file->read(buf, sizeof(buf) - 1);
|
||||
// The PDF spec says xref must be followed by a line terminator, but files exist in the wild
|
||||
// where it is terminated by arbitrary whitespace.
|
||||
if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) {
|
||||
@ -823,11 +817,9 @@ QPDF::Xref_table::parse_first(std::string const& line, int& obj, int& num, int&
|
||||
bool
|
||||
QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type)
|
||||
{
|
||||
auto* m = qpdf.m.get();
|
||||
|
||||
// Reposition after initial read attempt and reread.
|
||||
m->file->seek(m->file->getLastOffset(), SEEK_SET);
|
||||
auto line = m->file->readLine(30);
|
||||
file->seek(file->getLastOffset(), SEEK_SET);
|
||||
auto line = file->readLine(30);
|
||||
|
||||
// is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
|
||||
// buffer.
|
||||
@ -907,10 +899,8 @@ QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type)
|
||||
bool
|
||||
QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type)
|
||||
{
|
||||
auto* m = qpdf.m.get();
|
||||
|
||||
std::array<char, 21> line;
|
||||
if (m->file->read(line.data(), 20) != 20) {
|
||||
if (file->read(line.data(), 20) != 20) {
|
||||
// C++20: [[unlikely]]
|
||||
return false;
|
||||
}
|
||||
@ -963,13 +953,11 @@ QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type)
|
||||
qpdf_offset_t
|
||||
QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
|
||||
{
|
||||
auto* m = qpdf.m.get();
|
||||
|
||||
m->file->seek(xref_offset, SEEK_SET);
|
||||
file->seek(xref_offset, SEEK_SET);
|
||||
std::string line;
|
||||
while (true) {
|
||||
line.assign(50, '\0');
|
||||
m->file->read(line.data(), line.size());
|
||||
file->read(line.data(), line.size());
|
||||
int obj = 0;
|
||||
int num = 0;
|
||||
int bytes = 0;
|
||||
@ -977,11 +965,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
|
||||
QTC::TC("qpdf", "QPDF invalid xref");
|
||||
throw damaged_table("xref syntax invalid");
|
||||
}
|
||||
m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);
|
||||
file->seek(file->getLastOffset() + bytes, SEEK_SET);
|
||||
for (qpdf_offset_t i = obj; i - num < obj; ++i) {
|
||||
if (i == 0) {
|
||||
// This is needed by checkLinearization()
|
||||
first_item_offset = m->file->tell();
|
||||
first_item_offset = file->tell();
|
||||
}
|
||||
// For xref_table, these will always be small enough to be ints
|
||||
qpdf_offset_t f1 = 0;
|
||||
@ -997,11 +985,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
|
||||
insert(toI(i), 1, f1, f2);
|
||||
}
|
||||
}
|
||||
qpdf_offset_t pos = m->file->tell();
|
||||
if (qpdf.readToken(*m->file).isWord("trailer")) {
|
||||
qpdf_offset_t pos = file->tell();
|
||||
if (read_token().isWord("trailer")) {
|
||||
break;
|
||||
} else {
|
||||
m->file->seek(pos, SEEK_SET);
|
||||
file->seek(pos, SEEK_SET);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -7,9 +7,11 @@
|
||||
class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry>
|
||||
{
|
||||
public:
|
||||
Xref_table(QPDF& qpdf) :
|
||||
qpdf(qpdf)
|
||||
Xref_table(QPDF& qpdf, InputSource* const& file) :
|
||||
qpdf(qpdf),
|
||||
file(file)
|
||||
{
|
||||
tokenizer.allowEOF();
|
||||
}
|
||||
|
||||
void initialize();
|
||||
@ -50,6 +52,12 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry>
|
||||
int max_num_entries,
|
||||
std::function<QPDFExc(std::string_view)> damaged);
|
||||
|
||||
QPDFTokenizer::Token
|
||||
read_token(size_t max_len = 0)
|
||||
{
|
||||
return tokenizer.readToken(*file, "", true, max_len);
|
||||
}
|
||||
|
||||
// Methods to insert table entries
|
||||
void insert_reconstructed(int obj, qpdf_offset_t f1, int f2);
|
||||
void insert(int obj, int f0, qpdf_offset_t f1, int f2);
|
||||
@ -72,7 +80,10 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry>
|
||||
{
|
||||
qpdf.warn(damaged_pdf(msg));
|
||||
}
|
||||
|
||||
QPDF& qpdf;
|
||||
InputSource* const& file;
|
||||
QPDFTokenizer tokenizer;
|
||||
};
|
||||
|
||||
// Writer class is restricted to QPDFWriter so that only it can call certain methods.
|
||||
|
Loading…
Reference in New Issue
Block a user