mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 10:58:58 +00:00
Add QPDF::Xref_table members file and tokenizer
This commit is contained in:
parent
b1d845e708
commit
1e2dcbf03e
@ -201,7 +201,7 @@ QPDF::Members::Members(QPDF& qpdf) :
|
|||||||
file_sp(new InvalidInputSource()),
|
file_sp(new InvalidInputSource()),
|
||||||
file(file_sp.get()),
|
file(file_sp.get()),
|
||||||
encp(new EncryptionParameters),
|
encp(new EncryptionParameters),
|
||||||
xref_table(qpdf)
|
xref_table(qpdf, file)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -495,12 +495,10 @@ QPDF::warn(
|
|||||||
void
|
void
|
||||||
QPDF::Xref_table::initialize()
|
QPDF::Xref_table::initialize()
|
||||||
{
|
{
|
||||||
auto* m = qpdf.m.get();
|
|
||||||
|
|
||||||
// PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra
|
// PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra
|
||||||
// 30 characters to leave room for the startxref stuff.
|
// 30 characters to leave room for the startxref stuff.
|
||||||
m->file->seek(0, SEEK_END);
|
file->seek(0, SEEK_END);
|
||||||
qpdf_offset_t end_offset = m->file->tell();
|
qpdf_offset_t end_offset = file->tell();
|
||||||
max_offset = end_offset;
|
max_offset = end_offset;
|
||||||
// Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
|
// Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
|
||||||
// scenarios at least 3 bytes are required.
|
// scenarios at least 3 bytes are required.
|
||||||
@ -510,8 +508,8 @@ QPDF::Xref_table::initialize()
|
|||||||
qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
|
qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
|
||||||
PatternFinder sf(qpdf, &QPDF::findStartxref);
|
PatternFinder sf(qpdf, &QPDF::findStartxref);
|
||||||
qpdf_offset_t xref_offset = 0;
|
qpdf_offset_t xref_offset = 0;
|
||||||
if (m->file->findLast("startxref", start_offset, 0, sf)) {
|
if (file->findLast("startxref", start_offset, 0, sf)) {
|
||||||
xref_offset = QUtil::string_to_ll(qpdf.readToken(*m->file).getValue().c_str());
|
xref_offset = QUtil::string_to_ll(read_token().getValue().c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -547,11 +545,9 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
|
|||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto* m = qpdf.m.get();
|
|
||||||
|
|
||||||
// If recovery generates more than 1000 warnings, the file is so severely damaged that there
|
// If recovery generates more than 1000 warnings, the file is so severely damaged that there
|
||||||
// probably is no point trying to continue.
|
// probably is no point trying to continue.
|
||||||
const auto max_warnings = m->warnings.size() + 1000U;
|
const auto max_warnings = qpdf.m->warnings.size() + 1000U;
|
||||||
auto check_warnings = [this, max_warnings]() {
|
auto check_warnings = [this, max_warnings]() {
|
||||||
if (qpdf.m->warnings.size() > max_warnings) {
|
if (qpdf.m->warnings.size() > max_warnings) {
|
||||||
throw damaged_pdf("too many errors while reconstructing cross-reference table");
|
throw damaged_pdf("too many errors while reconstructing cross-reference table");
|
||||||
@ -560,7 +556,7 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
|
|||||||
|
|
||||||
reconstructed = true;
|
reconstructed = true;
|
||||||
// We may find more objects, which may contain dangling references.
|
// We may find more objects, which may contain dangling references.
|
||||||
m->fixed_dangling_refs = false;
|
qpdf.m->fixed_dangling_refs = false;
|
||||||
|
|
||||||
warn_damaged("file is damaged");
|
warn_damaged("file is damaged");
|
||||||
qpdf.warn(e);
|
qpdf.warn(e);
|
||||||
@ -577,18 +573,18 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
|
|||||||
erase(iter);
|
erase(iter);
|
||||||
}
|
}
|
||||||
|
|
||||||
m->file->seek(0, SEEK_END);
|
file->seek(0, SEEK_END);
|
||||||
qpdf_offset_t eof = m->file->tell();
|
qpdf_offset_t eof = file->tell();
|
||||||
m->file->seek(0, SEEK_SET);
|
file->seek(0, SEEK_SET);
|
||||||
// Don't allow very long tokens here during recovery. All the interesting tokens are covered.
|
// Don't allow very long tokens here during recovery. All the interesting tokens are covered.
|
||||||
static size_t const MAX_LEN = 10;
|
static size_t const MAX_LEN = 10;
|
||||||
while (m->file->tell() < eof) {
|
while (file->tell() < eof) {
|
||||||
QPDFTokenizer::Token t1 = qpdf.readToken(*m->file, MAX_LEN);
|
QPDFTokenizer::Token t1 = read_token(MAX_LEN);
|
||||||
qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
|
qpdf_offset_t token_start = file->tell() - toO(t1.getValue().length());
|
||||||
if (t1.isInteger()) {
|
if (t1.isInteger()) {
|
||||||
auto pos = m->file->tell();
|
auto pos = file->tell();
|
||||||
QPDFTokenizer::Token t2 = qpdf.readToken(*m->file, MAX_LEN);
|
QPDFTokenizer::Token t2 = read_token(MAX_LEN);
|
||||||
if (t2.isInteger() && qpdf.readToken(*m->file, MAX_LEN).isWord("obj")) {
|
if (t2.isInteger() && read_token(MAX_LEN).isWord("obj")) {
|
||||||
int obj = QUtil::string_to_int(t1.getValue().c_str());
|
int obj = QUtil::string_to_int(t1.getValue().c_str());
|
||||||
int gen = QUtil::string_to_int(t2.getValue().c_str());
|
int gen = QUtil::string_to_int(t2.getValue().c_str());
|
||||||
if (obj <= max_id) {
|
if (obj <= max_id) {
|
||||||
@ -597,19 +593,19 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
|
|||||||
warn_damaged("ignoring object with impossibly large id " + std::to_string(obj));
|
warn_damaged("ignoring object with impossibly large id " + std::to_string(obj));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m->file->seek(pos, SEEK_SET);
|
file->seek(pos, SEEK_SET);
|
||||||
} else if (!trailer && t1.isWord("trailer")) {
|
} else if (!trailer && t1.isWord("trailer")) {
|
||||||
auto pos = m->file->tell();
|
auto pos = file->tell();
|
||||||
QPDFObjectHandle t = qpdf.readTrailer();
|
QPDFObjectHandle t = qpdf.readTrailer();
|
||||||
if (!t.isDictionary()) {
|
if (!t.isDictionary()) {
|
||||||
// Oh well. It was worth a try.
|
// Oh well. It was worth a try.
|
||||||
} else {
|
} else {
|
||||||
trailer = t;
|
trailer = t;
|
||||||
}
|
}
|
||||||
m->file->seek(pos, SEEK_SET);
|
file->seek(pos, SEEK_SET);
|
||||||
}
|
}
|
||||||
check_warnings();
|
check_warnings();
|
||||||
m->file->findAndSkipNextEOL();
|
file->findAndSkipNextEOL();
|
||||||
}
|
}
|
||||||
deleted_objects.clear();
|
deleted_objects.clear();
|
||||||
|
|
||||||
@ -664,7 +660,7 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
|
|||||||
parsed = true;
|
parsed = true;
|
||||||
qpdf.getAllPages();
|
qpdf.getAllPages();
|
||||||
check_warnings();
|
check_warnings();
|
||||||
if (m->all_pages.empty()) {
|
if (qpdf.m->all_pages.empty()) {
|
||||||
parsed = false;
|
parsed = false;
|
||||||
throw damaged_pdf("unable to find any pages while recovering damaged file");
|
throw damaged_pdf("unable to find any pages while recovering damaged file");
|
||||||
}
|
}
|
||||||
@ -679,15 +675,13 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
|
|||||||
void
|
void
|
||||||
QPDF::Xref_table::read(qpdf_offset_t xref_offset)
|
QPDF::Xref_table::read(qpdf_offset_t xref_offset)
|
||||||
{
|
{
|
||||||
auto* m = qpdf.m.get();
|
|
||||||
|
|
||||||
std::map<int, int> free_table;
|
std::map<int, int> free_table;
|
||||||
std::set<qpdf_offset_t> visited;
|
std::set<qpdf_offset_t> visited;
|
||||||
while (xref_offset) {
|
while (xref_offset) {
|
||||||
visited.insert(xref_offset);
|
visited.insert(xref_offset);
|
||||||
char buf[7];
|
char buf[7];
|
||||||
memset(buf, 0, sizeof(buf));
|
memset(buf, 0, sizeof(buf));
|
||||||
m->file->seek(xref_offset, SEEK_SET);
|
file->seek(xref_offset, SEEK_SET);
|
||||||
// Some files miss the mark a little with startxref. We could do a better job of searching
|
// Some files miss the mark a little with startxref. We could do a better job of searching
|
||||||
// in the neighborhood for something that looks like either an xref table or stream, but the
|
// in the neighborhood for something that looks like either an xref table or stream, but the
|
||||||
// simple heuristic of skipping whitespace can help with the xref table case and is harmless
|
// simple heuristic of skipping whitespace can help with the xref table case and is harmless
|
||||||
@ -696,11 +690,11 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset)
|
|||||||
bool skipped_space = false;
|
bool skipped_space = false;
|
||||||
while (!done) {
|
while (!done) {
|
||||||
char ch;
|
char ch;
|
||||||
if (1 == m->file->read(&ch, 1)) {
|
if (1 == file->read(&ch, 1)) {
|
||||||
if (QUtil::is_space(ch)) {
|
if (QUtil::is_space(ch)) {
|
||||||
skipped_space = true;
|
skipped_space = true;
|
||||||
} else {
|
} else {
|
||||||
m->file->unreadCh(ch);
|
file->unreadCh(ch);
|
||||||
done = true;
|
done = true;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -709,7 +703,7 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
m->file->read(buf, sizeof(buf) - 1);
|
file->read(buf, sizeof(buf) - 1);
|
||||||
// The PDF spec says xref must be followed by a line terminator, but files exist in the wild
|
// The PDF spec says xref must be followed by a line terminator, but files exist in the wild
|
||||||
// where it is terminated by arbitrary whitespace.
|
// where it is terminated by arbitrary whitespace.
|
||||||
if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) {
|
if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) {
|
||||||
@ -823,11 +817,9 @@ QPDF::Xref_table::parse_first(std::string const& line, int& obj, int& num, int&
|
|||||||
bool
|
bool
|
||||||
QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type)
|
QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type)
|
||||||
{
|
{
|
||||||
auto* m = qpdf.m.get();
|
|
||||||
|
|
||||||
// Reposition after initial read attempt and reread.
|
// Reposition after initial read attempt and reread.
|
||||||
m->file->seek(m->file->getLastOffset(), SEEK_SET);
|
file->seek(file->getLastOffset(), SEEK_SET);
|
||||||
auto line = m->file->readLine(30);
|
auto line = file->readLine(30);
|
||||||
|
|
||||||
// is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
|
// is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
|
||||||
// buffer.
|
// buffer.
|
||||||
@ -907,10 +899,8 @@ QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type)
|
|||||||
bool
|
bool
|
||||||
QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type)
|
QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type)
|
||||||
{
|
{
|
||||||
auto* m = qpdf.m.get();
|
|
||||||
|
|
||||||
std::array<char, 21> line;
|
std::array<char, 21> line;
|
||||||
if (m->file->read(line.data(), 20) != 20) {
|
if (file->read(line.data(), 20) != 20) {
|
||||||
// C++20: [[unlikely]]
|
// C++20: [[unlikely]]
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -963,13 +953,11 @@ QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type)
|
|||||||
qpdf_offset_t
|
qpdf_offset_t
|
||||||
QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
|
QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
|
||||||
{
|
{
|
||||||
auto* m = qpdf.m.get();
|
file->seek(xref_offset, SEEK_SET);
|
||||||
|
|
||||||
m->file->seek(xref_offset, SEEK_SET);
|
|
||||||
std::string line;
|
std::string line;
|
||||||
while (true) {
|
while (true) {
|
||||||
line.assign(50, '\0');
|
line.assign(50, '\0');
|
||||||
m->file->read(line.data(), line.size());
|
file->read(line.data(), line.size());
|
||||||
int obj = 0;
|
int obj = 0;
|
||||||
int num = 0;
|
int num = 0;
|
||||||
int bytes = 0;
|
int bytes = 0;
|
||||||
@ -977,11 +965,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
|
|||||||
QTC::TC("qpdf", "QPDF invalid xref");
|
QTC::TC("qpdf", "QPDF invalid xref");
|
||||||
throw damaged_table("xref syntax invalid");
|
throw damaged_table("xref syntax invalid");
|
||||||
}
|
}
|
||||||
m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);
|
file->seek(file->getLastOffset() + bytes, SEEK_SET);
|
||||||
for (qpdf_offset_t i = obj; i - num < obj; ++i) {
|
for (qpdf_offset_t i = obj; i - num < obj; ++i) {
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
// This is needed by checkLinearization()
|
// This is needed by checkLinearization()
|
||||||
first_item_offset = m->file->tell();
|
first_item_offset = file->tell();
|
||||||
}
|
}
|
||||||
// For xref_table, these will always be small enough to be ints
|
// For xref_table, these will always be small enough to be ints
|
||||||
qpdf_offset_t f1 = 0;
|
qpdf_offset_t f1 = 0;
|
||||||
@ -997,11 +985,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
|
|||||||
insert(toI(i), 1, f1, f2);
|
insert(toI(i), 1, f1, f2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
qpdf_offset_t pos = m->file->tell();
|
qpdf_offset_t pos = file->tell();
|
||||||
if (qpdf.readToken(*m->file).isWord("trailer")) {
|
if (read_token().isWord("trailer")) {
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
m->file->seek(pos, SEEK_SET);
|
file->seek(pos, SEEK_SET);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,9 +7,11 @@
|
|||||||
class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry>
|
class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
Xref_table(QPDF& qpdf) :
|
Xref_table(QPDF& qpdf, InputSource* const& file) :
|
||||||
qpdf(qpdf)
|
qpdf(qpdf),
|
||||||
|
file(file)
|
||||||
{
|
{
|
||||||
|
tokenizer.allowEOF();
|
||||||
}
|
}
|
||||||
|
|
||||||
void initialize();
|
void initialize();
|
||||||
@ -50,6 +52,12 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry>
|
|||||||
int max_num_entries,
|
int max_num_entries,
|
||||||
std::function<QPDFExc(std::string_view)> damaged);
|
std::function<QPDFExc(std::string_view)> damaged);
|
||||||
|
|
||||||
|
QPDFTokenizer::Token
|
||||||
|
read_token(size_t max_len = 0)
|
||||||
|
{
|
||||||
|
return tokenizer.readToken(*file, "", true, max_len);
|
||||||
|
}
|
||||||
|
|
||||||
// Methods to insert table entries
|
// Methods to insert table entries
|
||||||
void insert_reconstructed(int obj, qpdf_offset_t f1, int f2);
|
void insert_reconstructed(int obj, qpdf_offset_t f1, int f2);
|
||||||
void insert(int obj, int f0, qpdf_offset_t f1, int f2);
|
void insert(int obj, int f0, qpdf_offset_t f1, int f2);
|
||||||
@ -72,7 +80,10 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry>
|
|||||||
{
|
{
|
||||||
qpdf.warn(damaged_pdf(msg));
|
qpdf.warn(damaged_pdf(msg));
|
||||||
}
|
}
|
||||||
|
|
||||||
QPDF& qpdf;
|
QPDF& qpdf;
|
||||||
|
InputSource* const& file;
|
||||||
|
QPDFTokenizer tokenizer;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Writer class is restricted to QPDFWriter so that only it can call certain methods.
|
// Writer class is restricted to QPDFWriter so that only it can call certain methods.
|
||||||
|
Loading…
Reference in New Issue
Block a user