Read xref table without PCRE

Also accept more errors than before.
This commit is contained in:
Jay Berkenbilt 2017-08-10 19:37:05 -04:00
parent 98a843c2a2
commit 30f109e244
7 changed files with 276 additions and 18 deletions

View File

@ -1,5 +1,8 @@
2017-08-10 Jay Berkenbilt <ejb@ql.org>
* Be more forgiving of certain types of errors in the xref table
that don't interfere with interpreting the table.
* Remove unused "tracing" parameter from PointerHolder's
(T*, bool) constructor. This change breaks source code
compatibility, but since this argument to PointerHolder has not

View File

@ -652,6 +652,10 @@ class QPDF
void setTrailer(QPDFObjectHandle obj);
void read_xref(qpdf_offset_t offset);
void reconstruct_xref(QPDFExc& e);
bool parse_xrefFirst(std::string const& line,
int& obj, int& num, int& bytes);
bool parse_xrefEntry(std::string const& line,
qpdf_offset_t& f1, int& f2, char& type);
qpdf_offset_t read_xrefTable(qpdf_offset_t offset);
qpdf_offset_t read_xrefStream(qpdf_offset_t offset);
qpdf_offset_t processXRefStream(

View File

@ -9,7 +9,6 @@
#include <qpdf/QTC.hh>
#include <qpdf/QUtil.hh>
#include <qpdf/PCRE.hh>
#include <qpdf/Pipeline.hh>
#include <qpdf/Pl_Discard.hh>
#include <qpdf/FileInputSource.hh>
@ -537,12 +536,162 @@ QPDF::read_xref(qpdf_offset_t xref_offset)
this->deleted_objects.clear();
}
bool
QPDF::parse_xrefFirst(std::string const& line,
int& obj, int& num, int& bytes)
{
// is_space and is_digit both return false on '\0', so this will
// not overrun the null-terminated buffer.
char const* p = line.c_str();
char const* start = line.c_str();
// Skip zero or more spaces
while (QUtil::is_space(*p))
{
++p;
}
// Require digit
if (! QUtil::is_digit(*p))
{
return false;
}
// Gather digits
std::string obj_str;
while (QUtil::is_digit(*p))
{
obj_str.append(1, *p++);
}
// Require space
if (! QUtil::is_space(*p))
{
return false;
}
// Skip spaces
while (QUtil::is_space(*p))
{
++p;
}
// Require digit
if (! QUtil::is_digit(*p))
{
return false;
}
// Gather digits
std::string num_str;
while (QUtil::is_digit(*p))
{
num_str.append(1, *p++);
}
// Skip any space including line terminators
while (QUtil::is_space(*p))
{
++p;
}
bytes = p - start;
obj = atoi(obj_str.c_str());
num = atoi(num_str.c_str());
return true;
}
bool
QPDF::parse_xrefEntry(std::string const& line,
qpdf_offset_t& f1, int& f2, char& type)
{
// is_space and is_digit both return false on '\0', so this will
// not overrun the null-terminated buffer.
char const* p = line.c_str();
// Skip zero or more spaces. There aren't supposed to be any.
bool invalid = false;
while (QUtil::is_space(*p))
{
++p;
QTC::TC("qpdf", "QPDF ignore first space in xref entry");
invalid = true;
}
// Require digit
if (! QUtil::is_digit(*p))
{
return false;
}
// Gather digits
std::string f1_str;
while (QUtil::is_digit(*p))
{
f1_str.append(1, *p++);
}
// Require space
if (! QUtil::is_space(*p))
{
return false;
}
if (QUtil::is_space(*(p+1)))
{
QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");
invalid = true;
}
// Skip spaces
while (QUtil::is_space(*p))
{
++p;
}
// Require digit
if (! QUtil::is_digit(*p))
{
return false;
}
// Gather digits
std::string f2_str;
while (QUtil::is_digit(*p))
{
f2_str.append(1, *p++);
}
// Require space
if (! QUtil::is_space(*p))
{
return false;
}
if (QUtil::is_space(*(p+1)))
{
QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");
invalid = true;
}
// Skip spaces
while (QUtil::is_space(*p))
{
++p;
}
if ((*p == 'f') || (*p == 'n'))
{
type = *p;
}
else
{
return false;
}
if ((f1_str.length() != 10) || (f2_str.length() != 5))
{
QTC::TC("qpdf", "QPDF ignore length error xref entry");
invalid = true;
}
if (invalid)
{
warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
"xref table",
this->file->getLastOffset(),
"accepting invalid xref table entry"));
}
f1 = QUtil::string_to_ll(f1_str.c_str());
f2 = atoi(f2_str.c_str());
return true;
}
qpdf_offset_t
QPDF::read_xrefTable(qpdf_offset_t xref_offset)
{
PCRE xref_first_re("^\\s*(\\d+)\\s+(\\d+)\\s*");
PCRE xref_entry_re("(?s:(^\\d{10}) (\\d{5}) ([fn])\\s*$)");
std::vector<QPDFObjGen> deleted_items;
this->file->seek(xref_offset, SEEK_SET);
@ -553,18 +702,17 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset)
memset(linebuf, 0, sizeof(linebuf));
this->file->read(linebuf, sizeof(linebuf) - 1);
std::string line = linebuf;
PCRE::Match m1 = xref_first_re.match(line.c_str());
if (! m1)
int obj = 0;
int num = 0;
int bytes = 0;
if (! parse_xrefFirst(line, obj, num, bytes))
{
QTC::TC("qpdf", "QPDF invalid xref");
throw QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
"xref table", this->file->getLastOffset(),
"xref syntax invalid");
}
file->seek(this->file->getLastOffset() + m1.getMatch(0).length(),
SEEK_SET);
int obj = atoi(m1.getMatch(1).c_str());
int num = atoi(m1.getMatch(2).c_str());
this->file->seek(this->file->getLastOffset() + bytes, SEEK_SET);
for (int i = obj; i < obj + num; ++i)
{
if (i == 0)
@ -573,8 +721,11 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset)
this->first_xref_item_offset = this->file->tell();
}
std::string xref_entry = this->file->readLine(30);
PCRE::Match m2 = xref_entry_re.match(xref_entry.c_str());
if (! m2)
// For xref_table, these will always be small enough to be ints
qpdf_offset_t f1 = 0;
int f2 = 0;
char type = '\0';
if (! parse_xrefEntry(xref_entry, f1, f2, type))
{
QTC::TC("qpdf", "QPDF invalid xref entry");
throw QPDFExc(
@ -583,11 +734,6 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset)
"invalid xref entry (obj=" +
QUtil::int_to_string(i) + ")");
}
// For xref_table, these will always be small enough to be ints
qpdf_offset_t f1 = QUtil::string_to_ll(m2.getMatch(1).c_str());
int f2 = atoi(m2.getMatch(2).c_str());
char type = m2.getMatch(3).at(0);
if (type == 'f')
{
// Save deleted items until after we've checked the

View File

@ -289,3 +289,7 @@ qpdf single-pages %d 0
qpdf single-pages .pdf 0
qpdf single-pages other 0
QPDFTokenizer allowing bad token 0
QPDF ignore first space in xref entry 0
QPDF ignore first extra space in xref entry 0
QPDF ignore second extra space in xref entry 0
QPDF ignore length error xref entry 0

View File

@ -232,7 +232,7 @@ foreach my $d (@bug_tests)
show_ntests();
# ----------
$td->notify("--- Miscellaneous Tests ---");
$n_tests += 86;
$n_tests += 87;
$td->runtest("qpdf version",
{$td->COMMAND => "qpdf --version"},
@ -669,6 +669,13 @@ $td->runtest("ignore bad token",
$td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->runtest("recoverable xref errors",
{$td->COMMAND =>
"qpdf --check --show-xref xref-errors.pdf"},
{$td->FILE => "xref-errors.out",
$td->EXIT_STATUS => 3},
$td->NORMALIZE_NEWLINES);
show_ntests();
# ----------
$td->notify("--- Single Page ---");

View File

@ -0,0 +1,15 @@
WARNING: xref-errors.pdf (xref table, file position 585): accepting invalid xref table entry
WARNING: xref-errors.pdf (xref table, file position 606): accepting invalid xref table entry
WARNING: xref-errors.pdf (xref table, file position 627): accepting invalid xref table entry
WARNING: xref-errors.pdf (xref table, file position 648): accepting invalid xref table entry
WARNING: xref-errors.pdf (xref table, file position 667): accepting invalid xref table entry
checking xref-errors.pdf
PDF Version: 1.3
File is not encrypted
File is not linearized
1/0: uncompressed; offset = 9
2/0: uncompressed; offset = 63
3/0: uncompressed; offset = 135
4/0: uncompressed; offset = 307
5/0: uncompressed; offset = 403
6/0: uncompressed; offset = 438

View File

@ -0,0 +1,79 @@
%PDF-1.3
1 0 obj
<<
/Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj
<<
/Type /Pages
/Kids [
3 0 R
]
/Count 1
>>
endobj
3 0 obj
<<
/Type /Page
/Parent 2 0 R
/MediaBox [0 0 612 792]
/Contents 4 0 R
/Resources <<
/ProcSet 5 0 R
/Font <<
/F1 6 0 R
>>
>>
>>
endobj
4 0 obj
<<
/Length 44
>>
stream
BT
/F1 24 Tf
72 720 Td
(Potato) Tj
ET
endstream
endobj
5 0 obj
[
/PDF
/Text
]
endobj
6 0 obj
<<
/Type /Font
/Subtype /Type1
/Name /F1
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
>>
endobj
xref
0 7
0000000000 65535 f
0000000009 00000 n
0000000063 00000 n
0000000135 00000 n
000000307 00000 n
0000000403 0000 n
0000000438 00000 n
trailer <<
/Size 7
/Root 1 0 R
>>
startxref
556
%%EOF