2
1
mirror of https://github.com/qpdf/qpdf.git synced 2025-01-03 15:17:29 +00:00

Find starxref without PCRE

This commit is contained in:
Jay Berkenbilt 2017-08-05 14:54:07 -04:00
parent 1765c6ec20
commit 03aa9679ac
5 changed files with 29 additions and 34 deletions

View File

@ -1029,6 +1029,7 @@ class QPDF
// Methods to support pattern finding
bool findHeader();
bool findStartxref();
// methods to support linearization checking -- implemented in
// QPDF_linearization.cc

View File

@ -254,11 +254,26 @@ QPDF::findHeader()
return valid;
}
bool
QPDF::findStartxref()
{
QPDFTokenizer::Token t = readToken(this->file, true);
if (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "startxref"))
{
t = readToken(this->file, true);
if (t.getType() == QPDFTokenizer::tt_integer)
{
// Position in front of offset token
this->file->seek(this->file->getLastOffset(), SEEK_SET);
return true;
}
}
return false;
}
void
QPDF::parse(char const* password)
{
PCRE eof_re("(?s:startxref\\s+(\\d+)\\s+%%EOF\\b)");
if (password)
{
this->provided_password = password;
@ -283,47 +298,25 @@ QPDF::parse(char const* password)
// PDF spec says %%EOF must be found within the last 1024 bytes of
// the file. We add an extra 30 characters to leave room for the
// startxref stuff.
static int const tbuf_size = 1054;
this->file->seek(0, SEEK_END);
if (this->file->tell() > tbuf_size)
qpdf_offset_t end_offset = this->file->tell();
qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
PatternFinder sf(*this, &QPDF::findStartxref);
qpdf_offset_t xref_offset = 0;
if (this->file->findLast("startxref", start_offset, 0, sf))
{
this->file->seek(-tbuf_size, SEEK_END);
}
else
{
this->file->rewind();
}
char* buf = new char[tbuf_size + 1];
// Put buf in an array-style PointerHolder to guarantee deletion
// of buf.
PointerHolder<char> b(true, buf);
memset(buf, '\0', tbuf_size + 1);
this->file->read(buf, tbuf_size);
// Since buf may contain null characters, we can't do a regexp
// search on buf directly. Find the last occurrence within buf
// where the regexp matches.
char* p = buf;
char const* candidate = "";
while ((p = static_cast<char*>(memchr(p, 's', tbuf_size - (p - buf)))) != 0)
{
if (eof_re.match(p))
{
candidate = p;
}
++p;
xref_offset = QUtil::string_to_ll(
readToken(this->file).getValue().c_str());
}
try
{
PCRE::Match m2 = eof_re.match(candidate);
if (! m2)
if (xref_offset == 0)
{
QTC::TC("qpdf", "QPDF can't find startxref");
throw QPDFExc(qpdf_e_damaged_pdf, this->file->getName(), "", 0,
"can't find startxref");
}
qpdf_offset_t xref_offset = QUtil::string_to_ll(m2.getMatch(1).c_str());
read_xref(xref_offset);
}
catch (QPDFExc& e)

View File

@ -521,7 +521,7 @@ QPDFTokenizer::readToken(PointerHolder<InputSource> input,
{
if (allow_bad)
{
// QTC::TC("qpdf", "QPDFTokenizer allowing bad token");
QTC::TC("qpdf", "QPDFTokenizer allowing bad token");
}
else
{

View File

@ -290,3 +290,4 @@ qpdf read args from file 0
qpdf single-pages %d 0
qpdf single-pages .pdf 0
qpdf single-pages other 0
QPDFTokenizer allowing bad token 0

View File

@ -1,5 +1,5 @@
WARNING: issue-117.pdf: file is damaged
WARNING: issue-117.pdf: can't find startxref
WARNING: issue-117.pdf (file position 3526): xref not found
WARNING: issue-117.pdf: Attempting to reconstruct cross-reference table
WARNING: issue-117.pdf (file position 66): loop detected resolving object 2 0
WARNING: issue-117.pdf (object 2 0, file position 22): /Length key in stream dictionary is not an integer