Improve stream length recovery

Eliminate PCRE and find endobj not preceded by endstream. Be more lax
about placement of endstream and endobj.
This commit is contained in:
Jay Berkenbilt 2017-08-09 21:14:48 -04:00
parent 3082e4e606
commit ca5b1d267a
5 changed files with 87 additions and 68 deletions

View File

@ -11,6 +11,12 @@
the (bool, T*) version of the constructor instead. If not, just
remove the second parameter.
2017-08-09 Jay Berkenbilt <ejb@ql.org>
* When recovering stream length, find endobj without endstream as
well as just looking for endstream. Be a little more lax about
where we allow it to be found.
2017-08-05 Jay Berkenbilt <ejb@ql.org>
* Add --single-pages option to cause output to be written to a

View File

@ -1030,6 +1030,7 @@ class QPDF
// Methods to support pattern finding
bool findHeader();
bool findStartxref();
bool findEndstream();
// methods to support linearization checking -- implemented in
// QPDF_linearization.cc

View File

@ -1231,76 +1231,43 @@ QPDF::readObject(PointerHolder<InputSource> input,
return object;
}
bool
QPDF::findEndstream()
{
// Find endstream or endobj. Position the input at that token.
QPDFTokenizer::Token t = readToken(this->file, true);
if ((t.getType() == QPDFTokenizer::tt_word) &&
((t.getValue() == "endobj") ||
(t.getValue() == "endstream")));
{
this->file->seek(this->file->getLastOffset(), SEEK_SET);
return true;
}
return false;
}
size_t
QPDF::recoverStreamLength(PointerHolder<InputSource> input,
int objid, int generation,
qpdf_offset_t stream_offset)
{
PCRE endobj_re("^\\s*endobj\\b");
// Try to reconstruct stream length by looking for
// endstream(\r\n?|\n)endobj
// endstream or endobj
warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(),
this->last_object_description, stream_offset,
"attempting to recover stream length"));
input->seek(0, SEEK_END);
qpdf_offset_t eof = input->tell();
input->seek(stream_offset, SEEK_SET);
qpdf_offset_t last_line_offset = 0;
PatternFinder ef(*this, &QPDF::findEndstream);
size_t length = 0;
static int const line_end_length = 12; // room for endstream\r\n\0
char last_line_end[line_end_length];
while (input->tell() < eof)
if (this->file->findFirst("end", stream_offset, 0, ef))
{
std::string line = input->readLine(50);
qpdf_offset_t line_offset = input->getLastOffset();
if (endobj_re.match(line.c_str()))
length = this->file->tell() - stream_offset;
// Reread endstream but, if it was endobj, don't skip that.
QPDFTokenizer::Token t = readToken(this->file);
if (t.getValue() == "endobj")
{
qpdf_offset_t endstream_offset = 0;
if (last_line_offset >= line_end_length)
{
qpdf_offset_t cur_offset = input->tell();
// Read from the end of the last line, guaranteeing
// null termination
qpdf_offset_t search_offset =
line_offset - (line_end_length - 1);
input->seek(search_offset, SEEK_SET);
memset(last_line_end, '\0', line_end_length);
input->read(last_line_end, line_end_length - 1);
input->seek(cur_offset, SEEK_SET);
// if endstream[\r\n] will fit in last_line_end, the
// 'e' has to be in one of the first three spots.
// Check explicitly rather than using strstr directly
// in case there are nulls right before endstream.
char* p = ((last_line_end[0] == 'e') ? last_line_end :
(last_line_end[1] == 'e') ? last_line_end + 1 :
(last_line_end[2] == 'e') ? last_line_end + 2 :
0);
char* endstream_p = 0;
if (p)
{
char* p1 = strstr(p, "endstream\n");
char* p2 = strstr(p, "endstream\r");
endstream_p = (p1 ? p1 : p2);
}
if (endstream_p)
{
endstream_offset =
search_offset + (endstream_p - last_line_end);
}
}
if (endstream_offset > 0)
{
// Stream probably ends right before "endstream"
length = endstream_offset - stream_offset;
// Go back to where we would have been if we had just
// read the endstream.
input->seek(line_offset, SEEK_SET);
break;
}
}
last_line_offset = line_offset;
this->file->seek(this->file->getLastOffset(), SEEK_SET);
}
}
if (length)

View File

@ -1,10 +1,25 @@
WARNING: bad24.pdf (object 4 0, file position 385): expected endstream
WARNING: bad24.pdf (object 4 0, file position 341): attempting to recover stream length
WARNING: bad24.pdf (object 4 0, file position 341): unable to recover stream data; treating stream as empty
WARNING: bad24.pdf (object 4 0, file position 778): EOF while reading token
/QTest is implicit
/QTest is indirect and has type null (2)
/QTest is null
WARNING: bad24.pdf (object 4 0, file position 341): recovered stream length: 54
/QTest is indirect and has type stream (10)
/QTest is a stream. Dictionary: << /Length 44 >>
Raw stream data:
BT
/F1 24 Tf
72 720 Td
(Potato) Tj
ET
enxstream
Uncompressed stream data:
BT
/F1 24 Tf
72 720 Td
(Potato) Tj
ET
enxstream
End of stream data
unparse: 4 0 R
unparseResolved: null
unparseResolved: 4 0 R
test 1 done

View File

@ -5,10 +5,16 @@ WARNING: issue-101.pdf (file position 1242): expected dictionary key but found n
WARNING: issue-101.pdf (file position 1242): dictionary ended prematurely; using null as value for last key
WARNING: issue-101.pdf (object 5 0, file position 1438): /Length key in stream dictionary is not an integer
WARNING: issue-101.pdf (object 5 0, file position 1509): attempting to recover stream length
WARNING: issue-101.pdf (object 5 0, file position 1509): recovered stream length: 205
WARNING: issue-101.pdf (object 5 0, file position 1509): recovered stream length: 8
WARNING: issue-101.pdf (trailer, file position 1631): /Length key in stream dictionary is not an integer
WARNING: issue-101.pdf (trailer, file position 1702): attempting to recover stream length
WARNING: issue-101.pdf (trailer, file position 1702): recovered stream length: 12
WARNING: issue-101.pdf (trailer, file position 2026): /Length key in stream dictionary is not an integer
WARNING: issue-101.pdf (trailer, file position 2097): attempting to recover stream length
WARNING: issue-101.pdf (trailer, file position 2097): recovered stream length: 709
WARNING: issue-101.pdf (trailer, file position 2097): recovered stream length: 12
WARNING: issue-101.pdf (trailer, file position 2613): /Length key in stream dictionary is not an integer
WARNING: issue-101.pdf (trailer, file position 2684): attempting to recover stream length
WARNING: issue-101.pdf (trailer, file position 2684): recovered stream length: 74
WARNING: issue-101.pdf (trailer, file position 2928): unknown token while reading object; treating as string
WARNING: issue-101.pdf (trailer, file position 2929): unknown token while reading object; treating as string
WARNING: issue-101.pdf (trailer, file position 2928): expected dictionary key but found non-name object; inserting key /QPDFFake1
@ -22,8 +28,32 @@ WARNING: issue-101.pdf (trailer, file position 3410): attempting to recover stre
WARNING: issue-101.pdf (trailer, file position 3410): recovered stream length: 12
WARNING: issue-101.pdf (trailer, file position 3560): /Length key in stream dictionary is not an integer
WARNING: issue-101.pdf (trailer, file position 3631): attempting to recover stream length
WARNING: issue-101.pdf (trailer, file position 3631): recovered stream length: 167
WARNING: issue-101.pdf (trailer, file position 3631): recovered stream length: 8
WARNING: issue-101.pdf (trailer, file position 4113): /Length key in stream dictionary is not an integer
WARNING: issue-101.pdf (trailer, file position 4184): attempting to recover stream length
WARNING: issue-101.pdf (trailer, file position 4184): unable to recover stream data; treating stream as empty
issue-101.pdf: unable to find trailer dictionary while recovering damaged file
WARNING: issue-101.pdf (trailer, file position 4184): recovered stream length: 8
WARNING: issue-101.pdf (file position 591): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 625): treating unexpected brace token as null
WARNING: issue-101.pdf (file position 626): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 637): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 639): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 644): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 647): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 687): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 691): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 696): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 698): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 701): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 711): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 742): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 745): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 747): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 777): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 790): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 800): treating unexpected brace token as null
WARNING: issue-101.pdf (file position 801): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 811): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 819): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 832): unknown token while reading object; treating as string
WARNING: issue-101.pdf (file position 856): unexpected >
issue-101.pdf (file position 856): unable to find /Root dictionary