mirror of
https://github.com/qpdf/qpdf.git
synced 2025-01-31 02:48:31 +00:00
Improve stream length recovery
Eliminate PCRE and find endobj not preceded by endstream. Be more lax about placement of endstream and endobj.
This commit is contained in:
parent
3082e4e606
commit
ca5b1d267a
@ -11,6 +11,12 @@
|
||||
the (bool, T*) version of the constructor instead. If not, just
|
||||
remove the second parameter.
|
||||
|
||||
2017-08-09 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* When recovering stream length, find endobj without endstream as
|
||||
well as just looking for endstream. Be a little more lax about
|
||||
where we allow it to be found.
|
||||
|
||||
2017-08-05 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Add --single-pages option to cause output to be written to a
|
||||
|
@ -1030,6 +1030,7 @@ class QPDF
|
||||
// Methods to support pattern finding
|
||||
bool findHeader();
|
||||
bool findStartxref();
|
||||
bool findEndstream();
|
||||
|
||||
// methods to support linearization checking -- implemented in
|
||||
// QPDF_linearization.cc
|
||||
|
@ -1231,76 +1231,43 @@ QPDF::readObject(PointerHolder<InputSource> input,
|
||||
return object;
|
||||
}
|
||||
|
||||
bool
|
||||
QPDF::findEndstream()
|
||||
{
|
||||
// Find endstream or endobj. Position the input at that token.
|
||||
QPDFTokenizer::Token t = readToken(this->file, true);
|
||||
if ((t.getType() == QPDFTokenizer::tt_word) &&
|
||||
((t.getValue() == "endobj") ||
|
||||
(t.getValue() == "endstream")));
|
||||
{
|
||||
this->file->seek(this->file->getLastOffset(), SEEK_SET);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t
|
||||
QPDF::recoverStreamLength(PointerHolder<InputSource> input,
|
||||
int objid, int generation,
|
||||
qpdf_offset_t stream_offset)
|
||||
{
|
||||
PCRE endobj_re("^\\s*endobj\\b");
|
||||
|
||||
// Try to reconstruct stream length by looking for
|
||||
// endstream(\r\n?|\n)endobj
|
||||
// endstream or endobj
|
||||
warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(),
|
||||
this->last_object_description, stream_offset,
|
||||
"attempting to recover stream length"));
|
||||
|
||||
input->seek(0, SEEK_END);
|
||||
qpdf_offset_t eof = input->tell();
|
||||
input->seek(stream_offset, SEEK_SET);
|
||||
qpdf_offset_t last_line_offset = 0;
|
||||
PatternFinder ef(*this, &QPDF::findEndstream);
|
||||
size_t length = 0;
|
||||
static int const line_end_length = 12; // room for endstream\r\n\0
|
||||
char last_line_end[line_end_length];
|
||||
while (input->tell() < eof)
|
||||
if (this->file->findFirst("end", stream_offset, 0, ef))
|
||||
{
|
||||
std::string line = input->readLine(50);
|
||||
qpdf_offset_t line_offset = input->getLastOffset();
|
||||
if (endobj_re.match(line.c_str()))
|
||||
length = this->file->tell() - stream_offset;
|
||||
// Reread endstream but, if it was endobj, don't skip that.
|
||||
QPDFTokenizer::Token t = readToken(this->file);
|
||||
if (t.getValue() == "endobj")
|
||||
{
|
||||
qpdf_offset_t endstream_offset = 0;
|
||||
if (last_line_offset >= line_end_length)
|
||||
{
|
||||
qpdf_offset_t cur_offset = input->tell();
|
||||
// Read from the end of the last line, guaranteeing
|
||||
// null termination
|
||||
qpdf_offset_t search_offset =
|
||||
line_offset - (line_end_length - 1);
|
||||
input->seek(search_offset, SEEK_SET);
|
||||
memset(last_line_end, '\0', line_end_length);
|
||||
input->read(last_line_end, line_end_length - 1);
|
||||
input->seek(cur_offset, SEEK_SET);
|
||||
// if endstream[\r\n] will fit in last_line_end, the
|
||||
// 'e' has to be in one of the first three spots.
|
||||
// Check explicitly rather than using strstr directly
|
||||
// in case there are nulls right before endstream.
|
||||
char* p = ((last_line_end[0] == 'e') ? last_line_end :
|
||||
(last_line_end[1] == 'e') ? last_line_end + 1 :
|
||||
(last_line_end[2] == 'e') ? last_line_end + 2 :
|
||||
0);
|
||||
char* endstream_p = 0;
|
||||
if (p)
|
||||
{
|
||||
char* p1 = strstr(p, "endstream\n");
|
||||
char* p2 = strstr(p, "endstream\r");
|
||||
endstream_p = (p1 ? p1 : p2);
|
||||
}
|
||||
if (endstream_p)
|
||||
{
|
||||
endstream_offset =
|
||||
search_offset + (endstream_p - last_line_end);
|
||||
}
|
||||
}
|
||||
if (endstream_offset > 0)
|
||||
{
|
||||
// Stream probably ends right before "endstream"
|
||||
length = endstream_offset - stream_offset;
|
||||
// Go back to where we would have been if we had just
|
||||
// read the endstream.
|
||||
input->seek(line_offset, SEEK_SET);
|
||||
break;
|
||||
}
|
||||
}
|
||||
last_line_offset = line_offset;
|
||||
this->file->seek(this->file->getLastOffset(), SEEK_SET);
|
||||
}
|
||||
}
|
||||
|
||||
if (length)
|
||||
|
@ -1,10 +1,25 @@
|
||||
WARNING: bad24.pdf (object 4 0, file position 385): expected endstream
|
||||
WARNING: bad24.pdf (object 4 0, file position 341): attempting to recover stream length
|
||||
WARNING: bad24.pdf (object 4 0, file position 341): unable to recover stream data; treating stream as empty
|
||||
WARNING: bad24.pdf (object 4 0, file position 778): EOF while reading token
|
||||
/QTest is implicit
|
||||
/QTest is indirect and has type null (2)
|
||||
/QTest is null
|
||||
WARNING: bad24.pdf (object 4 0, file position 341): recovered stream length: 54
|
||||
/QTest is indirect and has type stream (10)
|
||||
/QTest is a stream. Dictionary: << /Length 44 >>
|
||||
Raw stream data:
|
||||
BT
|
||||
/F1 24 Tf
|
||||
72 720 Td
|
||||
(Potato) Tj
|
||||
ET
|
||||
enxstream
|
||||
|
||||
Uncompressed stream data:
|
||||
BT
|
||||
/F1 24 Tf
|
||||
72 720 Td
|
||||
(Potato) Tj
|
||||
ET
|
||||
enxstream
|
||||
|
||||
End of stream data
|
||||
unparse: 4 0 R
|
||||
unparseResolved: null
|
||||
unparseResolved: 4 0 R
|
||||
test 1 done
|
||||
|
@ -5,10 +5,16 @@ WARNING: issue-101.pdf (file position 1242): expected dictionary key but found n
|
||||
WARNING: issue-101.pdf (file position 1242): dictionary ended prematurely; using null as value for last key
|
||||
WARNING: issue-101.pdf (object 5 0, file position 1438): /Length key in stream dictionary is not an integer
|
||||
WARNING: issue-101.pdf (object 5 0, file position 1509): attempting to recover stream length
|
||||
WARNING: issue-101.pdf (object 5 0, file position 1509): recovered stream length: 205
|
||||
WARNING: issue-101.pdf (object 5 0, file position 1509): recovered stream length: 8
|
||||
WARNING: issue-101.pdf (trailer, file position 1631): /Length key in stream dictionary is not an integer
|
||||
WARNING: issue-101.pdf (trailer, file position 1702): attempting to recover stream length
|
||||
WARNING: issue-101.pdf (trailer, file position 1702): recovered stream length: 12
|
||||
WARNING: issue-101.pdf (trailer, file position 2026): /Length key in stream dictionary is not an integer
|
||||
WARNING: issue-101.pdf (trailer, file position 2097): attempting to recover stream length
|
||||
WARNING: issue-101.pdf (trailer, file position 2097): recovered stream length: 709
|
||||
WARNING: issue-101.pdf (trailer, file position 2097): recovered stream length: 12
|
||||
WARNING: issue-101.pdf (trailer, file position 2613): /Length key in stream dictionary is not an integer
|
||||
WARNING: issue-101.pdf (trailer, file position 2684): attempting to recover stream length
|
||||
WARNING: issue-101.pdf (trailer, file position 2684): recovered stream length: 74
|
||||
WARNING: issue-101.pdf (trailer, file position 2928): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (trailer, file position 2929): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (trailer, file position 2928): expected dictionary key but found non-name object; inserting key /QPDFFake1
|
||||
@ -22,8 +28,32 @@ WARNING: issue-101.pdf (trailer, file position 3410): attempting to recover stre
|
||||
WARNING: issue-101.pdf (trailer, file position 3410): recovered stream length: 12
|
||||
WARNING: issue-101.pdf (trailer, file position 3560): /Length key in stream dictionary is not an integer
|
||||
WARNING: issue-101.pdf (trailer, file position 3631): attempting to recover stream length
|
||||
WARNING: issue-101.pdf (trailer, file position 3631): recovered stream length: 167
|
||||
WARNING: issue-101.pdf (trailer, file position 3631): recovered stream length: 8
|
||||
WARNING: issue-101.pdf (trailer, file position 4113): /Length key in stream dictionary is not an integer
|
||||
WARNING: issue-101.pdf (trailer, file position 4184): attempting to recover stream length
|
||||
WARNING: issue-101.pdf (trailer, file position 4184): unable to recover stream data; treating stream as empty
|
||||
issue-101.pdf: unable to find trailer dictionary while recovering damaged file
|
||||
WARNING: issue-101.pdf (trailer, file position 4184): recovered stream length: 8
|
||||
WARNING: issue-101.pdf (file position 591): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 625): treating unexpected brace token as null
|
||||
WARNING: issue-101.pdf (file position 626): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 637): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 639): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 644): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 647): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 687): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 691): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 696): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 698): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 701): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 711): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 742): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 745): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 747): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 777): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 790): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 800): treating unexpected brace token as null
|
||||
WARNING: issue-101.pdf (file position 801): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 811): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 819): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 832): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 856): unexpected >
|
||||
issue-101.pdf (file position 856): unable to find /Root dictionary
|
||||
|
Loading…
x
Reference in New Issue
Block a user