diff --git a/ChangeLog b/ChangeLog index e04bdd8d..6b7454eb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -11,6 +11,12 @@ the (bool, T*) version of the constructor instead. If not, just remove the second parameter. +2017-08-09 Jay Berkenbilt + + * When recovering stream length, find endobj without endstream as + well as just looking for endstream. Be a little more lax about + where we allow it to be found. + 2017-08-05 Jay Berkenbilt * Add --single-pages option to cause output to be written to a diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index 68525eba..92a66a34 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -1030,6 +1030,7 @@ class QPDF // Methods to support pattern finding bool findHeader(); bool findStartxref(); + bool findEndstream(); // methods to support linearization checking -- implemented in // QPDF_linearization.cc diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index fc3120bb..4cda1545 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -1231,76 +1231,43 @@ QPDF::readObject(PointerHolder input, return object; } +bool +QPDF::findEndstream() +{ + // Find endstream or endobj. Position the input at that token. + QPDFTokenizer::Token t = readToken(this->file, true); + if ((t.getType() == QPDFTokenizer::tt_word) && + ((t.getValue() == "endobj") || + (t.getValue() == "endstream"))); + { + this->file->seek(this->file->getLastOffset(), SEEK_SET); + return true; + } + return false; +} + size_t QPDF::recoverStreamLength(PointerHolder input, int objid, int generation, qpdf_offset_t stream_offset) { - PCRE endobj_re("^\\s*endobj\\b"); - // Try to reconstruct stream length by looking for - // endstream(\r\n?|\n)endobj + // endstream or endobj warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(), this->last_object_description, stream_offset, "attempting to recover stream length")); - input->seek(0, SEEK_END); - qpdf_offset_t eof = input->tell(); - input->seek(stream_offset, SEEK_SET); - qpdf_offset_t last_line_offset = 0; + PatternFinder ef(*this, &QPDF::findEndstream); size_t length = 0; - static int const line_end_length = 12; // room for endstream\r\n\0 - char last_line_end[line_end_length]; - while (input->tell() < eof) + if (this->file->findFirst("end", stream_offset, 0, ef)) { - std::string line = input->readLine(50); - qpdf_offset_t line_offset = input->getLastOffset(); - if (endobj_re.match(line.c_str())) + length = this->file->tell() - stream_offset; + // Reread endstream but, if it was endobj, don't skip that. + QPDFTokenizer::Token t = readToken(this->file); + if (t.getValue() == "endobj") { - qpdf_offset_t endstream_offset = 0; - if (last_line_offset >= line_end_length) - { - qpdf_offset_t cur_offset = input->tell(); - // Read from the end of the last line, guaranteeing - // null termination - qpdf_offset_t search_offset = - line_offset - (line_end_length - 1); - input->seek(search_offset, SEEK_SET); - memset(last_line_end, '\0', line_end_length); - input->read(last_line_end, line_end_length - 1); - input->seek(cur_offset, SEEK_SET); - // if endstream[\r\n] will fit in last_line_end, the - // 'e' has to be in one of the first three spots. - // Check explicitly rather than using strstr directly - // in case there are nulls right before endstream. - char* p = ((last_line_end[0] == 'e') ? last_line_end : - (last_line_end[1] == 'e') ? last_line_end + 1 : - (last_line_end[2] == 'e') ? last_line_end + 2 : - 0); - char* endstream_p = 0; - if (p) - { - char* p1 = strstr(p, "endstream\n"); - char* p2 = strstr(p, "endstream\r"); - endstream_p = (p1 ? p1 : p2); - } - if (endstream_p) - { - endstream_offset = - search_offset + (endstream_p - last_line_end); - } - } - if (endstream_offset > 0) - { - // Stream probably ends right before "endstream" - length = endstream_offset - stream_offset; - // Go back to where we would have been if we had just - // read the endstream. - input->seek(line_offset, SEEK_SET); - break; - } - } - last_line_offset = line_offset; + this->file->seek(this->file->getLastOffset(), SEEK_SET); + } } if (length) diff --git a/qpdf/qtest/qpdf/bad24-recover.out b/qpdf/qtest/qpdf/bad24-recover.out index d1bbe736..8d7fd87c 100644 --- a/qpdf/qtest/qpdf/bad24-recover.out +++ b/qpdf/qtest/qpdf/bad24-recover.out @@ -1,10 +1,25 @@ WARNING: bad24.pdf (object 4 0, file position 385): expected endstream WARNING: bad24.pdf (object 4 0, file position 341): attempting to recover stream length -WARNING: bad24.pdf (object 4 0, file position 341): unable to recover stream data; treating stream as empty -WARNING: bad24.pdf (object 4 0, file position 778): EOF while reading token -/QTest is implicit -/QTest is indirect and has type null (2) -/QTest is null +WARNING: bad24.pdf (object 4 0, file position 341): recovered stream length: 54 +/QTest is indirect and has type stream (10) +/QTest is a stream. Dictionary: << /Length 44 >> +Raw stream data: +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET +enxstream + +Uncompressed stream data: +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET +enxstream + +End of stream data unparse: 4 0 R -unparseResolved: null +unparseResolved: 4 0 R test 1 done diff --git a/qpdf/qtest/qpdf/issue-101.out b/qpdf/qtest/qpdf/issue-101.out index 7010cdda..bd457c3a 100644 --- a/qpdf/qtest/qpdf/issue-101.out +++ b/qpdf/qtest/qpdf/issue-101.out @@ -5,10 +5,16 @@ WARNING: issue-101.pdf (file position 1242): expected dictionary key but found n WARNING: issue-101.pdf (file position 1242): dictionary ended prematurely; using null as value for last key WARNING: issue-101.pdf (object 5 0, file position 1438): /Length key in stream dictionary is not an integer WARNING: issue-101.pdf (object 5 0, file position 1509): attempting to recover stream length -WARNING: issue-101.pdf (object 5 0, file position 1509): recovered stream length: 205 +WARNING: issue-101.pdf (object 5 0, file position 1509): recovered stream length: 8 +WARNING: issue-101.pdf (trailer, file position 1631): /Length key in stream dictionary is not an integer +WARNING: issue-101.pdf (trailer, file position 1702): attempting to recover stream length +WARNING: issue-101.pdf (trailer, file position 1702): recovered stream length: 12 WARNING: issue-101.pdf (trailer, file position 2026): /Length key in stream dictionary is not an integer WARNING: issue-101.pdf (trailer, file position 2097): attempting to recover stream length -WARNING: issue-101.pdf (trailer, file position 2097): recovered stream length: 709 +WARNING: issue-101.pdf (trailer, file position 2097): recovered stream length: 12 +WARNING: issue-101.pdf (trailer, file position 2613): /Length key in stream dictionary is not an integer +WARNING: issue-101.pdf (trailer, file position 2684): attempting to recover stream length +WARNING: issue-101.pdf (trailer, file position 2684): recovered stream length: 74 WARNING: issue-101.pdf (trailer, file position 2928): unknown token while reading object; treating as string WARNING: issue-101.pdf (trailer, file position 2929): unknown token while reading object; treating as string WARNING: issue-101.pdf (trailer, file position 2928): expected dictionary key but found non-name object; inserting key /QPDFFake1 @@ -22,8 +28,32 @@ WARNING: issue-101.pdf (trailer, file position 3410): attempting to recover stre WARNING: issue-101.pdf (trailer, file position 3410): recovered stream length: 12 WARNING: issue-101.pdf (trailer, file position 3560): /Length key in stream dictionary is not an integer WARNING: issue-101.pdf (trailer, file position 3631): attempting to recover stream length -WARNING: issue-101.pdf (trailer, file position 3631): recovered stream length: 167 +WARNING: issue-101.pdf (trailer, file position 3631): recovered stream length: 8 WARNING: issue-101.pdf (trailer, file position 4113): /Length key in stream dictionary is not an integer WARNING: issue-101.pdf (trailer, file position 4184): attempting to recover stream length -WARNING: issue-101.pdf (trailer, file position 4184): unable to recover stream data; treating stream as empty -issue-101.pdf: unable to find trailer dictionary while recovering damaged file +WARNING: issue-101.pdf (trailer, file position 4184): recovered stream length: 8 +WARNING: issue-101.pdf (file position 591): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 625): treating unexpected brace token as null +WARNING: issue-101.pdf (file position 626): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 637): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 639): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 644): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 647): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 687): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 691): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 696): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 698): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 701): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 711): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 742): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 745): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 747): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 777): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 790): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 800): treating unexpected brace token as null +WARNING: issue-101.pdf (file position 801): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 811): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 819): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 832): unknown token while reading object; treating as string +WARNING: issue-101.pdf (file position 856): unexpected > +issue-101.pdf (file position 856): unable to find /Root dictionary