From aeb892f99bad9f6c24aef94a2d93d573c6de0382 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sat, 30 Apr 2011 21:46:09 +0000 Subject: [PATCH] accept stream keyword with CR only git-svn-id: svn+q:///qpdf/trunk@1052 71b93d88-0707-0410-a8cf-f5a4172ac649 --- ChangeLog | 5 + libqpdf/QPDF.cc | 68 +++++++++--- manual/qpdf-manual.xml | 6 ++ qpdf/qpdf.testcov | 4 + qpdf/qtest/qpdf.test | 13 ++- qpdf/qtest/qpdf/stream-line-enders.out | 3 + qpdf/qtest/qpdf/stream-line-enders.pdf | 50 +++++++++ qpdf/qtest/qpdf/stream-line-enders.qdf | 137 +++++++++++++++++++++++++ 8 files changed, 272 insertions(+), 14 deletions(-) create mode 100644 qpdf/qtest/qpdf/stream-line-enders.out create mode 100644 qpdf/qtest/qpdf/stream-line-enders.pdf create mode 100644 qpdf/qtest/qpdf/stream-line-enders.qdf diff --git a/ChangeLog b/ChangeLog index 01e43879..a81124bf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -2,6 +2,11 @@ * 2.2.3: release + * libqpdf/QPDF.cc (readObjectInternal): Accept the case of the + stream keyword being followed by carriage return by itself. While + this is not permitted by the specification, there are PDF files + that do this, and other readers can read them. + * libqpdf/Pl_QPDFTokenizer.cc (processChar): When an inline image is detected, suspend normalization only up to the end of the inline image rather than for the remainder of the content stream. diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index c8146eff..f6157287 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -1331,24 +1331,66 @@ QPDF::readObjectInternal(PointerHolder input, if (readToken(input) == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream")) { - // Kill to next actual newline. Do not use readLine() - // here -- streams are a special case. The next - // single newline character marks the end of the - // stream token. It is incorrect to strip subsequent - // carriage returns or newlines as they may be part of - // the stream. + // The PDF specification states that the word "stream" + // should be followed by either a carriage return and + // a newline or by a newline alone. It specifically + // disallowed following it by a carriage return alone + // since, in that case, there would be no way to tell + // whether the NL in a CR NL sequence was part of the + // stream data. However, some readers, including + // Adobe reader, accept a carriage return by itself + // when followed by a non-newline character, so that's + // what we do here. { char ch; - do + if (input->read(&ch, 1) == 0) { - if (input->read(&ch, 1) == 0) + // A premature EOF here will result in some + // other problem that will get reported at + // another time. + } + else if (ch == '\n') + { + // ready to read stream data + QTC::TC("qpdf", "QPDF stream with NL only"); + } + else if (ch == '\r') + { + // Read another character + if (input->read(&ch, 1) != 0) { - // A premature EOF here will result in - // some other problem that will get - // reported at another time. - ch = '\n'; + if (ch == '\n') + { + // Ready to read stream data + QTC::TC("qpdf", "QPDF stream with CRNL"); + } + else + { + // Treat the \r by itself as the + // whitespace after endstream and + // start reading stream data in spite + // of not having seen a newline. + QTC::TC("qpdf", "QPDF stream with CR only"); + input->unreadCh(ch); + warn(QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + this->last_object_description, + input->tell(), + "stream keyword followed" + " by carriage return only")); + } } - } while (ch != '\n'); + } + else + { + QTC::TC("qpdf", "QPDF stream without newline"); + warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(), + this->last_object_description, + input->tell(), + "stream keyword not followed" + " by proper line terminator")); + } } // Must get offset before accessing any additional diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml index 9bdb95b3..02b69a1d 100644 --- a/manual/qpdf-manual.xml +++ b/manual/qpdf-manual.xml @@ -2078,6 +2078,12 @@ print "\n"; 2.2.3: April 30, 2011 + + + Handle some damaged streams with incorrect characters + following the stream keyword. + + Improve handling of inline images when normalizing content diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 51a6f0e8..4e7e292a 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -188,3 +188,7 @@ QPDF_Stream getStreamData 0 QPDF_Stream expand filter abbreviation 0 qpdf-c called qpdf_read_memory 0 Pl_QPDFTokenizer found EI 0 +QPDF stream without newline 0 +QPDF stream with CR only 0 +QPDF stream with CRNL 0 +QPDF stream with NL only 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index 62eed000..75a92aa4 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -111,7 +111,7 @@ $td->runtest("new stream", show_ntests(); # ---------- $td->notify("--- Miscellaneous Tests ---"); -$n_tests += 29; +$n_tests += 31; $td->runtest("qpdf version", {$td->COMMAND => "qpdf --version"}, @@ -265,6 +265,17 @@ $td->runtest("error/output redirection to strings", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); +$td->runtest("odd terminators for stream keyword", + {$td->COMMAND => + "qpdf --qdf --static-id" . + " stream-line-enders.pdf a.qdf"}, + {$td->FILE => "stream-line-enders.out", + $td->EXIT_STATUS => 3}, + $td->NORMALIZE_NEWLINES); +$td->runtest("check output", + {$td->FILE => "a.qdf"}, + {$td->FILE => "stream-line-enders.qdf"}); + show_ntests(); # ---------- $td->notify("--- Error Condition Tests ---"); diff --git a/qpdf/qtest/qpdf/stream-line-enders.out b/qpdf/qtest/qpdf/stream-line-enders.out new file mode 100644 index 00000000..b7a7513c --- /dev/null +++ b/qpdf/qtest/qpdf/stream-line-enders.out @@ -0,0 +1,3 @@ +WARNING: stream-line-enders.pdf (object 5 0, file position 378): stream keyword followed by carriage return only +WARNING: stream-line-enders.pdf (object 6 0, file position 437): stream keyword not followed by proper line terminator +qpdf: operation succeeded with warnings; resulting file may have some problems diff --git a/qpdf/qtest/qpdf/stream-line-enders.pdf b/qpdf/qtest/qpdf/stream-line-enders.pdf new file mode 100644 index 00000000..e6238889 --- /dev/null +++ b/qpdf/qtest/qpdf/stream-line-enders.pdf @@ -0,0 +1,50 @@ +%PDF-1.3 +%¿÷¢þ +1 0 obj +<< /Pages 2 0 R /Type /Catalog >> +endobj +2 0 obj +<< /Count 1 /Kids [ 3 0 R ] /Type /Pages >> +endobj +3 0 obj +<< /Contents [ 4 0 R 5 0 R 6 0 R ] /MediaBox [ 0 0 612 792 ] /Parent 2 0 R /Resources << /Font << /F1 7 0 R >> /ProcSet 8 0 R >> /Type /Page >> +endobj +4 0 obj +<< /Length 14 >> +stream +BT + /F1 24 Tf +endstream +endobj +5 0 obj +<< /Length 10 >> +stream 72 720 Td +endstream +endobj +6 0 obj +<< /Length 15 >> +stream (Potato) Tj +ET +endstream +endobj +7 0 obj +<< /BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font >> +endobj +8 0 obj +[ /PDF /Text ] +endobj +xref +0 9 +0000000000 65535 f +0000000015 00000 n +0000000064 00000 n +0000000123 00000 n +0000000282 00000 n +0000000346 00000 n +0000000405 00000 n +0000000469 00000 n +0000000576 00000 n +trailer << /Root 1 0 R /Size 9 /ID [<08aa98c73f8a7262d77c8328772c3989><7b1f32865e2165debe277f27ee790092>] >> +startxref +606 +%%EOF diff --git a/qpdf/qtest/qpdf/stream-line-enders.qdf b/qpdf/qtest/qpdf/stream-line-enders.qdf new file mode 100644 index 00000000..33536028 --- /dev/null +++ b/qpdf/qtest/qpdf/stream-line-enders.qdf @@ -0,0 +1,137 @@ +%PDF-1.3 +%¿÷¢þ +%QDF-1.0 + +%% Original object ID: 1 0 +1 0 obj +<< + /Pages 2 0 R + /Type /Catalog +>> +endobj + +%% Original object ID: 2 0 +2 0 obj +<< + /Count 1 + /Kids [ + 3 0 R + ] + /Type /Pages +>> +endobj + +%% Page 1 +%% Original object ID: 3 0 +3 0 obj +<< + /Contents [ + 4 0 R + 6 0 R + 8 0 R + ] + /MediaBox [ + 0 + 0 + 612 + 792 + ] + /Parent 2 0 R + /Resources << + /Font << + /F1 10 0 R + >> + /ProcSet 11 0 R + >> + /Type /Page +>> +endobj + +%% Contents for page 1 +%% Original object ID: 4 0 +4 0 obj +<< + /Length 5 0 R +>> +stream +BT + /F1 24 Tf +endstream +endobj + +5 0 obj +14 +endobj + +%% Contents for page 1 +%% Original object ID: 5 0 +6 0 obj +<< + /Length 7 0 R +>> +stream +72 720 Td +endstream +endobj + +7 0 obj +10 +endobj + +%% Contents for page 1 +%% Original object ID: 6 0 +8 0 obj +<< + /Length 9 0 R +>> +stream +(Potato) Tj +ET +endstream +endobj + +9 0 obj +15 +endobj + +%% Original object ID: 7 0 +10 0 obj +<< + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + /Name /F1 + /Subtype /Type1 + /Type /Font +>> +endobj + +%% Original object ID: 8 0 +11 0 obj +[ + /PDF + /Text +] +endobj + +xref +0 12 +0000000000 65535 f +0000000052 00000 n +0000000133 00000 n +0000000242 00000 n +0000000516 00000 n +0000000585 00000 n +0000000654 00000 n +0000000719 00000 n +0000000788 00000 n +0000000858 00000 n +0000000904 00000 n +0000001050 00000 n +trailer << + /Root 1 0 R + /Size 12 + /ID [<08aa98c73f8a7262d77c8328772c3989><31415926535897932384626433832795>] +>> +startxref +1086 +%%EOF