Better handle split content streams (fixes #73)

When parsing content streams, allow content to be split arbitrarily
across stream boundaries.
This commit is contained in:
Jay Berkenbilt 2017-07-29 10:40:31 -04:00
parent a136824243
commit b389268f16
11 changed files with 8793 additions and 20 deletions

View File

@ -1,3 +1,8 @@
2017-07-29 Jay Berkenbilt <ejb@ql.org>
* Fix content stream parsing to handle cases of structures within
the stream split across stream boundaries. Fixes #73.
2017-07-28 Jay Berkenbilt <ejb@ql.org>
* Add --preserve-unreferenced command-line option and

View File

@ -623,7 +623,9 @@ class QPDFObjectHandle
bool in_array, bool in_dictionary,
bool content_stream);
static void parseContentStream_internal(
QPDFObjectHandle stream, ParserCallbacks* callbacks);
PointerHolder<Buffer> stream_data,
std::string const& description,
ParserCallbacks* callbacks);
// Other methods
static void warn(QPDF*, QPDFExc const&);

View File

@ -13,6 +13,7 @@
#include <qpdf/QPDF_Dictionary.hh>
#include <qpdf/QPDF_Stream.hh>
#include <qpdf/QPDF_Reserved.hh>
#include <qpdf/Pl_Buffer.hh>
#include <qpdf/BufferInputSource.hh>
#include <qpdf/QPDFExc.hh>
@ -739,37 +740,63 @@ QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
{
streams.push_back(stream_or_array);
}
Pl_Buffer buf("concatenated stream data buffer");
std::string all_description = "content stream objects";
bool first = true;
for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
iter != streams.end(); ++iter)
{
QPDFObjectHandle stream = *iter;
if (! stream.isStream())
{
throw std::logic_error(
"QPDFObjectHandle: parseContentStream called on non-stream");
QTC::TC("qpdf", "QPDFObjectHandle non-stream in parsecontent");
warn(stream.getOwningQPDF(),
QPDFExc(qpdf_e_damaged_pdf, "content stream",
"", 0,
"ignoring non-stream while parsing content streams"));
}
try
else
{
parseContentStream_internal(stream, callbacks);
}
catch (TerminateParsing&)
{
return;
std::string og = QUtil::int_to_string(stream.getObjectID()) + " " +
QUtil::int_to_string(stream.getGeneration());
std::string description = "content stream object " + og;
if (first)
{
first = false;
}
else
{
all_description += ",";
}
all_description += " " + og;
if (! stream.pipeStreamData(&buf, true, false, false, false))
{
QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
warn(stream.getOwningQPDF(),
QPDFExc(qpdf_e_damaged_pdf, "content stream",
description, 0,
"errors while decoding content stream"));
}
}
}
PointerHolder<Buffer> stream_data = buf.getBuffer();
try
{
parseContentStream_internal(stream_data, all_description, callbacks);
}
catch (TerminateParsing&)
{
return;
}
callbacks->handleEOF();
}
void
QPDFObjectHandle::parseContentStream_internal(QPDFObjectHandle stream,
QPDFObjectHandle::parseContentStream_internal(PointerHolder<Buffer> stream_data,
std::string const& description,
ParserCallbacks* callbacks)
{
stream.assertStream();
PointerHolder<Buffer> stream_data = stream.getStreamData();
size_t length = stream_data->getSize();
std::string description = "content stream object " +
QUtil::int_to_string(stream.getObjectID()) + " " +
QUtil::int_to_string(stream.getGeneration());
PointerHolder<InputSource> input =
new BufferInputSource(description, stream_data.getPointer());
QPDFTokenizer tokenizer;

View File

@ -281,3 +281,5 @@ QPDFObjectHandle no val for last key 0
QPDF resolve failure to null 0
QPDFWriter precheck stream 0
QPDFWriter preserve unreferenced standard 0
QPDFObjectHandle non-stream in parsecontent 0
QPDFObjectHandle errors in parsecontent 0

View File

@ -206,7 +206,7 @@ $td->runtest("remove page we don't have",
show_ntests();
# ----------
$td->notify("--- Miscellaneous Tests ---");
$n_tests += 86;
$n_tests += 88;
$td->runtest("qpdf version",
{$td->COMMAND => "qpdf --version"},
@ -604,6 +604,20 @@ $td->runtest("no trailing space in xref table",
{$td->FILE => "no-space-in-xref.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
# An array is split across multiple content streams starting object
# 42. This was reported in github issue 73. The file is modified from
# that example.
$td->runtest("parse split content stream",
{$td->COMMAND => "qpdf --check split-content-stream.pdf"},
{$td->FILE => "split-content-stream.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->runtest("split content stream errors",
{$td->COMMAND => "qpdf --check split-content-stream-errors.pdf"},
{$td->FILE => "split-content-stream-errors.out",
$td->EXIT_STATUS => 3},
$td->NORMALIZE_NEWLINES);
show_ntests();
# ----------
$td->notify("--- Numeric range parsing tests ---");

View File

@ -2,6 +2,6 @@ checking content-stream-errors.pdf
PDF Version: 1.3
File is not encrypted
File is not linearized
page 1: content stream object 7 0 (content, file position 52): parse error while reading object
page 3: content stream object 15 0 (stream data, file position 117): EOF found while reading inline image
page 4: content stream object 19 0 (content, file position 53): parse error while reading object
page 1: content stream objects 7 0 (content, file position 52): parse error while reading object
page 3: content stream objects 15 0 (stream data, file position 117): EOF found while reading inline image
page 4: content stream objects 19 0 (content, file position 53): parse error while reading object

View File

@ -22,4 +22,4 @@ name: /Fl
name: /DP
dictionary: << /Columns 1 /Predictor 15 >>
operator: ID
content stream object 4 0 (stream data, file position 139): EOF found while reading inline image
content stream objects 4 0 (stream data, file position 139): EOF found while reading inline image

View File

@ -0,0 +1,11 @@
WARNING: split-content-stream-errors.pdf: file is damaged
WARNING: split-content-stream-errors.pdf (file position 802): xref not found
WARNING: split-content-stream-errors.pdf: Attempting to reconstruct cross-reference table
checking split-content-stream-errors.pdf
PDF Version: 1.3
File is not encrypted
File is not linearized
WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
WARNING: content stream: ignoring non-stream while parsing content streams
WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
WARNING: content stream (content stream object 6 0): errors while decoding content stream

View File

@ -0,0 +1,113 @@
%PDF-1.3
%¿÷¢þ
%QDF-1.0
1 0 obj
<<
/Pages 2 0 R
/Type /Catalog
>>
endobj
2 0 obj
<<
/Count 1
/Kids [
3 0 R
]
/Type /Pages
>>
endobj
%% Page 1
3 0 obj
<<
/Contents [
4 0 R
6 0 R
]
/MediaBox [
0
0
612
792
]
/Parent 2 0 R
/Resources <<
/Font <<
/F1 8 0 R
>>
/ProcSet 9 0 R
>>
/Type /Page
>>
endobj
%% Contents for page 1
4 0 obj
<<
/Length 5 0 R
/Oops (Not a stream)
>>
endobj
5 0 obj
44
endobj
%% Contents for page 1
6 0 obj
<<
/Length 7 0 R
/Filter /LZWDecode
>>
stream
BT
/F1 24 Tf
72 720 Td
(Encoding errors) Tj
ET
endstream
endobj
7 0 obj
53
endobj
8 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
/Name /F1
/Subtype /Type1
/Type /Font
>>
endobj
9 0 obj
[
/PDF
/Text
]
endobj
xref
0 10
0000000000 65535 f
0000000025 00000 n
0000000079 00000 n
0000000161 00000 n
0000000396 00000 n
0000000457 00000 n
0000000499 00000 n
0000000630 00000 n
0000000649 00000 n
0000000767 00000 n
trailer <<
/Root 1 0 R
/Size 10
/ID [<cbdd966f9b7b2bb31ad606c532d7cce5><e5f7cff7a542641606230aadd53106a4>]
>>
startxref
802
%%EOF

View File

@ -0,0 +1,6 @@
checking split-content-stream.pdf
PDF Version: 1.4
File is not encrypted
File is not linearized
No syntax or stream encoding errors found; the file may still contain
errors that qpdf cannot detect

File diff suppressed because one or more lines are too long