Pass offset and length to ParserCallbacks::handleObject

This commit is contained in:
Jay Berkenbilt 2019-08-22 19:16:25 -04:00
parent 4b2e72c4cd
commit 3f1ab64066
11 changed files with 326 additions and 219 deletions

View File

@ -1,5 +1,19 @@
2019-08-22 Jay Berkenbilt <ejb@ql.org>
* In QPDFObjectHandle::ParserCallbacks, in addition to
handleObject(QPDFObjectHandle), allow developers to override
handleObject(QPDFObjectHandle, size_t offset, size_t length). If
this method appears instead, it is called with the offset of the
object in the content stream (which may be concatenated from an
array of streams) and the length of the object. Intervening
whitespace and comments are not included in offset and length.
* Add method
QPDFObjectHandle::ParserCallbacks::contentSize(size_t). If
defined, it is called by the content stream parser before the
first call to handleObject, and the argument is the total size in
bytes of the content streams.
* Add QPDFObjectHandle::isDirectNull() -- a const method that
allows determining whether an object is a literal null without
attempting to resolve it.

View File

@ -26,14 +26,23 @@ class ParserCallbacks: public QPDFObjectHandle::ParserCallbacks
{
}
virtual void handleObject(QPDFObjectHandle);
virtual void contentSize(size_t);
virtual void handleObject(QPDFObjectHandle, size_t offset, size_t length);
virtual void handleEOF();
};
void
ParserCallbacks::handleObject(QPDFObjectHandle obj)
ParserCallbacks::contentSize(size_t size)
{
std::cout << obj.getTypeName() << ": ";
std::cout << "content size: " << size << std::endl;
}
void
ParserCallbacks::handleObject(QPDFObjectHandle obj,
size_t offset, size_t length)
{
std::cout << obj.getTypeName() << ", offset=" << offset
<< ", length=" << length << ": ";
if (obj.isInlineImage())
{
std::cout << QUtil::hex_encode(obj.getInlineImageValue()) << std::endl;

View File

@ -1,11 +1,12 @@
operator: BT
name: /F1
integer: 24
operator: Tf
integer: 72
integer: 720
operator: Td
string: (Potato)
operator: Tj
operator: ET
content size: 44
operator, offset=0, length=2: BT
name, offset=5, length=3: /F1
integer, offset=9, length=2: 24
operator, offset=12, length=2: Tf
integer, offset=17, length=2: 72
integer, offset=20, length=3: 720
operator, offset=24, length=2: Td
string, offset=29, length=8: (Potato)
operator, offset=38, length=2: Tj
operator, offset=41, length=2: ET
-EOF-

View File

@ -159,16 +159,28 @@ class QPDFObjectHandle
// This class is used by parsePageContents. Callers must
// instantiate a subclass of this with handlers defined to accept
// QPDFObjectHandles that are parsed from the stream.
class ParserCallbacks
class QPDF_DLL_CLASS ParserCallbacks
{
public:
QPDF_DLL
virtual ~ParserCallbacks()
{
}
virtual void handleObject(QPDFObjectHandle) = 0;
// One of the handleObject methods must be overridden.
QPDF_DLL
virtual void handleObject(QPDFObjectHandle);
QPDF_DLL
virtual void handleObject(
QPDFObjectHandle, size_t offset, size_t length);
virtual void handleEOF() = 0;
// Override this if you want to know the full size of the
// contents, possibly after concatenation of multiple streams.
// This is called before the first call to handleObject.
QPDF_DLL
virtual void contentSize(size_t);
protected:
// Implementors may call this method during parsing to
// terminate parsing early. This method throws an exception

View File

@ -105,6 +105,29 @@ QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token)
write(value.c_str(), value.length());
}
void
QPDFObjectHandle::ParserCallbacks::handleObject(QPDFObjectHandle)
{
throw std::logic_error("You must override one of the"
" handleObject methods in ParserCallbacks");
}
void
QPDFObjectHandle::ParserCallbacks::handleObject(
QPDFObjectHandle oh, size_t, size_t)
{
// This version of handleObject was added in qpdf 9. If the
// developer did not override it, fall back to the older
// interface.
handleObject(oh);
}
void
QPDFObjectHandle::ParserCallbacks::contentSize(size_t)
{
// Ignore by default; overriding this is optional.
}
void
QPDFObjectHandle::ParserCallbacks::terminateParsing()
{
@ -1615,6 +1638,7 @@ QPDFObjectHandle::parseContentStream_internal(
std::string all_description;
pipeContentStreams(&buf, description, all_description);
PointerHolder<Buffer> stream_data = buf.getBuffer();
callbacks->contentSize(stream_data->getSize());
try
{
parseContentStream_data(stream_data, all_description,
@ -1642,6 +1666,13 @@ QPDFObjectHandle::parseContentStream_data(
bool empty = false;
while (QIntC::to_size(input->tell()) < length)
{
// Read a token and seek to the beginning. The offset we get
// from this process is the beginning of the next
// non-ignorable (space, comment) token. This way, the offset
// and don't including ignorable content.
tokenizer.readToken(input, "content", true);
qpdf_offset_t offset = input->getLastOffset();
input->seek(offset, SEEK_SET);
QPDFObjectHandle obj =
parseInternal(input, "content", tokenizer,
empty, 0, context, true);
@ -1650,8 +1681,9 @@ QPDFObjectHandle::parseContentStream_data(
// EOF
break;
}
size_t length = QIntC::to_size(input->tell() - offset);
callbacks->handleObject(obj);
callbacks->handleObject(obj, QIntC::to_size(offset), length);
if (obj.isOperator() && (obj.getOperatorValue() == "ID"))
{
// Discard next character; it is the space after ID that
@ -1661,6 +1693,8 @@ QPDFObjectHandle::parseContentStream_data(
tokenizer.expectInlineImage(input);
QPDFTokenizer::Token t =
tokenizer.readToken(input, description, true);
offset = input->getLastOffset();
length = QIntC::to_size(input->tell() - offset);
if (t.getType() == QPDFTokenizer::tt_bad)
{
QTC::TC("qpdf", "QPDFObjectHandle EOF in inline image");
@ -1674,7 +1708,8 @@ QPDFObjectHandle::parseContentStream_data(
std::string inline_image = t.getValue();
QTC::TC("qpdf", "QPDFObjectHandle inline image token");
callbacks->handleObject(
QPDFObjectHandle::newInlineImage(inline_image));
QPDFObjectHandle::newInlineImage(inline_image),
QIntC::to_size(offset), length);
}
}
}

View File

@ -4489,6 +4489,25 @@ print "\n";
<function>getUIntValueAsUInt</function>.
</para>
</listitem>
<listitem>
<para>
When parsing content streams with
<classname>QPDFObjectHandle::ParserCallbacks</classname>, in
place of the method
<function>handleObject(QPDFObjectHandle)</function>, the
developer may override
<function>handleObject(QPDFObjectHandle, size_t offset,
size_t length)</function>. If this method is defined, it
will be invoked with the object along with its offset and
length within the overall contents being parsed. Intervening
spaces and comments are not included in offset and length.
Additionally, a new method
<function>contentSize(size_t)</function> may be implemented.
If present, it will be called prior to the first call to
<function>handleObject</function> with the total size in
bytes of the combined contents.
</para>
</listitem>
<listitem>
<para>
The underlying implementation of QPDF arrays has been

View File

@ -1,27 +1,28 @@
operator: BT
name: /F1
integer: 24
operator: Tf
integer: 72
integer: 720
operator: Td
string: (Potato)
operator: Tj
operator: ET
operator: BI
name: /CS
name: /G
name: /W
integer: 1
name: /H
integer: 1
name: /BPC
integer: 8
name: /F
name: /Fl
name: /DP
dictionary: << /Columns 1 /Predictor 15 >>
operator: ID
content size: 139
operator, offset=0, length=2: BT
name, offset=5, length=3: /F1
integer, offset=9, length=2: 24
operator, offset=12, length=2: Tf
integer, offset=17, length=2: 72
integer, offset=20, length=3: 720
operator, offset=24, length=2: Td
string, offset=29, length=8: (Potato)
operator, offset=38, length=2: Tj
operator, offset=41, length=2: ET
operator, offset=66, length=2: BI
name, offset=69, length=3: /CS
name, offset=73, length=2: /G
name, offset=75, length=2: /W
integer, offset=78, length=1: 1
name, offset=79, length=2: /H
integer, offset=82, length=1: 1
name, offset=83, length=4: /BPC
integer, offset=88, length=1: 8
name, offset=89, length=2: /F
name, offset=91, length=3: /Fl
name, offset=94, length=3: /DP
dictionary, offset=97, length=27: << /Columns 1 /Predictor 15 >>
operator, offset=125, length=2: ID
WARNING: page object 3 0 stream 4 0 (stream data, offset 139): EOF found while reading inline image
-EOF-
test 37 done

View File

@ -1,86 +1,88 @@
name: /potato
content size: 44
name, offset=0, length=7: /potato
test suite: terminating parsing
real: 0.1
integer: 0
integer: 0
real: 0.1
integer: 0
integer: 0
operator: cm
operator: q
integer: 0
real: 1.1999
real: -1.1999
integer: 0
real: 121.19
real: 150.009
operator: cm
operator: BI
name: /CS
name: /G
name: /W
integer: 1
name: /H
integer: 1
name: /BPC
integer: 8
name: /F
name: /Fl
name: /DP
dictionary: << /Columns 1 /Predictor 15 >>
operator: ID
inline-image: 789c63fc0f00010301010a
operator: EI
operator: Q
operator: q
integer: 0
real: 35.997
real: -128.389
integer: 0
real: 431.964
real: 7269.02
operator: cm
operator: BI
name: /CS
name: /G
name: /W
integer: 30
name: /H
integer: 107
name: /BPC
integer: 8
name: /F
name: /Fl
name: /DP
dictionary: << /Columns 30 /Predictor 15 >>
operator: ID
inline-image: 789cedd1a11100300800b1b2ffd06503148283bc8dfcf8af2a306ee352eff2e06318638c31c63b3801627b620a0a
operator: EI
operator: Q
operator: q
integer: 0
real: 38.3968
real: -93.5922
integer: 0
real: 431.964
real: 7567.79
operator: cm
operator: BI
name: /CS
name: /G
name: /W
integer: 32
name: /H
integer: 78
name: /BPC
integer: 8
name: /F
name: /Fl
name: /DP
dictionary: << /Columns 32 /Predictor 15 >>
operator: ID
inline-image: 789c63fccf801f308e2a185530aa60882a20203faa605401890a0643aa1e5530aa6054010d140000bdd03c130a
operator: EI
operator: Q
content size: 454
real, offset=0, length=3: 0.1
integer, offset=4, length=1: 0
integer, offset=6, length=1: 0
real, offset=8, length=3: 0.1
integer, offset=12, length=1: 0
integer, offset=14, length=1: 0
operator, offset=16, length=2: cm
operator, offset=19, length=1: q
integer, offset=21, length=1: 0
real, offset=23, length=6: 1.1999
real, offset=30, length=7: -1.1999
integer, offset=38, length=2: 0
real, offset=41, length=6: 121.19
real, offset=48, length=7: 150.009
operator, offset=56, length=2: cm
operator, offset=59, length=2: BI
name, offset=62, length=3: /CS
name, offset=66, length=2: /G
name, offset=68, length=2: /W
integer, offset=71, length=1: 1
name, offset=72, length=2: /H
integer, offset=75, length=1: 1
name, offset=76, length=4: /BPC
integer, offset=81, length=1: 8
name, offset=82, length=2: /F
name, offset=84, length=3: /Fl
name, offset=87, length=3: /DP
dictionary, offset=90, length=27: << /Columns 1 /Predictor 15 >>
operator, offset=118, length=2: ID
inline-image, offset=121, length=11: 789c63fc0f00010301010a
operator, offset=132, length=2: EI
operator, offset=135, length=1: Q
operator, offset=137, length=1: q
integer, offset=139, length=1: 0
real, offset=141, length=6: 35.997
real, offset=148, length=8: -128.389
integer, offset=157, length=2: 0
real, offset=160, length=7: 431.964
real, offset=168, length=7: 7269.02
operator, offset=176, length=2: cm
operator, offset=179, length=2: BI
name, offset=182, length=3: /CS
name, offset=186, length=2: /G
name, offset=188, length=2: /W
integer, offset=191, length=2: 30
name, offset=193, length=2: /H
integer, offset=196, length=3: 107
name, offset=199, length=4: /BPC
integer, offset=204, length=1: 8
name, offset=205, length=2: /F
name, offset=207, length=3: /Fl
name, offset=210, length=3: /DP
dictionary, offset=213, length=28: << /Columns 30 /Predictor 15 >>
operator, offset=242, length=2: ID
inline-image, offset=245, length=46: 789cedd1a11100300800b1b2ffd06503148283bc8dfcf8af2a306ee352eff2e06318638c31c63b3801627b620a0a
operator, offset=291, length=2: EI
operator, offset=294, length=1: Q
operator, offset=296, length=1: q
integer, offset=298, length=1: 0
real, offset=300, length=7: 38.3968
real, offset=308, length=8: -93.5922
integer, offset=317, length=2: 0
real, offset=320, length=7: 431.964
real, offset=328, length=7: 7567.79
operator, offset=336, length=2: cm
operator, offset=339, length=2: BI
name, offset=342, length=3: /CS
name, offset=346, length=2: /G
name, offset=348, length=2: /W
integer, offset=351, length=2: 32
name, offset=353, length=2: /H
integer, offset=356, length=2: 78
name, offset=358, length=4: /BPC
integer, offset=363, length=1: 8
name, offset=364, length=2: /F
name, offset=366, length=3: /Fl
name, offset=369, length=3: /DP
dictionary, offset=372, length=28: << /Columns 32 /Predictor 15 >>
operator, offset=401, length=2: ID
inline-image, offset=404, length=45: 789c63fccf801f308e2a185530aa60882a20203faa605401890a0643aa1e5530aa6054010d140000bdd03c130a
operator, offset=449, length=2: EI
operator, offset=452, length=1: Q
-EOF-
test 37 done

View File

@ -1,95 +1,100 @@
operator: BT
name: /F1
integer: 24
operator: Tf
integer: 72
integer: 720
operator: Td
string: (Potato)
operator: Tj
operator: ET
content size: 44
operator, offset=0, length=2: BT
name, offset=5, length=3: /F1
integer, offset=9, length=2: 24
operator, offset=12, length=2: Tf
integer, offset=17, length=2: 72
integer, offset=20, length=3: 720
operator, offset=24, length=2: Td
string, offset=29, length=8: (Potato)
operator, offset=38, length=2: Tj
operator, offset=41, length=2: ET
-EOF-
real: 0.1
integer: 0
integer: 0
real: 0.1
integer: 0
integer: 0
operator: cm
operator: q
integer: 0
real: 1.1999
real: -1.1999
integer: 0
real: 121.19
real: 150.009
operator: cm
operator: BI
name: /CS
name: /G
name: /W
integer: 1
name: /H
integer: 1
name: /BPC
integer: 8
name: /F
name: /Fl
name: /DP
dictionary: << /Columns 1 /Predictor 15 >>
operator: ID
inline-image: 789c63fc0f00010301010a
operator: EI
operator: Q
operator: q
integer: 0
real: 35.997
real: -128.389
integer: 0
real: 431.964
real: 7269.02
operator: cm
operator: BI
name: /CS
name: /G
name: /W
integer: 30
name: /H
integer: 107
name: /BPC
integer: 8
name: /F
name: /Fl
name: /DP
dictionary: << /Columns 30 /Predictor 15 >>
operator: ID
inline-image: 789cedd1a11100300800b1b2ffd06503148283bc8dfcf8af2a306ee352eff2e06318638c31c63b3801627b620a0a
operator: EI
operator: Q
operator: q
integer: 0
real: 38.3968
real: -93.5922
integer: 0
real: 431.964
real: 7567.79
operator: cm
operator: BI
name: /CS
name: /G
name: /W
integer: 32
name: /H
integer: 78
name: /BPC
integer: 8
name: /F
name: /Fl
name: /DP
dictionary: << /Columns 32 /Predictor 15 >>
operator: ID
inline-image: 789c63fccf801f308e2a185530aa60882a20203faa605401890a0643aa1e5530aa6054010d140000bdd03c130a
operator: EI
operator: Q
content size: 490
real, offset=0, length=3: 0.1
integer, offset=4, length=1: 0
integer, offset=6, length=1: 0
real, offset=8, length=3: 0.1
integer, offset=12, length=1: 0
integer, offset=14, length=1: 0
operator, offset=16, length=2: cm
operator, offset=19, length=1: q
integer, offset=21, length=1: 0
real, offset=23, length=6: 1.1999
real, offset=30, length=7: -1.1999
integer, offset=38, length=2: 0
real, offset=41, length=6: 121.19
real, offset=48, length=7: 150.009
operator, offset=56, length=2: cm
operator, offset=59, length=2: BI
name, offset=62, length=3: /CS
name, offset=66, length=2: /G
name, offset=68, length=2: /W
integer, offset=71, length=1: 1
name, offset=72, length=2: /H
integer, offset=75, length=1: 1
name, offset=76, length=4: /BPC
integer, offset=81, length=1: 8
name, offset=82, length=2: /F
name, offset=84, length=3: /Fl
name, offset=87, length=3: /DP
dictionary, offset=90, length=27: << /Columns 1 /Predictor 15 >>
operator, offset=118, length=2: ID
inline-image, offset=121, length=11: 789c63fc0f00010301010a
operator, offset=132, length=2: EI
operator, offset=135, length=1: Q
operator, offset=137, length=1: q
integer, offset=139, length=1: 0
real, offset=141, length=6: 35.997
real, offset=148, length=8: -128.389
integer, offset=157, length=2: 0
real, offset=160, length=7: 431.964
real, offset=168, length=7: 7269.02
operator, offset=176, length=2: cm
operator, offset=179, length=2: BI
name, offset=182, length=3: /CS
name, offset=186, length=2: /G
name, offset=188, length=2: /W
integer, offset=191, length=2: 30
name, offset=193, length=2: /H
integer, offset=196, length=3: 107
name, offset=199, length=4: /BPC
integer, offset=204, length=1: 8
name, offset=205, length=2: /F
name, offset=207, length=3: /Fl
name, offset=210, length=3: /DP
dictionary, offset=214, length=28: << /Columns 30 /Predictor 15 >>
operator, offset=243, length=2: ID
inline-image, offset=246, length=46: 789cedd1a11100300800b1b2ffd06503148283bc8dfcf8af2a306ee352eff2e06318638c31c63b3801627b620a0a
operator, offset=292, length=2: EI
operator, offset=295, length=1: Q
operator, offset=297, length=1: q
array, offset=299, length=30: [ 1 /two (three) << /four 5 >> ]
operator, offset=330, length=1: Q
operator, offset=332, length=1: q
integer, offset=334, length=1: 0
real, offset=336, length=7: 38.3968
real, offset=344, length=8: -93.5922
integer, offset=353, length=2: 0
real, offset=356, length=7: 431.964
real, offset=364, length=7: 7567.79
operator, offset=372, length=2: cm
operator, offset=375, length=2: BI
name, offset=378, length=3: /CS
name, offset=382, length=2: /G
name, offset=384, length=2: /W
integer, offset=387, length=2: 32
name, offset=389, length=2: /H
integer, offset=392, length=2: 78
name, offset=394, length=4: /BPC
integer, offset=399, length=1: 8
name, offset=400, length=2: /F
name, offset=402, length=3: /Fl
name, offset=405, length=3: /DP
dictionary, offset=408, length=28: << /Columns 32 /Predictor 15 >>
operator, offset=437, length=2: ID
inline-image, offset=440, length=45: 789c63fccf801f308e2a185530aa60882a20203faa605401890a0643aa1e5530aa6054010d140000bdd03c130a
operator, offset=485, length=2: EI
operator, offset=488, length=1: Q
-EOF-
test 37 done

View File

@ -76,19 +76,28 @@ class ParserCallbacks: public QPDFObjectHandle::ParserCallbacks
{
}
virtual void handleObject(QPDFObjectHandle);
virtual void contentSize(size_t size);
virtual void handleObject(QPDFObjectHandle, size_t, size_t);
virtual void handleEOF();
};
void
ParserCallbacks::handleObject(QPDFObjectHandle obj)
ParserCallbacks::contentSize(size_t size)
{
std::cout << "content size: " << size << std::endl;
}
void
ParserCallbacks::handleObject(QPDFObjectHandle obj,
size_t offset, size_t length)
{
if (obj.isName() && (obj.getName() == "/Abort"))
{
std::cout << "test suite: terminating parsing" << std::endl;
terminateParsing();
}
std::cout << obj.getTypeName() << ": ";
std::cout << obj.getTypeName() << ", offset=" << offset
<< ", length=" << length << ": ";
if (obj.isInlineImage())
{
// Exercise getTypeCode