Add QPDFObjectHandle::parseContentStream method

This method allows parsing of the PDF objects in a content stream or
array of content streams.
This commit is contained in:
Jay Berkenbilt 2013-01-20 15:26:45 -05:00
parent 1d88955fa6
commit f81152311e
17 changed files with 494 additions and 12 deletions

View File

@ -1,5 +1,9 @@
2013-01-20 Jay Berkenbilt <ejb@ql.org>
* Added QPDFObjectHandle::parseContentStream, which parses the
objects in a content stream and calls handlers in a callback
class. The example pdf-parse-content illustrates it use.
* Added QPDF_Keyword and QPDF_InlineImage types along with
appropriate wrapper methods in QPDFObjectHandle. These new object
types are to facilitate content stream parsing.

View File

@ -4,7 +4,8 @@ BINS_examples = \
pdf-npages \
pdf-double-page-size \
pdf-invert-images \
pdf-create
pdf-create \
pdf-parse-content
CBINS_examples = pdf-linearize
TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B)))

View File

@ -0,0 +1,97 @@
#include <iostream>
#include <string.h>
#include <stdlib.h>
#include <qpdf/QPDF.hh>
#include <qpdf/QUtil.hh>
static char const* whoami = 0;
void usage()
{
std::cerr << "Usage: " << whoami << " filename page-number" << std::endl
<< "Prints a dump of the objects in the content streams"
<< " of the given page." << std::endl
<< "Pages are numbered from 1." << std::endl;
exit(2);
}
class ParserCallbacks: public QPDFObjectHandle::ParserCallbacks
{
public:
virtual ~ParserCallbacks()
{
}
virtual void handleObject(QPDFObjectHandle);
virtual void handleEOF();
};
void
ParserCallbacks::handleObject(QPDFObjectHandle obj)
{
if (obj.isInlineImage())
{
std::string val = obj.getInlineImageValue();
std::cout << "inline image: ";
char buf[3];
buf[2] = '\0';
for (size_t i = 0; i < val.length(); ++i)
{
sprintf(buf, "%02x", (unsigned char)(val[i]));
std::cout << buf;
}
std::cout << std::endl;
}
else
{
std::cout << obj.unparse() << std::endl;
}
}
void
ParserCallbacks::handleEOF()
{
std::cout << "-EOF-" << std::endl;
}
int main(int argc, char* argv[])
{
whoami = QUtil::getWhoami(argv[0]);
// For libtool's sake....
if (strncmp(whoami, "lt-", 3) == 0)
{
whoami += 3;
}
if (argc != 3)
{
usage();
}
char const* filename = argv[1];
int pageno = atoi(argv[2]);
try
{
QPDF pdf;
pdf.processFile(filename);
std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
if ((pageno < 1) || (pageno > (int)pages.size()))
{
usage();
}
QPDFObjectHandle page = pages[pageno-1];
QPDFObjectHandle contents = page.getKey("/Contents");
ParserCallbacks cb;
QPDFObjectHandle::parseContentStream(contents, &cb);
}
catch (std::exception& e)
{
std::cerr << whoami << ": " << e.what() << std::endl;
exit(2);
}
return 0;
}

View File

@ -0,0 +1,17 @@
#!/usr/bin/env perl
require 5.008;
BEGIN { $^W = 1; }
use strict;
chdir("parse-content");
require TestDriver;
my $td = new TestDriver('pdf-parse-content');
$td->runtest("parse content",
{$td->COMMAND => "pdf-parse-content input.pdf 1"},
{$td->FILE => "content.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->report(1);

View File

@ -0,0 +1,11 @@
BT
/F1
24
Tf
72
720
Td
(Potato)
Tj
ET
-EOF-

Binary file not shown.

View File

@ -71,6 +71,21 @@ class QPDFObjectHandle
virtual void decryptString(std::string& val) = 0;
};
// This class is used by parseContentStream. Callers must
// instantiate a subclass of this with handlers defined to accept
// QPDFObjectHandles that are parsed from the stream.
class ParserCallbacks
{
public:
QPDF_DLL
virtual ~ParserCallbacks()
{
}
virtual void handleObject(QPDFObjectHandle) = 0;
virtual void handleEOF() = 0;
};
QPDF_DLL
QPDFObjectHandle();
QPDF_DLL
@ -138,6 +153,11 @@ class QPDFObjectHandle
StringDecrypter* decrypter,
QPDF* context);
// Helpers for parsing content streams
QPDF_DLL
static void parseContentStream(QPDFObjectHandle stream_or_array,
ParserCallbacks* callbacks);
// Type-specific factories
QPDF_DLL
static QPDFObjectHandle newNull();
@ -571,7 +591,10 @@ class QPDFObjectHandle
std::string const& object_description,
QPDFTokenizer& tokenizer, bool& empty,
StringDecrypter* decrypter, QPDF* context,
bool in_array, bool in_dictionary);
bool in_array, bool in_dictionary,
bool content_stream);
static void parseContentStream_internal(
QPDFObjectHandle stream, ParserCallbacks* callbacks);
bool initialized;

View File

@ -18,6 +18,8 @@
class QPDFTokenizer
{
public:
// Token type tt_eof is only returned of allowEOF() is called on
// the tokenizer. tt_eof was introduced in QPDF version 4.1.
enum token_type_e
{
tt_bad,
@ -34,6 +36,7 @@ class QPDFTokenizer
tt_null,
tt_bool,
tt_word,
tt_eof,
};
class Token
@ -97,6 +100,12 @@ class QPDFTokenizer
QPDF_DLL
void allowPoundAnywhereInName();
// If called, treat EOF as a separate token type instead of an
// error. This was introduced in QPDF 4.1 to facilitate
// tokenizing content streams.
QPDF_DLL
void allowEOF();
// Mode of operation:
// Keep presenting characters and calling getToken() until
@ -140,6 +149,7 @@ class QPDFTokenizer
st_literal, st_in_hexstring, st_token_ready } state;
bool pound_special_in_name;
bool allow_eof;
// Current token accumulation
token_type_e type;

View File

@ -680,6 +680,106 @@ QPDFObjectHandle::parse(std::string const& object_str,
return result;
}
void
QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
ParserCallbacks* callbacks)
{
std::vector<QPDFObjectHandle> streams;
if (stream_or_array.isArray())
{
streams = stream_or_array.getArrayAsVector();
}
else
{
streams.push_back(stream_or_array);
}
for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
iter != streams.end(); ++iter)
{
QPDFObjectHandle stream = *iter;
if (! stream.isStream())
{
throw std::logic_error(
"QPDFObjectHandle: parseContentStream called on non-stream");
}
parseContentStream_internal(stream, callbacks);
}
callbacks->handleEOF();
}
void
QPDFObjectHandle::parseContentStream_internal(QPDFObjectHandle stream,
ParserCallbacks* callbacks)
{
stream.assertStream();
PointerHolder<Buffer> stream_data = stream.getStreamData();
size_t length = stream_data->getSize();
std::string description = "content stream object " +
QUtil::int_to_string(stream.getObjectID()) + " " +
QUtil::int_to_string(stream.getGeneration());
PointerHolder<InputSource> input =
new BufferInputSource(description, stream_data.getPointer());
QPDFTokenizer tokenizer;
tokenizer.allowEOF();
bool empty = false;
while ((size_t) input->tell() < length)
{
QPDFObjectHandle obj =
parseInternal(input, "content", tokenizer, empty,
0, 0, false, false, true);
if (! obj.isInitialized())
{
// EOF
break;
}
callbacks->handleObject(obj);
if (obj.isKeyword() && (obj.getKeywordValue() == "ID"))
{
// Discard next character; it is the space after ID that
// terminated the token. Read until end of inline image.
char ch;
input->read(&ch, 1);
char buf[4];
memset(buf, '\0', sizeof(buf));
bool done = false;
std::string inline_image;
while (! done)
{
if (input->read(&ch, 1) == 0)
{
QTC::TC("qpdf", "QPDFObjectHandle EOF in inline image");
throw QPDFExc(qpdf_e_damaged_pdf, input->getName(),
"stream data", input->tell(),
"EOF found while reading inline image");
}
inline_image += ch;
memmove(buf, buf + 1, sizeof(buf) - 1);
buf[sizeof(buf) - 1] = ch;
if (strchr(" \t\n\v\f\r", buf[0]) &&
(buf[1] == 'E') &&
(buf[2] == 'I') &&
strchr(" \t\n\v\f\r", buf[3]))
{
// We've found an EI operator.
done = true;
input->seek(-3, SEEK_CUR);
for (int i = 0; i < 4; ++i)
{
if (inline_image.length() > 0)
{
inline_image.erase(inline_image.length() - 1);
}
}
}
}
QTC::TC("qpdf", "QPDFObjectHandle inline image token");
callbacks->handleObject(
QPDFObjectHandle::newInlineImage(inline_image));
}
}
}
QPDFObjectHandle
QPDFObjectHandle::parse(PointerHolder<InputSource> input,
std::string const& object_description,
@ -687,7 +787,7 @@ QPDFObjectHandle::parse(PointerHolder<InputSource> input,
StringDecrypter* decrypter, QPDF* context)
{
return parseInternal(input, object_description, tokenizer, empty,
decrypter, context, false, false);
decrypter, context, false, false, false);
}
QPDFObjectHandle
@ -695,7 +795,8 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
std::string const& object_description,
QPDFTokenizer& tokenizer, bool& empty,
StringDecrypter* decrypter, QPDF* context,
bool in_array, bool in_dictionary)
bool in_array, bool in_dictionary,
bool content_stream)
{
empty = false;
if (in_dictionary && in_array)
@ -721,6 +822,21 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
switch (token.getType())
{
case QPDFTokenizer::tt_eof:
if (content_stream)
{
// Return uninitialized object to indicate EOF
return object;
}
else
{
// When not in content stream mode, EOF is tt_bad and
// throws an exception before we get here.
throw std::logic_error(
"EOF received while not in content stream mode");
}
break;
case QPDFTokenizer::tt_brace_open:
case QPDFTokenizer::tt_brace_close:
// Don't know what to do with these for now
@ -764,13 +880,13 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
case QPDFTokenizer::tt_array_open:
object = parseInternal(
input, object_description, tokenizer, empty,
decrypter, context, true, false);
decrypter, context, true, false, content_stream);
break;
case QPDFTokenizer::tt_dict_open:
object = parseInternal(
input, object_description, tokenizer, empty,
decrypter, context, false, true);
decrypter, context, false, true, content_stream);
break;
case QPDFTokenizer::tt_bool:
@ -826,6 +942,10 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
input->seek(input->getLastOffset(), SEEK_SET);
empty = true;
}
else if (content_stream)
{
object = QPDFObjectHandle::newKeyword(token.getValue());
}
else
{
throw QPDFExc(qpdf_e_damaged_pdf, input->getName(),

View File

@ -22,7 +22,8 @@ static bool is_space(char ch)
}
QPDFTokenizer::QPDFTokenizer() :
pound_special_in_name(true)
pound_special_in_name(true),
allow_eof(false)
{
reset();
}
@ -34,6 +35,12 @@ QPDFTokenizer::allowPoundAnywhereInName()
this->pound_special_in_name = false;
}
void
QPDFTokenizer::allowEOF()
{
this->allow_eof = true;
}
void
QPDFTokenizer::reset()
{
@ -441,9 +448,17 @@ QPDFTokenizer::presentEOF()
}
else if (state != st_token_ready)
{
QTC::TC("qpdf", "QPDF_Tokenizer EOF reading token");
type = tt_bad;
error_message = "EOF while reading token";
QTC::TC("qpdf", "QPDF_Tokenizer EOF reading token",
this->allow_eof ? 1 : 0);
if (this->allow_eof)
{
type = tt_eof;
}
else
{
type = tt_bad;
error_message = "EOF while reading token";
}
}
state = st_token_ready;

View File

@ -236,7 +236,7 @@ QPDFWriter copy use_aes 1
QPDFObjectHandle indirect without context 0
QPDFObjectHandle trailing data in parse 0
qpdf pages encryption password 0
QPDF_Tokenizer EOF reading token 0
QPDF_Tokenizer EOF reading token 1
QPDF_Tokenizer EOF reading appendable token 0
QPDFWriter extra header text no newline 0
QPDFWriter extra header text add newline 0
@ -259,3 +259,5 @@ QPDFWriter remove Crypt 0
qpdf-c called qpdf_get_pdf_extension_level 0
qpdf-c called qpdf_set_r5_encryption_parameters 0
qpdf-c called qpdf_set_r6_encryption_parameters 0
QPDFObjectHandle EOF in inline image 0
QPDFObjectHandle inline image token 0

View File

@ -199,7 +199,7 @@ $td->runtest("remove page we don't have",
show_ntests();
# ----------
$td->notify("--- Miscellaneous Tests ---");
$n_tests += 57;
$n_tests += 59;
$td->runtest("qpdf version",
{$td->COMMAND => "qpdf --version"},
@ -468,6 +468,16 @@ $td->runtest("check file with leading junk",
{$td->COMMAND => "qpdf --check leading-junk.pdf"},
{$td->FILE => "leading-junk.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->runtest("EOF inside inline image",
{$td->COMMAND => "test_driver 37 eof-in-inline-image.pdf"},
{$td->FILE => "eof-in-inline-image.out",
$td->EXIT_STATUS => 2},
$td->NORMALIZE_NEWLINES);
$td->runtest("tokenize content streams",
{$td->COMMAND => "test_driver 37 tokenize-content-streams.pdf"},
{$td->FILE => "tokenize-content-streams.out",
$td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
show_ntests();
# ----------

View File

@ -0,0 +1,25 @@
BT
/F1
24
Tf
72
720
Td
(Potato)
Tj
ET
BI
/CS
/G
/W
1
/H
1
/BPC
8
/F
/Fl
/DP
<< /Columns 1 /Predictor 15 >>
ID
content stream object 4 0 (stream data, file position 139): EOF found while reading inline image

Binary file not shown.

View File

@ -0,0 +1,95 @@
BT
/F1
24
Tf
72
720
Td
(Potato)
Tj
ET
-EOF-
0.1
0
0
0.1
0
0
cm
q
0
1.1999
-1.1999
0
121.19
150.009
cm
BI
/CS
/G
/W
1
/H
1
/BPC
8
/F
/Fl
/DP
<< /Columns 1 /Predictor 15 >>
ID
inline image: 789c63fc0f0001030101
EI
Q
q
0
35.997
-128.389
0
431.964
7269.02
cm
BI
/CS
/G
/W
30
/H
107
/BPC
8
/F
/Fl
/DP
<< /Columns 30 /Predictor 15 >>
ID
inline image: 789cedd1a11100300800b1b2ffd06503148283bc8dfcf8af2a306ee352eff2e06318638c31c63b3801627b620a
EI
Q
q
0
38.3968
-93.5922
0
431.964
7567.79
cm
BI
/CS
/G
/W
32
/H
78
/BPC
8
/F
/Fl
/DP
<< /Columns 32 /Predictor 15 >>
ID
inline image: 789c63fccf801f308e2a185530aa60882a20203faa605401890a0643aa1e5530aa6054010d140000bdd03c13
EI
Q
-EOF-
test 37 done

Binary file not shown.

View File

@ -58,6 +58,45 @@ class Provider: public QPDFObjectHandle::StreamDataProvider
bool bad_length;
};
class ParserCallbacks: public QPDFObjectHandle::ParserCallbacks
{
public:
virtual ~ParserCallbacks()
{
}
virtual void handleObject(QPDFObjectHandle);
virtual void handleEOF();
};
void
ParserCallbacks::handleObject(QPDFObjectHandle obj)
{
if (obj.isInlineImage())
{
std::string val = obj.getInlineImageValue();
std::cout << "inline image: ";
char buf[3];
buf[2] = '\0';
for (size_t i = 0; i < val.length(); ++i)
{
sprintf(buf, "%02x", (unsigned char)(val[i]));
std::cout << buf;
}
std::cout << std::endl;
}
else
{
std::cout << obj.unparse() << std::endl;
}
}
void
ParserCallbacks::handleEOF()
{
std::cout << "-EOF-" << std::endl;
}
static std::string getPageContents(QPDFObjectHandle page)
{
PointerHolder<Buffer> b1 =
@ -1245,6 +1284,19 @@ void runtest(int n, char const* filename1, char const* arg2)
}
}
}
else if (n == 37)
{
// Parse content streams of all pages
std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin();
iter != pages.end(); ++iter)
{
QPDFObjectHandle page = *iter;
QPDFObjectHandle contents = page.getKey("/Contents");
ParserCallbacks cb;
QPDFObjectHandle::parseContentStream(contents, &cb);
}
}
else
{
throw std::runtime_error(std::string("invalid test ") +