2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-05-31 17:30:54 +00:00

Add QPDFObjectHandle::parseContentStream method

This method allows parsing of the PDF objects in a content stream or
array of content streams.
This commit is contained in:
Jay Berkenbilt 2013-01-20 15:26:45 -05:00
parent 1d88955fa6
commit f81152311e
17 changed files with 494 additions and 12 deletions

View File

@ -1,5 +1,9 @@
2013-01-20 Jay Berkenbilt <ejb@ql.org> 2013-01-20 Jay Berkenbilt <ejb@ql.org>
* Added QPDFObjectHandle::parseContentStream, which parses the
objects in a content stream and calls handlers in a callback
class. The example pdf-parse-content illustrates it use.
* Added QPDF_Keyword and QPDF_InlineImage types along with * Added QPDF_Keyword and QPDF_InlineImage types along with
appropriate wrapper methods in QPDFObjectHandle. These new object appropriate wrapper methods in QPDFObjectHandle. These new object
types are to facilitate content stream parsing. types are to facilitate content stream parsing.

View File

@ -4,7 +4,8 @@ BINS_examples = \
pdf-npages \ pdf-npages \
pdf-double-page-size \ pdf-double-page-size \
pdf-invert-images \ pdf-invert-images \
pdf-create pdf-create \
pdf-parse-content
CBINS_examples = pdf-linearize CBINS_examples = pdf-linearize
TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B))) TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B)))

View File

@ -0,0 +1,97 @@
#include <iostream>
#include <string.h>
#include <stdlib.h>
#include <qpdf/QPDF.hh>
#include <qpdf/QUtil.hh>
static char const* whoami = 0;
void usage()
{
std::cerr << "Usage: " << whoami << " filename page-number" << std::endl
<< "Prints a dump of the objects in the content streams"
<< " of the given page." << std::endl
<< "Pages are numbered from 1." << std::endl;
exit(2);
}
class ParserCallbacks: public QPDFObjectHandle::ParserCallbacks
{
public:
virtual ~ParserCallbacks()
{
}
virtual void handleObject(QPDFObjectHandle);
virtual void handleEOF();
};
void
ParserCallbacks::handleObject(QPDFObjectHandle obj)
{
if (obj.isInlineImage())
{
std::string val = obj.getInlineImageValue();
std::cout << "inline image: ";
char buf[3];
buf[2] = '\0';
for (size_t i = 0; i < val.length(); ++i)
{
sprintf(buf, "%02x", (unsigned char)(val[i]));
std::cout << buf;
}
std::cout << std::endl;
}
else
{
std::cout << obj.unparse() << std::endl;
}
}
void
ParserCallbacks::handleEOF()
{
std::cout << "-EOF-" << std::endl;
}
int main(int argc, char* argv[])
{
whoami = QUtil::getWhoami(argv[0]);
// For libtool's sake....
if (strncmp(whoami, "lt-", 3) == 0)
{
whoami += 3;
}
if (argc != 3)
{
usage();
}
char const* filename = argv[1];
int pageno = atoi(argv[2]);
try
{
QPDF pdf;
pdf.processFile(filename);
std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
if ((pageno < 1) || (pageno > (int)pages.size()))
{
usage();
}
QPDFObjectHandle page = pages[pageno-1];
QPDFObjectHandle contents = page.getKey("/Contents");
ParserCallbacks cb;
QPDFObjectHandle::parseContentStream(contents, &cb);
}
catch (std::exception& e)
{
std::cerr << whoami << ": " << e.what() << std::endl;
exit(2);
}
return 0;
}

View File

@ -0,0 +1,17 @@
#!/usr/bin/env perl
require 5.008;
BEGIN { $^W = 1; }
use strict;
chdir("parse-content");
require TestDriver;
my $td = new TestDriver('pdf-parse-content');
$td->runtest("parse content",
{$td->COMMAND => "pdf-parse-content input.pdf 1"},
{$td->FILE => "content.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->report(1);

View File

@ -0,0 +1,11 @@
BT
/F1
24
Tf
72
720
Td
(Potato)
Tj
ET
-EOF-

Binary file not shown.

View File

@ -71,6 +71,21 @@ class QPDFObjectHandle
virtual void decryptString(std::string& val) = 0; virtual void decryptString(std::string& val) = 0;
}; };
// This class is used by parseContentStream. Callers must
// instantiate a subclass of this with handlers defined to accept
// QPDFObjectHandles that are parsed from the stream.
class ParserCallbacks
{
public:
QPDF_DLL
virtual ~ParserCallbacks()
{
}
virtual void handleObject(QPDFObjectHandle) = 0;
virtual void handleEOF() = 0;
};
QPDF_DLL QPDF_DLL
QPDFObjectHandle(); QPDFObjectHandle();
QPDF_DLL QPDF_DLL
@ -138,6 +153,11 @@ class QPDFObjectHandle
StringDecrypter* decrypter, StringDecrypter* decrypter,
QPDF* context); QPDF* context);
// Helpers for parsing content streams
QPDF_DLL
static void parseContentStream(QPDFObjectHandle stream_or_array,
ParserCallbacks* callbacks);
// Type-specific factories // Type-specific factories
QPDF_DLL QPDF_DLL
static QPDFObjectHandle newNull(); static QPDFObjectHandle newNull();
@ -571,7 +591,10 @@ class QPDFObjectHandle
std::string const& object_description, std::string const& object_description,
QPDFTokenizer& tokenizer, bool& empty, QPDFTokenizer& tokenizer, bool& empty,
StringDecrypter* decrypter, QPDF* context, StringDecrypter* decrypter, QPDF* context,
bool in_array, bool in_dictionary); bool in_array, bool in_dictionary,
bool content_stream);
static void parseContentStream_internal(
QPDFObjectHandle stream, ParserCallbacks* callbacks);
bool initialized; bool initialized;

View File

@ -18,6 +18,8 @@
class QPDFTokenizer class QPDFTokenizer
{ {
public: public:
// Token type tt_eof is only returned of allowEOF() is called on
// the tokenizer. tt_eof was introduced in QPDF version 4.1.
enum token_type_e enum token_type_e
{ {
tt_bad, tt_bad,
@ -34,6 +36,7 @@ class QPDFTokenizer
tt_null, tt_null,
tt_bool, tt_bool,
tt_word, tt_word,
tt_eof,
}; };
class Token class Token
@ -97,6 +100,12 @@ class QPDFTokenizer
QPDF_DLL QPDF_DLL
void allowPoundAnywhereInName(); void allowPoundAnywhereInName();
// If called, treat EOF as a separate token type instead of an
// error. This was introduced in QPDF 4.1 to facilitate
// tokenizing content streams.
QPDF_DLL
void allowEOF();
// Mode of operation: // Mode of operation:
// Keep presenting characters and calling getToken() until // Keep presenting characters and calling getToken() until
@ -140,6 +149,7 @@ class QPDFTokenizer
st_literal, st_in_hexstring, st_token_ready } state; st_literal, st_in_hexstring, st_token_ready } state;
bool pound_special_in_name; bool pound_special_in_name;
bool allow_eof;
// Current token accumulation // Current token accumulation
token_type_e type; token_type_e type;

View File

@ -680,6 +680,106 @@ QPDFObjectHandle::parse(std::string const& object_str,
return result; return result;
} }
void
QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
ParserCallbacks* callbacks)
{
std::vector<QPDFObjectHandle> streams;
if (stream_or_array.isArray())
{
streams = stream_or_array.getArrayAsVector();
}
else
{
streams.push_back(stream_or_array);
}
for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
iter != streams.end(); ++iter)
{
QPDFObjectHandle stream = *iter;
if (! stream.isStream())
{
throw std::logic_error(
"QPDFObjectHandle: parseContentStream called on non-stream");
}
parseContentStream_internal(stream, callbacks);
}
callbacks->handleEOF();
}
void
QPDFObjectHandle::parseContentStream_internal(QPDFObjectHandle stream,
ParserCallbacks* callbacks)
{
stream.assertStream();
PointerHolder<Buffer> stream_data = stream.getStreamData();
size_t length = stream_data->getSize();
std::string description = "content stream object " +
QUtil::int_to_string(stream.getObjectID()) + " " +
QUtil::int_to_string(stream.getGeneration());
PointerHolder<InputSource> input =
new BufferInputSource(description, stream_data.getPointer());
QPDFTokenizer tokenizer;
tokenizer.allowEOF();
bool empty = false;
while ((size_t) input->tell() < length)
{
QPDFObjectHandle obj =
parseInternal(input, "content", tokenizer, empty,
0, 0, false, false, true);
if (! obj.isInitialized())
{
// EOF
break;
}
callbacks->handleObject(obj);
if (obj.isKeyword() && (obj.getKeywordValue() == "ID"))
{
// Discard next character; it is the space after ID that
// terminated the token. Read until end of inline image.
char ch;
input->read(&ch, 1);
char buf[4];
memset(buf, '\0', sizeof(buf));
bool done = false;
std::string inline_image;
while (! done)
{
if (input->read(&ch, 1) == 0)
{
QTC::TC("qpdf", "QPDFObjectHandle EOF in inline image");
throw QPDFExc(qpdf_e_damaged_pdf, input->getName(),
"stream data", input->tell(),
"EOF found while reading inline image");
}
inline_image += ch;
memmove(buf, buf + 1, sizeof(buf) - 1);
buf[sizeof(buf) - 1] = ch;
if (strchr(" \t\n\v\f\r", buf[0]) &&
(buf[1] == 'E') &&
(buf[2] == 'I') &&
strchr(" \t\n\v\f\r", buf[3]))
{
// We've found an EI operator.
done = true;
input->seek(-3, SEEK_CUR);
for (int i = 0; i < 4; ++i)
{
if (inline_image.length() > 0)
{
inline_image.erase(inline_image.length() - 1);
}
}
}
}
QTC::TC("qpdf", "QPDFObjectHandle inline image token");
callbacks->handleObject(
QPDFObjectHandle::newInlineImage(inline_image));
}
}
}
QPDFObjectHandle QPDFObjectHandle
QPDFObjectHandle::parse(PointerHolder<InputSource> input, QPDFObjectHandle::parse(PointerHolder<InputSource> input,
std::string const& object_description, std::string const& object_description,
@ -687,7 +787,7 @@ QPDFObjectHandle::parse(PointerHolder<InputSource> input,
StringDecrypter* decrypter, QPDF* context) StringDecrypter* decrypter, QPDF* context)
{ {
return parseInternal(input, object_description, tokenizer, empty, return parseInternal(input, object_description, tokenizer, empty,
decrypter, context, false, false); decrypter, context, false, false, false);
} }
QPDFObjectHandle QPDFObjectHandle
@ -695,7 +795,8 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
std::string const& object_description, std::string const& object_description,
QPDFTokenizer& tokenizer, bool& empty, QPDFTokenizer& tokenizer, bool& empty,
StringDecrypter* decrypter, QPDF* context, StringDecrypter* decrypter, QPDF* context,
bool in_array, bool in_dictionary) bool in_array, bool in_dictionary,
bool content_stream)
{ {
empty = false; empty = false;
if (in_dictionary && in_array) if (in_dictionary && in_array)
@ -721,6 +822,21 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
switch (token.getType()) switch (token.getType())
{ {
case QPDFTokenizer::tt_eof:
if (content_stream)
{
// Return uninitialized object to indicate EOF
return object;
}
else
{
// When not in content stream mode, EOF is tt_bad and
// throws an exception before we get here.
throw std::logic_error(
"EOF received while not in content stream mode");
}
break;
case QPDFTokenizer::tt_brace_open: case QPDFTokenizer::tt_brace_open:
case QPDFTokenizer::tt_brace_close: case QPDFTokenizer::tt_brace_close:
// Don't know what to do with these for now // Don't know what to do with these for now
@ -764,13 +880,13 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
case QPDFTokenizer::tt_array_open: case QPDFTokenizer::tt_array_open:
object = parseInternal( object = parseInternal(
input, object_description, tokenizer, empty, input, object_description, tokenizer, empty,
decrypter, context, true, false); decrypter, context, true, false, content_stream);
break; break;
case QPDFTokenizer::tt_dict_open: case QPDFTokenizer::tt_dict_open:
object = parseInternal( object = parseInternal(
input, object_description, tokenizer, empty, input, object_description, tokenizer, empty,
decrypter, context, false, true); decrypter, context, false, true, content_stream);
break; break;
case QPDFTokenizer::tt_bool: case QPDFTokenizer::tt_bool:
@ -826,6 +942,10 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
input->seek(input->getLastOffset(), SEEK_SET); input->seek(input->getLastOffset(), SEEK_SET);
empty = true; empty = true;
} }
else if (content_stream)
{
object = QPDFObjectHandle::newKeyword(token.getValue());
}
else else
{ {
throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), throw QPDFExc(qpdf_e_damaged_pdf, input->getName(),

View File

@ -22,7 +22,8 @@ static bool is_space(char ch)
} }
QPDFTokenizer::QPDFTokenizer() : QPDFTokenizer::QPDFTokenizer() :
pound_special_in_name(true) pound_special_in_name(true),
allow_eof(false)
{ {
reset(); reset();
} }
@ -34,6 +35,12 @@ QPDFTokenizer::allowPoundAnywhereInName()
this->pound_special_in_name = false; this->pound_special_in_name = false;
} }
void
QPDFTokenizer::allowEOF()
{
this->allow_eof = true;
}
void void
QPDFTokenizer::reset() QPDFTokenizer::reset()
{ {
@ -441,9 +448,17 @@ QPDFTokenizer::presentEOF()
} }
else if (state != st_token_ready) else if (state != st_token_ready)
{ {
QTC::TC("qpdf", "QPDF_Tokenizer EOF reading token"); QTC::TC("qpdf", "QPDF_Tokenizer EOF reading token",
type = tt_bad; this->allow_eof ? 1 : 0);
error_message = "EOF while reading token"; if (this->allow_eof)
{
type = tt_eof;
}
else
{
type = tt_bad;
error_message = "EOF while reading token";
}
} }
state = st_token_ready; state = st_token_ready;

View File

@ -236,7 +236,7 @@ QPDFWriter copy use_aes 1
QPDFObjectHandle indirect without context 0 QPDFObjectHandle indirect without context 0
QPDFObjectHandle trailing data in parse 0 QPDFObjectHandle trailing data in parse 0
qpdf pages encryption password 0 qpdf pages encryption password 0
QPDF_Tokenizer EOF reading token 0 QPDF_Tokenizer EOF reading token 1
QPDF_Tokenizer EOF reading appendable token 0 QPDF_Tokenizer EOF reading appendable token 0
QPDFWriter extra header text no newline 0 QPDFWriter extra header text no newline 0
QPDFWriter extra header text add newline 0 QPDFWriter extra header text add newline 0
@ -259,3 +259,5 @@ QPDFWriter remove Crypt 0
qpdf-c called qpdf_get_pdf_extension_level 0 qpdf-c called qpdf_get_pdf_extension_level 0
qpdf-c called qpdf_set_r5_encryption_parameters 0 qpdf-c called qpdf_set_r5_encryption_parameters 0
qpdf-c called qpdf_set_r6_encryption_parameters 0 qpdf-c called qpdf_set_r6_encryption_parameters 0
QPDFObjectHandle EOF in inline image 0
QPDFObjectHandle inline image token 0

View File

@ -199,7 +199,7 @@ $td->runtest("remove page we don't have",
show_ntests(); show_ntests();
# ---------- # ----------
$td->notify("--- Miscellaneous Tests ---"); $td->notify("--- Miscellaneous Tests ---");
$n_tests += 57; $n_tests += 59;
$td->runtest("qpdf version", $td->runtest("qpdf version",
{$td->COMMAND => "qpdf --version"}, {$td->COMMAND => "qpdf --version"},
@ -468,6 +468,16 @@ $td->runtest("check file with leading junk",
{$td->COMMAND => "qpdf --check leading-junk.pdf"}, {$td->COMMAND => "qpdf --check leading-junk.pdf"},
{$td->FILE => "leading-junk.out", $td->EXIT_STATUS => 0}, {$td->FILE => "leading-junk.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES); $td->NORMALIZE_NEWLINES);
$td->runtest("EOF inside inline image",
{$td->COMMAND => "test_driver 37 eof-in-inline-image.pdf"},
{$td->FILE => "eof-in-inline-image.out",
$td->EXIT_STATUS => 2},
$td->NORMALIZE_NEWLINES);
$td->runtest("tokenize content streams",
{$td->COMMAND => "test_driver 37 tokenize-content-streams.pdf"},
{$td->FILE => "tokenize-content-streams.out",
$td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
show_ntests(); show_ntests();
# ---------- # ----------

View File

@ -0,0 +1,25 @@
BT
/F1
24
Tf
72
720
Td
(Potato)
Tj
ET
BI
/CS
/G
/W
1
/H
1
/BPC
8
/F
/Fl
/DP
<< /Columns 1 /Predictor 15 >>
ID
content stream object 4 0 (stream data, file position 139): EOF found while reading inline image

Binary file not shown.

View File

@ -0,0 +1,95 @@
BT
/F1
24
Tf
72
720
Td
(Potato)
Tj
ET
-EOF-
0.1
0
0
0.1
0
0
cm
q
0
1.1999
-1.1999
0
121.19
150.009
cm
BI
/CS
/G
/W
1
/H
1
/BPC
8
/F
/Fl
/DP
<< /Columns 1 /Predictor 15 >>
ID
inline image: 789c63fc0f0001030101
EI
Q
q
0
35.997
-128.389
0
431.964
7269.02
cm
BI
/CS
/G
/W
30
/H
107
/BPC
8
/F
/Fl
/DP
<< /Columns 30 /Predictor 15 >>
ID
inline image: 789cedd1a11100300800b1b2ffd06503148283bc8dfcf8af2a306ee352eff2e06318638c31c63b3801627b620a
EI
Q
q
0
38.3968
-93.5922
0
431.964
7567.79
cm
BI
/CS
/G
/W
32
/H
78
/BPC
8
/F
/Fl
/DP
<< /Columns 32 /Predictor 15 >>
ID
inline image: 789c63fccf801f308e2a185530aa60882a20203faa605401890a0643aa1e5530aa6054010d140000bdd03c13
EI
Q
-EOF-
test 37 done

Binary file not shown.

View File

@ -58,6 +58,45 @@ class Provider: public QPDFObjectHandle::StreamDataProvider
bool bad_length; bool bad_length;
}; };
class ParserCallbacks: public QPDFObjectHandle::ParserCallbacks
{
public:
virtual ~ParserCallbacks()
{
}
virtual void handleObject(QPDFObjectHandle);
virtual void handleEOF();
};
void
ParserCallbacks::handleObject(QPDFObjectHandle obj)
{
if (obj.isInlineImage())
{
std::string val = obj.getInlineImageValue();
std::cout << "inline image: ";
char buf[3];
buf[2] = '\0';
for (size_t i = 0; i < val.length(); ++i)
{
sprintf(buf, "%02x", (unsigned char)(val[i]));
std::cout << buf;
}
std::cout << std::endl;
}
else
{
std::cout << obj.unparse() << std::endl;
}
}
void
ParserCallbacks::handleEOF()
{
std::cout << "-EOF-" << std::endl;
}
static std::string getPageContents(QPDFObjectHandle page) static std::string getPageContents(QPDFObjectHandle page)
{ {
PointerHolder<Buffer> b1 = PointerHolder<Buffer> b1 =
@ -1245,6 +1284,19 @@ void runtest(int n, char const* filename1, char const* arg2)
} }
} }
} }
else if (n == 37)
{
// Parse content streams of all pages
std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin();
iter != pages.end(); ++iter)
{
QPDFObjectHandle page = *iter;
QPDFObjectHandle contents = page.getKey("/Contents");
ParserCallbacks cb;
QPDFObjectHandle::parseContentStream(contents, &cb);
}
}
else else
{ {
throw std::runtime_error(std::string("invalid test ") + throw std::runtime_error(std::string("invalid test ") +