diff --git a/ChangeLog b/ChangeLog index e4919ca6..0a3b3d81 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2013-01-20 Jay Berkenbilt + * Added QPDFObjectHandle::parseContentStream, which parses the + objects in a content stream and calls handlers in a callback + class. The example pdf-parse-content illustrates it use. + * Added QPDF_Keyword and QPDF_InlineImage types along with appropriate wrapper methods in QPDFObjectHandle. These new object types are to facilitate content stream parsing. diff --git a/examples/build.mk b/examples/build.mk index 12734b1b..bcb4440e 100644 --- a/examples/build.mk +++ b/examples/build.mk @@ -4,7 +4,8 @@ BINS_examples = \ pdf-npages \ pdf-double-page-size \ pdf-invert-images \ - pdf-create + pdf-create \ + pdf-parse-content CBINS_examples = pdf-linearize TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B))) diff --git a/examples/pdf-parse-content.cc b/examples/pdf-parse-content.cc new file mode 100644 index 00000000..1c3cae16 --- /dev/null +++ b/examples/pdf-parse-content.cc @@ -0,0 +1,97 @@ +#include +#include +#include + +#include +#include + +static char const* whoami = 0; + +void usage() +{ + std::cerr << "Usage: " << whoami << " filename page-number" << std::endl + << "Prints a dump of the objects in the content streams" + << " of the given page." << std::endl + << "Pages are numbered from 1." << std::endl; + exit(2); +} + +class ParserCallbacks: public QPDFObjectHandle::ParserCallbacks +{ + public: + virtual ~ParserCallbacks() + { + } + + virtual void handleObject(QPDFObjectHandle); + virtual void handleEOF(); +}; + +void +ParserCallbacks::handleObject(QPDFObjectHandle obj) +{ + if (obj.isInlineImage()) + { + std::string val = obj.getInlineImageValue(); + std::cout << "inline image: "; + char buf[3]; + buf[2] = '\0'; + for (size_t i = 0; i < val.length(); ++i) + { + sprintf(buf, "%02x", (unsigned char)(val[i])); + std::cout << buf; + } + std::cout << std::endl; + } + else + { + std::cout << obj.unparse() << std::endl; + } +} + +void +ParserCallbacks::handleEOF() +{ + std::cout << "-EOF-" << std::endl; +} + +int main(int argc, char* argv[]) +{ + whoami = QUtil::getWhoami(argv[0]); + + // For libtool's sake.... + if (strncmp(whoami, "lt-", 3) == 0) + { + whoami += 3; + } + + if (argc != 3) + { + usage(); + } + char const* filename = argv[1]; + int pageno = atoi(argv[2]); + + try + { + QPDF pdf; + pdf.processFile(filename); + std::vector pages = pdf.getAllPages(); + if ((pageno < 1) || (pageno > (int)pages.size())) + { + usage(); + } + + QPDFObjectHandle page = pages[pageno-1]; + QPDFObjectHandle contents = page.getKey("/Contents"); + ParserCallbacks cb; + QPDFObjectHandle::parseContentStream(contents, &cb); + } + catch (std::exception& e) + { + std::cerr << whoami << ": " << e.what() << std::endl; + exit(2); + } + + return 0; +} diff --git a/examples/qtest/parse-content.test b/examples/qtest/parse-content.test new file mode 100644 index 00000000..a73566f8 --- /dev/null +++ b/examples/qtest/parse-content.test @@ -0,0 +1,17 @@ +#!/usr/bin/env perl +require 5.008; +BEGIN { $^W = 1; } +use strict; + +chdir("parse-content"); + +require TestDriver; + +my $td = new TestDriver('pdf-parse-content'); + +$td->runtest("parse content", + {$td->COMMAND => "pdf-parse-content input.pdf 1"}, + {$td->FILE => "content.out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); + +$td->report(1); diff --git a/examples/qtest/parse-content/content.out b/examples/qtest/parse-content/content.out new file mode 100644 index 00000000..9c07edc2 --- /dev/null +++ b/examples/qtest/parse-content/content.out @@ -0,0 +1,11 @@ +BT +/F1 +24 +Tf +72 +720 +Td +(Potato) +Tj +ET +-EOF- diff --git a/examples/qtest/parse-content/input.pdf b/examples/qtest/parse-content/input.pdf new file mode 100644 index 00000000..cd319591 Binary files /dev/null and b/examples/qtest/parse-content/input.pdf differ diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index 932a6678..c4a922d1 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -71,6 +71,21 @@ class QPDFObjectHandle virtual void decryptString(std::string& val) = 0; }; + // This class is used by parseContentStream. Callers must + // instantiate a subclass of this with handlers defined to accept + // QPDFObjectHandles that are parsed from the stream. + class ParserCallbacks + { + public: + QPDF_DLL + virtual ~ParserCallbacks() + { + } + virtual void handleObject(QPDFObjectHandle) = 0; + virtual void handleEOF() = 0; + }; + + QPDF_DLL QPDFObjectHandle(); QPDF_DLL @@ -138,6 +153,11 @@ class QPDFObjectHandle StringDecrypter* decrypter, QPDF* context); + // Helpers for parsing content streams + QPDF_DLL + static void parseContentStream(QPDFObjectHandle stream_or_array, + ParserCallbacks* callbacks); + // Type-specific factories QPDF_DLL static QPDFObjectHandle newNull(); @@ -571,7 +591,10 @@ class QPDFObjectHandle std::string const& object_description, QPDFTokenizer& tokenizer, bool& empty, StringDecrypter* decrypter, QPDF* context, - bool in_array, bool in_dictionary); + bool in_array, bool in_dictionary, + bool content_stream); + static void parseContentStream_internal( + QPDFObjectHandle stream, ParserCallbacks* callbacks); bool initialized; diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh index 1835fcb1..081e12d3 100644 --- a/include/qpdf/QPDFTokenizer.hh +++ b/include/qpdf/QPDFTokenizer.hh @@ -18,6 +18,8 @@ class QPDFTokenizer { public: + // Token type tt_eof is only returned of allowEOF() is called on + // the tokenizer. tt_eof was introduced in QPDF version 4.1. enum token_type_e { tt_bad, @@ -34,6 +36,7 @@ class QPDFTokenizer tt_null, tt_bool, tt_word, + tt_eof, }; class Token @@ -97,6 +100,12 @@ class QPDFTokenizer QPDF_DLL void allowPoundAnywhereInName(); + // If called, treat EOF as a separate token type instead of an + // error. This was introduced in QPDF 4.1 to facilitate + // tokenizing content streams. + QPDF_DLL + void allowEOF(); + // Mode of operation: // Keep presenting characters and calling getToken() until @@ -140,6 +149,7 @@ class QPDFTokenizer st_literal, st_in_hexstring, st_token_ready } state; bool pound_special_in_name; + bool allow_eof; // Current token accumulation token_type_e type; diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 9b51a0cb..bfca3f08 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -680,6 +680,106 @@ QPDFObjectHandle::parse(std::string const& object_str, return result; } +void +QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, + ParserCallbacks* callbacks) +{ + std::vector streams; + if (stream_or_array.isArray()) + { + streams = stream_or_array.getArrayAsVector(); + } + else + { + streams.push_back(stream_or_array); + } + for (std::vector::iterator iter = streams.begin(); + iter != streams.end(); ++iter) + { + QPDFObjectHandle stream = *iter; + if (! stream.isStream()) + { + throw std::logic_error( + "QPDFObjectHandle: parseContentStream called on non-stream"); + } + parseContentStream_internal(stream, callbacks); + } + callbacks->handleEOF(); +} + +void +QPDFObjectHandle::parseContentStream_internal(QPDFObjectHandle stream, + ParserCallbacks* callbacks) +{ + stream.assertStream(); + PointerHolder stream_data = stream.getStreamData(); + size_t length = stream_data->getSize(); + std::string description = "content stream object " + + QUtil::int_to_string(stream.getObjectID()) + " " + + QUtil::int_to_string(stream.getGeneration()); + PointerHolder input = + new BufferInputSource(description, stream_data.getPointer()); + QPDFTokenizer tokenizer; + tokenizer.allowEOF(); + bool empty = false; + while ((size_t) input->tell() < length) + { + QPDFObjectHandle obj = + parseInternal(input, "content", tokenizer, empty, + 0, 0, false, false, true); + if (! obj.isInitialized()) + { + // EOF + break; + } + + callbacks->handleObject(obj); + if (obj.isKeyword() && (obj.getKeywordValue() == "ID")) + { + // Discard next character; it is the space after ID that + // terminated the token. Read until end of inline image. + char ch; + input->read(&ch, 1); + char buf[4]; + memset(buf, '\0', sizeof(buf)); + bool done = false; + std::string inline_image; + while (! done) + { + if (input->read(&ch, 1) == 0) + { + QTC::TC("qpdf", "QPDFObjectHandle EOF in inline image"); + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + "stream data", input->tell(), + "EOF found while reading inline image"); + } + inline_image += ch; + memmove(buf, buf + 1, sizeof(buf) - 1); + buf[sizeof(buf) - 1] = ch; + if (strchr(" \t\n\v\f\r", buf[0]) && + (buf[1] == 'E') && + (buf[2] == 'I') && + strchr(" \t\n\v\f\r", buf[3])) + { + // We've found an EI operator. + done = true; + input->seek(-3, SEEK_CUR); + for (int i = 0; i < 4; ++i) + { + if (inline_image.length() > 0) + { + inline_image.erase(inline_image.length() - 1); + } + } + } + } + QTC::TC("qpdf", "QPDFObjectHandle inline image token"); + callbacks->handleObject( + QPDFObjectHandle::newInlineImage(inline_image)); + } + } +} + QPDFObjectHandle QPDFObjectHandle::parse(PointerHolder input, std::string const& object_description, @@ -687,7 +787,7 @@ QPDFObjectHandle::parse(PointerHolder input, StringDecrypter* decrypter, QPDF* context) { return parseInternal(input, object_description, tokenizer, empty, - decrypter, context, false, false); + decrypter, context, false, false, false); } QPDFObjectHandle @@ -695,7 +795,8 @@ QPDFObjectHandle::parseInternal(PointerHolder input, std::string const& object_description, QPDFTokenizer& tokenizer, bool& empty, StringDecrypter* decrypter, QPDF* context, - bool in_array, bool in_dictionary) + bool in_array, bool in_dictionary, + bool content_stream) { empty = false; if (in_dictionary && in_array) @@ -721,6 +822,21 @@ QPDFObjectHandle::parseInternal(PointerHolder input, switch (token.getType()) { + case QPDFTokenizer::tt_eof: + if (content_stream) + { + // Return uninitialized object to indicate EOF + return object; + } + else + { + // When not in content stream mode, EOF is tt_bad and + // throws an exception before we get here. + throw std::logic_error( + "EOF received while not in content stream mode"); + } + break; + case QPDFTokenizer::tt_brace_open: case QPDFTokenizer::tt_brace_close: // Don't know what to do with these for now @@ -764,13 +880,13 @@ QPDFObjectHandle::parseInternal(PointerHolder input, case QPDFTokenizer::tt_array_open: object = parseInternal( input, object_description, tokenizer, empty, - decrypter, context, true, false); + decrypter, context, true, false, content_stream); break; case QPDFTokenizer::tt_dict_open: object = parseInternal( input, object_description, tokenizer, empty, - decrypter, context, false, true); + decrypter, context, false, true, content_stream); break; case QPDFTokenizer::tt_bool: @@ -826,6 +942,10 @@ QPDFObjectHandle::parseInternal(PointerHolder input, input->seek(input->getLastOffset(), SEEK_SET); empty = true; } + else if (content_stream) + { + object = QPDFObjectHandle::newKeyword(token.getValue()); + } else { throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index 1a20bb5a..a6333b73 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -22,7 +22,8 @@ static bool is_space(char ch) } QPDFTokenizer::QPDFTokenizer() : - pound_special_in_name(true) + pound_special_in_name(true), + allow_eof(false) { reset(); } @@ -34,6 +35,12 @@ QPDFTokenizer::allowPoundAnywhereInName() this->pound_special_in_name = false; } +void +QPDFTokenizer::allowEOF() +{ + this->allow_eof = true; +} + void QPDFTokenizer::reset() { @@ -441,9 +448,17 @@ QPDFTokenizer::presentEOF() } else if (state != st_token_ready) { - QTC::TC("qpdf", "QPDF_Tokenizer EOF reading token"); - type = tt_bad; - error_message = "EOF while reading token"; + QTC::TC("qpdf", "QPDF_Tokenizer EOF reading token", + this->allow_eof ? 1 : 0); + if (this->allow_eof) + { + type = tt_eof; + } + else + { + type = tt_bad; + error_message = "EOF while reading token"; + } } state = st_token_ready; diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index a0578f28..b09e966c 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -236,7 +236,7 @@ QPDFWriter copy use_aes 1 QPDFObjectHandle indirect without context 0 QPDFObjectHandle trailing data in parse 0 qpdf pages encryption password 0 -QPDF_Tokenizer EOF reading token 0 +QPDF_Tokenizer EOF reading token 1 QPDF_Tokenizer EOF reading appendable token 0 QPDFWriter extra header text no newline 0 QPDFWriter extra header text add newline 0 @@ -259,3 +259,5 @@ QPDFWriter remove Crypt 0 qpdf-c called qpdf_get_pdf_extension_level 0 qpdf-c called qpdf_set_r5_encryption_parameters 0 qpdf-c called qpdf_set_r6_encryption_parameters 0 +QPDFObjectHandle EOF in inline image 0 +QPDFObjectHandle inline image token 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index bf62ceea..8d2b5cfc 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -199,7 +199,7 @@ $td->runtest("remove page we don't have", show_ntests(); # ---------- $td->notify("--- Miscellaneous Tests ---"); -$n_tests += 57; +$n_tests += 59; $td->runtest("qpdf version", {$td->COMMAND => "qpdf --version"}, @@ -468,6 +468,16 @@ $td->runtest("check file with leading junk", {$td->COMMAND => "qpdf --check leading-junk.pdf"}, {$td->FILE => "leading-junk.out", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); +$td->runtest("EOF inside inline image", + {$td->COMMAND => "test_driver 37 eof-in-inline-image.pdf"}, + {$td->FILE => "eof-in-inline-image.out", + $td->EXIT_STATUS => 2}, + $td->NORMALIZE_NEWLINES); +$td->runtest("tokenize content streams", + {$td->COMMAND => "test_driver 37 tokenize-content-streams.pdf"}, + {$td->FILE => "tokenize-content-streams.out", + $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); show_ntests(); # ---------- diff --git a/qpdf/qtest/qpdf/eof-in-inline-image.out b/qpdf/qtest/qpdf/eof-in-inline-image.out new file mode 100644 index 00000000..8ac365c4 --- /dev/null +++ b/qpdf/qtest/qpdf/eof-in-inline-image.out @@ -0,0 +1,25 @@ +BT +/F1 +24 +Tf +72 +720 +Td +(Potato) +Tj +ET +BI +/CS +/G +/W +1 +/H +1 +/BPC +8 +/F +/Fl +/DP +<< /Columns 1 /Predictor 15 >> +ID +content stream object 4 0 (stream data, file position 139): EOF found while reading inline image diff --git a/qpdf/qtest/qpdf/eof-in-inline-image.pdf b/qpdf/qtest/qpdf/eof-in-inline-image.pdf new file mode 100644 index 00000000..e970b77d Binary files /dev/null and b/qpdf/qtest/qpdf/eof-in-inline-image.pdf differ diff --git a/qpdf/qtest/qpdf/tokenize-content-streams.out b/qpdf/qtest/qpdf/tokenize-content-streams.out new file mode 100644 index 00000000..9bc933dc --- /dev/null +++ b/qpdf/qtest/qpdf/tokenize-content-streams.out @@ -0,0 +1,95 @@ +BT +/F1 +24 +Tf +72 +720 +Td +(Potato) +Tj +ET +-EOF- +0.1 +0 +0 +0.1 +0 +0 +cm +q +0 +1.1999 +-1.1999 +0 +121.19 +150.009 +cm +BI +/CS +/G +/W +1 +/H +1 +/BPC +8 +/F +/Fl +/DP +<< /Columns 1 /Predictor 15 >> +ID +inline image: 789c63fc0f0001030101 +EI +Q +q +0 +35.997 +-128.389 +0 +431.964 +7269.02 +cm +BI +/CS +/G +/W +30 +/H +107 +/BPC +8 +/F +/Fl +/DP +<< /Columns 30 /Predictor 15 >> +ID +inline image: 789cedd1a11100300800b1b2ffd06503148283bc8dfcf8af2a306ee352eff2e06318638c31c63b3801627b620a +EI +Q +q +0 +38.3968 +-93.5922 +0 +431.964 +7567.79 +cm +BI +/CS +/G +/W +32 +/H +78 +/BPC +8 +/F +/Fl +/DP +<< /Columns 32 /Predictor 15 >> +ID +inline image: 789c63fccf801f308e2a185530aa60882a20203faa605401890a0643aa1e5530aa6054010d140000bdd03c13 +EI +Q +-EOF- +test 37 done diff --git a/qpdf/qtest/qpdf/tokenize-content-streams.pdf b/qpdf/qtest/qpdf/tokenize-content-streams.pdf new file mode 100644 index 00000000..ea97a6e2 Binary files /dev/null and b/qpdf/qtest/qpdf/tokenize-content-streams.pdf differ diff --git a/qpdf/test_driver.cc b/qpdf/test_driver.cc index 48017908..cd6aa991 100644 --- a/qpdf/test_driver.cc +++ b/qpdf/test_driver.cc @@ -58,6 +58,45 @@ class Provider: public QPDFObjectHandle::StreamDataProvider bool bad_length; }; +class ParserCallbacks: public QPDFObjectHandle::ParserCallbacks +{ + public: + virtual ~ParserCallbacks() + { + } + + virtual void handleObject(QPDFObjectHandle); + virtual void handleEOF(); +}; + +void +ParserCallbacks::handleObject(QPDFObjectHandle obj) +{ + if (obj.isInlineImage()) + { + std::string val = obj.getInlineImageValue(); + std::cout << "inline image: "; + char buf[3]; + buf[2] = '\0'; + for (size_t i = 0; i < val.length(); ++i) + { + sprintf(buf, "%02x", (unsigned char)(val[i])); + std::cout << buf; + } + std::cout << std::endl; + } + else + { + std::cout << obj.unparse() << std::endl; + } +} + +void +ParserCallbacks::handleEOF() +{ + std::cout << "-EOF-" << std::endl; +} + static std::string getPageContents(QPDFObjectHandle page) { PointerHolder b1 = @@ -1245,6 +1284,19 @@ void runtest(int n, char const* filename1, char const* arg2) } } } + else if (n == 37) + { + // Parse content streams of all pages + std::vector pages = pdf.getAllPages(); + for (std::vector::iterator iter = pages.begin(); + iter != pages.end(); ++iter) + { + QPDFObjectHandle page = *iter; + QPDFObjectHandle contents = page.getKey("/Contents"); + ParserCallbacks cb; + QPDFObjectHandle::parseContentStream(contents, &cb); + } + } else { throw std::runtime_error(std::string("invalid test ") +