From f81152311e5737e5e0de9dd9462311f306c6921b Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sun, 20 Jan 2013 15:26:45 -0500 Subject: [PATCH] Add QPDFObjectHandle::parseContentStream method This method allows parsing of the PDF objects in a content stream or array of content streams. --- ChangeLog | 4 + examples/build.mk | 3 +- examples/pdf-parse-content.cc | 97 ++++++++++++++ examples/qtest/parse-content.test | 17 +++ examples/qtest/parse-content/content.out | 11 ++ examples/qtest/parse-content/input.pdf | Bin 0 -> 799 bytes include/qpdf/QPDFObjectHandle.hh | 25 +++- include/qpdf/QPDFTokenizer.hh | 10 ++ libqpdf/QPDFObjectHandle.cc | 128 ++++++++++++++++++- libqpdf/QPDFTokenizer.cc | 23 +++- qpdf/qpdf.testcov | 4 +- qpdf/qtest/qpdf.test | 12 +- qpdf/qtest/qpdf/eof-in-inline-image.out | 25 ++++ qpdf/qtest/qpdf/eof-in-inline-image.pdf | Bin 0 -> 870 bytes qpdf/qtest/qpdf/tokenize-content-streams.out | 95 ++++++++++++++ qpdf/qtest/qpdf/tokenize-content-streams.pdf | Bin 0 -> 1539 bytes qpdf/test_driver.cc | 52 ++++++++ 17 files changed, 494 insertions(+), 12 deletions(-) create mode 100644 examples/pdf-parse-content.cc create mode 100644 examples/qtest/parse-content.test create mode 100644 examples/qtest/parse-content/content.out create mode 100644 examples/qtest/parse-content/input.pdf create mode 100644 qpdf/qtest/qpdf/eof-in-inline-image.out create mode 100644 qpdf/qtest/qpdf/eof-in-inline-image.pdf create mode 100644 qpdf/qtest/qpdf/tokenize-content-streams.out create mode 100644 qpdf/qtest/qpdf/tokenize-content-streams.pdf diff --git a/ChangeLog b/ChangeLog index e4919ca6..0a3b3d81 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2013-01-20 Jay Berkenbilt + * Added QPDFObjectHandle::parseContentStream, which parses the + objects in a content stream and calls handlers in a callback + class. The example pdf-parse-content illustrates it use. + * Added QPDF_Keyword and QPDF_InlineImage types along with appropriate wrapper methods in QPDFObjectHandle. These new object types are to facilitate content stream parsing. diff --git a/examples/build.mk b/examples/build.mk index 12734b1b..bcb4440e 100644 --- a/examples/build.mk +++ b/examples/build.mk @@ -4,7 +4,8 @@ BINS_examples = \ pdf-npages \ pdf-double-page-size \ pdf-invert-images \ - pdf-create + pdf-create \ + pdf-parse-content CBINS_examples = pdf-linearize TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B))) diff --git a/examples/pdf-parse-content.cc b/examples/pdf-parse-content.cc new file mode 100644 index 00000000..1c3cae16 --- /dev/null +++ b/examples/pdf-parse-content.cc @@ -0,0 +1,97 @@ +#include +#include +#include + +#include +#include + +static char const* whoami = 0; + +void usage() +{ + std::cerr << "Usage: " << whoami << " filename page-number" << std::endl + << "Prints a dump of the objects in the content streams" + << " of the given page." << std::endl + << "Pages are numbered from 1." << std::endl; + exit(2); +} + +class ParserCallbacks: public QPDFObjectHandle::ParserCallbacks +{ + public: + virtual ~ParserCallbacks() + { + } + + virtual void handleObject(QPDFObjectHandle); + virtual void handleEOF(); +}; + +void +ParserCallbacks::handleObject(QPDFObjectHandle obj) +{ + if (obj.isInlineImage()) + { + std::string val = obj.getInlineImageValue(); + std::cout << "inline image: "; + char buf[3]; + buf[2] = '\0'; + for (size_t i = 0; i < val.length(); ++i) + { + sprintf(buf, "%02x", (unsigned char)(val[i])); + std::cout << buf; + } + std::cout << std::endl; + } + else + { + std::cout << obj.unparse() << std::endl; + } +} + +void +ParserCallbacks::handleEOF() +{ + std::cout << "-EOF-" << std::endl; +} + +int main(int argc, char* argv[]) +{ + whoami = QUtil::getWhoami(argv[0]); + + // For libtool's sake.... + if (strncmp(whoami, "lt-", 3) == 0) + { + whoami += 3; + } + + if (argc != 3) + { + usage(); + } + char const* filename = argv[1]; + int pageno = atoi(argv[2]); + + try + { + QPDF pdf; + pdf.processFile(filename); + std::vector pages = pdf.getAllPages(); + if ((pageno < 1) || (pageno > (int)pages.size())) + { + usage(); + } + + QPDFObjectHandle page = pages[pageno-1]; + QPDFObjectHandle contents = page.getKey("/Contents"); + ParserCallbacks cb; + QPDFObjectHandle::parseContentStream(contents, &cb); + } + catch (std::exception& e) + { + std::cerr << whoami << ": " << e.what() << std::endl; + exit(2); + } + + return 0; +} diff --git a/examples/qtest/parse-content.test b/examples/qtest/parse-content.test new file mode 100644 index 00000000..a73566f8 --- /dev/null +++ b/examples/qtest/parse-content.test @@ -0,0 +1,17 @@ +#!/usr/bin/env perl +require 5.008; +BEGIN { $^W = 1; } +use strict; + +chdir("parse-content"); + +require TestDriver; + +my $td = new TestDriver('pdf-parse-content'); + +$td->runtest("parse content", + {$td->COMMAND => "pdf-parse-content input.pdf 1"}, + {$td->FILE => "content.out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); + +$td->report(1); diff --git a/examples/qtest/parse-content/content.out b/examples/qtest/parse-content/content.out new file mode 100644 index 00000000..9c07edc2 --- /dev/null +++ b/examples/qtest/parse-content/content.out @@ -0,0 +1,11 @@ +BT +/F1 +24 +Tf +72 +720 +Td +(Potato) +Tj +ET +-EOF- diff --git a/examples/qtest/parse-content/input.pdf b/examples/qtest/parse-content/input.pdf new file mode 100644 index 0000000000000000000000000000000000000000..cd3195910bc5352cbf1f673a70f6c4bdcb0a2961 GIT binary patch literal 799 zcmah{&2G~`5C$rwuoCYuNaP0CUjHPKA`9BoYQ$eiB5{av*w~X`a-$n^3_yxT~nn- zsSEHn<(^}bihZ@BEH*uX!=csTK)D7Yr@%_q5DT@;W7269M~ZIB4+s1*Q5X^s}x6j-)k9bxtk zbs7t>4keXLu`aPI*eeT*b5pxtBfq}>Ub| streams; + if (stream_or_array.isArray()) + { + streams = stream_or_array.getArrayAsVector(); + } + else + { + streams.push_back(stream_or_array); + } + for (std::vector::iterator iter = streams.begin(); + iter != streams.end(); ++iter) + { + QPDFObjectHandle stream = *iter; + if (! stream.isStream()) + { + throw std::logic_error( + "QPDFObjectHandle: parseContentStream called on non-stream"); + } + parseContentStream_internal(stream, callbacks); + } + callbacks->handleEOF(); +} + +void +QPDFObjectHandle::parseContentStream_internal(QPDFObjectHandle stream, + ParserCallbacks* callbacks) +{ + stream.assertStream(); + PointerHolder stream_data = stream.getStreamData(); + size_t length = stream_data->getSize(); + std::string description = "content stream object " + + QUtil::int_to_string(stream.getObjectID()) + " " + + QUtil::int_to_string(stream.getGeneration()); + PointerHolder input = + new BufferInputSource(description, stream_data.getPointer()); + QPDFTokenizer tokenizer; + tokenizer.allowEOF(); + bool empty = false; + while ((size_t) input->tell() < length) + { + QPDFObjectHandle obj = + parseInternal(input, "content", tokenizer, empty, + 0, 0, false, false, true); + if (! obj.isInitialized()) + { + // EOF + break; + } + + callbacks->handleObject(obj); + if (obj.isKeyword() && (obj.getKeywordValue() == "ID")) + { + // Discard next character; it is the space after ID that + // terminated the token. Read until end of inline image. + char ch; + input->read(&ch, 1); + char buf[4]; + memset(buf, '\0', sizeof(buf)); + bool done = false; + std::string inline_image; + while (! done) + { + if (input->read(&ch, 1) == 0) + { + QTC::TC("qpdf", "QPDFObjectHandle EOF in inline image"); + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + "stream data", input->tell(), + "EOF found while reading inline image"); + } + inline_image += ch; + memmove(buf, buf + 1, sizeof(buf) - 1); + buf[sizeof(buf) - 1] = ch; + if (strchr(" \t\n\v\f\r", buf[0]) && + (buf[1] == 'E') && + (buf[2] == 'I') && + strchr(" \t\n\v\f\r", buf[3])) + { + // We've found an EI operator. + done = true; + input->seek(-3, SEEK_CUR); + for (int i = 0; i < 4; ++i) + { + if (inline_image.length() > 0) + { + inline_image.erase(inline_image.length() - 1); + } + } + } + } + QTC::TC("qpdf", "QPDFObjectHandle inline image token"); + callbacks->handleObject( + QPDFObjectHandle::newInlineImage(inline_image)); + } + } +} + QPDFObjectHandle QPDFObjectHandle::parse(PointerHolder input, std::string const& object_description, @@ -687,7 +787,7 @@ QPDFObjectHandle::parse(PointerHolder input, StringDecrypter* decrypter, QPDF* context) { return parseInternal(input, object_description, tokenizer, empty, - decrypter, context, false, false); + decrypter, context, false, false, false); } QPDFObjectHandle @@ -695,7 +795,8 @@ QPDFObjectHandle::parseInternal(PointerHolder input, std::string const& object_description, QPDFTokenizer& tokenizer, bool& empty, StringDecrypter* decrypter, QPDF* context, - bool in_array, bool in_dictionary) + bool in_array, bool in_dictionary, + bool content_stream) { empty = false; if (in_dictionary && in_array) @@ -721,6 +822,21 @@ QPDFObjectHandle::parseInternal(PointerHolder input, switch (token.getType()) { + case QPDFTokenizer::tt_eof: + if (content_stream) + { + // Return uninitialized object to indicate EOF + return object; + } + else + { + // When not in content stream mode, EOF is tt_bad and + // throws an exception before we get here. + throw std::logic_error( + "EOF received while not in content stream mode"); + } + break; + case QPDFTokenizer::tt_brace_open: case QPDFTokenizer::tt_brace_close: // Don't know what to do with these for now @@ -764,13 +880,13 @@ QPDFObjectHandle::parseInternal(PointerHolder input, case QPDFTokenizer::tt_array_open: object = parseInternal( input, object_description, tokenizer, empty, - decrypter, context, true, false); + decrypter, context, true, false, content_stream); break; case QPDFTokenizer::tt_dict_open: object = parseInternal( input, object_description, tokenizer, empty, - decrypter, context, false, true); + decrypter, context, false, true, content_stream); break; case QPDFTokenizer::tt_bool: @@ -826,6 +942,10 @@ QPDFObjectHandle::parseInternal(PointerHolder input, input->seek(input->getLastOffset(), SEEK_SET); empty = true; } + else if (content_stream) + { + object = QPDFObjectHandle::newKeyword(token.getValue()); + } else { throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index 1a20bb5a..a6333b73 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -22,7 +22,8 @@ static bool is_space(char ch) } QPDFTokenizer::QPDFTokenizer() : - pound_special_in_name(true) + pound_special_in_name(true), + allow_eof(false) { reset(); } @@ -34,6 +35,12 @@ QPDFTokenizer::allowPoundAnywhereInName() this->pound_special_in_name = false; } +void +QPDFTokenizer::allowEOF() +{ + this->allow_eof = true; +} + void QPDFTokenizer::reset() { @@ -441,9 +448,17 @@ QPDFTokenizer::presentEOF() } else if (state != st_token_ready) { - QTC::TC("qpdf", "QPDF_Tokenizer EOF reading token"); - type = tt_bad; - error_message = "EOF while reading token"; + QTC::TC("qpdf", "QPDF_Tokenizer EOF reading token", + this->allow_eof ? 1 : 0); + if (this->allow_eof) + { + type = tt_eof; + } + else + { + type = tt_bad; + error_message = "EOF while reading token"; + } } state = st_token_ready; diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index a0578f28..b09e966c 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -236,7 +236,7 @@ QPDFWriter copy use_aes 1 QPDFObjectHandle indirect without context 0 QPDFObjectHandle trailing data in parse 0 qpdf pages encryption password 0 -QPDF_Tokenizer EOF reading token 0 +QPDF_Tokenizer EOF reading token 1 QPDF_Tokenizer EOF reading appendable token 0 QPDFWriter extra header text no newline 0 QPDFWriter extra header text add newline 0 @@ -259,3 +259,5 @@ QPDFWriter remove Crypt 0 qpdf-c called qpdf_get_pdf_extension_level 0 qpdf-c called qpdf_set_r5_encryption_parameters 0 qpdf-c called qpdf_set_r6_encryption_parameters 0 +QPDFObjectHandle EOF in inline image 0 +QPDFObjectHandle inline image token 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index bf62ceea..8d2b5cfc 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -199,7 +199,7 @@ $td->runtest("remove page we don't have", show_ntests(); # ---------- $td->notify("--- Miscellaneous Tests ---"); -$n_tests += 57; +$n_tests += 59; $td->runtest("qpdf version", {$td->COMMAND => "qpdf --version"}, @@ -468,6 +468,16 @@ $td->runtest("check file with leading junk", {$td->COMMAND => "qpdf --check leading-junk.pdf"}, {$td->FILE => "leading-junk.out", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); +$td->runtest("EOF inside inline image", + {$td->COMMAND => "test_driver 37 eof-in-inline-image.pdf"}, + {$td->FILE => "eof-in-inline-image.out", + $td->EXIT_STATUS => 2}, + $td->NORMALIZE_NEWLINES); +$td->runtest("tokenize content streams", + {$td->COMMAND => "test_driver 37 tokenize-content-streams.pdf"}, + {$td->FILE => "tokenize-content-streams.out", + $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); show_ntests(); # ---------- diff --git a/qpdf/qtest/qpdf/eof-in-inline-image.out b/qpdf/qtest/qpdf/eof-in-inline-image.out new file mode 100644 index 00000000..8ac365c4 --- /dev/null +++ b/qpdf/qtest/qpdf/eof-in-inline-image.out @@ -0,0 +1,25 @@ +BT +/F1 +24 +Tf +72 +720 +Td +(Potato) +Tj +ET +BI +/CS +/G +/W +1 +/H +1 +/BPC +8 +/F +/Fl +/DP +<< /Columns 1 /Predictor 15 >> +ID +content stream object 4 0 (stream data, file position 139): EOF found while reading inline image diff --git a/qpdf/qtest/qpdf/eof-in-inline-image.pdf b/qpdf/qtest/qpdf/eof-in-inline-image.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e970b77d669828af17a17ac74814411840c638c6 GIT binary patch literal 870 zcmZWn%Z}496s2P44Zmxk(}fEIQRIBvqk!nzw*fV=dT}s2nPnHcpz~MG?P2kfk0#lH2?Zp0i8=L z8?%EXAxc*+&>as6b92xZIMBCs)xjFPK6NuBpXyyw-SH(=TcvFW{;A!2Rn>B4j#!1^ z6LJBQNC4K#WQ&0Vt)=SBp)HZ8W1Jd1={FpLzEF~2Mr+EIvJjq;(=d+DKa;r51D$tD z@9ZbwUgQEg+bX#yvz!3-;2?a+x454mDZwDGaQ{3rR$B7{@&|I0lQGwrUf3*xoHl2-2q1qcMCXvFMmIK zGI}~1Iq@G!Jzj&Gb|62Ko$8gKx2pM~Y+XvAH@Y-ct#?50YyDbx^#%AK_jt897hN3+ z+uySj2>(wy^|db7=45oO)~M_v#UORGuo?VtY}J-9m=Jbe5C(x4z!uIJN4jo-zF;Bh ze}!>@7P-Pii2biHKls015TY}OINQp)QEhLHr7_Ng# zD#k+TRZ48Rub7fW$ygGr2@gX*WOC~HB8sMd6p1L5MYv^*D_h*}7UsGD literal 0 HcmV?d00001 diff --git a/qpdf/qtest/qpdf/tokenize-content-streams.out b/qpdf/qtest/qpdf/tokenize-content-streams.out new file mode 100644 index 00000000..9bc933dc --- /dev/null +++ b/qpdf/qtest/qpdf/tokenize-content-streams.out @@ -0,0 +1,95 @@ +BT +/F1 +24 +Tf +72 +720 +Td +(Potato) +Tj +ET +-EOF- +0.1 +0 +0 +0.1 +0 +0 +cm +q +0 +1.1999 +-1.1999 +0 +121.19 +150.009 +cm +BI +/CS +/G +/W +1 +/H +1 +/BPC +8 +/F +/Fl +/DP +<< /Columns 1 /Predictor 15 >> +ID +inline image: 789c63fc0f0001030101 +EI +Q +q +0 +35.997 +-128.389 +0 +431.964 +7269.02 +cm +BI +/CS +/G +/W +30 +/H +107 +/BPC +8 +/F +/Fl +/DP +<< /Columns 30 /Predictor 15 >> +ID +inline image: 789cedd1a11100300800b1b2ffd06503148283bc8dfcf8af2a306ee352eff2e06318638c31c63b3801627b620a +EI +Q +q +0 +38.3968 +-93.5922 +0 +431.964 +7567.79 +cm +BI +/CS +/G +/W +32 +/H +78 +/BPC +8 +/F +/Fl +/DP +<< /Columns 32 /Predictor 15 >> +ID +inline image: 789c63fccf801f308e2a185530aa60882a20203faa605401890a0643aa1e5530aa6054010d140000bdd03c13 +EI +Q +-EOF- +test 37 done diff --git a/qpdf/qtest/qpdf/tokenize-content-streams.pdf b/qpdf/qtest/qpdf/tokenize-content-streams.pdf new file mode 100644 index 0000000000000000000000000000000000000000..ea97a6e2bdf724142d49634a0f99a7ca1ae799d9 GIT binary patch literal 1539 zcmbW1&1(}u6u=V%kpU60ho1CZ3R>vS%+7udhKNZTTl|XciiaY&+053IWLGy^sd^E; zD5!sd7d`eOdRDx7wf+Z!XA!*Up{Q>*-KM4$#9gvGdHdd*-}{(1)oL_nxFN{YyYDZ4 z5)KTK_6n&~fVP5VH2@PO3qbwbHx$r%kOjSD8LCyH;*M6-p9_O}vKnV-K+kqN16Tr) zn_BtvYT;t&b>u)RjU2?8in9UOxeYz9I^Cd_tf4Q4x5G_vr3vV(6{M(Gz%8gjvYLiS ztGSyAe#sXcoZL~hiWX@SE~*UNZOJes&9fsOO}h(c3B24&=1<)I|G)}_C8TLIfzJ1F z?;3`q`SzsIc_?VksdzcN4wj{92U)6uKB@TxFklX*1%8BU7c*os@H@CZ-b%6{OHP2l zLS}tkuq~^t(&^k3&TGLy<$0rXs&`9e-7o+;8;41!8!rRB(v45YgYHOhA?RZ};Kp=u zwVma?p##AG!#HwZqay31>#d5&eYKXs)uM-Vatn~HD7kQVgvkuumw0cz!#=r57;uA2 zDFNGe_(qAT*T8LNFeU-@YV(BF7lEFkSAf$w{MA}@@Mx1Zd$iH2RIo9y%fc*4(TvuU z-fBM{07v%u2CO{`HxHG{`^sf9J5Mg{&Y|vzF?7LqqH9iJ8&bN+X?lk65SW1_xFH=2 zi{(gznd78_X;kL^8?0aj?U#?w50}`1(yQ0MHq^d}`w!kc+5GWxn#G?NzJB`@PKJ;9 z`;%U|eYZ`90p)l|B^?iDq%dr0nt$-wj%&EPGAT^tbG?7WGYhD0j>kq{5RYV- z>v0)3v3g^ACLW-1nZdGu@R(Mo2N2aD`FE+_<9;BDl literal 0 HcmV?d00001 diff --git a/qpdf/test_driver.cc b/qpdf/test_driver.cc index 48017908..cd6aa991 100644 --- a/qpdf/test_driver.cc +++ b/qpdf/test_driver.cc @@ -58,6 +58,45 @@ class Provider: public QPDFObjectHandle::StreamDataProvider bool bad_length; }; +class ParserCallbacks: public QPDFObjectHandle::ParserCallbacks +{ + public: + virtual ~ParserCallbacks() + { + } + + virtual void handleObject(QPDFObjectHandle); + virtual void handleEOF(); +}; + +void +ParserCallbacks::handleObject(QPDFObjectHandle obj) +{ + if (obj.isInlineImage()) + { + std::string val = obj.getInlineImageValue(); + std::cout << "inline image: "; + char buf[3]; + buf[2] = '\0'; + for (size_t i = 0; i < val.length(); ++i) + { + sprintf(buf, "%02x", (unsigned char)(val[i])); + std::cout << buf; + } + std::cout << std::endl; + } + else + { + std::cout << obj.unparse() << std::endl; + } +} + +void +ParserCallbacks::handleEOF() +{ + std::cout << "-EOF-" << std::endl; +} + static std::string getPageContents(QPDFObjectHandle page) { PointerHolder b1 = @@ -1245,6 +1284,19 @@ void runtest(int n, char const* filename1, char const* arg2) } } } + else if (n == 37) + { + // Parse content streams of all pages + std::vector pages = pdf.getAllPages(); + for (std::vector::iterator iter = pages.begin(); + iter != pages.end(); ++iter) + { + QPDFObjectHandle page = *iter; + QPDFObjectHandle contents = page.getKey("/Contents"); + ParserCallbacks cb; + QPDFObjectHandle::parseContentStream(contents, &cb); + } + } else { throw std::runtime_error(std::string("invalid test ") +