From 5708b5d0aa9c94ab663509fbb865aa27a134aeb3 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sun, 11 Feb 2018 15:41:02 -0500 Subject: [PATCH] Add additional interface for filtering page contents --- ChangeLog | 6 ++ examples/build.mk | 3 +- examples/pdf-count-strings.cc | 131 ++++++++++++++++++++++++++++ examples/pdf-filter-tokens.cc | 6 +- examples/qtest/count-strings.test | 17 ++++ examples/qtest/count-strings/in.pdf | Bin 0 -> 1348 bytes examples/qtest/count-strings/out | 16 ++++ include/qpdf/QPDFObjectHandle.hh | 24 +++-- libqpdf/QPDFObjectHandle.cc | 20 +++++ 9 files changed, 215 insertions(+), 8 deletions(-) create mode 100644 examples/pdf-count-strings.cc create mode 100644 examples/qtest/count-strings.test create mode 100644 examples/qtest/count-strings/in.pdf create mode 100644 examples/qtest/count-strings/out diff --git a/ChangeLog b/ChangeLog index 97d65238..0c298abb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2018-02-11 Jay Berkenbilt + + * Add QPDFObjectHandle::filterPageContents method to provide a + different interface for applying token filters to page contents + without modifying the ultimate output. + 2018-02-04 Jay Berkenbilt * Changes listed on today's date are numerous and reflect diff --git a/examples/build.mk b/examples/build.mk index f5b44669..b5748c11 100644 --- a/examples/build.mk +++ b/examples/build.mk @@ -7,7 +7,8 @@ BINS_examples = \ pdf-create \ pdf-parse-content \ pdf-split-pages \ - pdf-filter-tokens + pdf-filter-tokens \ + pdf-count-strings CBINS_examples = pdf-linearize TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B))) diff --git a/examples/pdf-count-strings.cc b/examples/pdf-count-strings.cc new file mode 100644 index 00000000..81718298 --- /dev/null +++ b/examples/pdf-count-strings.cc @@ -0,0 +1,131 @@ +// +// This example illustrates the use of QPDFObjectHandle::TokenFilter +// with filterPageContents. See also pdf-filter-tokens.cc for an +// example that uses QPDFObjectHandle::TokenFilter with +// addContentTokenFilter. +// + +#include +#include +#include + +#include +#include +#include +#include + +static char const* whoami = 0; + +void usage() +{ + std::cerr << "Usage: " << whoami << " infile" << std::endl + << "Applies token filters to infile" + << std::endl; + exit(2); +} + +class StringCounter: public QPDFObjectHandle::TokenFilter +{ + public: + StringCounter() : + count(0) + { + } + virtual ~StringCounter() + { + } + virtual void handleToken(QPDFTokenizer::Token const&); + virtual void handleEOF(); + int getCount() const; + + private: + int count; +}; + +void +StringCounter::handleToken(QPDFTokenizer::Token const& token) +{ + // Count string tokens + if (token.getType() == QPDFTokenizer::tt_string) + { + ++this->count; + } + // Preserve input verbatim by passing each token to any specified + // downstream filter. + writeToken(token); +} + +void +StringCounter::handleEOF() +{ + // Write a comment at the end of the stream just to show how we + // can enhance the output if we want. + write("\n% strings found: "); + write(QUtil::int_to_string(this->count)); + // If you override handleEOF, you must always remember to call finish(). + finish(); +} + +int +StringCounter::getCount() const +{ + return this->count; +} + +int main(int argc, char* argv[]) +{ + whoami = QUtil::getWhoami(argv[0]); + + // For libtool's sake.... + if (strncmp(whoami, "lt-", 3) == 0) + { + whoami += 3; + } + + if (argc != 2) + { + usage(); + } + char const* infilename = argv[1]; + + try + { + QPDF pdf; + pdf.processFile(infilename); + std::vector pages = pdf.getAllPages(); + int pageno = 0; + for (std::vector::iterator iter = pages.begin(); + iter != pages.end(); ++iter) + { + QPDFObjectHandle page = *iter; + ++pageno; + // Pass the contents of a page through our string counter. + // If it's an even page, capture the output. This + // illustrates that you may capture any output generated + // by the filter, or you may ignore it. + StringCounter counter; + if (pageno % 2) + { + // Ignore output for odd pages. + page.filterPageContents(&counter); + } + else + { + // Write output to stdout for even pages. + Pl_StdioFile out("stdout", stdout); + std::cout << "% Contents of page " << pageno << std::endl; + page.filterPageContents(&counter, &out); + std::cout << "\n% end " << pageno << std::endl; + } + std::cout << "Page " << pageno + << ": strings = " << counter.getCount() << std::endl; + } + } + catch (std::exception& e) + { + std::cerr << whoami << ": " << e.what() << std::endl; + exit(2); + } + + return 0; +} diff --git a/examples/pdf-filter-tokens.cc b/examples/pdf-filter-tokens.cc index 2566f72c..809c160b 100644 --- a/examples/pdf-filter-tokens.cc +++ b/examples/pdf-filter-tokens.cc @@ -1,6 +1,8 @@ // -// This example illustrates the use of QPDFObjectHandle::TokenFilter. -// Please see comments inline for details. +// This example illustrates the use of QPDFObjectHandle::TokenFilter +// with addContentTokenFilter. Please see comments inline for details. +// See also pdf-count-strings.cc for a use of +// QPDFObjectHandle::TokenFilter with filterPageContents. // #include diff --git a/examples/qtest/count-strings.test b/examples/qtest/count-strings.test new file mode 100644 index 00000000..ba3f835b --- /dev/null +++ b/examples/qtest/count-strings.test @@ -0,0 +1,17 @@ +#!/usr/bin/env perl +require 5.008; +BEGIN { $^W = 1; } +use strict; + +chdir("count-strings"); + +require TestDriver; + +my $td = new TestDriver('pdf-count-strings'); + +$td->runtest("filter tokens", + {$td->COMMAND => "pdf-count-strings in.pdf"}, + {$td->FILE => "out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); + +$td->report(1); diff --git a/examples/qtest/count-strings/in.pdf b/examples/qtest/count-strings/in.pdf new file mode 100644 index 0000000000000000000000000000000000000000..591614c40643a3c5c7a5468fc46feca4aecdab86 GIT binary patch literal 1348 zcmcIkO=#3m5H6Nt2_E#K6g<3Q7gT7Im!FpemPOm`VyV?NyT!w{LVn(^(QQ(aS6w}c z(3@V0C_+!gt74_cf)@|fRxG7>wt5v2EO-;=rOj^H#fxYbvdNd3FW-DKlT0yR7{o)0 zka_)c>6d_!ghJ0waH#5q&JbhlDM7&^>DQva*QPBAILGAFHhxQUYG>} z4*GLI4|gKfbCCu$b{**iYV_-jM(|&^8O>8S^pYZC7EyafXe`FUj#b>et&}DCO7B?r ztM?D@t)1&1=vn*t{loa=X1Vw6q2-ZxKaPEUu(VyCf42Q*Yhhz$>*W0Bg{6(vU1#*= z>doF0U8U2;#EI5AH;qOxMa`3 z6csQMOLGoOMv8Xz&G9sLx%V_48B+;|1O2vkvNt|m8@yXw(ICv z(HxVgW|d&o^<<5>*fCAG%4HQI%d(;xnx;#-;TlBu9Le>nuA!NZ>fzipU(1-;5lf6U M4MWIeMlTnH-&;(EbN~PV literal 0 HcmV?d00001 diff --git a/examples/qtest/count-strings/out b/examples/qtest/count-strings/out new file mode 100644 index 00000000..87b024fc --- /dev/null +++ b/examples/qtest/count-strings/out @@ -0,0 +1,16 @@ +Page 1: strings = 3 +% Contents of page 2 +BT + /F1 24 Tf + 72 720 Td + (Four ) Tj + (Five ) Tj + (Six ) + (beautiful ) Tj + (strings) Tj + (!) Tj +ET + +% strings found: 6 +% end 2 +Page 2: strings = 6 diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index f0b8f2af..1f0d550a 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -80,9 +80,10 @@ class QPDFObjectHandle // The TokenFilter class provides a way to filter content streams // in a lexically aware fashion. TokenFilters can be attached to // streams using the addTokenFilter or addContentTokenFilter - // methods. The handleToken method is called for each token, - // including the eof token, and then handleEOF is called at the - // very end. Handlers may call write (or writeToken) to pass data + // methods or can be applied on the spot by filterPageContents. + // The handleToken method is called for each token, including the + // eof token, and then handleEOF is called at the very end. + // Handlers may call write (or writeToken) to pass data // downstream. The finish() method must be called exactly one time // to ensure that any written data is flushed out. The default // handleEOF calls finish. If you override handleEOF, you must @@ -91,8 +92,9 @@ class QPDFObjectHandle // Failure to call finish() may result in some of the data you // have written being lost. You should not rely on a destructor // for calling finish() since the destructor call may occur later - // than you expect. Please see examples/token-filters.cc for - // examples of using TokenFilters. + // than you expect. Please see examples/pdf-filter-tokens.cc and + // examples/pdf-count-strings.cc for examples of using + // TokenFilters. // // Please note that when you call token.getValue() on a token of // type tt_string, you get the string value without any @@ -255,6 +257,18 @@ class QPDFObjectHandle QPDF_DLL void parsePageContents(ParserCallbacks* callbacks); + // Pass a page's contents through the given TokenFilter. If a + // pipeline is also provided, it will be the target of the write + // methods from the token filter. If a pipeline is not specified, + // any output generated by the token filter will be discarded. Use + // this interface if you need to pass a page's contents through + // filter for work purposes without having that filter + // automatically applied to the page's contents, as happens with + // addContentTokenFilter. See examples/pdf-count-strings.cc for an + // example. + QPDF_DLL + void filterPageContents(TokenFilter* filter, Pipeline* next = 0); + // Pipe a page's contents through the given pipeline. This method // works whether the contents are a single stream or an array of // streams. Call on a page object. diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index bba95938..5d7b0bb9 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include @@ -998,6 +1000,24 @@ QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks) description, callbacks); } +void +QPDFObjectHandle::filterPageContents(TokenFilter* filter, Pipeline* next) +{ + assertPageObject(); + std::string description = "token filter for page object " + + QUtil::int_to_string(this->objid) + " " + + QUtil::int_to_string(this->generation); + Pl_QPDFTokenizer token_pipeline(description.c_str(), filter); + PointerHolder next_p; + if (next == 0) + { + next_p = new Pl_Discard(); + next = next_p.getPointer(); + } + filter->setPipeline(next); + this->pipePageContents(&token_pipeline); +} + void QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, ParserCallbacks* callbacks)