diff --git a/ChangeLog b/ChangeLog index 20cb0e80..b061c584 100644 --- a/ChangeLog +++ b/ChangeLog @@ -150,6 +150,9 @@ QPDFObjectHandle::pipeStreamData, you don't need to worry about this at all. + * Provide heavily annoated examples/pdf-filter-tokens.cc example + that illustrates use of some simple token filters. + 2018-02-04 Jay Berkenbilt * Add QPDFWriter::setLinearizationPass1Filename method and diff --git a/examples/build.mk b/examples/build.mk index 518f4d55..f5b44669 100644 --- a/examples/build.mk +++ b/examples/build.mk @@ -6,7 +6,8 @@ BINS_examples = \ pdf-invert-images \ pdf-create \ pdf-parse-content \ - pdf-split-pages + pdf-split-pages \ + pdf-filter-tokens CBINS_examples = pdf-linearize TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B))) diff --git a/examples/pdf-filter-tokens.cc b/examples/pdf-filter-tokens.cc new file mode 100644 index 00000000..2566f72c --- /dev/null +++ b/examples/pdf-filter-tokens.cc @@ -0,0 +1,239 @@ +// +// This example illustrates the use of QPDFObjectHandle::TokenFilter. +// Please see comments inline for details. +// + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static char const* whoami = 0; + +void usage() +{ + std::cerr << "Usage: " << whoami << " infile outfile" << std::endl + << "Applies token filters to infile and writes outfile" + << std::endl; + exit(2); +} + +// The StringReverser class is a trivial example of using a token +// filter. This class only overrides the pure virtual handleToken +// function and preserves the default handleEOF function. +class StringReverser: public QPDFObjectHandle::TokenFilter +{ + public: + virtual ~StringReverser() + { + } + virtual void handleToken(QPDFTokenizer::Token const&); +}; + +void +StringReverser::handleToken(QPDFTokenizer::Token const& token) +{ + // For string tokens, reverse the characters. For other tokens, + // just pass them through. Notice that we construct a new string + // token and write that, thus allowing the library to handle any + // subtleties about properly encoding unprintable characters. This + // function doesn't handle multibyte characters at all. It's not + // intended to be an example of the correct way to reverse + // strings. It's just intended to give a simple example of a + // pretty minimal filter and to show an example of writing a + // constructed token. + if (token.getType() == QPDFTokenizer::tt_string) + { + std::string value = token.getValue(); + std::reverse(value.begin(), value.end()); + writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, value)); + } + else + { + writeToken(token); + } +} + +// The ColorToGray filter finds all "rg" operators in the content +// stream and replaces them with "g" operators, thus mapping color to +// grayscale. Note that it only applies to content streams, not +// images, so this will not replace color images with grayscale +// images. +class ColorToGray: public QPDFObjectHandle::TokenFilter +{ + public: + virtual ~ColorToGray() + { + } + virtual void handleToken(QPDFTokenizer::Token const&); + virtual void handleEOF(); + + private: + bool isNumeric(QPDFTokenizer::token_type_e); + bool isIgnorable(QPDFTokenizer::token_type_e); + double numericValue(QPDFTokenizer::Token const&); + + std::deque all_stack; + std::deque stack; +}; + +bool +ColorToGray::isNumeric(QPDFTokenizer::token_type_e token_type) +{ + return ((token_type == QPDFTokenizer::tt_integer) || + (token_type == QPDFTokenizer::tt_real)); +} + +bool +ColorToGray::isIgnorable(QPDFTokenizer::token_type_e token_type) +{ + return ((token_type == QPDFTokenizer::tt_space) || + (token_type == QPDFTokenizer::tt_comment)); +} + +double +ColorToGray::numericValue(QPDFTokenizer::Token const& token) +{ + return QPDFObjectHandle::parse(token.getValue()).getNumericValue(); +} + +void +ColorToGray::handleToken(QPDFTokenizer::Token const& token) +{ + // Track the number of non-ignorable tokens we've seen. If we see + // an "rg" following three numbers, convert it to a grayscale + // value. Keep writing tokens to the output as we can. + + // There are several things to notice here. We keep two stacks: + // one of "meaningful" tokens, and one of all tokens. This way we + // can preserve whitespace or comments that we encounter in the + // stream and there preserve layout. As we receive tokens, we keep + // the last four meaningful tokens. If we see three numbers + // followed by rg, we use the three numbers to calculate a gray + // value that is perceptually similar to the color value and then + // write the "g" operator to the output, discarding any spaces or + // comments encountered embedded in the "rg" operator. + + // The stack and all_stack members are updated in such a way that + // they always contain exactly the same non-ignorable tokens. The + // stack member contains the tokens that would be left if you + // removed all space and comment tokens from all_stack. + + // On each new token, flush out any space or comment tokens. Store + // the incoming token. If we just got an rg preceded by the right + // kinds of operands, replace the command. Flush any additional + // accumulated tokens to keep the stack only four tokens deep. + + while ((! this->all_stack.empty()) && + isIgnorable(this->all_stack.at(0).getType())) + { + writeToken(this->all_stack.at(0)); + this->all_stack.pop_front(); + } + this->all_stack.push_back(token); + QPDFTokenizer::token_type_e token_type = token.getType(); + if (! isIgnorable(token_type)) + { + this->stack.push_back(token); + if ((this->stack.size() == 4) && + (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "rg")) && + (isNumeric(this->stack.at(0).getType())) && + (isNumeric(this->stack.at(1).getType())) && + (isNumeric(this->stack.at(2).getType()))) + { + double r = numericValue(this->stack.at(0)); + double g = numericValue(this->stack.at(1)); + double b = numericValue(this->stack.at(2)); + double gray = ((0.3 * r) + (0.59 * b) + (0.11 * g)); + if (gray > 1.0) + { + gray = 1.0; + } + if (gray < 0.0) + { + gray = 0.0; + } + write(QUtil::double_to_string(gray, 3)); + write(" g"); + this->stack.clear(); + this->all_stack.clear(); + } + } + if (this->stack.size() == 4) + { + writeToken(this->all_stack.at(0)); + this->all_stack.pop_front(); + this->stack.pop_front(); + } +} + +void +ColorToGray::handleEOF() +{ + // Flush out any remaining accumulated tokens. + while (! this->all_stack.empty()) + { + writeToken(this->all_stack.at(0)); + this->all_stack.pop_front(); + } + // Remember to call finish(). If you override handleEOF, it is + // essential that you call finish() or else you are likely to lose + // some data in buffers of downstream pipelines that are not + // flushed out. This is also mentioned in comments in + // QPDFObjectHandle.hh. + finish(); +} + +int main(int argc, char* argv[]) +{ + whoami = QUtil::getWhoami(argv[0]); + + // For libtool's sake.... + if (strncmp(whoami, "lt-", 3) == 0) + { + whoami += 3; + } + + if (argc != 3) + { + usage(); + } + char const* infilename = argv[1]; + char const* outfilename = argv[2]; + + try + { + QPDF pdf; + pdf.processFile(infilename); + std::vector pages = pdf.getAllPages(); + for (std::vector::iterator iter = pages.begin(); + iter != pages.end(); ++iter) + { + // Attach two token filters to each page of this file. + // When the file is written, or when the pages' contents + // are retrieved in any other way, the filters will be + // applied. See comments on the filters for additional + // details. + QPDFObjectHandle page = *iter; + page.addContentTokenFilter(new StringReverser); + page.addContentTokenFilter(new ColorToGray); + } + + QPDFWriter w(pdf, outfilename); + w.setStaticID(true); // for testing only + w.write(); + } + catch (std::exception& e) + { + std::cerr << whoami << ": " << e.what() << std::endl; + exit(2); + } + + return 0; +} diff --git a/examples/qtest/filter-tokens.test b/examples/qtest/filter-tokens.test new file mode 100644 index 00000000..6b93eb8f --- /dev/null +++ b/examples/qtest/filter-tokens.test @@ -0,0 +1,20 @@ +#!/usr/bin/env perl +require 5.008; +BEGIN { $^W = 1; } +use strict; + +chdir("filter-tokens"); + +require TestDriver; + +my $td = new TestDriver('pdf-filter-tokens'); + +$td->runtest("filter tokens", + {$td->COMMAND => "pdf-filter-tokens in.pdf a.pdf"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}); + +$td->runtest("check output", + {$td->FILE => "a.pdf"}, + {$td->FILE => "out.pdf"}); + +$td->report(2); diff --git a/examples/qtest/filter-tokens/a.pdf b/examples/qtest/filter-tokens/a.pdf new file mode 100644 index 00000000..ef7cdbce Binary files /dev/null and b/examples/qtest/filter-tokens/a.pdf differ diff --git a/examples/qtest/filter-tokens/in.pdf b/examples/qtest/filter-tokens/in.pdf new file mode 100644 index 00000000..f60a30d6 Binary files /dev/null and b/examples/qtest/filter-tokens/in.pdf differ diff --git a/examples/qtest/filter-tokens/out.pdf b/examples/qtest/filter-tokens/out.pdf new file mode 100644 index 00000000..ef7cdbce Binary files /dev/null and b/examples/qtest/filter-tokens/out.pdf differ