Add additional interface for filtering page contents

This commit is contained in:
Jay Berkenbilt 2018-02-11 15:41:02 -05:00
parent fd02944e19
commit 5708b5d0aa
9 changed files with 215 additions and 8 deletions

View File

@ -1,3 +1,9 @@
2018-02-11 Jay Berkenbilt <ejb@ql.org>
* Add QPDFObjectHandle::filterPageContents method to provide a
different interface for applying token filters to page contents
without modifying the ultimate output.
2018-02-04 Jay Berkenbilt <ejb@ql.org>
* Changes listed on today's date are numerous and reflect

View File

@ -7,7 +7,8 @@ BINS_examples = \
pdf-create \
pdf-parse-content \
pdf-split-pages \
pdf-filter-tokens
pdf-filter-tokens \
pdf-count-strings
CBINS_examples = pdf-linearize
TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B)))

View File

@ -0,0 +1,131 @@
//
// This example illustrates the use of QPDFObjectHandle::TokenFilter
// with filterPageContents. See also pdf-filter-tokens.cc for an
// example that uses QPDFObjectHandle::TokenFilter with
// addContentTokenFilter.
//
#include <iostream>
#include <string.h>
#include <stdlib.h>
#include <qpdf/QPDF.hh>
#include <qpdf/QUtil.hh>
#include <qpdf/QPDFObjectHandle.hh>
#include <qpdf/Pl_StdioFile.hh>
static char const* whoami = 0;
void usage()
{
std::cerr << "Usage: " << whoami << " infile" << std::endl
<< "Applies token filters to infile"
<< std::endl;
exit(2);
}
class StringCounter: public QPDFObjectHandle::TokenFilter
{
public:
StringCounter() :
count(0)
{
}
virtual ~StringCounter()
{
}
virtual void handleToken(QPDFTokenizer::Token const&);
virtual void handleEOF();
int getCount() const;
private:
int count;
};
void
StringCounter::handleToken(QPDFTokenizer::Token const& token)
{
// Count string tokens
if (token.getType() == QPDFTokenizer::tt_string)
{
++this->count;
}
// Preserve input verbatim by passing each token to any specified
// downstream filter.
writeToken(token);
}
void
StringCounter::handleEOF()
{
// Write a comment at the end of the stream just to show how we
// can enhance the output if we want.
write("\n% strings found: ");
write(QUtil::int_to_string(this->count));
// If you override handleEOF, you must always remember to call finish().
finish();
}
int
StringCounter::getCount() const
{
return this->count;
}
int main(int argc, char* argv[])
{
whoami = QUtil::getWhoami(argv[0]);
// For libtool's sake....
if (strncmp(whoami, "lt-", 3) == 0)
{
whoami += 3;
}
if (argc != 2)
{
usage();
}
char const* infilename = argv[1];
try
{
QPDF pdf;
pdf.processFile(infilename);
std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
int pageno = 0;
for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin();
iter != pages.end(); ++iter)
{
QPDFObjectHandle page = *iter;
++pageno;
// Pass the contents of a page through our string counter.
// If it's an even page, capture the output. This
// illustrates that you may capture any output generated
// by the filter, or you may ignore it.
StringCounter counter;
if (pageno % 2)
{
// Ignore output for odd pages.
page.filterPageContents(&counter);
}
else
{
// Write output to stdout for even pages.
Pl_StdioFile out("stdout", stdout);
std::cout << "% Contents of page " << pageno << std::endl;
page.filterPageContents(&counter, &out);
std::cout << "\n% end " << pageno << std::endl;
}
std::cout << "Page " << pageno
<< ": strings = " << counter.getCount() << std::endl;
}
}
catch (std::exception& e)
{
std::cerr << whoami << ": " << e.what() << std::endl;
exit(2);
}
return 0;
}

View File

@ -1,6 +1,8 @@
//
// This example illustrates the use of QPDFObjectHandle::TokenFilter.
// Please see comments inline for details.
// This example illustrates the use of QPDFObjectHandle::TokenFilter
// with addContentTokenFilter. Please see comments inline for details.
// See also pdf-count-strings.cc for a use of
// QPDFObjectHandle::TokenFilter with filterPageContents.
//
#include <iostream>

View File

@ -0,0 +1,17 @@
#!/usr/bin/env perl
require 5.008;
BEGIN { $^W = 1; }
use strict;
chdir("count-strings");
require TestDriver;
my $td = new TestDriver('pdf-count-strings');
$td->runtest("filter tokens",
{$td->COMMAND => "pdf-count-strings in.pdf"},
{$td->FILE => "out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->report(1);

Binary file not shown.

View File

@ -0,0 +1,16 @@
Page 1: strings = 3
% Contents of page 2
BT
/F1 24 Tf
72 720 Td
(Four ) Tj
(Five ) Tj
(Six )
(beautiful ) Tj
(strings) Tj
(!) Tj
ET
% strings found: 6
% end 2
Page 2: strings = 6

View File

@ -80,9 +80,10 @@ class QPDFObjectHandle
// The TokenFilter class provides a way to filter content streams
// in a lexically aware fashion. TokenFilters can be attached to
// streams using the addTokenFilter or addContentTokenFilter
// methods. The handleToken method is called for each token,
// including the eof token, and then handleEOF is called at the
// very end. Handlers may call write (or writeToken) to pass data
// methods or can be applied on the spot by filterPageContents.
// The handleToken method is called for each token, including the
// eof token, and then handleEOF is called at the very end.
// Handlers may call write (or writeToken) to pass data
// downstream. The finish() method must be called exactly one time
// to ensure that any written data is flushed out. The default
// handleEOF calls finish. If you override handleEOF, you must
@ -91,8 +92,9 @@ class QPDFObjectHandle
// Failure to call finish() may result in some of the data you
// have written being lost. You should not rely on a destructor
// for calling finish() since the destructor call may occur later
// than you expect. Please see examples/token-filters.cc for
// examples of using TokenFilters.
// than you expect. Please see examples/pdf-filter-tokens.cc and
// examples/pdf-count-strings.cc for examples of using
// TokenFilters.
//
// Please note that when you call token.getValue() on a token of
// type tt_string, you get the string value without any
@ -255,6 +257,18 @@ class QPDFObjectHandle
QPDF_DLL
void parsePageContents(ParserCallbacks* callbacks);
// Pass a page's contents through the given TokenFilter. If a
// pipeline is also provided, it will be the target of the write
// methods from the token filter. If a pipeline is not specified,
// any output generated by the token filter will be discarded. Use
// this interface if you need to pass a page's contents through
// filter for work purposes without having that filter
// automatically applied to the page's contents, as happens with
// addContentTokenFilter. See examples/pdf-count-strings.cc for an
// example.
QPDF_DLL
void filterPageContents(TokenFilter* filter, Pipeline* next = 0);
// Pipe a page's contents through the given pipeline. This method
// works whether the contents are a single stream or an array of
// streams. Call on a page object.

View File

@ -15,6 +15,8 @@
#include <qpdf/QPDF_Reserved.hh>
#include <qpdf/Pl_Buffer.hh>
#include <qpdf/Pl_Concatenate.hh>
#include <qpdf/Pl_QPDFTokenizer.hh>
#include <qpdf/Pl_Discard.hh>
#include <qpdf/BufferInputSource.hh>
#include <qpdf/QPDFExc.hh>
@ -998,6 +1000,24 @@ QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks)
description, callbacks);
}
void
QPDFObjectHandle::filterPageContents(TokenFilter* filter, Pipeline* next)
{
assertPageObject();
std::string description = "token filter for page object " +
QUtil::int_to_string(this->objid) + " " +
QUtil::int_to_string(this->generation);
Pl_QPDFTokenizer token_pipeline(description.c_str(), filter);
PointerHolder<Pipeline> next_p;
if (next == 0)
{
next_p = new Pl_Discard();
next = next_p.getPointer();
}
filter->setPipeline(next);
this->pipePageContents(&token_pipeline);
}
void
QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
ParserCallbacks* callbacks)