mirror of
https://github.com/qpdf/qpdf.git
synced 2025-04-02 14:41:50 +00:00
Add additional interface for filtering page contents
This commit is contained in:
parent
fd02944e19
commit
5708b5d0aa
@ -1,3 +1,9 @@
|
||||
2018-02-11 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Add QPDFObjectHandle::filterPageContents method to provide a
|
||||
different interface for applying token filters to page contents
|
||||
without modifying the ultimate output.
|
||||
|
||||
2018-02-04 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Changes listed on today's date are numerous and reflect
|
||||
|
@ -7,7 +7,8 @@ BINS_examples = \
|
||||
pdf-create \
|
||||
pdf-parse-content \
|
||||
pdf-split-pages \
|
||||
pdf-filter-tokens
|
||||
pdf-filter-tokens \
|
||||
pdf-count-strings
|
||||
CBINS_examples = pdf-linearize
|
||||
|
||||
TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B)))
|
||||
|
131
examples/pdf-count-strings.cc
Normal file
131
examples/pdf-count-strings.cc
Normal file
@ -0,0 +1,131 @@
|
||||
//
|
||||
// This example illustrates the use of QPDFObjectHandle::TokenFilter
|
||||
// with filterPageContents. See also pdf-filter-tokens.cc for an
|
||||
// example that uses QPDFObjectHandle::TokenFilter with
|
||||
// addContentTokenFilter.
|
||||
//
|
||||
|
||||
#include <iostream>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <qpdf/QPDF.hh>
|
||||
#include <qpdf/QUtil.hh>
|
||||
#include <qpdf/QPDFObjectHandle.hh>
|
||||
#include <qpdf/Pl_StdioFile.hh>
|
||||
|
||||
static char const* whoami = 0;
|
||||
|
||||
void usage()
|
||||
{
|
||||
std::cerr << "Usage: " << whoami << " infile" << std::endl
|
||||
<< "Applies token filters to infile"
|
||||
<< std::endl;
|
||||
exit(2);
|
||||
}
|
||||
|
||||
class StringCounter: public QPDFObjectHandle::TokenFilter
|
||||
{
|
||||
public:
|
||||
StringCounter() :
|
||||
count(0)
|
||||
{
|
||||
}
|
||||
virtual ~StringCounter()
|
||||
{
|
||||
}
|
||||
virtual void handleToken(QPDFTokenizer::Token const&);
|
||||
virtual void handleEOF();
|
||||
int getCount() const;
|
||||
|
||||
private:
|
||||
int count;
|
||||
};
|
||||
|
||||
void
|
||||
StringCounter::handleToken(QPDFTokenizer::Token const& token)
|
||||
{
|
||||
// Count string tokens
|
||||
if (token.getType() == QPDFTokenizer::tt_string)
|
||||
{
|
||||
++this->count;
|
||||
}
|
||||
// Preserve input verbatim by passing each token to any specified
|
||||
// downstream filter.
|
||||
writeToken(token);
|
||||
}
|
||||
|
||||
void
|
||||
StringCounter::handleEOF()
|
||||
{
|
||||
// Write a comment at the end of the stream just to show how we
|
||||
// can enhance the output if we want.
|
||||
write("\n% strings found: ");
|
||||
write(QUtil::int_to_string(this->count));
|
||||
// If you override handleEOF, you must always remember to call finish().
|
||||
finish();
|
||||
}
|
||||
|
||||
int
|
||||
StringCounter::getCount() const
|
||||
{
|
||||
return this->count;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
whoami = QUtil::getWhoami(argv[0]);
|
||||
|
||||
// For libtool's sake....
|
||||
if (strncmp(whoami, "lt-", 3) == 0)
|
||||
{
|
||||
whoami += 3;
|
||||
}
|
||||
|
||||
if (argc != 2)
|
||||
{
|
||||
usage();
|
||||
}
|
||||
char const* infilename = argv[1];
|
||||
|
||||
try
|
||||
{
|
||||
QPDF pdf;
|
||||
pdf.processFile(infilename);
|
||||
std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
|
||||
int pageno = 0;
|
||||
for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin();
|
||||
iter != pages.end(); ++iter)
|
||||
{
|
||||
QPDFObjectHandle page = *iter;
|
||||
++pageno;
|
||||
// Pass the contents of a page through our string counter.
|
||||
// If it's an even page, capture the output. This
|
||||
// illustrates that you may capture any output generated
|
||||
// by the filter, or you may ignore it.
|
||||
StringCounter counter;
|
||||
if (pageno % 2)
|
||||
{
|
||||
// Ignore output for odd pages.
|
||||
page.filterPageContents(&counter);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Write output to stdout for even pages.
|
||||
Pl_StdioFile out("stdout", stdout);
|
||||
std::cout << "% Contents of page " << pageno << std::endl;
|
||||
page.filterPageContents(&counter, &out);
|
||||
std::cout << "\n% end " << pageno << std::endl;
|
||||
}
|
||||
std::cout << "Page " << pageno
|
||||
<< ": strings = " << counter.getCount() << std::endl;
|
||||
}
|
||||
}
|
||||
catch (std::exception& e)
|
||||
{
|
||||
std::cerr << whoami << ": " << e.what() << std::endl;
|
||||
exit(2);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@ -1,6 +1,8 @@
|
||||
//
|
||||
// This example illustrates the use of QPDFObjectHandle::TokenFilter.
|
||||
// Please see comments inline for details.
|
||||
// This example illustrates the use of QPDFObjectHandle::TokenFilter
|
||||
// with addContentTokenFilter. Please see comments inline for details.
|
||||
// See also pdf-count-strings.cc for a use of
|
||||
// QPDFObjectHandle::TokenFilter with filterPageContents.
|
||||
//
|
||||
|
||||
#include <iostream>
|
||||
|
17
examples/qtest/count-strings.test
Normal file
17
examples/qtest/count-strings.test
Normal file
@ -0,0 +1,17 @@
|
||||
#!/usr/bin/env perl
|
||||
require 5.008;
|
||||
BEGIN { $^W = 1; }
|
||||
use strict;
|
||||
|
||||
chdir("count-strings");
|
||||
|
||||
require TestDriver;
|
||||
|
||||
my $td = new TestDriver('pdf-count-strings');
|
||||
|
||||
$td->runtest("filter tokens",
|
||||
{$td->COMMAND => "pdf-count-strings in.pdf"},
|
||||
{$td->FILE => "out", $td->EXIT_STATUS => 0},
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
|
||||
$td->report(1);
|
BIN
examples/qtest/count-strings/in.pdf
Normal file
BIN
examples/qtest/count-strings/in.pdf
Normal file
Binary file not shown.
16
examples/qtest/count-strings/out
Normal file
16
examples/qtest/count-strings/out
Normal file
@ -0,0 +1,16 @@
|
||||
Page 1: strings = 3
|
||||
% Contents of page 2
|
||||
BT
|
||||
/F1 24 Tf
|
||||
72 720 Td
|
||||
(Four ) Tj
|
||||
(Five ) Tj
|
||||
(Six )
|
||||
(beautiful ) Tj
|
||||
(strings) Tj
|
||||
(!) Tj
|
||||
ET
|
||||
|
||||
% strings found: 6
|
||||
% end 2
|
||||
Page 2: strings = 6
|
@ -80,9 +80,10 @@ class QPDFObjectHandle
|
||||
// The TokenFilter class provides a way to filter content streams
|
||||
// in a lexically aware fashion. TokenFilters can be attached to
|
||||
// streams using the addTokenFilter or addContentTokenFilter
|
||||
// methods. The handleToken method is called for each token,
|
||||
// including the eof token, and then handleEOF is called at the
|
||||
// very end. Handlers may call write (or writeToken) to pass data
|
||||
// methods or can be applied on the spot by filterPageContents.
|
||||
// The handleToken method is called for each token, including the
|
||||
// eof token, and then handleEOF is called at the very end.
|
||||
// Handlers may call write (or writeToken) to pass data
|
||||
// downstream. The finish() method must be called exactly one time
|
||||
// to ensure that any written data is flushed out. The default
|
||||
// handleEOF calls finish. If you override handleEOF, you must
|
||||
@ -91,8 +92,9 @@ class QPDFObjectHandle
|
||||
// Failure to call finish() may result in some of the data you
|
||||
// have written being lost. You should not rely on a destructor
|
||||
// for calling finish() since the destructor call may occur later
|
||||
// than you expect. Please see examples/token-filters.cc for
|
||||
// examples of using TokenFilters.
|
||||
// than you expect. Please see examples/pdf-filter-tokens.cc and
|
||||
// examples/pdf-count-strings.cc for examples of using
|
||||
// TokenFilters.
|
||||
//
|
||||
// Please note that when you call token.getValue() on a token of
|
||||
// type tt_string, you get the string value without any
|
||||
@ -255,6 +257,18 @@ class QPDFObjectHandle
|
||||
QPDF_DLL
|
||||
void parsePageContents(ParserCallbacks* callbacks);
|
||||
|
||||
// Pass a page's contents through the given TokenFilter. If a
|
||||
// pipeline is also provided, it will be the target of the write
|
||||
// methods from the token filter. If a pipeline is not specified,
|
||||
// any output generated by the token filter will be discarded. Use
|
||||
// this interface if you need to pass a page's contents through
|
||||
// filter for work purposes without having that filter
|
||||
// automatically applied to the page's contents, as happens with
|
||||
// addContentTokenFilter. See examples/pdf-count-strings.cc for an
|
||||
// example.
|
||||
QPDF_DLL
|
||||
void filterPageContents(TokenFilter* filter, Pipeline* next = 0);
|
||||
|
||||
// Pipe a page's contents through the given pipeline. This method
|
||||
// works whether the contents are a single stream or an array of
|
||||
// streams. Call on a page object.
|
||||
|
@ -15,6 +15,8 @@
|
||||
#include <qpdf/QPDF_Reserved.hh>
|
||||
#include <qpdf/Pl_Buffer.hh>
|
||||
#include <qpdf/Pl_Concatenate.hh>
|
||||
#include <qpdf/Pl_QPDFTokenizer.hh>
|
||||
#include <qpdf/Pl_Discard.hh>
|
||||
#include <qpdf/BufferInputSource.hh>
|
||||
#include <qpdf/QPDFExc.hh>
|
||||
|
||||
@ -998,6 +1000,24 @@ QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks)
|
||||
description, callbacks);
|
||||
}
|
||||
|
||||
void
|
||||
QPDFObjectHandle::filterPageContents(TokenFilter* filter, Pipeline* next)
|
||||
{
|
||||
assertPageObject();
|
||||
std::string description = "token filter for page object " +
|
||||
QUtil::int_to_string(this->objid) + " " +
|
||||
QUtil::int_to_string(this->generation);
|
||||
Pl_QPDFTokenizer token_pipeline(description.c_str(), filter);
|
||||
PointerHolder<Pipeline> next_p;
|
||||
if (next == 0)
|
||||
{
|
||||
next_p = new Pl_Discard();
|
||||
next = next_p.getPointer();
|
||||
}
|
||||
filter->setPipeline(next);
|
||||
this->pipePageContents(&token_pipeline);
|
||||
}
|
||||
|
||||
void
|
||||
QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
|
||||
ParserCallbacks* callbacks)
|
||||
|
Loading…
x
Reference in New Issue
Block a user