2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-06-22 20:15:11 +00:00
qpdf/qpdf/test_tokenizer.cc
Jay Berkenbilt 4f24617e1e Code clean up: use range-style for loops wherever possible
Where not possible, use "auto" to get the iterator type.

Editorial note: I have avoid this change for a long time because of
not wanting to make gratuitous changes to version history, which can
obscure when certain changes were made, but with having recently
touched every single file to apply automatic code formatting and with
making several broad changes to the API, I decided it was time to take
the plunge and get rid of the older (pre-C++11) verbose iterator
syntax. The new code is just easier to read and understand, and in
many cases, it will be more effecient as fewer temporary copies are
being made.

m-holger, if you're reading, you can see that I've finally come
around. :-)
2022-04-30 13:27:18 -04:00

284 lines
8.1 KiB
C++

#include <qpdf/BufferInputSource.hh>
#include <qpdf/FileInputSource.hh>
#include <qpdf/Pl_Buffer.hh>
#include <qpdf/QIntC.hh>
#include <qpdf/QPDF.hh>
#include <qpdf/QPDFPageDocumentHelper.hh>
#include <qpdf/QPDFPageObjectHelper.hh>
#include <qpdf/QPDFTokenizer.hh>
#include <qpdf/QUtil.hh>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static char const* whoami = 0;
void
usage()
{
std::cerr << "Usage: " << whoami
<< " [-maxlen len | -no-ignorable] filename" << std::endl;
exit(2);
}
class Finder: public InputSource::Finder
{
public:
Finder(std::shared_ptr<InputSource> is, std::string const& str) :
is(is),
str(str)
{
}
virtual ~Finder() = default;
virtual bool check();
private:
std::shared_ptr<InputSource> is;
std::string str;
};
bool
Finder::check()
{
QPDFTokenizer tokenizer;
QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
qpdf_offset_t offset = this->is->tell();
bool result = (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str));
this->is->seek(offset - QIntC::to_offset(this->str.length()), SEEK_SET);
return result;
}
static char const*
tokenTypeName(QPDFTokenizer::token_type_e ttype)
{
// Do this is a case statement instead of a lookup so the compiler
// will warn if we miss any.
switch (ttype) {
case QPDFTokenizer::tt_bad:
return "bad";
case QPDFTokenizer::tt_array_close:
return "array_close";
case QPDFTokenizer::tt_array_open:
return "array_open";
case QPDFTokenizer::tt_brace_close:
return "brace_close";
case QPDFTokenizer::tt_brace_open:
return "brace_open";
case QPDFTokenizer::tt_dict_close:
return "dict_close";
case QPDFTokenizer::tt_dict_open:
return "dict_open";
case QPDFTokenizer::tt_integer:
return "integer";
case QPDFTokenizer::tt_name:
return "name";
case QPDFTokenizer::tt_real:
return "real";
case QPDFTokenizer::tt_string:
return "string";
case QPDFTokenizer::tt_null:
return "null";
case QPDFTokenizer::tt_bool:
return "bool";
case QPDFTokenizer::tt_word:
return "word";
case QPDFTokenizer::tt_eof:
return "eof";
case QPDFTokenizer::tt_space:
return "space";
case QPDFTokenizer::tt_comment:
return "comment";
case QPDFTokenizer::tt_inline_image:
return "inline-image";
}
return 0;
}
static std::string
sanitize(std::string const& value)
{
std::string result;
for (auto const& iter: value) {
if ((iter >= 32) && (iter <= 126)) {
result.append(1, iter);
} else {
result += "\\x" +
QUtil::int_to_string_base(
static_cast<unsigned char>(iter), 16, 2);
}
}
return result;
}
static void
try_skipping(
QPDFTokenizer& tokenizer,
std::shared_ptr<InputSource> is,
size_t max_len,
char const* what,
Finder& f)
{
std::cout << "skipping to " << what << std::endl;
qpdf_offset_t offset = is->tell();
if (!is->findFirst(what, offset, 0, f)) {
std::cout << what << " not found" << std::endl;
is->seek(offset, SEEK_SET);
}
}
static void
dump_tokens(
std::shared_ptr<InputSource> is,
std::string const& label,
size_t max_len,
bool include_ignorable,
bool skip_streams,
bool skip_inline_images)
{
Finder f1(is, "endstream");
std::cout << "--- BEGIN " << label << " ---" << std::endl;
bool done = false;
QPDFTokenizer tokenizer;
tokenizer.allowEOF();
if (include_ignorable) {
tokenizer.includeIgnorable();
}
qpdf_offset_t inline_image_offset = 0;
while (!done) {
QPDFTokenizer::Token token = tokenizer.readToken(
is, "test", true, inline_image_offset ? 0 : max_len);
if (inline_image_offset && (token.getType() == QPDFTokenizer::tt_bad)) {
std::cout << "EI not found; resuming normal scanning" << std::endl;
is->seek(inline_image_offset, SEEK_SET);
inline_image_offset = 0;
continue;
}
inline_image_offset = 0;
qpdf_offset_t offset = is->getLastOffset();
std::cout << offset << ": " << tokenTypeName(token.getType());
if (token.getType() != QPDFTokenizer::tt_eof) {
std::cout << ": " << sanitize(token.getValue());
if (token.getValue() != token.getRawValue()) {
std::cout << " (raw: " << sanitize(token.getRawValue()) << ")";
}
}
if (!token.getErrorMessage().empty()) {
std::cout << " (" << token.getErrorMessage() << ")";
}
std::cout << std::endl;
if (skip_streams &&
(token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream"))) {
try_skipping(tokenizer, is, max_len, "endstream", f1);
} else if (
skip_inline_images &&
(token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID"))) {
char ch;
is->read(&ch, 1);
tokenizer.expectInlineImage(is);
inline_image_offset = is->tell();
} else if (token.getType() == QPDFTokenizer::tt_eof) {
done = true;
}
}
std::cout << "--- END " << label << " ---" << std::endl;
}
static void
process(char const* filename, bool include_ignorable, size_t max_len)
{
std::shared_ptr<InputSource> is;
// Tokenize file, skipping streams
FileInputSource* fis = new FileInputSource();
fis->setFilename(filename);
is = std::shared_ptr<InputSource>(fis);
dump_tokens(is, "FILE", max_len, include_ignorable, true, false);
// Tokenize content streams, skipping inline images
QPDF qpdf;
qpdf.processFile(filename);
std::vector<QPDFPageObjectHelper> pages =
QPDFPageDocumentHelper(qpdf).getAllPages();
int pageno = 0;
for (auto& page: pages) {
++pageno;
Pl_Buffer plb("buffer");
page.pipeContents(&plb);
auto content_data = plb.getBufferSharedPointer();
BufferInputSource* bis =
new BufferInputSource("content data", content_data.get());
is = std::shared_ptr<InputSource>(bis);
dump_tokens(
is,
"PAGE " + QUtil::int_to_string(pageno),
max_len,
include_ignorable,
false,
true);
}
// Tokenize object streams
for (auto& obj: qpdf.getAllObjects()) {
if (obj.isStream() && obj.getDict().getKey("/Type").isName() &&
obj.getDict().getKey("/Type").getName() == "/ObjStm") {
std::shared_ptr<Buffer> b = obj.getStreamData(qpdf_dl_specialized);
BufferInputSource* bis =
new BufferInputSource("object stream data", b.get());
is = std::shared_ptr<InputSource>(bis);
dump_tokens(
is,
"OBJECT STREAM " + QUtil::int_to_string(obj.getObjectID()),
max_len,
include_ignorable,
false,
false);
}
}
}
int
main(int argc, char* argv[])
{
QUtil::setLineBuf(stdout);
if ((whoami = strrchr(argv[0], '/')) == NULL) {
whoami = argv[0];
} else {
++whoami;
}
char const* filename = 0;
size_t max_len = 0;
bool include_ignorable = true;
for (int i = 1; i < argc; ++i) {
if (argv[i][0] == '-') {
if (strcmp(argv[i], "-maxlen") == 0) {
if (++i >= argc) {
usage();
}
max_len = QUtil::string_to_uint(argv[i]);
} else if (strcmp(argv[i], "-no-ignorable") == 0) {
include_ignorable = false;
} else {
usage();
}
} else if (filename) {
usage();
} else {
filename = argv[i];
}
}
if (filename == 0) {
usage();
}
try {
process(filename, include_ignorable, max_len);
} catch (std::exception& e) {
std::cerr << whoami << ": exception: " << e.what();
exit(2);
}
return 0;
}