qpdf/qpdf/test_tokenizer.cc

268 lines
7.8 KiB
C++

#include <qpdf/BufferInputSource.hh>
#include <qpdf/FileInputSource.hh>
#include <qpdf/Pl_Buffer.hh>
#include <qpdf/QIntC.hh>
#include <qpdf/QPDF.hh>
#include <qpdf/QPDFPageDocumentHelper.hh>
#include <qpdf/QPDFTokenizer.hh>
#include <qpdf/QUtil.hh>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
static char const* whoami = nullptr;
void
usage()
{
std::cerr << "Usage: " << whoami << " [-maxlen len | -no-ignorable] filename" << std::endl;
exit(2);
}
class Finder: public InputSource::Finder
{
public:
Finder(std::shared_ptr<InputSource> is, std::string const& str) :
is(is),
str(str)
{
}
~Finder() override = default;
bool check() override;
private:
std::shared_ptr<InputSource> is;
std::string str;
};
bool
Finder::check()
{
QPDFTokenizer tokenizer;
QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
qpdf_offset_t offset = this->is->tell();
bool result = (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str));
this->is->seek(offset - QIntC::to_offset(this->str.length()), SEEK_SET);
return result;
}
static char const*
tokenTypeName(QPDFTokenizer::token_type_e ttype)
{
// Do this is a case statement instead of a lookup so the compiler
// will warn if we miss any.
switch (ttype) {
case QPDFTokenizer::tt_bad:
return "bad";
case QPDFTokenizer::tt_array_close:
return "array_close";
case QPDFTokenizer::tt_array_open:
return "array_open";
case QPDFTokenizer::tt_brace_close:
return "brace_close";
case QPDFTokenizer::tt_brace_open:
return "brace_open";
case QPDFTokenizer::tt_dict_close:
return "dict_close";
case QPDFTokenizer::tt_dict_open:
return "dict_open";
case QPDFTokenizer::tt_integer:
return "integer";
case QPDFTokenizer::tt_name:
return "name";
case QPDFTokenizer::tt_real:
return "real";
case QPDFTokenizer::tt_string:
return "string";
case QPDFTokenizer::tt_null:
return "null";
case QPDFTokenizer::tt_bool:
return "bool";
case QPDFTokenizer::tt_word:
return "word";
case QPDFTokenizer::tt_eof:
return "eof";
case QPDFTokenizer::tt_space:
return "space";
case QPDFTokenizer::tt_comment:
return "comment";
case QPDFTokenizer::tt_inline_image:
return "inline-image";
}
return nullptr;
}
static std::string
sanitize(std::string const& value)
{
std::string result;
for (auto const& iter: value) {
if ((iter >= 32) && (iter <= 126)) {
result.append(1, iter);
} else {
result += "\\x" + QUtil::int_to_string_base(static_cast<unsigned char>(iter), 16, 2);
}
}
return result;
}
static void
try_skipping(
QPDFTokenizer& tokenizer,
std::shared_ptr<InputSource> is,
size_t max_len,
char const* what,
Finder& f)
{
std::cout << "skipping to " << what << std::endl;
qpdf_offset_t offset = is->tell();
if (!is->findFirst(what, offset, 0, f)) {
std::cout << what << " not found" << std::endl;
is->seek(offset, SEEK_SET);
}
}
static void
dump_tokens(
std::shared_ptr<InputSource> is,
std::string const& label,
size_t max_len,
bool include_ignorable,
bool skip_streams,
bool skip_inline_images)
{
Finder f1(is, "endstream");
std::cout << "--- BEGIN " << label << " ---" << std::endl;
bool done = false;
QPDFTokenizer tokenizer;
tokenizer.allowEOF();
if (include_ignorable) {
tokenizer.includeIgnorable();
}
qpdf_offset_t inline_image_offset = 0;
while (!done) {
QPDFTokenizer::Token token =
tokenizer.readToken(is, "test", true, inline_image_offset ? 0 : max_len);
if (inline_image_offset && (token.getType() == QPDFTokenizer::tt_bad)) {
std::cout << "EI not found; resuming normal scanning" << std::endl;
is->seek(inline_image_offset, SEEK_SET);
inline_image_offset = 0;
continue;
}
inline_image_offset = 0;
qpdf_offset_t offset = is->getLastOffset();
std::cout << offset << ": " << tokenTypeName(token.getType());
if (token.getType() != QPDFTokenizer::tt_eof) {
std::cout << ": " << sanitize(token.getValue());
if (token.getValue() != token.getRawValue()) {
std::cout << " (raw: " << sanitize(token.getRawValue()) << ")";
}
}
if (!token.getErrorMessage().empty()) {
std::cout << " (" << token.getErrorMessage() << ")";
}
std::cout << std::endl;
if (skip_streams && (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream"))) {
try_skipping(tokenizer, is, max_len, "endstream", f1);
} else if (
skip_inline_images && (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID"))) {
char ch;
is->read(&ch, 1);
tokenizer.expectInlineImage(is);
inline_image_offset = is->tell();
} else if (token.getType() == QPDFTokenizer::tt_eof) {
done = true;
}
}
std::cout << "--- END " << label << " ---" << std::endl;
}
static void
process(char const* filename, bool include_ignorable, size_t max_len)
{
std::shared_ptr<InputSource> is;
// Tokenize file, skipping streams
auto* fis = new FileInputSource(filename);
is = std::shared_ptr<InputSource>(fis);
dump_tokens(is, "FILE", max_len, include_ignorable, true, false);
// Tokenize content streams, skipping inline images
QPDF qpdf;
qpdf.processFile(filename);
int pageno = 0;
for (auto& page: QPDFPageDocumentHelper(qpdf).getAllPages()) {
++pageno;
Pl_Buffer plb("buffer");
page.pipeContents(&plb);
auto content_data = plb.getBufferSharedPointer();
auto* bis = new BufferInputSource("content data", content_data.get());
is = std::shared_ptr<InputSource>(bis);
dump_tokens(
is, "PAGE " + QUtil::int_to_string(pageno), max_len, include_ignorable, false, true);
}
// Tokenize object streams
for (auto& obj: qpdf.getAllObjects()) {
if (obj.isStream() && obj.getDict().getKey("/Type").isName() &&
obj.getDict().getKey("/Type").getName() == "/ObjStm") {
std::shared_ptr<Buffer> b = obj.getStreamData(qpdf_dl_specialized);
auto* bis = new BufferInputSource("object stream data", b.get());
is = std::shared_ptr<InputSource>(bis);
dump_tokens(
is,
"OBJECT STREAM " + QUtil::int_to_string(obj.getObjectID()),
max_len,
include_ignorable,
false,
false);
}
}
}
int
main(int argc, char* argv[])
{
QUtil::setLineBuf(stdout);
if ((whoami = strrchr(argv[0], '/')) == nullptr) {
whoami = argv[0];
} else {
++whoami;
}
char const* filename = nullptr;
size_t max_len = 0;
bool include_ignorable = true;
for (int i = 1; i < argc; ++i) {
if (argv[i][0] == '-') {
if (strcmp(argv[i], "-maxlen") == 0) {
if (++i >= argc) {
usage();
}
max_len = QUtil::string_to_uint(argv[i]);
} else if (strcmp(argv[i], "-no-ignorable") == 0) {
include_ignorable = false;
} else {
usage();
}
} else if (filename) {
usage();
} else {
filename = argv[i];
}
}
if (filename == nullptr) {
usage();
}
try {
process(filename, include_ignorable, max_len);
} catch (std::exception& e) {
std::cerr << whoami << ": exception: " << e.what();
exit(2);
}
return 0;
}