mirror of
https://github.com/qpdf/qpdf.git
synced 2024-11-17 01:55:09 +00:00
ec9e310c9e
Add a version of expectInlineImage that takes an input source and searches for EI. This is in preparation for improving the way EI is found. This commit just refactors the code without changing the functionality and adds tests to make sure the old and new code behave identically.
327 lines
8.9 KiB
C++
327 lines
8.9 KiB
C++
#include <qpdf/QPDFTokenizer.hh>
|
|
#include <qpdf/QPDFPageDocumentHelper.hh>
|
|
#include <qpdf/QPDFPageObjectHelper.hh>
|
|
#include <qpdf/QUtil.hh>
|
|
#include <qpdf/FileInputSource.hh>
|
|
#include <qpdf/BufferInputSource.hh>
|
|
#include <qpdf/QPDF.hh>
|
|
#include <qpdf/Pl_Buffer.hh>
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <iostream>
|
|
|
|
static char const* whoami = 0;
|
|
|
|
void usage()
|
|
{
|
|
std::cerr << "Usage: " << whoami
|
|
<< " [-maxlen len | -no-ignorable | -old-ei] filename"
|
|
<< std::endl;
|
|
exit(2);
|
|
}
|
|
|
|
class Finder: public InputSource::Finder
|
|
{
|
|
public:
|
|
Finder(PointerHolder<InputSource> is, std::string const& str) :
|
|
is(is),
|
|
str(str)
|
|
{
|
|
}
|
|
virtual ~Finder()
|
|
{
|
|
}
|
|
virtual bool check();
|
|
|
|
private:
|
|
PointerHolder<InputSource> is;
|
|
std::string str;
|
|
};
|
|
|
|
bool
|
|
Finder::check()
|
|
{
|
|
QPDFTokenizer tokenizer;
|
|
QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
|
|
qpdf_offset_t offset = this->is->tell();
|
|
bool result = (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str));
|
|
this->is->seek(offset - this->str.length(), SEEK_SET);
|
|
return result;
|
|
}
|
|
|
|
static char const* tokenTypeName(QPDFTokenizer::token_type_e ttype)
|
|
{
|
|
// Do this is a case statement instead of a lookup so the compiler
|
|
// will warn if we miss any.
|
|
switch (ttype)
|
|
{
|
|
case QPDFTokenizer::tt_bad:
|
|
return "bad";
|
|
case QPDFTokenizer::tt_array_close:
|
|
return "array_close";
|
|
case QPDFTokenizer::tt_array_open:
|
|
return "array_open";
|
|
case QPDFTokenizer::tt_brace_close:
|
|
return "brace_close";
|
|
case QPDFTokenizer::tt_brace_open:
|
|
return "brace_open";
|
|
case QPDFTokenizer::tt_dict_close:
|
|
return "dict_close";
|
|
case QPDFTokenizer::tt_dict_open:
|
|
return "dict_open";
|
|
case QPDFTokenizer::tt_integer:
|
|
return "integer";
|
|
case QPDFTokenizer::tt_name:
|
|
return "name";
|
|
case QPDFTokenizer::tt_real:
|
|
return "real";
|
|
case QPDFTokenizer::tt_string:
|
|
return "string";
|
|
case QPDFTokenizer::tt_null:
|
|
return "null";
|
|
case QPDFTokenizer::tt_bool:
|
|
return "bool";
|
|
case QPDFTokenizer::tt_word:
|
|
return "word";
|
|
case QPDFTokenizer::tt_eof:
|
|
return "eof";
|
|
case QPDFTokenizer::tt_space:
|
|
return "space";
|
|
case QPDFTokenizer::tt_comment:
|
|
return "comment";
|
|
case QPDFTokenizer::tt_inline_image:
|
|
return "inline-image";
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static std::string
|
|
sanitize(std::string const& value)
|
|
{
|
|
std::string result;
|
|
for (std::string::const_iterator iter = value.begin(); iter != value.end();
|
|
++iter)
|
|
{
|
|
if ((*iter >= 32) && (*iter <= 126))
|
|
{
|
|
result.append(1, *iter);
|
|
}
|
|
else
|
|
{
|
|
result += "\\x" + QUtil::int_to_string_base(
|
|
static_cast<unsigned char>(*iter), 16, 2);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
static void
|
|
try_skipping(QPDFTokenizer& tokenizer, PointerHolder<InputSource> is,
|
|
size_t max_len, char const* what, Finder& f)
|
|
{
|
|
std::cout << "skipping to " << what << std::endl;
|
|
qpdf_offset_t offset = is->tell();
|
|
if (! is->findFirst(what, offset, 0, f))
|
|
{
|
|
std::cout << what << " not found" << std::endl;
|
|
is->seek(offset, SEEK_SET);
|
|
}
|
|
}
|
|
|
|
static void
|
|
dump_tokens(PointerHolder<InputSource> is, std::string const& label,
|
|
size_t max_len, bool include_ignorable,
|
|
bool skip_streams, bool skip_inline_images, bool old_ei)
|
|
{
|
|
Finder f1(is, "endstream");
|
|
std::cout << "--- BEGIN " << label << " ---" << std::endl;
|
|
bool done = false;
|
|
QPDFTokenizer tokenizer;
|
|
tokenizer.allowEOF();
|
|
if (include_ignorable)
|
|
{
|
|
tokenizer.includeIgnorable();
|
|
}
|
|
qpdf_offset_t inline_image_offset = 0;
|
|
while (! done)
|
|
{
|
|
QPDFTokenizer::Token token =
|
|
tokenizer.readToken(is, "test", true,
|
|
inline_image_offset ? 0 : max_len);
|
|
if (inline_image_offset && (token.getType() == QPDFTokenizer::tt_bad))
|
|
{
|
|
std::cout << "EI not found; resuming normal scanning" << std::endl;
|
|
is->seek(inline_image_offset, SEEK_SET);
|
|
inline_image_offset = 0;
|
|
continue;
|
|
}
|
|
inline_image_offset = 0;
|
|
|
|
qpdf_offset_t offset = is->getLastOffset();
|
|
std::cout << offset << ": "
|
|
<< tokenTypeName(token.getType());
|
|
if (token.getType() != QPDFTokenizer::tt_eof)
|
|
{
|
|
std::cout << ": "
|
|
<< sanitize(token.getValue());
|
|
if (token.getValue() != token.getRawValue())
|
|
{
|
|
std::cout << " (raw: " << sanitize(token.getRawValue()) << ")";
|
|
}
|
|
}
|
|
if (token.getType() == QPDFTokenizer::tt_bad)
|
|
{
|
|
std::cout << " (" << token.getErrorMessage() << ")";
|
|
}
|
|
std::cout << std::endl;
|
|
if (skip_streams &&
|
|
(token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream")))
|
|
{
|
|
try_skipping(tokenizer, is, max_len, "endstream", f1);
|
|
}
|
|
else if (skip_inline_images &&
|
|
(token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID")))
|
|
{
|
|
if (old_ei)
|
|
{
|
|
tokenizer.expectInlineImage();
|
|
}
|
|
else
|
|
{
|
|
tokenizer.expectInlineImage(is);
|
|
}
|
|
inline_image_offset = is->tell();
|
|
}
|
|
else if (token.getType() == QPDFTokenizer::tt_eof)
|
|
{
|
|
done = true;
|
|
}
|
|
}
|
|
std::cout << "--- END " << label << " ---" << std::endl;
|
|
}
|
|
|
|
static void process(char const* filename, bool include_ignorable,
|
|
size_t max_len, bool old_ei)
|
|
{
|
|
PointerHolder<InputSource> is;
|
|
|
|
// Tokenize file, skipping streams
|
|
FileInputSource* fis = new FileInputSource();
|
|
fis->setFilename(filename);
|
|
is = fis;
|
|
dump_tokens(is, "FILE", max_len, include_ignorable, true, false, false);
|
|
|
|
// Tokenize content streams, skipping inline images
|
|
QPDF qpdf;
|
|
qpdf.processFile(filename);
|
|
std::vector<QPDFPageObjectHelper> pages =
|
|
QPDFPageDocumentHelper(qpdf).getAllPages();
|
|
int pageno = 0;
|
|
for (std::vector<QPDFPageObjectHelper>::iterator iter = pages.begin();
|
|
iter != pages.end(); ++iter)
|
|
{
|
|
++pageno;
|
|
Pl_Buffer plb("buffer");
|
|
(*iter).pipePageContents(&plb);
|
|
PointerHolder<Buffer> content_data = plb.getBuffer();
|
|
BufferInputSource* bis = new BufferInputSource(
|
|
"content data", content_data.getPointer());
|
|
is = bis;
|
|
dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno),
|
|
max_len, include_ignorable, false, true, old_ei);
|
|
}
|
|
|
|
// Tokenize object streams
|
|
std::vector<QPDFObjectHandle> all = qpdf.getAllObjects();
|
|
for (std::vector<QPDFObjectHandle>::iterator iter = all.begin();
|
|
iter != all.end(); ++iter)
|
|
{
|
|
if ((*iter).isStream() &&
|
|
(*iter).getDict().getKey("/Type").isName() &&
|
|
(*iter).getDict().getKey("/Type").getName() == "/ObjStm")
|
|
{
|
|
PointerHolder<Buffer> b =
|
|
(*iter).getStreamData(qpdf_dl_specialized);
|
|
BufferInputSource* bis = new BufferInputSource(
|
|
"object stream data", b.getPointer());
|
|
is = bis;
|
|
dump_tokens(is, "OBJECT STREAM " +
|
|
QUtil::int_to_string((*iter).getObjectID()),
|
|
max_len, include_ignorable, false, false, false);
|
|
}
|
|
}
|
|
}
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
QUtil::setLineBuf(stdout);
|
|
if ((whoami = strrchr(argv[0], '/')) == NULL)
|
|
{
|
|
whoami = argv[0];
|
|
}
|
|
else
|
|
{
|
|
++whoami;
|
|
}
|
|
// For libtool's sake....
|
|
if (strncmp(whoami, "lt-", 3) == 0)
|
|
{
|
|
whoami += 3;
|
|
}
|
|
|
|
char const* filename = 0;
|
|
size_t max_len = 0;
|
|
bool include_ignorable = true;
|
|
bool old_ei = false;
|
|
for (int i = 1; i < argc; ++i)
|
|
{
|
|
if (argv[i][0] == '-')
|
|
{
|
|
if (strcmp(argv[i], "-maxlen") == 0)
|
|
{
|
|
if (++i >= argc)
|
|
{
|
|
usage();
|
|
}
|
|
max_len = QUtil::string_to_int(argv[i]);
|
|
}
|
|
else if (strcmp(argv[i], "-no-ignorable") == 0)
|
|
{
|
|
include_ignorable = false;
|
|
}
|
|
else if (strcmp(argv[i], "-old-ei") == 0)
|
|
{
|
|
old_ei = true;
|
|
}
|
|
else
|
|
{
|
|
usage();
|
|
}
|
|
}
|
|
else if (filename)
|
|
{
|
|
usage();
|
|
}
|
|
else
|
|
{
|
|
filename = argv[i];
|
|
}
|
|
}
|
|
if (filename == 0)
|
|
{
|
|
usage();
|
|
}
|
|
|
|
try
|
|
{
|
|
process(filename, include_ignorable, max_len, old_ei);
|
|
}
|
|
catch (std::exception& e)
|
|
{
|
|
std::cerr << whoami << ": exception: " << e.what();
|
|
exit(2);
|
|
}
|
|
return 0;
|
|
}
|