#include #include #include #include #include #include #include #include #include #include #include #include #include static char const* whoami = 0; void usage() { std::cerr << "Usage: " << whoami << " [-maxlen len | -no-ignorable] filename" << std::endl; exit(2); } class Finder: public InputSource::Finder { public: Finder(PointerHolder is, std::string const& str) : is(is), str(str) { } virtual ~Finder() { } virtual bool check(); private: PointerHolder is; std::string str; }; bool Finder::check() { QPDFTokenizer tokenizer; QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true); qpdf_offset_t offset = this->is->tell(); bool result = (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)); this->is->seek(offset - QIntC::to_offset(this->str.length()), SEEK_SET); return result; } static char const* tokenTypeName(QPDFTokenizer::token_type_e ttype) { // Do this is a case statement instead of a lookup so the compiler // will warn if we miss any. switch (ttype) { case QPDFTokenizer::tt_bad: return "bad"; case QPDFTokenizer::tt_array_close: return "array_close"; case QPDFTokenizer::tt_array_open: return "array_open"; case QPDFTokenizer::tt_brace_close: return "brace_close"; case QPDFTokenizer::tt_brace_open: return "brace_open"; case QPDFTokenizer::tt_dict_close: return "dict_close"; case QPDFTokenizer::tt_dict_open: return "dict_open"; case QPDFTokenizer::tt_integer: return "integer"; case QPDFTokenizer::tt_name: return "name"; case QPDFTokenizer::tt_real: return "real"; case QPDFTokenizer::tt_string: return "string"; case QPDFTokenizer::tt_null: return "null"; case QPDFTokenizer::tt_bool: return "bool"; case QPDFTokenizer::tt_word: return "word"; case QPDFTokenizer::tt_eof: return "eof"; case QPDFTokenizer::tt_space: return "space"; case QPDFTokenizer::tt_comment: return "comment"; case QPDFTokenizer::tt_inline_image: return "inline-image"; } return 0; } static std::string sanitize(std::string const& value) { std::string result; for (std::string::const_iterator iter = value.begin(); iter != value.end(); ++iter) { if ((*iter >= 32) && (*iter <= 126)) { result.append(1, *iter); } else { result += "\\x" + QUtil::int_to_string_base( static_cast(*iter), 16, 2); } } return result; } static void try_skipping(QPDFTokenizer& tokenizer, PointerHolder is, size_t max_len, char const* what, Finder& f) { std::cout << "skipping to " << what << std::endl; qpdf_offset_t offset = is->tell(); if (! is->findFirst(what, offset, 0, f)) { std::cout << what << " not found" << std::endl; is->seek(offset, SEEK_SET); } } static void dump_tokens(PointerHolder is, std::string const& label, size_t max_len, bool include_ignorable, bool skip_streams, bool skip_inline_images) { Finder f1(is, "endstream"); std::cout << "--- BEGIN " << label << " ---" << std::endl; bool done = false; QPDFTokenizer tokenizer; tokenizer.allowEOF(); if (include_ignorable) { tokenizer.includeIgnorable(); } qpdf_offset_t inline_image_offset = 0; while (! done) { QPDFTokenizer::Token token = tokenizer.readToken(is, "test", true, inline_image_offset ? 0 : max_len); if (inline_image_offset && (token.getType() == QPDFTokenizer::tt_bad)) { std::cout << "EI not found; resuming normal scanning" << std::endl; is->seek(inline_image_offset, SEEK_SET); inline_image_offset = 0; continue; } inline_image_offset = 0; qpdf_offset_t offset = is->getLastOffset(); std::cout << offset << ": " << tokenTypeName(token.getType()); if (token.getType() != QPDFTokenizer::tt_eof) { std::cout << ": " << sanitize(token.getValue()); if (token.getValue() != token.getRawValue()) { std::cout << " (raw: " << sanitize(token.getRawValue()) << ")"; } } if (! token.getErrorMessage().empty()) { std::cout << " (" << token.getErrorMessage() << ")"; } std::cout << std::endl; if (skip_streams && (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream"))) { try_skipping(tokenizer, is, max_len, "endstream", f1); } else if (skip_inline_images && (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID"))) { char ch; is->read(&ch, 1); tokenizer.expectInlineImage(is); inline_image_offset = is->tell(); } else if (token.getType() == QPDFTokenizer::tt_eof) { done = true; } } std::cout << "--- END " << label << " ---" << std::endl; } static void process(char const* filename, bool include_ignorable, size_t max_len) { PointerHolder is; // Tokenize file, skipping streams FileInputSource* fis = new FileInputSource(); fis->setFilename(filename); is = fis; dump_tokens(is, "FILE", max_len, include_ignorable, true, false); // Tokenize content streams, skipping inline images QPDF qpdf; qpdf.processFile(filename); std::vector pages = QPDFPageDocumentHelper(qpdf).getAllPages(); int pageno = 0; for (std::vector::iterator iter = pages.begin(); iter != pages.end(); ++iter) { ++pageno; Pl_Buffer plb("buffer"); (*iter).pipeContents(&plb); PointerHolder content_data = plb.getBuffer(); BufferInputSource* bis = new BufferInputSource( "content data", content_data.getPointer()); is = bis; dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno), max_len, include_ignorable, false, true); } // Tokenize object streams std::vector all = qpdf.getAllObjects(); for (std::vector::iterator iter = all.begin(); iter != all.end(); ++iter) { if ((*iter).isStream() && (*iter).getDict().getKey("/Type").isName() && (*iter).getDict().getKey("/Type").getName() == "/ObjStm") { PointerHolder b = (*iter).getStreamData(qpdf_dl_specialized); BufferInputSource* bis = new BufferInputSource( "object stream data", b.getPointer()); is = bis; dump_tokens(is, "OBJECT STREAM " + QUtil::int_to_string((*iter).getObjectID()), max_len, include_ignorable, false, false); } } } int main(int argc, char* argv[]) { QUtil::setLineBuf(stdout); if ((whoami = strrchr(argv[0], '/')) == NULL) { whoami = argv[0]; } else { ++whoami; } // For libtool's sake.... if (strncmp(whoami, "lt-", 3) == 0) { whoami += 3; } char const* filename = 0; size_t max_len = 0; bool include_ignorable = true; for (int i = 1; i < argc; ++i) { if (argv[i][0] == '-') { if (strcmp(argv[i], "-maxlen") == 0) { if (++i >= argc) { usage(); } max_len = QUtil::string_to_uint(argv[i]); } else if (strcmp(argv[i], "-no-ignorable") == 0) { include_ignorable = false; } else { usage(); } } else if (filename) { usage(); } else { filename = argv[i]; } } if (filename == 0) { usage(); } try { process(filename, include_ignorable, max_len); } catch (std::exception& e) { std::cerr << whoami << ": exception: " << e.what(); exit(2); } return 0; }