#include #include #include #include #include #include #include #include #include #include #include #include static char const* whoami = nullptr; void usage() { std::cerr << "Usage: " << whoami << " [-maxlen len | -no-ignorable] filename" << std::endl; exit(2); } class Finder: public InputSource::Finder { public: Finder(std::shared_ptr is, std::string const& str) : is(is), str(str) { } ~Finder() override = default; bool check() override; private: std::shared_ptr is; std::string str; }; bool Finder::check() { QPDFTokenizer tokenizer; QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true); qpdf_offset_t offset = this->is->tell(); bool result = (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)); this->is->seek(offset - QIntC::to_offset(this->str.length()), SEEK_SET); return result; } static char const* tokenTypeName(QPDFTokenizer::token_type_e ttype) { // Do this is a case statement instead of a lookup so the compiler // will warn if we miss any. switch (ttype) { case QPDFTokenizer::tt_bad: return "bad"; case QPDFTokenizer::tt_array_close: return "array_close"; case QPDFTokenizer::tt_array_open: return "array_open"; case QPDFTokenizer::tt_brace_close: return "brace_close"; case QPDFTokenizer::tt_brace_open: return "brace_open"; case QPDFTokenizer::tt_dict_close: return "dict_close"; case QPDFTokenizer::tt_dict_open: return "dict_open"; case QPDFTokenizer::tt_integer: return "integer"; case QPDFTokenizer::tt_name: return "name"; case QPDFTokenizer::tt_real: return "real"; case QPDFTokenizer::tt_string: return "string"; case QPDFTokenizer::tt_null: return "null"; case QPDFTokenizer::tt_bool: return "bool"; case QPDFTokenizer::tt_word: return "word"; case QPDFTokenizer::tt_eof: return "eof"; case QPDFTokenizer::tt_space: return "space"; case QPDFTokenizer::tt_comment: return "comment"; case QPDFTokenizer::tt_inline_image: return "inline-image"; } return nullptr; } static std::string sanitize(std::string const& value) { std::string result; for (auto const& iter: value) { if ((iter >= 32) && (iter <= 126)) { result.append(1, iter); } else { result += "\\x" + QUtil::int_to_string_base(static_cast(iter), 16, 2); } } return result; } static void try_skipping( QPDFTokenizer& tokenizer, std::shared_ptr is, size_t max_len, char const* what, Finder& f) { std::cout << "skipping to " << what << std::endl; qpdf_offset_t offset = is->tell(); if (!is->findFirst(what, offset, 0, f)) { std::cout << what << " not found" << std::endl; is->seek(offset, SEEK_SET); } } static void dump_tokens( std::shared_ptr is, std::string const& label, size_t max_len, bool include_ignorable, bool skip_streams, bool skip_inline_images) { Finder f1(is, "endstream"); std::cout << "--- BEGIN " << label << " ---" << std::endl; bool done = false; QPDFTokenizer tokenizer; tokenizer.allowEOF(); if (include_ignorable) { tokenizer.includeIgnorable(); } qpdf_offset_t inline_image_offset = 0; while (!done) { QPDFTokenizer::Token token = tokenizer.readToken(is, "test", true, inline_image_offset ? 0 : max_len); if (inline_image_offset && (token.getType() == QPDFTokenizer::tt_bad)) { std::cout << "EI not found; resuming normal scanning" << std::endl; is->seek(inline_image_offset, SEEK_SET); inline_image_offset = 0; continue; } inline_image_offset = 0; qpdf_offset_t offset = is->getLastOffset(); std::cout << offset << ": " << tokenTypeName(token.getType()); if (token.getType() != QPDFTokenizer::tt_eof) { std::cout << ": " << sanitize(token.getValue()); if (token.getValue() != token.getRawValue()) { std::cout << " (raw: " << sanitize(token.getRawValue()) << ")"; } } if (!token.getErrorMessage().empty()) { std::cout << " (" << token.getErrorMessage() << ")"; } std::cout << std::endl; if (skip_streams && (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream"))) { try_skipping(tokenizer, is, max_len, "endstream", f1); } else if ( skip_inline_images && (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID"))) { char ch; is->read(&ch, 1); tokenizer.expectInlineImage(is); inline_image_offset = is->tell(); } else if (token.getType() == QPDFTokenizer::tt_eof) { done = true; } } std::cout << "--- END " << label << " ---" << std::endl; } static void process(char const* filename, bool include_ignorable, size_t max_len) { std::shared_ptr is; // Tokenize file, skipping streams auto* fis = new FileInputSource(filename); is = std::shared_ptr(fis); dump_tokens(is, "FILE", max_len, include_ignorable, true, false); // Tokenize content streams, skipping inline images QPDF qpdf; qpdf.processFile(filename); int pageno = 0; for (auto& page: QPDFPageDocumentHelper(qpdf).getAllPages()) { ++pageno; Pl_Buffer plb("buffer"); page.pipeContents(&plb); auto content_data = plb.getBufferSharedPointer(); auto* bis = new BufferInputSource("content data", content_data.get()); is = std::shared_ptr(bis); dump_tokens( is, "PAGE " + QUtil::int_to_string(pageno), max_len, include_ignorable, false, true); } // Tokenize object streams for (auto& obj: qpdf.getAllObjects()) { if (obj.isStream() && obj.getDict().getKey("/Type").isName() && obj.getDict().getKey("/Type").getName() == "/ObjStm") { std::shared_ptr b = obj.getStreamData(qpdf_dl_specialized); auto* bis = new BufferInputSource("object stream data", b.get()); is = std::shared_ptr(bis); dump_tokens( is, "OBJECT STREAM " + QUtil::int_to_string(obj.getObjectID()), max_len, include_ignorable, false, false); } } } int main(int argc, char* argv[]) { QUtil::setLineBuf(stdout); if ((whoami = strrchr(argv[0], '/')) == nullptr) { whoami = argv[0]; } else { ++whoami; } char const* filename = nullptr; size_t max_len = 0; bool include_ignorable = true; for (int i = 1; i < argc; ++i) { if (argv[i][0] == '-') { if (strcmp(argv[i], "-maxlen") == 0) { if (++i >= argc) { usage(); } max_len = QUtil::string_to_uint(argv[i]); } else if (strcmp(argv[i], "-no-ignorable") == 0) { include_ignorable = false; } else { usage(); } } else if (filename) { usage(); } else { filename = argv[i]; } } if (filename == nullptr) { usage(); } try { process(filename, include_ignorable, max_len); } catch (std::exception& e) { std::cerr << whoami << ": exception: " << e.what(); exit(2); } return 0; }