From 6670c685ab9f929121c5498115b278c95574e461 Mon Sep 17 00:00:00 2001 From: m-holger Date: Tue, 16 Aug 2022 13:59:32 +0100 Subject: [PATCH] Move QPDFObjectHandle::parseInternal to new class QPDFParser Part of #729 --- include/qpdf/QPDF.hh | 3 +- include/qpdf/QPDFObjectHandle.hh | 21 +- libqpdf/CMakeLists.txt | 1 + libqpdf/QPDFObjectHandle.cc | 498 +----------------------------- libqpdf/QPDFParser.cc | 503 +++++++++++++++++++++++++++++++ libqpdf/qpdf/QPDFParser.hh | 50 +++ qpdf/qpdf.testcov | 22 +- 7 files changed, 576 insertions(+), 522 deletions(-) create mode 100644 libqpdf/QPDFParser.cc create mode 100644 libqpdf/qpdf/QPDFParser.hh diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index 7c389c1d..12d41eff 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -49,6 +49,7 @@ class QPDF_Stream; class BitStream; class BitWriter; class QPDFLogger; +class QPDFParser; class QPDF { @@ -881,7 +882,7 @@ class QPDF // resolution class ParseGuard { - friend class QPDFObjectHandle; + friend class QPDFParser; private: ParseGuard(QPDF* qpdf) : diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index 4b054928..8f38cb24 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -49,9 +49,12 @@ class QPDFTokenizer; class QPDFExc; class Pl_QPDFTokenizer; class QPDFMatrix; +class QPDFParser; class QPDFObjectHandle { + friend class QPDFParser; + public: // This class is used by replaceStreamData. It provides an // alternative way of associating stream data with a stream. See @@ -1563,15 +1566,6 @@ class QPDFObjectHandle QPDFObjectHandle(QPDF*, QPDFObjGen const& og); QPDFObjectHandle(std::shared_ptr const&); - enum parser_state_e { - st_top, - st_start, - st_stop, - st_eof, - st_dictionary, - st_array - }; - // Private object factory methods static QPDFObjectHandle newIndirect(QPDF*, QPDFObjGen const& og); static QPDFObjectHandle newStream( @@ -1599,14 +1593,7 @@ class QPDFObjectHandle std::string const&, std::shared_ptr, qpdf_offset_t); - static QPDFObjectHandle parseInternal( - std::shared_ptr input, - std::string const& object_description, - QPDFTokenizer& tokenizer, - bool& empty, - StringDecrypter* decrypter, - QPDF* context, - bool content_stream); + void setParsedOffset(qpdf_offset_t offset); void parseContentStream_internal( std::string const& description, ParserCallbacks* callbacks); diff --git a/libqpdf/CMakeLists.txt b/libqpdf/CMakeLists.txt index cf807f6d..46d35959 100644 --- a/libqpdf/CMakeLists.txt +++ b/libqpdf/CMakeLists.txt @@ -80,6 +80,7 @@ set(libqpdf_SOURCES QPDFPageDocumentHelper.cc QPDFPageLabelDocumentHelper.cc QPDFPageObjectHelper.cc + QPDFParser.cc QPDFStreamFilter.cc QPDFSystemError.cc QPDFTokenizer.cc diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 8a2d59e3..377a1cbb 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1879,8 +1880,8 @@ QPDFObjectHandle::parseContentStream_data( tokenizer.readToken(input, "content", true); qpdf_offset_t offset = input->getLastOffset(); input->seek(offset, SEEK_SET); - QPDFObjectHandle obj = parseInternal( - input, "content", tokenizer, empty, nullptr, context, true); + auto obj = QPDFParser(input, "content", tokenizer, nullptr, context) + .parse(empty, true); if (!obj.isInitialized()) { // EOF break; @@ -1943,497 +1944,8 @@ QPDFObjectHandle::parse( StringDecrypter* decrypter, QPDF* context) { - return parseInternal( - input, object_description, tokenizer, empty, decrypter, context, false); -} - -QPDFObjectHandle -QPDFObjectHandle::parseInternal( - std::shared_ptr input, - std::string const& object_description, - QPDFTokenizer& tokenizer, - bool& empty, - StringDecrypter* decrypter, - QPDF* context, - bool content_stream) -{ - // This method must take care not to resolve any objects. Don't - // check the type of any object without first ensuring that it is - // a direct object. Otherwise, doing so may have the side effect - // of reading the object and changing the file pointer. If you do - // this, it will cause a logic error to be thrown from - // QPDF::inParse(). - - QPDF::ParseGuard pg(context); - - empty = false; - - QPDFObjectHandle object; - bool set_offset = false; - - std::vector olist_stack; - olist_stack.push_back(SparseOHArray()); - std::vector state_stack; - state_stack.push_back(st_top); - std::vector offset_stack; - qpdf_offset_t offset = input->tell(); - offset_stack.push_back(offset); - bool done = false; - int bad_count = 0; - int good_count = 0; - bool b_contents = false; - std::vector contents_string_stack; - contents_string_stack.push_back(""); - std::vector contents_offset_stack; - contents_offset_stack.push_back(-1); - while (!done) { - bool bad = false; - SparseOHArray& olist = olist_stack.back(); - parser_state_e state = state_stack.back(); - offset = offset_stack.back(); - std::string& contents_string = contents_string_stack.back(); - qpdf_offset_t& contents_offset = contents_offset_stack.back(); - - object = QPDFObjectHandle(); - set_offset = false; - - QPDFTokenizer::Token token = - tokenizer.readToken(input, object_description, true); - std::string const& token_error_message = token.getErrorMessage(); - if (!token_error_message.empty()) { - // Tokens other than tt_bad can still generate warnings. - warn( - context, - QPDFExc( - qpdf_e_damaged_pdf, - input->getName(), - object_description, - input->getLastOffset(), - token_error_message)); - } - - switch (token.getType()) { - case QPDFTokenizer::tt_eof: - if (!content_stream) { - QTC::TC("qpdf", "QPDFObjectHandle eof in parseInternal"); - warn( - context, - QPDFExc( - qpdf_e_damaged_pdf, - input->getName(), - object_description, - input->getLastOffset(), - "unexpected EOF")); - } - bad = true; - state = st_eof; - break; - - case QPDFTokenizer::tt_bad: - QTC::TC("qpdf", "QPDFObjectHandle bad token in parse"); - bad = true; - object = newNull(); - break; - - case QPDFTokenizer::tt_brace_open: - case QPDFTokenizer::tt_brace_close: - QTC::TC("qpdf", "QPDFObjectHandle bad brace"); - warn( - context, - QPDFExc( - qpdf_e_damaged_pdf, - input->getName(), - object_description, - input->getLastOffset(), - "treating unexpected brace token as null")); - bad = true; - object = newNull(); - break; - - case QPDFTokenizer::tt_array_close: - if (state == st_array) { - state = st_stop; - } else { - QTC::TC("qpdf", "QPDFObjectHandle bad array close"); - warn( - context, - QPDFExc( - qpdf_e_damaged_pdf, - input->getName(), - object_description, - input->getLastOffset(), - "treating unexpected array close token as null")); - bad = true; - object = newNull(); - } - break; - - case QPDFTokenizer::tt_dict_close: - if (state == st_dictionary) { - state = st_stop; - } else { - QTC::TC("qpdf", "QPDFObjectHandle bad dictionary close"); - warn( - context, - QPDFExc( - qpdf_e_damaged_pdf, - input->getName(), - object_description, - input->getLastOffset(), - "unexpected dictionary close token")); - bad = true; - object = newNull(); - } - break; - - case QPDFTokenizer::tt_array_open: - case QPDFTokenizer::tt_dict_open: - if (olist_stack.size() > 500) { - QTC::TC("qpdf", "QPDFObjectHandle too deep"); - warn( - context, - QPDFExc( - qpdf_e_damaged_pdf, - input->getName(), - object_description, - input->getLastOffset(), - "ignoring excessively deeply nested data structure")); - bad = true; - object = newNull(); - state = st_top; - } else { - olist_stack.push_back(SparseOHArray()); - state = st_start; - offset_stack.push_back(input->tell()); - state_stack.push_back( - (token.getType() == QPDFTokenizer::tt_array_open) - ? st_array - : st_dictionary); - b_contents = false; - contents_string_stack.push_back(""); - contents_offset_stack.push_back(-1); - } - break; - - case QPDFTokenizer::tt_bool: - object = newBool((token.getValue() == "true")); - break; - - case QPDFTokenizer::tt_null: - object = newNull(); - break; - - case QPDFTokenizer::tt_integer: - object = newInteger(QUtil::string_to_ll(token.getValue().c_str())); - break; - - case QPDFTokenizer::tt_real: - object = newReal(token.getValue()); - break; - - case QPDFTokenizer::tt_name: - { - std::string name = token.getValue(); - object = newName(name); - - if (name == "/Contents") { - b_contents = true; - } else { - b_contents = false; - } - } - break; - - case QPDFTokenizer::tt_word: - { - std::string const& value = token.getValue(); - if (content_stream) { - object = QPDFObjectHandle::newOperator(value); - } else if ( - (value == "R") && (state != st_top) && - (olist.size() >= 2) && - (!olist.at(olist.size() - 1).isIndirect()) && - (olist.at(olist.size() - 1).isInteger()) && - (!olist.at(olist.size() - 2).isIndirect()) && - (olist.at(olist.size() - 2).isInteger())) { - if (context == nullptr) { - QTC::TC( - "qpdf", - "QPDFObjectHandle indirect without context"); - throw std::logic_error( - "QPDFObjectHandle::parse called without context" - " on an object with indirect references"); - } - // Try to resolve indirect objects - object = newIndirect( - context, - QPDFObjGen( - olist.at(olist.size() - 2).getIntValueAsInt(), - olist.at(olist.size() - 1).getIntValueAsInt())); - olist.remove_last(); - olist.remove_last(); - } else if ((value == "endobj") && (state == st_top)) { - // We just saw endobj without having read - // anything. Treat this as a null and do not move - // the input source's offset. - object = newNull(); - input->seek(input->getLastOffset(), SEEK_SET); - empty = true; - } else { - QTC::TC("qpdf", "QPDFObjectHandle treat word as string"); - warn( - context, - QPDFExc( - qpdf_e_damaged_pdf, - input->getName(), - object_description, - input->getLastOffset(), - "unknown token while reading object;" - " treating as string")); - bad = true; - object = newString(value); - } - } - break; - - case QPDFTokenizer::tt_string: - { - std::string val = token.getValue(); - if (decrypter) { - if (b_contents) { - contents_string = val; - contents_offset = input->getLastOffset(); - b_contents = false; - } - decrypter->decryptString(val); - } - object = QPDFObjectHandle::newString(val); - } - - break; - - default: - warn( - context, - QPDFExc( - qpdf_e_damaged_pdf, - input->getName(), - object_description, - input->getLastOffset(), - "treating unknown token type as null while " - "reading object")); - bad = true; - object = newNull(); - break; - } - - if ((!object.isInitialized()) && - (!((state == st_start) || (state == st_stop) || - (state == st_eof)))) { - throw std::logic_error("QPDFObjectHandle::parseInternal: " - "unexpected uninitialized object"); - object = newNull(); - } - - if (bad) { - ++bad_count; - good_count = 0; - } else { - ++good_count; - if (good_count > 3) { - bad_count = 0; - } - } - if (bad_count > 5) { - // We had too many consecutive errors without enough - // intervening successful objects. Give up. - warn( - context, - QPDFExc( - qpdf_e_damaged_pdf, - input->getName(), - object_description, - input->getLastOffset(), - "too many errors; giving up on reading object")); - state = st_top; - object = newNull(); - } - - switch (state) { - case st_eof: - if (state_stack.size() > 1) { - warn( - context, - QPDFExc( - qpdf_e_damaged_pdf, - input->getName(), - object_description, - input->getLastOffset(), - "parse error while reading object")); - } - done = true; - // In content stream mode, leave object uninitialized to - // indicate EOF - if (!content_stream) { - object = newNull(); - } - break; - - case st_dictionary: - case st_array: - setObjectDescriptionFromInput( - object, - context, - object_description, - input, - input->getLastOffset()); - object.setParsedOffset(input->getLastOffset()); - set_offset = true; - olist.append(object); - break; - - case st_top: - done = true; - break; - - case st_start: - break; - - case st_stop: - if ((state_stack.size() < 2) || (olist_stack.size() < 2)) { - throw std::logic_error( - "QPDFObjectHandle::parseInternal: st_stop encountered" - " with insufficient elements in stack"); - } - parser_state_e old_state = state_stack.back(); - state_stack.pop_back(); - if (old_state == st_array) { - // There's no newArray(SparseOHArray) since - // SparseOHArray is not part of the public API. - object = QPDFObjectHandle(QPDF_Array::create(olist)); - setObjectDescriptionFromInput( - object, context, object_description, input, offset); - // The `offset` points to the next of "[". Set the - // rewind offset to point to the beginning of "[". - // This has been explicitly tested with whitespace - // surrounding the array start delimiter. - // getLastOffset points to the array end token and - // therefore can't be used here. - object.setParsedOffset(offset - 1); - set_offset = true; - } else if (old_state == st_dictionary) { - // Convert list to map. Alternating elements are keys. - // Attempt to recover more or less gracefully from - // invalid dictionaries. - std::set names; - size_t n_elements = olist.size(); - for (size_t i = 0; i < n_elements; ++i) { - QPDFObjectHandle oh = olist.at(i); - if ((!oh.isIndirect()) && oh.isName()) { - names.insert(oh.getName()); - } - } - - std::map dict; - int next_fake_key = 1; - for (unsigned int i = 0; i < olist.size(); ++i) { - QPDFObjectHandle key_obj = olist.at(i); - QPDFObjectHandle val; - if (key_obj.isIndirect() || (!key_obj.isName())) { - bool found_fake = false; - std::string candidate; - while (!found_fake) { - candidate = "/QPDFFake" + - QUtil::int_to_string(next_fake_key++); - found_fake = (names.count(candidate) == 0); - QTC::TC( - "qpdf", - "QPDFObjectHandle found fake", - (found_fake ? 0 : 1)); - } - warn( - context, - QPDFExc( - qpdf_e_damaged_pdf, - input->getName(), - object_description, - offset, - "expected dictionary key but found" - " non-name object; inserting key " + - candidate)); - val = key_obj; - key_obj = newName(candidate); - } else if (i + 1 >= olist.size()) { - QTC::TC("qpdf", "QPDFObjectHandle no val for last key"); - warn( - context, - QPDFExc( - qpdf_e_damaged_pdf, - input->getName(), - object_description, - offset, - "dictionary ended prematurely; " - "using null as value for last key")); - val = newNull(); - setObjectDescriptionFromInput( - val, context, object_description, input, offset); - } else { - val = olist.at(++i); - } - std::string key = key_obj.getName(); - if (dict.count(key) > 0) { - QTC::TC("qpdf", "QPDFObjectHandle duplicate dict key"); - warn( - context, - QPDFExc( - qpdf_e_damaged_pdf, - input->getName(), - object_description, - offset, - "dictionary has duplicated key " + key + - "; last occurrence overrides earlier " - "ones")); - } - dict[key] = val; - } - if (!contents_string.empty() && dict.count("/Type") && - dict["/Type"].isNameAndEquals("/Sig") && - dict.count("/ByteRange") && dict.count("/Contents") && - dict["/Contents"].isString()) { - dict["/Contents"] = - QPDFObjectHandle::newString(contents_string); - dict["/Contents"].setParsedOffset(contents_offset); - } - object = newDictionary(dict); - setObjectDescriptionFromInput( - object, context, object_description, input, offset); - // The `offset` points to the next of "<<". Set the - // rewind offset to point to the beginning of "<<". - // This has been explicitly tested with whitespace - // surrounding the dictionary start delimiter. - // getLastOffset points to the dictionary end token - // and therefore can't be used here. - object.setParsedOffset(offset - 2); - set_offset = true; - } - olist_stack.pop_back(); - offset_stack.pop_back(); - if (state_stack.back() == st_top) { - done = true; - } else { - olist_stack.back().append(object); - } - contents_string_stack.pop_back(); - contents_offset_stack.pop_back(); - } - } - - if (!set_offset) { - setObjectDescriptionFromInput( - object, context, object_description, input, offset); - object.setParsedOffset(offset); - } - return object; + return QPDFParser(input, object_description, tokenizer, decrypter, context) + .parse(empty, false); } qpdf_offset_t diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc new file mode 100644 index 00000000..e86a44bd --- /dev/null +++ b/libqpdf/QPDFParser.cc @@ -0,0 +1,503 @@ +#include + +#include +#include +#include +#include +#include +#include + +QPDFObjectHandle +QPDFParser::parse(bool& empty, bool content_stream) +{ + // This method must take care not to resolve any objects. Don't + // check the type of any object without first ensuring that it is + // a direct object. Otherwise, doing so may have the side effect + // of reading the object and changing the file pointer. If you do + // this, it will cause a logic error to be thrown from + // QPDF::inParse(). + + QPDF::ParseGuard pg(context); + + empty = false; + + QPDFObjectHandle object; + bool set_offset = false; + + std::vector olist_stack; + olist_stack.push_back(SparseOHArray()); + std::vector state_stack; + state_stack.push_back(st_top); + std::vector offset_stack; + qpdf_offset_t offset = input->tell(); + offset_stack.push_back(offset); + bool done = false; + int bad_count = 0; + int good_count = 0; + bool b_contents = false; + std::vector contents_string_stack; + contents_string_stack.push_back(""); + std::vector contents_offset_stack; + contents_offset_stack.push_back(-1); + while (!done) { + bool bad = false; + SparseOHArray& olist = olist_stack.back(); + parser_state_e state = state_stack.back(); + offset = offset_stack.back(); + std::string& contents_string = contents_string_stack.back(); + qpdf_offset_t& contents_offset = contents_offset_stack.back(); + + object = QPDFObjectHandle(); + set_offset = false; + + QPDFTokenizer::Token token = + tokenizer.readToken(input, object_description, true); + std::string const& token_error_message = token.getErrorMessage(); + if (!token_error_message.empty()) { + // Tokens other than tt_bad can still generate warnings. + warn( + context, + QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + object_description, + input->getLastOffset(), + token_error_message)); + } + + switch (token.getType()) { + case QPDFTokenizer::tt_eof: + if (!content_stream) { + QTC::TC("qpdf", "QPDFParser eof in parse"); + warn( + context, + QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + object_description, + input->getLastOffset(), + "unexpected EOF")); + } + bad = true; + state = st_eof; + break; + + case QPDFTokenizer::tt_bad: + QTC::TC("qpdf", "QPDFParser bad token in parse"); + bad = true; + object = QPDFObjectHandle::newNull(); + break; + + case QPDFTokenizer::tt_brace_open: + case QPDFTokenizer::tt_brace_close: + QTC::TC("qpdf", "QPDFParser bad brace"); + warn( + context, + QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + object_description, + input->getLastOffset(), + "treating unexpected brace token as null")); + bad = true; + object = QPDFObjectHandle::newNull(); + break; + + case QPDFTokenizer::tt_array_close: + if (state == st_array) { + state = st_stop; + } else { + QTC::TC("qpdf", "QPDFParser bad array close"); + warn( + context, + QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + object_description, + input->getLastOffset(), + "treating unexpected array close token as null")); + bad = true; + object = QPDFObjectHandle::newNull(); + } + break; + + case QPDFTokenizer::tt_dict_close: + if (state == st_dictionary) { + state = st_stop; + } else { + QTC::TC("qpdf", "QPDFParser bad dictionary close"); + warn( + context, + QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + object_description, + input->getLastOffset(), + "unexpected dictionary close token")); + bad = true; + object = QPDFObjectHandle::newNull(); + } + break; + + case QPDFTokenizer::tt_array_open: + case QPDFTokenizer::tt_dict_open: + if (olist_stack.size() > 500) { + QTC::TC("qpdf", "QPDFParser too deep"); + warn( + context, + QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + object_description, + input->getLastOffset(), + "ignoring excessively deeply nested data structure")); + bad = true; + object = QPDFObjectHandle::newNull(); + state = st_top; + } else { + olist_stack.push_back(SparseOHArray()); + state = st_start; + offset_stack.push_back(input->tell()); + state_stack.push_back( + (token.getType() == QPDFTokenizer::tt_array_open) + ? st_array + : st_dictionary); + b_contents = false; + contents_string_stack.push_back(""); + contents_offset_stack.push_back(-1); + } + break; + + case QPDFTokenizer::tt_bool: + object = QPDFObjectHandle::newBool((token.getValue() == "true")); + break; + + case QPDFTokenizer::tt_null: + object = QPDFObjectHandle::newNull(); + break; + + case QPDFTokenizer::tt_integer: + object = QPDFObjectHandle::newInteger( + QUtil::string_to_ll(token.getValue().c_str())); + break; + + case QPDFTokenizer::tt_real: + object = QPDFObjectHandle::newReal(token.getValue()); + break; + + case QPDFTokenizer::tt_name: + { + std::string name = token.getValue(); + object = QPDFObjectHandle::newName(name); + + if (name == "/Contents") { + b_contents = true; + } else { + b_contents = false; + } + } + break; + + case QPDFTokenizer::tt_word: + { + std::string const& value = token.getValue(); + if (content_stream) { + object = QPDFObjectHandle::newOperator(value); + } else if ( + (value == "R") && (state != st_top) && + (olist.size() >= 2) && + (!olist.at(olist.size() - 1).isIndirect()) && + (olist.at(olist.size() - 1).isInteger()) && + (!olist.at(olist.size() - 2).isIndirect()) && + (olist.at(olist.size() - 2).isInteger())) { + if (context == nullptr) { + QTC::TC("qpdf", "QPDFParser indirect without context"); + throw std::logic_error( + "QPDFObjectHandle::parse called without context" + " on an object with indirect references"); + } + // Try to resolve indirect objects + object = QPDFObjectHandle::newIndirect( + context, + QPDFObjGen( + olist.at(olist.size() - 2).getIntValueAsInt(), + olist.at(olist.size() - 1).getIntValueAsInt())); + olist.remove_last(); + olist.remove_last(); + } else if ((value == "endobj") && (state == st_top)) { + // We just saw endobj without having read + // anything. Treat this as a null and do not move + // the input source's offset. + object = QPDFObjectHandle::newNull(); + input->seek(input->getLastOffset(), SEEK_SET); + empty = true; + } else { + QTC::TC("qpdf", "QPDFParser treat word as string"); + warn( + context, + QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + object_description, + input->getLastOffset(), + "unknown token while reading object;" + " treating as string")); + bad = true; + object = QPDFObjectHandle::newString(value); + } + } + break; + + case QPDFTokenizer::tt_string: + { + std::string val = token.getValue(); + if (decrypter) { + if (b_contents) { + contents_string = val; + contents_offset = input->getLastOffset(); + b_contents = false; + } + decrypter->decryptString(val); + } + object = QPDFObjectHandle::newString(val); + } + + break; + + default: + warn( + context, + QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + object_description, + input->getLastOffset(), + "treating unknown token type as null while " + "reading object")); + bad = true; + object = QPDFObjectHandle::newNull(); + break; + } + + if ((!object.isInitialized()) && + (!((state == st_start) || (state == st_stop) || + (state == st_eof)))) { + throw std::logic_error("QPDFObjectHandle::parseInternal: " + "unexpected uninitialized object"); + object = QPDFObjectHandle::newNull(); + } + + if (bad) { + ++bad_count; + good_count = 0; + } else { + ++good_count; + if (good_count > 3) { + bad_count = 0; + } + } + if (bad_count > 5) { + // We had too many consecutive errors without enough + // intervening successful objects. Give up. + warn( + context, + QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + object_description, + input->getLastOffset(), + "too many errors; giving up on reading object")); + state = st_top; + object = QPDFObjectHandle::newNull(); + } + + switch (state) { + case st_eof: + if (state_stack.size() > 1) { + warn( + context, + QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + object_description, + input->getLastOffset(), + "parse error while reading object")); + } + done = true; + // In content stream mode, leave object uninitialized to + // indicate EOF + if (!content_stream) { + object = QPDFObjectHandle::newNull(); + } + break; + + case st_dictionary: + case st_array: + QPDFObjectHandle::setObjectDescriptionFromInput( + object, + context, + object_description, + input, + input->getLastOffset()); + object.setParsedOffset(input->getLastOffset()); + set_offset = true; + olist.append(object); + break; + + case st_top: + done = true; + break; + + case st_start: + break; + + case st_stop: + if ((state_stack.size() < 2) || (olist_stack.size() < 2)) { + throw std::logic_error( + "QPDFObjectHandle::parseInternal: st_stop encountered" + " with insufficient elements in stack"); + } + parser_state_e old_state = state_stack.back(); + state_stack.pop_back(); + if (old_state == st_array) { + // There's no newArray(SparseOHArray) since + // SparseOHArray is not part of the public API. + object = QPDFObjectHandle(QPDF_Array::create(olist)); + QPDFObjectHandle::setObjectDescriptionFromInput( + object, context, object_description, input, offset); + // The `offset` points to the next of "[". Set the + // rewind offset to point to the beginning of "[". + // This has been explicitly tested with whitespace + // surrounding the array start delimiter. + // getLastOffset points to the array end token and + // therefore can't be used here. + object.setParsedOffset(offset - 1); + set_offset = true; + } else if (old_state == st_dictionary) { + // Convert list to map. Alternating elements are keys. + // Attempt to recover more or less gracefully from + // invalid dictionaries. + std::set names; + size_t n_elements = olist.size(); + for (size_t i = 0; i < n_elements; ++i) { + QPDFObjectHandle oh = olist.at(i); + if ((!oh.isIndirect()) && oh.isName()) { + names.insert(oh.getName()); + } + } + + std::map dict; + int next_fake_key = 1; + for (unsigned int i = 0; i < olist.size(); ++i) { + QPDFObjectHandle key_obj = olist.at(i); + QPDFObjectHandle val; + if (key_obj.isIndirect() || (!key_obj.isName())) { + bool found_fake = false; + std::string candidate; + while (!found_fake) { + candidate = "/QPDFFake" + + QUtil::int_to_string(next_fake_key++); + found_fake = (names.count(candidate) == 0); + QTC::TC( + "qpdf", + "QPDFParser found fake", + (found_fake ? 0 : 1)); + } + warn( + context, + QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + object_description, + offset, + "expected dictionary key but found" + " non-name object; inserting key " + + candidate)); + val = key_obj; + key_obj = QPDFObjectHandle::newName(candidate); + } else if (i + 1 >= olist.size()) { + QTC::TC("qpdf", "QPDFParser no val for last key"); + warn( + context, + QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + object_description, + offset, + "dictionary ended prematurely; " + "using null as value for last key")); + val = QPDFObjectHandle::newNull(); + QPDFObjectHandle::setObjectDescriptionFromInput( + val, context, object_description, input, offset); + } else { + val = olist.at(++i); + } + std::string key = key_obj.getName(); + if (dict.count(key) > 0) { + QTC::TC("qpdf", "QPDFParser duplicate dict key"); + warn( + context, + QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + object_description, + offset, + "dictionary has duplicated key " + key + + "; last occurrence overrides earlier " + "ones")); + } + dict[key] = val; + } + if (!contents_string.empty() && dict.count("/Type") && + dict["/Type"].isNameAndEquals("/Sig") && + dict.count("/ByteRange") && dict.count("/Contents") && + dict["/Contents"].isString()) { + dict["/Contents"] = + QPDFObjectHandle::newString(contents_string); + dict["/Contents"].setParsedOffset(contents_offset); + } + object = QPDFObjectHandle::newDictionary(dict); + QPDFObjectHandle::setObjectDescriptionFromInput( + object, context, object_description, input, offset); + // The `offset` points to the next of "<<". Set the + // rewind offset to point to the beginning of "<<". + // This has been explicitly tested with whitespace + // surrounding the dictionary start delimiter. + // getLastOffset points to the dictionary end token + // and therefore can't be used here. + object.setParsedOffset(offset - 2); + set_offset = true; + } + olist_stack.pop_back(); + offset_stack.pop_back(); + if (state_stack.back() == st_top) { + done = true; + } else { + olist_stack.back().append(object); + } + contents_string_stack.pop_back(); + contents_offset_stack.pop_back(); + } + } + + if (!set_offset) { + QPDFObjectHandle::setObjectDescriptionFromInput( + object, context, object_description, input, offset); + object.setParsedOffset(offset); + } + return object; +} + +void +QPDFParser::warn(QPDF* qpdf, QPDFExc const& e) +{ + // If parsing on behalf of a QPDF object and want to give a + // warning, we can warn through the object. If parsing for some + // other reason, such as an explicit creation of an object from a + // string, then just throw the exception. + if (qpdf) { + qpdf->warn(e); + } else { + throw e; + } +} diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh new file mode 100644 index 00000000..e929c3f2 --- /dev/null +++ b/libqpdf/qpdf/QPDFParser.hh @@ -0,0 +1,50 @@ +#ifndef QPDFPARSER_HH +#define QPDFPARSER_HH + +#include + +#include +#include + +class QPDFParser +{ + public: + QPDFParser() = delete; + QPDFParser( + std::shared_ptr input, + std::string const& object_description, + QPDFTokenizer& tokenizer, + QPDFObjectHandle::StringDecrypter* decrypter, + QPDF* context) : + input(input), + object_description(object_description), + tokenizer(tokenizer), + decrypter(decrypter), + context(context) + { + } + virtual ~QPDFParser() = default; + + QPDFObjectHandle parse(bool& empty, bool content_stream); + + private: + enum parser_state_e { + st_top, + st_start, + st_stop, + st_eof, + st_dictionary, + st_array + }; + + static void warn(QPDF*, QPDFExc const&); + void setParsedOffset(qpdf_offset_t offset); + + std::shared_ptr input; + std::string const& object_description; + QPDFTokenizer& tokenizer; + QPDFObjectHandle::StringDecrypter* decrypter; + QPDF* context; +}; + +#endif // QPDFPARSER_HH diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 9e106902..f13385e4 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -56,12 +56,12 @@ QPDF missing trailer 0 QPDF trailer lacks size 0 QPDF trailer size not integer 0 QPDF trailer prev not integer 0 -QPDFObjectHandle bad brace 0 -QPDFObjectHandle bad array close 0 +QPDFParser bad brace 0 +QPDFParser bad array close 0 QPDF stream without length 0 QPDF stream length not integer 0 QPDF missing endstream 0 -QPDFObjectHandle bad dictionary close 0 +QPDFParser bad dictionary close 0 QPDF can't find xref 0 QPDFTokenizer bad ) 0 QPDFTokenizer bad > 0 @@ -215,7 +215,7 @@ QPDF not copying pages object 0 QPDF insert foreign page 0 QPDFWriter foreign object 0 QPDFWriter copy use_aes 1 -QPDFObjectHandle indirect without context 0 +QPDFParser indirect without context 0 QPDFObjectHandle trailing data in parse 0 QPDFJob pages encryption password 0 QPDFTokenizer EOF reading token 0 @@ -257,9 +257,9 @@ qpdf-c called qpdf_set_deterministic_ID 0 QPDFObjectHandle indirect with 0 objid 0 QPDF object id 0 0 QPDF recursion loop in resolve 0 -QPDFObjectHandle treat word as string 0 -QPDFObjectHandle found fake 1 -QPDFObjectHandle no val for last key 0 +QPDFParser treat word as string 0 +QPDFParser found fake 1 +QPDFParser no val for last key 0 QPDF resolve failure to null 0 QPDFWriter preserve unreferenced standard 0 QPDFObjectHandle errors in parsecontent 0 @@ -288,8 +288,8 @@ QPDFObjectHandle non-stream in stream array 0 QPDFObjectHandle coalesce called on stream 0 QPDFObjectHandle coalesce provide stream data 0 QPDF_Stream bad token at end during normalize 0 -QPDFObjectHandle bad token in parse 0 -QPDFObjectHandle eof in parseInternal 0 +QPDFParser bad token in parse 0 +QPDFParser eof in parse 0 QPDFObjectHandle array bounds 0 QPDFObjectHandle boolean returning false 0 QPDFObjectHandle integer returning 0 0 @@ -317,7 +317,7 @@ QPDFObjectHandle numeric non-numeric 0 QPDFObjectHandle erase array bounds 0 qpdf-c called qpdf_check_pdf 0 QPDF xref loop 0 -QPDFObjectHandle too deep 0 +QPDFParser too deep 0 QPDFFormFieldObjectHelper non-trivial inheritance 0 QPDFFormFieldObjectHelper non-trivial qualified name 0 QPDFFormFieldObjectHelper TU present 0 @@ -428,7 +428,7 @@ QPDF eof skipping spaces before xref 1 QPDF_encryption user matches owner V < 5 0 QPDF_encryption same password 1 QPDFWriter stream in ostream 0 -QPDFObjectHandle duplicate dict key 0 +QPDFParser duplicate dict key 0 QPDFWriter no encryption sig contents 0 QPDFPageObjectHelper colorspace lookup 0 QPDFWriter ignore XRef in qdf mode 0