From 29cd8f4f53dcad517ecba37978d6073daf43d819 Mon Sep 17 00:00:00 2001 From: m-holger Date: Fri, 27 Oct 2023 14:07:01 +0100 Subject: [PATCH 01/15] Avoid unnecessary string copies in QPDFParser::parse Fixes #864. QPDFTokenizer::getValue originally had a std::string_view return type, which was changed to std::string without removing some unnecessary string creation. --- libqpdf/QPDFParser.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 6dcbddb5..2716c34d 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -156,8 +156,7 @@ QPDFParser::parse(bool& empty, bool content_stream) break; case QPDFTokenizer::tt_integer: - object = QPDF_Integer::create( - QUtil::string_to_ll(std::string(tokenizer.getValue()).c_str())); + object = QPDF_Integer::create(QUtil::string_to_ll(tokenizer.getValue().c_str())); break; case QPDFTokenizer::tt_real: @@ -166,7 +165,7 @@ QPDFParser::parse(bool& empty, bool content_stream) case QPDFTokenizer::tt_name: { - auto name = tokenizer.getValue(); + auto const& name = tokenizer.getValue(); object = QPDF_Name::create(name); if (name == "/Contents") { @@ -179,7 +178,7 @@ QPDFParser::parse(bool& empty, bool content_stream) case QPDFTokenizer::tt_word: { - auto value = tokenizer.getValue(); + auto const& value = tokenizer.getValue(); auto size = olist.size(); if (content_stream) { object = QPDF_Operator::create(value); @@ -226,7 +225,7 @@ QPDFParser::parse(bool& empty, bool content_stream) case QPDFTokenizer::tt_string: { - auto val = tokenizer.getValue(); + auto const& val = tokenizer.getValue(); if (decrypter) { if (b_contents) { frame.contents_string = val; From 37f7a734885f0d3c9dce64fbdb9a57192170686b Mon Sep 17 00:00:00 2001 From: m-holger Date: Fri, 27 Oct 2023 14:40:30 +0100 Subject: [PATCH 02/15] In QPDFParser::parse refactor handling of bad tokens --- libqpdf/QPDFParser.cc | 66 +++++++++++++++++++++----------------- libqpdf/qpdf/QPDFParser.hh | 5 +++ 2 files changed, 42 insertions(+), 29 deletions(-) diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 2716c34d..18b60d53 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -60,13 +60,10 @@ QPDFParser::parse(bool& empty, bool content_stream) state_stack.push_back(st_top); qpdf_offset_t offset; bool done = false; - int bad_count = 0; - int good_count = 0; bool b_contents = false; bool is_null = false; while (!done) { - bool bad = false; bool indirect_ref = false; is_null = false; auto& frame = stack.back(); @@ -80,6 +77,7 @@ QPDFParser::parse(bool& empty, bool content_stream) if (!tokenizer.nextToken(*input, object_description)) { warn(tokenizer.getErrorMessage()); } + ++good_count; // optimistically switch (tokenizer.getType()) { case QPDFTokenizer::tt_eof: @@ -87,13 +85,14 @@ QPDFParser::parse(bool& empty, bool content_stream) QTC::TC("qpdf", "QPDFParser eof in parse"); warn("unexpected EOF"); } - bad = true; state = st_eof; break; case QPDFTokenizer::tt_bad: QTC::TC("qpdf", "QPDFParser bad token in parse"); - bad = true; + if (tooManyBadTokens()) { + return {QPDF_Null::create()}; + } is_null = true; break; @@ -101,7 +100,9 @@ QPDFParser::parse(bool& empty, bool content_stream) case QPDFTokenizer::tt_brace_close: QTC::TC("qpdf", "QPDFParser bad brace"); warn("treating unexpected brace token as null"); - bad = true; + if (tooManyBadTokens()) { + return {QPDF_Null::create()}; + } is_null = true; break; @@ -111,7 +112,9 @@ QPDFParser::parse(bool& empty, bool content_stream) } else { QTC::TC("qpdf", "QPDFParser bad array close"); warn("treating unexpected array close token as null"); - bad = true; + if (tooManyBadTokens()) { + return {QPDF_Null::create()}; + } is_null = true; } break; @@ -122,7 +125,9 @@ QPDFParser::parse(bool& empty, bool content_stream) } else { QTC::TC("qpdf", "QPDFParser bad dictionary close"); warn("unexpected dictionary close token"); - bad = true; + if (tooManyBadTokens()) { + return {QPDF_Null::create()}; + } is_null = true; } break; @@ -132,7 +137,9 @@ QPDFParser::parse(bool& empty, bool content_stream) if (stack.size() > 500) { QTC::TC("qpdf", "QPDFParser too deep"); warn("ignoring excessively deeply nested data structure"); - bad = true; + if (tooManyBadTokens()) { + return {QPDF_Null::create()}; + } is_null = true; state = st_top; } else { @@ -217,7 +224,9 @@ QPDFParser::parse(bool& empty, bool content_stream) } else { QTC::TC("qpdf", "QPDFParser treat word as string"); warn("unknown token while reading object; treating as string"); - bad = true; + if (tooManyBadTokens()) { + return {QPDF_Null::create()}; + } object = QPDF_String::create(value); } } @@ -239,12 +248,13 @@ QPDFParser::parse(bool& empty, bool content_stream) object = QPDF_String::create(val); } } - break; default: warn("treating unknown token type as null while reading object"); - bad = true; + if (tooManyBadTokens()) { + return {QPDF_Null::create()}; + } is_null = true; break; } @@ -255,23 +265,6 @@ QPDFParser::parse(bool& empty, bool content_stream) is_null = true; } - if (bad) { - ++bad_count; - good_count = 0; - } else { - ++good_count; - if (good_count > 3) { - bad_count = 0; - } - } - if (bad_count > 5) { - // We had too many consecutive errors without enough intervening successful objects. - // Give up. - warn("too many errors; giving up on reading object"); - state = st_top; - is_null = true; - } - switch (state) { case st_eof: if (state_stack.size() > 1) { @@ -412,6 +405,21 @@ QPDFParser::setDescription(std::shared_ptr& obj, qpdf_offset_t parse } } +bool +QPDFParser::tooManyBadTokens() +{ + if (good_count <= 4) { + if (++bad_count > 5) { + warn("too many errors; giving up on reading object"); + return true; + } + } else { + bad_count = 1; + } + good_count = 0; + return false; +} + void QPDFParser::warn(QPDFExc const& e) const { diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index 35f9f603..2107a9b8 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -33,6 +33,7 @@ class QPDFParser private: enum parser_state_e { st_top, st_start, st_stop, st_eof, st_dictionary, st_array }; + bool tooManyBadTokens(); void warn(qpdf_offset_t offset, std::string const& msg) const; void warn(std::string const& msg) const; void warn(QPDFExc const&) const; @@ -43,6 +44,10 @@ class QPDFParser QPDFObjectHandle::StringDecrypter* decrypter; QPDF* context; std::shared_ptr description; + // Number of recent bad tokens. + int bad_count = 0; + // Number of good tokens since last bad token. Irrelevant if bad_count == 0. + int good_count = 0; }; #endif // QPDFPARSER_HH From 26e0bf461041856ea8159c86556524e4b158efe7 Mon Sep 17 00:00:00 2001 From: m-holger Date: Fri, 27 Oct 2023 15:37:46 +0100 Subject: [PATCH 03/15] In QPDFParser::parse refactor eof handling --- libqpdf/QPDFParser.cc | 29 ++++++++++------------------- libqpdf/qpdf/QPDFParser.hh | 2 +- qpdf/qtest/qpdf/bad16-recover.out | 4 ++-- qpdf/qtest/qpdf/bad16.out | 2 +- 4 files changed, 14 insertions(+), 23 deletions(-) diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 18b60d53..2a7598c9 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -81,12 +81,16 @@ QPDFParser::parse(bool& empty, bool content_stream) switch (tokenizer.getType()) { case QPDFTokenizer::tt_eof: - if (!content_stream) { - QTC::TC("qpdf", "QPDFParser eof in parse"); - warn("unexpected EOF"); + if (state_stack.size() > 1) { + warn("parse error while reading object"); } - state = st_eof; - break; + if (content_stream) { + // In content stream mode, leave object uninitialized to indicate EOF + return {}; + } + QTC::TC("qpdf", "QPDFParser eof in parse"); + warn("unexpected EOF"); + return {QPDF_Null::create()}; case QPDFTokenizer::tt_bad: QTC::TC("qpdf", "QPDFParser bad token in parse"); @@ -259,24 +263,11 @@ QPDFParser::parse(bool& empty, bool content_stream) break; } - if (object == nullptr && !is_null && - (!((state == st_start) || (state == st_stop) || (state == st_eof)))) { + if (object == nullptr && !is_null && (!(state == st_start || state == st_stop))) { throw std::logic_error("QPDFParser:parseInternal: unexpected uninitialized object"); - is_null = true; } switch (state) { - case st_eof: - if (state_stack.size() > 1) { - warn("parse error while reading object"); - } - done = true; - // In content stream mode, leave object uninitialized to indicate EOF - if (!content_stream) { - is_null = true; - } - break; - case st_dictionary: case st_array: if (is_null) { diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index 2107a9b8..5697e7d8 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -31,7 +31,7 @@ class QPDFParser QPDFObjectHandle parse(bool& empty, bool content_stream); private: - enum parser_state_e { st_top, st_start, st_stop, st_eof, st_dictionary, st_array }; + enum parser_state_e { st_top, st_start, st_stop, st_dictionary, st_array }; bool tooManyBadTokens(); void warn(qpdf_offset_t offset, std::string const& msg) const; diff --git a/qpdf/qtest/qpdf/bad16-recover.out b/qpdf/qtest/qpdf/bad16-recover.out index adddb4f7..0bedd64d 100644 --- a/qpdf/qtest/qpdf/bad16-recover.out +++ b/qpdf/qtest/qpdf/bad16-recover.out @@ -1,14 +1,14 @@ WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string -WARNING: bad16.pdf (trailer, offset 779): unexpected EOF WARNING: bad16.pdf (trailer, offset 779): parse error while reading object +WARNING: bad16.pdf (trailer, offset 779): unexpected EOF WARNING: bad16.pdf: file is damaged WARNING: bad16.pdf (offset 712): expected trailer dictionary WARNING: bad16.pdf: Attempting to reconstruct cross-reference table WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string -WARNING: bad16.pdf (trailer, offset 779): unexpected EOF WARNING: bad16.pdf (trailer, offset 779): parse error while reading object +WARNING: bad16.pdf (trailer, offset 779): unexpected EOF bad16.pdf: unable to find trailer dictionary while recovering damaged file diff --git a/qpdf/qtest/qpdf/bad16.out b/qpdf/qtest/qpdf/bad16.out index bcc37f35..ffba090a 100644 --- a/qpdf/qtest/qpdf/bad16.out +++ b/qpdf/qtest/qpdf/bad16.out @@ -1,6 +1,6 @@ WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string -WARNING: bad16.pdf (trailer, offset 779): unexpected EOF WARNING: bad16.pdf (trailer, offset 779): parse error while reading object +WARNING: bad16.pdf (trailer, offset 779): unexpected EOF bad16.pdf (offset 712): expected trailer dictionary From 12837f14b6c793313778ca13ca3f8e615d41117b Mon Sep 17 00:00:00 2001 From: m-holger Date: Fri, 27 Oct 2023 17:20:54 +0100 Subject: [PATCH 04/15] In QPDFParser::parse refactor handling of array_close tokens --- libqpdf/QPDFParser.cc | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 2a7598c9..2a77c731 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -112,7 +112,20 @@ QPDFParser::parse(bool& empty, bool content_stream) case QPDFTokenizer::tt_array_close: if (state == st_array) { - state = st_stop; + if ((state_stack.size() < 2) || (stack.size() < 2)) { + throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with " + "insufficient elements in stack"); + } + object = QPDF_Array::create(std::move(olist), frame.null_count > 100); + setDescription(object, offset - 1); + // The `offset` points to the next of "[". Set the rewind offset to point to the + // beginning of "[". This has been explicitly tested with whitespace surrounding the + // array start delimiter. getLastOffset points to the array end token and therefore + // can't be used here. + set_offset = true; + state_stack.pop_back(); + state = state_stack.back(); + stack.pop_back(); } else { QTC::TC("qpdf", "QPDFParser bad array close"); warn("treating unexpected array close token as null"); @@ -273,11 +286,11 @@ QPDFParser::parse(bool& empty, bool content_stream) if (is_null) { object = null_oh; // No need to set description for direct nulls - they probably will become implicit. - } else if (!indirect_ref) { + } else if (!indirect_ref && !set_offset) { setDescription(object, input->getLastOffset()); } set_offset = true; - olist.push_back(object); + stack.back().olist.push_back(object); break; case st_top: @@ -294,15 +307,7 @@ QPDFParser::parse(bool& empty, bool content_stream) } parser_state_e old_state = state_stack.back(); state_stack.pop_back(); - if (old_state == st_array) { - object = QPDF_Array::create(std::move(olist), frame.null_count > 100); - setDescription(object, offset - 1); - // The `offset` points to the next of "[". Set the rewind offset to point to the - // beginning of "[". This has been explicitly tested with whitespace surrounding the - // array start delimiter. getLastOffset points to the array end token and therefore - // can't be used here. - set_offset = true; - } else if (old_state == st_dictionary) { + if (old_state == st_dictionary) { // Convert list to map. Alternating elements are keys. Attempt to recover more or // less gracefully from invalid dictionaries. std::set names; From 90829228b814c6fe3ea3192da34db90dc1e36843 Mon Sep 17 00:00:00 2001 From: m-holger Date: Fri, 27 Oct 2023 17:41:36 +0100 Subject: [PATCH 05/15] In QPDFParser::parse refactor handling of dict_close tokens --- libqpdf/QPDFParser.cc | 159 +++++++++++++++++-------------------- libqpdf/qpdf/QPDFParser.hh | 2 +- 2 files changed, 76 insertions(+), 85 deletions(-) diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 2a77c731..5bff6af3 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -138,7 +138,80 @@ QPDFParser::parse(bool& empty, bool content_stream) case QPDFTokenizer::tt_dict_close: if (state == st_dictionary) { - state = st_stop; + if ((state_stack.size() < 2) || (stack.size() < 2)) { + throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with " + "insufficient elements in stack"); + } + + // Convert list to map. Alternating elements are keys. Attempt to recover more or + // less gracefully from invalid dictionaries. + std::set names; + for (auto& obj: olist) { + if (obj) { + if (obj->getTypeCode() == ::ot_name) { + names.insert(obj->getStringValue()); + } + } + } + + std::map dict; + int next_fake_key = 1; + for (auto iter = olist.begin(); iter != olist.end();) { + // Calculate key. + std::string key; + if (*iter && (*iter)->getTypeCode() == ::ot_name) { + key = (*iter)->getStringValue(); + ++iter; + } else { + for (bool found_fake = false; !found_fake;) { + key = "/QPDFFake" + std::to_string(next_fake_key++); + found_fake = (names.count(key) == 0); + QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1)); + } + warn( + offset, + "expected dictionary key but found non-name object; inserting key " + + key); + } + if (dict.count(key) > 0) { + QTC::TC("qpdf", "QPDFParser duplicate dict key"); + warn( + offset, + "dictionary has duplicated key " + key + + "; last occurrence overrides earlier ones"); + } + + // Calculate value. + std::shared_ptr val; + if (iter != olist.end()) { + val = *iter; + ++iter; + } else { + QTC::TC("qpdf", "QPDFParser no val for last key"); + warn( + offset, + "dictionary ended prematurely; using null as value for last key"); + val = QPDF_Null::create(); + } + + dict[std::move(key)] = std::move(val); + } + if (!frame.contents_string.empty() && dict.count("/Type") && + dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") && + dict.count("/Contents") && dict["/Contents"].isString()) { + dict["/Contents"] = QPDFObjectHandle::newString(frame.contents_string); + dict["/Contents"].setParsedOffset(frame.contents_offset); + } + object = QPDF_Dictionary::create(std::move(dict)); + setDescription(object, offset - 2); + // The `offset` points to the next of "<<". Set the rewind offset to point to the + // beginning of "<<". This has been explicitly tested with whitespace surrounding + // the dictionary start delimiter. getLastOffset points to the dictionary end token + // and therefore can't be used here. + set_offset = true; + state_stack.pop_back(); + state = state_stack.back(); + stack.pop_back(); } else { QTC::TC("qpdf", "QPDFParser bad dictionary close"); warn("unexpected dictionary close token"); @@ -276,7 +349,7 @@ QPDFParser::parse(bool& empty, bool content_stream) break; } - if (object == nullptr && !is_null && (!(state == st_start || state == st_stop))) { + if (object == nullptr && !is_null && state != st_start) { throw std::logic_error("QPDFParser:parseInternal: unexpected uninitialized object"); } @@ -299,88 +372,6 @@ QPDFParser::parse(bool& empty, bool content_stream) case st_start: break; - - case st_stop: - if ((state_stack.size() < 2) || (stack.size() < 2)) { - throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with " - "insufficient elements in stack"); - } - parser_state_e old_state = state_stack.back(); - state_stack.pop_back(); - if (old_state == st_dictionary) { - // Convert list to map. Alternating elements are keys. Attempt to recover more or - // less gracefully from invalid dictionaries. - std::set names; - for (auto& obj: olist) { - if (obj) { - if (obj->getTypeCode() == ::ot_name) { - names.insert(obj->getStringValue()); - } - } - } - - std::map dict; - int next_fake_key = 1; - for (auto iter = olist.begin(); iter != olist.end();) { - // Calculate key. - std::string key; - if (*iter && (*iter)->getTypeCode() == ::ot_name) { - key = (*iter)->getStringValue(); - ++iter; - } else { - for (bool found_fake = false; !found_fake;) { - key = "/QPDFFake" + std::to_string(next_fake_key++); - found_fake = (names.count(key) == 0); - QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1)); - } - warn( - offset, - "expected dictionary key but found non-name object; inserting key " + - key); - } - if (dict.count(key) > 0) { - QTC::TC("qpdf", "QPDFParser duplicate dict key"); - warn( - offset, - "dictionary has duplicated key " + key + - "; last occurrence overrides earlier ones"); - } - - // Calculate value. - std::shared_ptr val; - if (iter != olist.end()) { - val = *iter; - ++iter; - } else { - QTC::TC("qpdf", "QPDFParser no val for last key"); - warn( - offset, - "dictionary ended prematurely; using null as value for last key"); - val = QPDF_Null::create(); - } - - dict[std::move(key)] = std::move(val); - } - if (!frame.contents_string.empty() && dict.count("/Type") && - dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") && - dict.count("/Contents") && dict["/Contents"].isString()) { - dict["/Contents"] = QPDFObjectHandle::newString(frame.contents_string); - dict["/Contents"].setParsedOffset(frame.contents_offset); - } - object = QPDF_Dictionary::create(std::move(dict)); - setDescription(object, offset - 2); - // The `offset` points to the next of "<<". Set the rewind offset to point to the - // beginning of "<<". This has been explicitly tested with whitespace surrounding - // the dictionary start delimiter. getLastOffset points to the dictionary end token - // and therefore can't be used here. - set_offset = true; - } - stack.pop_back(); - if (state_stack.back() == st_top) { - done = true; - } else { - stack.back().olist.push_back(object); - } } } diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index 5697e7d8..48bd594e 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -31,7 +31,7 @@ class QPDFParser QPDFObjectHandle parse(bool& empty, bool content_stream); private: - enum parser_state_e { st_top, st_start, st_stop, st_dictionary, st_array }; + enum parser_state_e { st_top, st_start, st_dictionary, st_array }; bool tooManyBadTokens(); void warn(qpdf_offset_t offset, std::string const& msg) const; From d904eab84c008325289bfb7ddcf60256b36a1b67 Mon Sep 17 00:00:00 2001 From: m-holger Date: Fri, 27 Oct 2023 17:55:09 +0100 Subject: [PATCH 06/15] In QPDFParser::parse refactor handling of array_open and dict_open tokens --- libqpdf/QPDFParser.cc | 14 +++----------- libqpdf/qpdf/QPDFParser.hh | 2 +- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 5bff6af3..12a53851 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -227,20 +227,15 @@ QPDFParser::parse(bool& empty, bool content_stream) if (stack.size() > 500) { QTC::TC("qpdf", "QPDFParser too deep"); warn("ignoring excessively deeply nested data structure"); - if (tooManyBadTokens()) { - return {QPDF_Null::create()}; - } - is_null = true; - state = st_top; + return {QPDF_Null::create()}; } else { - state = st_start; state_stack.push_back( (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary); b_contents = false; stack.emplace_back(input); + continue; } - break; case QPDFTokenizer::tt_bool: object = QPDF_Bool::create((tokenizer.getValue() == "true")); @@ -349,7 +344,7 @@ QPDFParser::parse(bool& empty, bool content_stream) break; } - if (object == nullptr && !is_null && state != st_start) { + if (object == nullptr && !is_null) { throw std::logic_error("QPDFParser:parseInternal: unexpected uninitialized object"); } @@ -369,9 +364,6 @@ QPDFParser::parse(bool& empty, bool content_stream) case st_top: done = true; break; - - case st_start: - break; } } diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index 48bd594e..b9274824 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -31,7 +31,7 @@ class QPDFParser QPDFObjectHandle parse(bool& empty, bool content_stream); private: - enum parser_state_e { st_top, st_start, st_dictionary, st_array }; + enum parser_state_e { st_top, st_dictionary, st_array }; bool tooManyBadTokens(); void warn(qpdf_offset_t offset, std::string const& msg) const; From db6ab9cbfabe9be32b7386ac92dbc2a3fabd83e5 Mon Sep 17 00:00:00 2001 From: m-holger Date: Sat, 28 Oct 2023 00:46:29 +0100 Subject: [PATCH 07/15] In QPDFParser::parse merge state and object stacks --- libqpdf/QPDFParser.cc | 109 ++++++++++++++----------------------- libqpdf/qpdf/QPDFParser.hh | 17 ++++++ 2 files changed, 59 insertions(+), 67 deletions(-) diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 12a53851..125fe762 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -21,22 +21,6 @@ #include -namespace -{ - struct StackFrame - { - StackFrame(std::shared_ptr input) : - offset(input->tell()) - { - } - - std::vector> olist; - qpdf_offset_t offset; - std::string contents_string{""}; - qpdf_offset_t contents_offset{-1}; - int null_count{0}; - }; -} // namespace QPDFObjectHandle QPDFParser::parse(bool& empty, bool content_stream) @@ -54,23 +38,15 @@ QPDFParser::parse(bool& empty, bool content_stream) std::shared_ptr object; bool set_offset = false; - std::vector stack; - stack.emplace_back(input); - std::vector state_stack; - state_stack.push_back(st_top); - qpdf_offset_t offset; + std::vector stack{{input, st_top}}; bool done = false; bool b_contents = false; bool is_null = false; + auto* frame = &stack.back(); while (!done) { bool indirect_ref = false; is_null = false; - auto& frame = stack.back(); - auto& olist = frame.olist; - parser_state_e state = state_stack.back(); - offset = frame.offset; - object = nullptr; set_offset = false; @@ -81,7 +57,7 @@ QPDFParser::parse(bool& empty, bool content_stream) switch (tokenizer.getType()) { case QPDFTokenizer::tt_eof: - if (state_stack.size() > 1) { + if (stack.size() > 1) { warn("parse error while reading object"); } if (content_stream) { @@ -111,21 +87,20 @@ QPDFParser::parse(bool& empty, bool content_stream) break; case QPDFTokenizer::tt_array_close: - if (state == st_array) { - if ((state_stack.size() < 2) || (stack.size() < 2)) { + if (frame->state == st_array) { + if (stack.size() < 2) { throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with " "insufficient elements in stack"); } - object = QPDF_Array::create(std::move(olist), frame.null_count > 100); - setDescription(object, offset - 1); + object = QPDF_Array::create(std::move(frame->olist), frame->null_count > 100); + setDescription(object, frame->offset - 1); // The `offset` points to the next of "[". Set the rewind offset to point to the // beginning of "[". This has been explicitly tested with whitespace surrounding the // array start delimiter. getLastOffset points to the array end token and therefore // can't be used here. set_offset = true; - state_stack.pop_back(); - state = state_stack.back(); stack.pop_back(); + frame = &stack.back(); } else { QTC::TC("qpdf", "QPDFParser bad array close"); warn("treating unexpected array close token as null"); @@ -137,8 +112,8 @@ QPDFParser::parse(bool& empty, bool content_stream) break; case QPDFTokenizer::tt_dict_close: - if (state == st_dictionary) { - if ((state_stack.size() < 2) || (stack.size() < 2)) { + if (frame->state == st_dictionary) { + if (stack.size() < 2) { throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with " "insufficient elements in stack"); } @@ -146,7 +121,7 @@ QPDFParser::parse(bool& empty, bool content_stream) // Convert list to map. Alternating elements are keys. Attempt to recover more or // less gracefully from invalid dictionaries. std::set names; - for (auto& obj: olist) { + for (auto& obj: frame->olist) { if (obj) { if (obj->getTypeCode() == ::ot_name) { names.insert(obj->getStringValue()); @@ -156,7 +131,7 @@ QPDFParser::parse(bool& empty, bool content_stream) std::map dict; int next_fake_key = 1; - for (auto iter = olist.begin(); iter != olist.end();) { + for (auto iter = frame->olist.begin(); iter != frame->olist.end();) { // Calculate key. std::string key; if (*iter && (*iter)->getTypeCode() == ::ot_name) { @@ -169,49 +144,48 @@ QPDFParser::parse(bool& empty, bool content_stream) QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1)); } warn( - offset, + frame->offset, "expected dictionary key but found non-name object; inserting key " + key); } if (dict.count(key) > 0) { QTC::TC("qpdf", "QPDFParser duplicate dict key"); warn( - offset, + frame->offset, "dictionary has duplicated key " + key + "; last occurrence overrides earlier ones"); } // Calculate value. std::shared_ptr val; - if (iter != olist.end()) { + if (iter != frame->olist.end()) { val = *iter; ++iter; } else { QTC::TC("qpdf", "QPDFParser no val for last key"); warn( - offset, + frame->offset, "dictionary ended prematurely; using null as value for last key"); val = QPDF_Null::create(); } dict[std::move(key)] = std::move(val); } - if (!frame.contents_string.empty() && dict.count("/Type") && + if (!frame->contents_string.empty() && dict.count("/Type") && dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") && dict.count("/Contents") && dict["/Contents"].isString()) { - dict["/Contents"] = QPDFObjectHandle::newString(frame.contents_string); - dict["/Contents"].setParsedOffset(frame.contents_offset); + dict["/Contents"] = QPDFObjectHandle::newString(frame->contents_string); + dict["/Contents"].setParsedOffset(frame->contents_offset); } object = QPDF_Dictionary::create(std::move(dict)); - setDescription(object, offset - 2); + setDescription(object, frame->offset - 2); // The `offset` points to the next of "<<". Set the rewind offset to point to the // beginning of "<<". This has been explicitly tested with whitespace surrounding // the dictionary start delimiter. getLastOffset points to the dictionary end token // and therefore can't be used here. set_offset = true; - state_stack.pop_back(); - state = state_stack.back(); stack.pop_back(); + frame = &stack.back(); } else { QTC::TC("qpdf", "QPDFParser bad dictionary close"); warn("unexpected dictionary close token"); @@ -229,11 +203,12 @@ QPDFParser::parse(bool& empty, bool content_stream) warn("ignoring excessively deeply nested data structure"); return {QPDF_Null::create()}; } else { - state_stack.push_back( + b_contents = false; + stack.emplace_back( + input, (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary); - b_contents = false; - stack.emplace_back(input); + frame = &stack.back(); continue; } @@ -243,7 +218,7 @@ QPDFParser::parse(bool& empty, bool content_stream) case QPDFTokenizer::tt_null: is_null = true; - ++frame.null_count; + ++frame->null_count; break; @@ -271,23 +246,23 @@ QPDFParser::parse(bool& empty, bool content_stream) case QPDFTokenizer::tt_word: { auto const& value = tokenizer.getValue(); - auto size = olist.size(); + auto size = frame->olist.size(); if (content_stream) { object = QPDF_Operator::create(value); } else if ( - value == "R" && state != st_top && size >= 2 && olist.back() && - olist.back()->getTypeCode() == ::ot_integer && - !olist.back()->getObjGen().isIndirect() && olist.at(size - 2) && - olist.at(size - 2)->getTypeCode() == ::ot_integer && - !olist.at(size - 2)->getObjGen().isIndirect()) { + value == "R" && frame->state != st_top && size >= 2 && frame->olist.back() && + frame->olist.back()->getTypeCode() == ::ot_integer && + !frame->olist.back()->getObjGen().isIndirect() && frame->olist.at(size - 2) && + frame->olist.at(size - 2)->getTypeCode() == ::ot_integer && + !frame->olist.at(size - 2)->getObjGen().isIndirect()) { if (context == nullptr) { QTC::TC("qpdf", "QPDFParser indirect without context"); throw std::logic_error("QPDFObjectHandle::parse called without context on " "an object with indirect references"); } auto ref_og = QPDFObjGen( - QPDFObjectHandle(olist.at(size - 2)).getIntValueAsInt(), - QPDFObjectHandle(olist.back()).getIntValueAsInt()); + QPDFObjectHandle(frame->olist.at(size - 2)).getIntValueAsInt(), + QPDFObjectHandle(frame->olist.back()).getIntValueAsInt()); if (ref_og.isIndirect()) { // This action has the desirable side effect of causing dangling references // (references to indirect objects that don't appear in the PDF) in any @@ -298,9 +273,9 @@ QPDFParser::parse(bool& empty, bool content_stream) QTC::TC("qpdf", "QPDFParser indirect with 0 objid"); is_null = true; } - olist.pop_back(); - olist.pop_back(); - } else if ((value == "endobj") && (state == st_top)) { + frame->olist.pop_back(); + frame->olist.pop_back(); + } else if ((value == "endobj") && (frame->state == st_top)) { // We just saw endobj without having read anything. Treat this as a null and do // not move the input source's offset. is_null = true; @@ -322,8 +297,8 @@ QPDFParser::parse(bool& empty, bool content_stream) auto const& val = tokenizer.getValue(); if (decrypter) { if (b_contents) { - frame.contents_string = val; - frame.contents_offset = input->getLastOffset(); + frame->contents_string = val; + frame->contents_offset = input->getLastOffset(); b_contents = false; } std::string s{val}; @@ -348,7 +323,7 @@ QPDFParser::parse(bool& empty, bool content_stream) throw std::logic_error("QPDFParser:parseInternal: unexpected uninitialized object"); } - switch (state) { + switch (frame->state) { case st_dictionary: case st_array: if (is_null) { @@ -358,7 +333,7 @@ QPDFParser::parse(bool& empty, bool content_stream) setDescription(object, input->getLastOffset()); } set_offset = true; - stack.back().olist.push_back(object); + frame->olist.push_back(object); break; case st_top: @@ -371,7 +346,7 @@ QPDFParser::parse(bool& empty, bool content_stream) object = QPDF_Null::create(); } if (!set_offset) { - setDescription(object, offset); + setDescription(object, frame->offset); } return object; } diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index b9274824..099fcd9c 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -31,8 +31,25 @@ class QPDFParser QPDFObjectHandle parse(bool& empty, bool content_stream); private: + struct StackFrame; enum parser_state_e { st_top, st_dictionary, st_array }; + struct StackFrame + { + StackFrame(std::shared_ptr const& input, parser_state_e state) : + state(state), + offset(input->tell()) + { + } + + std::vector> olist; + parser_state_e state; + qpdf_offset_t offset; + std::string contents_string{""}; + qpdf_offset_t contents_offset{-1}; + int null_count{0}; + }; + bool tooManyBadTokens(); void warn(qpdf_offset_t offset, std::string const& msg) const; void warn(std::string const& msg) const; From 5a1bf035f91156d8fdc351fb18b34177ea5822e0 Mon Sep 17 00:00:00 2001 From: m-holger Date: Mon, 30 Oct 2023 12:21:34 +0000 Subject: [PATCH 08/15] Add new method QPDFParser::parseRemainder The new method is temporarily an (almost) complete copy of parse, which is temporarily (almost) unchanged. --- libqpdf/QPDFParser.cc | 338 ++++++++++++++++++++++++++++++++++++- libqpdf/qpdf/QPDFParser.hh | 5 + 2 files changed, 340 insertions(+), 3 deletions(-) diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 125fe762..8e3d0019 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -38,11 +38,343 @@ QPDFParser::parse(bool& empty, bool content_stream) std::shared_ptr object; bool set_offset = false; - std::vector stack{{input, st_top}}; +// std::vector stack{{input, st_top}}; + stack.clear(); // NEW + stack.emplace_back(input, st_top); // NEW bool done = false; bool b_contents = false; bool is_null = false; - auto* frame = &stack.back(); + frame = &stack.back(); // CHANGED + + while (!done) { + bool indirect_ref = false; + is_null = false; + object = nullptr; + set_offset = false; + + if (!tokenizer.nextToken(*input, object_description)) { + warn(tokenizer.getErrorMessage()); + } + ++good_count; // optimistically + + switch (tokenizer.getType()) { + case QPDFTokenizer::tt_eof: + if (stack.size() > 1) { + warn("parse error while reading object"); + } + if (content_stream) { + // In content stream mode, leave object uninitialized to indicate EOF + return {}; + } +// QTC::TC("qpdf", "QPDFParser eof in parse"); + warn("unexpected EOF"); + return {QPDF_Null::create()}; + + case QPDFTokenizer::tt_bad: +// QTC::TC("qpdf", "QPDFParser bad token in parse"); + if (tooManyBadTokens()) { + return {QPDF_Null::create()}; + } + is_null = true; + break; + + case QPDFTokenizer::tt_brace_open: + case QPDFTokenizer::tt_brace_close: +// QTC::TC("qpdf", "QPDFParser bad brace"); + warn("treating unexpected brace token as null"); + if (tooManyBadTokens()) { + return {QPDF_Null::create()}; + } + is_null = true; + break; + + case QPDFTokenizer::tt_array_close: + if (frame->state == st_array) { + if (stack.size() < 2) { + throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with " + "insufficient elements in stack"); + } + object = QPDF_Array::create(std::move(frame->olist), frame->null_count > 100); + setDescription(object, frame->offset - 1); + // The `offset` points to the next of "[". Set the rewind offset to point to the + // beginning of "[". This has been explicitly tested with whitespace surrounding the + // array start delimiter. getLastOffset points to the array end token and therefore + // can't be used here. + set_offset = true; + stack.pop_back(); + frame = &stack.back(); + } else { +// QTC::TC("qpdf", "QPDFParser bad array close"); + warn("treating unexpected array close token as null"); + if (tooManyBadTokens()) { + return {QPDF_Null::create()}; + } + is_null = true; + } + break; + + case QPDFTokenizer::tt_dict_close: + if (frame->state == st_dictionary) { + if (stack.size() < 2) { + throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with " + "insufficient elements in stack"); + } + + // Convert list to map. Alternating elements are keys. Attempt to recover more or + // less gracefully from invalid dictionaries. + std::set names; + for (auto& obj: frame->olist) { + if (obj) { + if (obj->getTypeCode() == ::ot_name) { + names.insert(obj->getStringValue()); + } + } + } + + std::map dict; + int next_fake_key = 1; + for (auto iter = frame->olist.begin(); iter != frame->olist.end();) { + // Calculate key. + std::string key; + if (*iter && (*iter)->getTypeCode() == ::ot_name) { + key = (*iter)->getStringValue(); + ++iter; + } else { + for (bool found_fake = false; !found_fake;) { + key = "/QPDFFake" + std::to_string(next_fake_key++); + found_fake = (names.count(key) == 0); +// QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1)); + } + warn( + frame->offset, + "expected dictionary key but found non-name object; inserting key " + + key); + } + if (dict.count(key) > 0) { +// QTC::TC("qpdf", "QPDFParser duplicate dict key"); + warn( + frame->offset, + "dictionary has duplicated key " + key + + "; last occurrence overrides earlier ones"); + } + + // Calculate value. + std::shared_ptr val; + if (iter != frame->olist.end()) { + val = *iter; + ++iter; + } else { +// QTC::TC("qpdf", "QPDFParser no val for last key"); + warn( + frame->offset, + "dictionary ended prematurely; using null as value for last key"); + val = QPDF_Null::create(); + } + + dict[std::move(key)] = std::move(val); + } + if (!frame->contents_string.empty() && dict.count("/Type") && + dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") && + dict.count("/Contents") && dict["/Contents"].isString()) { + dict["/Contents"] = QPDFObjectHandle::newString(frame->contents_string); + dict["/Contents"].setParsedOffset(frame->contents_offset); + } + object = QPDF_Dictionary::create(std::move(dict)); + setDescription(object, frame->offset - 2); + // The `offset` points to the next of "<<". Set the rewind offset to point to the + // beginning of "<<". This has been explicitly tested with whitespace surrounding + // the dictionary start delimiter. getLastOffset points to the dictionary end token + // and therefore can't be used here. + set_offset = true; + stack.pop_back(); + frame = &stack.back(); + } else { +// QTC::TC("qpdf", "QPDFParser bad dictionary close"); + warn("unexpected dictionary close token"); + if (tooManyBadTokens()) { + return {QPDF_Null::create()}; + } + is_null = true; + } + break; + + case QPDFTokenizer::tt_array_open: + case QPDFTokenizer::tt_dict_open: + if (stack.size() > 500) { +// QTC::TC("qpdf", "QPDFParser too deep"); + warn("ignoring excessively deeply nested data structure"); + return {QPDF_Null::create()}; + } else { + b_contents = false; + stack.emplace_back( + input, + (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array + : st_dictionary); + frame = &stack.back(); + return parseRemainder(content_stream); // NEW + continue; + } + + case QPDFTokenizer::tt_bool: + object = QPDF_Bool::create((tokenizer.getValue() == "true")); + break; + + case QPDFTokenizer::tt_null: + is_null = true; + ++frame->null_count; + + break; + + case QPDFTokenizer::tt_integer: + object = QPDF_Integer::create(QUtil::string_to_ll(tokenizer.getValue().c_str())); + break; + + case QPDFTokenizer::tt_real: + object = QPDF_Real::create(tokenizer.getValue()); + break; + + case QPDFTokenizer::tt_name: + { + auto const& name = tokenizer.getValue(); + object = QPDF_Name::create(name); + + if (name == "/Contents") { + b_contents = true; + } else { + b_contents = false; + } + } + break; + + case QPDFTokenizer::tt_word: + { + auto const& value = tokenizer.getValue(); + auto size = frame->olist.size(); + if (content_stream) { + object = QPDF_Operator::create(value); + } else if ( + value == "R" && frame->state != st_top && size >= 2 && frame->olist.back() && + frame->olist.back()->getTypeCode() == ::ot_integer && + !frame->olist.back()->getObjGen().isIndirect() && frame->olist.at(size - 2) && + frame->olist.at(size - 2)->getTypeCode() == ::ot_integer && + !frame->olist.at(size - 2)->getObjGen().isIndirect()) { + if (context == nullptr) { +// QTC::TC("qpdf", "QPDFParser indirect without context"); + throw std::logic_error("QPDFObjectHandle::parse called without context on " + "an object with indirect references"); + } + auto ref_og = QPDFObjGen( + QPDFObjectHandle(frame->olist.at(size - 2)).getIntValueAsInt(), + QPDFObjectHandle(frame->olist.back()).getIntValueAsInt()); + if (ref_og.isIndirect()) { + // This action has the desirable side effect of causing dangling references + // (references to indirect objects that don't appear in the PDF) in any + // parsed object to appear in the object cache. + object = context->getObject(ref_og).obj; + indirect_ref = true; + } else { +// QTC::TC("qpdf", "QPDFParser indirect with 0 objid"); + is_null = true; + } + frame->olist.pop_back(); + frame->olist.pop_back(); + } else if ((value == "endobj") && (frame->state == st_top)) { + // We just saw endobj without having read anything. Treat this as a null and do + // not move the input source's offset. + is_null = true; + input->seek(input->getLastOffset(), SEEK_SET); + empty = true; + } else { +// QTC::TC("qpdf", "QPDFParser treat word as string"); + warn("unknown token while reading object; treating as string"); + if (tooManyBadTokens()) { + return {QPDF_Null::create()}; + } + object = QPDF_String::create(value); + } + } + break; + + case QPDFTokenizer::tt_string: + { + auto const& val = tokenizer.getValue(); + if (decrypter) { + if (b_contents) { + frame->contents_string = val; + frame->contents_offset = input->getLastOffset(); + b_contents = false; + } + std::string s{val}; + decrypter->decryptString(s); + object = QPDF_String::create(s); + } else { + object = QPDF_String::create(val); + } + } + break; + + default: + warn("treating unknown token type as null while reading object"); + if (tooManyBadTokens()) { + return {QPDF_Null::create()}; + } + is_null = true; + break; + } + + if (object == nullptr && !is_null) { + throw std::logic_error("QPDFParser:parseInternal: unexpected uninitialized object"); + } + + switch (frame->state) { + case st_dictionary: + case st_array: + if (is_null) { + object = null_oh; + // No need to set description for direct nulls - they probably will become implicit. + } else if (!indirect_ref && !set_offset) { + setDescription(object, input->getLastOffset()); + } + set_offset = true; + frame->olist.push_back(object); + break; + + case st_top: + done = true; + break; + } + } + + if (is_null) { + object = QPDF_Null::create(); + } + if (!set_offset) { + setDescription(object, frame->offset); + } + return object; +} + +QPDFObjectHandle +QPDFParser::parseRemainder(bool content_stream) +{ + // This method must take care not to resolve any objects. Don't check the type of any object + // without first ensuring that it is a direct object. Otherwise, doing so may have the side + // effect of reading the object and changing the file pointer. If you do this, it will cause a + // logic error to be thrown from QPDF::inParse(). + + const static std::shared_ptr null_oh = QPDF_Null::create(); +// QPDF::ParseGuard pg(context); + +// empty = false; + + std::shared_ptr object; + bool set_offset = false; + +// std::vector stack{{input, st_top},}; + bool done = false; + bool b_contents = false; + bool is_null = false; + frame = &stack.back(); // CHANGED while (!done) { bool indirect_ref = false; @@ -280,7 +612,7 @@ QPDFParser::parse(bool& empty, bool content_stream) // not move the input source's offset. is_null = true; input->seek(input->getLastOffset(), SEEK_SET); - empty = true; +// empty = true; } else { QTC::TC("qpdf", "QPDFParser treat word as string"); warn("unknown token while reading object; treating as string"); diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index 099fcd9c..fcf40eeb 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -50,6 +50,9 @@ class QPDFParser int null_count{0}; }; + + QPDFObjectHandle + parseRemainder(bool content_stream); bool tooManyBadTokens(); void warn(qpdf_offset_t offset, std::string const& msg) const; void warn(std::string const& msg) const; @@ -61,6 +64,8 @@ class QPDFParser QPDFObjectHandle::StringDecrypter* decrypter; QPDF* context; std::shared_ptr description; + std::vector stack; + StackFrame* frame; // Number of recent bad tokens. int bad_count = 0; // Number of good tokens since last bad token. Irrelevant if bad_count == 0. From 172cc6130583d3c30df3fcea22528afca4b12e5f Mon Sep 17 00:00:00 2001 From: m-holger Date: Mon, 30 Oct 2023 13:42:00 +0000 Subject: [PATCH 09/15] Remove redundant code in QPDFParser::parse and parseRemainder Also, fix test cases. --- libqpdf/QPDFParser.cc | 419 +++++++------------------------ qpdf/qpdf.testcov | 6 + qpdf/qtest/parsing.test | 2 +- qpdf/qtest/qpdf/bad39.qdf | 102 ++++++++ qpdf/qtest/qpdf/parse-object.out | 6 + qpdf/test_driver.cc | 7 + 6 files changed, 217 insertions(+), 325 deletions(-) create mode 100644 qpdf/qtest/qpdf/bad39.qdf diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 8e3d0019..1758c7b8 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -21,7 +21,6 @@ #include - QPDFObjectHandle QPDFParser::parse(bool& empty, bool content_stream) { @@ -30,327 +29,110 @@ QPDFParser::parse(bool& empty, bool content_stream) // effect of reading the object and changing the file pointer. If you do this, it will cause a // logic error to be thrown from QPDF::inParse(). - const static std::shared_ptr null_oh = QPDF_Null::create(); QPDF::ParseGuard pg(context); - empty = false; std::shared_ptr object; - bool set_offset = false; + stack.clear(); + stack.emplace_back(input, st_top); + frame = &stack.back(); + object = nullptr; -// std::vector stack{{input, st_top}}; - stack.clear(); // NEW - stack.emplace_back(input, st_top); // NEW - bool done = false; - bool b_contents = false; - bool is_null = false; - frame = &stack.back(); // CHANGED + if (!tokenizer.nextToken(*input, object_description)) { + warn(tokenizer.getErrorMessage()); + } - while (!done) { - bool indirect_ref = false; - is_null = false; - object = nullptr; - set_offset = false; - - if (!tokenizer.nextToken(*input, object_description)) { - warn(tokenizer.getErrorMessage()); + switch (tokenizer.getType()) { + case QPDFTokenizer::tt_eof: + if (content_stream) { + // In content stream mode, leave object uninitialized to indicate EOF + return {}; } - ++good_count; // optimistically + QTC::TC("qpdf", "QPDFParser eof in parse"); + warn("unexpected EOF"); + return {QPDF_Null::create()}; - switch (tokenizer.getType()) { - case QPDFTokenizer::tt_eof: - if (stack.size() > 1) { - warn("parse error while reading object"); - } + case QPDFTokenizer::tt_bad: + QTC::TC("qpdf", "QPDFParser bad token in parse"); + return {QPDF_Null::create()}; + + case QPDFTokenizer::tt_brace_open: + case QPDFTokenizer::tt_brace_close: + QTC::TC("qpdf", "QPDFParser bad brace"); + warn("treating unexpected brace token as null"); + return {QPDF_Null::create()}; + + case QPDFTokenizer::tt_array_close: + QTC::TC("qpdf", "QPDFParser bad array close"); + warn("treating unexpected array close token as null"); + return {QPDF_Null::create()}; + + case QPDFTokenizer::tt_dict_close: + QTC::TC("qpdf", "QPDFParser bad dictionary close"); + warn("unexpected dictionary close token"); + return {QPDF_Null::create()}; + + case QPDFTokenizer::tt_array_open: + case QPDFTokenizer::tt_dict_open: + stack.emplace_back( + input, + (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary); + return parseRemainder(content_stream); + + case QPDFTokenizer::tt_bool: + object = QPDF_Bool::create((tokenizer.getValue() == "true")); + break; + + case QPDFTokenizer::tt_null: + return {QPDF_Null::create()}; + + case QPDFTokenizer::tt_integer: + object = QPDF_Integer::create(QUtil::string_to_ll(tokenizer.getValue().c_str())); + break; + + case QPDFTokenizer::tt_real: + object = QPDF_Real::create(tokenizer.getValue()); + break; + + case QPDFTokenizer::tt_name: + object = QPDF_Name::create(tokenizer.getValue()); + break; + + case QPDFTokenizer::tt_word: + { + auto const& value = tokenizer.getValue(); if (content_stream) { - // In content stream mode, leave object uninitialized to indicate EOF - return {}; - } -// QTC::TC("qpdf", "QPDFParser eof in parse"); - warn("unexpected EOF"); - return {QPDF_Null::create()}; - - case QPDFTokenizer::tt_bad: -// QTC::TC("qpdf", "QPDFParser bad token in parse"); - if (tooManyBadTokens()) { - return {QPDF_Null::create()}; - } - is_null = true; - break; - - case QPDFTokenizer::tt_brace_open: - case QPDFTokenizer::tt_brace_close: -// QTC::TC("qpdf", "QPDFParser bad brace"); - warn("treating unexpected brace token as null"); - if (tooManyBadTokens()) { - return {QPDF_Null::create()}; - } - is_null = true; - break; - - case QPDFTokenizer::tt_array_close: - if (frame->state == st_array) { - if (stack.size() < 2) { - throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with " - "insufficient elements in stack"); - } - object = QPDF_Array::create(std::move(frame->olist), frame->null_count > 100); - setDescription(object, frame->offset - 1); - // The `offset` points to the next of "[". Set the rewind offset to point to the - // beginning of "[". This has been explicitly tested with whitespace surrounding the - // array start delimiter. getLastOffset points to the array end token and therefore - // can't be used here. - set_offset = true; - stack.pop_back(); - frame = &stack.back(); - } else { -// QTC::TC("qpdf", "QPDFParser bad array close"); - warn("treating unexpected array close token as null"); - if (tooManyBadTokens()) { - return {QPDF_Null::create()}; - } - is_null = true; - } - break; - - case QPDFTokenizer::tt_dict_close: - if (frame->state == st_dictionary) { - if (stack.size() < 2) { - throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with " - "insufficient elements in stack"); - } - - // Convert list to map. Alternating elements are keys. Attempt to recover more or - // less gracefully from invalid dictionaries. - std::set names; - for (auto& obj: frame->olist) { - if (obj) { - if (obj->getTypeCode() == ::ot_name) { - names.insert(obj->getStringValue()); - } - } - } - - std::map dict; - int next_fake_key = 1; - for (auto iter = frame->olist.begin(); iter != frame->olist.end();) { - // Calculate key. - std::string key; - if (*iter && (*iter)->getTypeCode() == ::ot_name) { - key = (*iter)->getStringValue(); - ++iter; - } else { - for (bool found_fake = false; !found_fake;) { - key = "/QPDFFake" + std::to_string(next_fake_key++); - found_fake = (names.count(key) == 0); -// QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1)); - } - warn( - frame->offset, - "expected dictionary key but found non-name object; inserting key " + - key); - } - if (dict.count(key) > 0) { -// QTC::TC("qpdf", "QPDFParser duplicate dict key"); - warn( - frame->offset, - "dictionary has duplicated key " + key + - "; last occurrence overrides earlier ones"); - } - - // Calculate value. - std::shared_ptr val; - if (iter != frame->olist.end()) { - val = *iter; - ++iter; - } else { -// QTC::TC("qpdf", "QPDFParser no val for last key"); - warn( - frame->offset, - "dictionary ended prematurely; using null as value for last key"); - val = QPDF_Null::create(); - } - - dict[std::move(key)] = std::move(val); - } - if (!frame->contents_string.empty() && dict.count("/Type") && - dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") && - dict.count("/Contents") && dict["/Contents"].isString()) { - dict["/Contents"] = QPDFObjectHandle::newString(frame->contents_string); - dict["/Contents"].setParsedOffset(frame->contents_offset); - } - object = QPDF_Dictionary::create(std::move(dict)); - setDescription(object, frame->offset - 2); - // The `offset` points to the next of "<<". Set the rewind offset to point to the - // beginning of "<<". This has been explicitly tested with whitespace surrounding - // the dictionary start delimiter. getLastOffset points to the dictionary end token - // and therefore can't be used here. - set_offset = true; - stack.pop_back(); - frame = &stack.back(); - } else { -// QTC::TC("qpdf", "QPDFParser bad dictionary close"); - warn("unexpected dictionary close token"); - if (tooManyBadTokens()) { - return {QPDF_Null::create()}; - } - is_null = true; - } - break; - - case QPDFTokenizer::tt_array_open: - case QPDFTokenizer::tt_dict_open: - if (stack.size() > 500) { -// QTC::TC("qpdf", "QPDFParser too deep"); - warn("ignoring excessively deeply nested data structure"); + object = QPDF_Operator::create(value); + } else if (value == "endobj") { + // We just saw endobj without having read anything. Treat this as a null and do + // not move the input source's offset. + input->seek(input->getLastOffset(), SEEK_SET); + empty = true; return {QPDF_Null::create()}; } else { - b_contents = false; - stack.emplace_back( - input, - (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array - : st_dictionary); - frame = &stack.back(); - return parseRemainder(content_stream); // NEW - continue; + QTC::TC("qpdf", "QPDFParser treat word as string"); + warn("unknown token while reading object; treating as string"); + object = QPDF_String::create(value); } - - case QPDFTokenizer::tt_bool: - object = QPDF_Bool::create((tokenizer.getValue() == "true")); - break; - - case QPDFTokenizer::tt_null: - is_null = true; - ++frame->null_count; - - break; - - case QPDFTokenizer::tt_integer: - object = QPDF_Integer::create(QUtil::string_to_ll(tokenizer.getValue().c_str())); - break; - - case QPDFTokenizer::tt_real: - object = QPDF_Real::create(tokenizer.getValue()); - break; - - case QPDFTokenizer::tt_name: - { - auto const& name = tokenizer.getValue(); - object = QPDF_Name::create(name); - - if (name == "/Contents") { - b_contents = true; - } else { - b_contents = false; - } - } - break; - - case QPDFTokenizer::tt_word: - { - auto const& value = tokenizer.getValue(); - auto size = frame->olist.size(); - if (content_stream) { - object = QPDF_Operator::create(value); - } else if ( - value == "R" && frame->state != st_top && size >= 2 && frame->olist.back() && - frame->olist.back()->getTypeCode() == ::ot_integer && - !frame->olist.back()->getObjGen().isIndirect() && frame->olist.at(size - 2) && - frame->olist.at(size - 2)->getTypeCode() == ::ot_integer && - !frame->olist.at(size - 2)->getObjGen().isIndirect()) { - if (context == nullptr) { -// QTC::TC("qpdf", "QPDFParser indirect without context"); - throw std::logic_error("QPDFObjectHandle::parse called without context on " - "an object with indirect references"); - } - auto ref_og = QPDFObjGen( - QPDFObjectHandle(frame->olist.at(size - 2)).getIntValueAsInt(), - QPDFObjectHandle(frame->olist.back()).getIntValueAsInt()); - if (ref_og.isIndirect()) { - // This action has the desirable side effect of causing dangling references - // (references to indirect objects that don't appear in the PDF) in any - // parsed object to appear in the object cache. - object = context->getObject(ref_og).obj; - indirect_ref = true; - } else { -// QTC::TC("qpdf", "QPDFParser indirect with 0 objid"); - is_null = true; - } - frame->olist.pop_back(); - frame->olist.pop_back(); - } else if ((value == "endobj") && (frame->state == st_top)) { - // We just saw endobj without having read anything. Treat this as a null and do - // not move the input source's offset. - is_null = true; - input->seek(input->getLastOffset(), SEEK_SET); - empty = true; - } else { -// QTC::TC("qpdf", "QPDFParser treat word as string"); - warn("unknown token while reading object; treating as string"); - if (tooManyBadTokens()) { - return {QPDF_Null::create()}; - } - object = QPDF_String::create(value); - } - } - break; - - case QPDFTokenizer::tt_string: - { - auto const& val = tokenizer.getValue(); - if (decrypter) { - if (b_contents) { - frame->contents_string = val; - frame->contents_offset = input->getLastOffset(); - b_contents = false; - } - std::string s{val}; - decrypter->decryptString(s); - object = QPDF_String::create(s); - } else { - object = QPDF_String::create(val); - } - } - break; - - default: - warn("treating unknown token type as null while reading object"); - if (tooManyBadTokens()) { - return {QPDF_Null::create()}; - } - is_null = true; - break; } + break; - if (object == nullptr && !is_null) { - throw std::logic_error("QPDFParser:parseInternal: unexpected uninitialized object"); + case QPDFTokenizer::tt_string: + if (decrypter) { + std::string s{tokenizer.getValue()}; + decrypter->decryptString(s); + object = QPDF_String::create(s); + } else { + object = QPDF_String::create(tokenizer.getValue()); } + break; - switch (frame->state) { - case st_dictionary: - case st_array: - if (is_null) { - object = null_oh; - // No need to set description for direct nulls - they probably will become implicit. - } else if (!indirect_ref && !set_offset) { - setDescription(object, input->getLastOffset()); - } - set_offset = true; - frame->olist.push_back(object); - break; - - case st_top: - done = true; - break; - } + default: + warn("treating unknown token type as null while reading object"); + return {QPDF_Null::create()}; } - if (is_null) { - object = QPDF_Null::create(); - } - if (!set_offset) { - setDescription(object, frame->offset); - } + setDescription(object, frame->offset); return object; } @@ -363,18 +145,15 @@ QPDFParser::parseRemainder(bool content_stream) // logic error to be thrown from QPDF::inParse(). const static std::shared_ptr null_oh = QPDF_Null::create(); -// QPDF::ParseGuard pg(context); - -// empty = false; std::shared_ptr object; bool set_offset = false; -// std::vector stack{{input, st_top},}; bool done = false; bool b_contents = false; bool is_null = false; frame = &stack.back(); // CHANGED + bad_count = 0; while (!done) { bool indirect_ref = false; @@ -389,19 +168,17 @@ QPDFParser::parseRemainder(bool content_stream) switch (tokenizer.getType()) { case QPDFTokenizer::tt_eof: - if (stack.size() > 1) { - warn("parse error while reading object"); - } + warn("parse error while reading object"); if (content_stream) { // In content stream mode, leave object uninitialized to indicate EOF return {}; } - QTC::TC("qpdf", "QPDFParser eof in parse"); + QTC::TC("qpdf", "QPDFParser eof in parseRemainder"); warn("unexpected EOF"); return {QPDF_Null::create()}; case QPDFTokenizer::tt_bad: - QTC::TC("qpdf", "QPDFParser bad token in parse"); + QTC::TC("qpdf", "QPDFParser bad token in parseRemainder"); if (tooManyBadTokens()) { return {QPDF_Null::create()}; } @@ -410,7 +187,7 @@ QPDFParser::parseRemainder(bool content_stream) case QPDFTokenizer::tt_brace_open: case QPDFTokenizer::tt_brace_close: - QTC::TC("qpdf", "QPDFParser bad brace"); + QTC::TC("qpdf", "QPDFParser bad brace in parseRemainder"); warn("treating unexpected brace token as null"); if (tooManyBadTokens()) { return {QPDF_Null::create()}; @@ -434,7 +211,7 @@ QPDFParser::parseRemainder(bool content_stream) stack.pop_back(); frame = &stack.back(); } else { - QTC::TC("qpdf", "QPDFParser bad array close"); + QTC::TC("qpdf", "QPDFParser bad array close in parseRemainder"); warn("treating unexpected array close token as null"); if (tooManyBadTokens()) { return {QPDF_Null::create()}; @@ -519,7 +296,7 @@ QPDFParser::parseRemainder(bool content_stream) stack.pop_back(); frame = &stack.back(); } else { - QTC::TC("qpdf", "QPDFParser bad dictionary close"); + QTC::TC("qpdf", "QPDFParser bad dictionary close in parseRemainder"); warn("unexpected dictionary close token"); if (tooManyBadTokens()) { return {QPDF_Null::create()}; @@ -582,7 +359,7 @@ QPDFParser::parseRemainder(bool content_stream) if (content_stream) { object = QPDF_Operator::create(value); } else if ( - value == "R" && frame->state != st_top && size >= 2 && frame->olist.back() && + value == "R" && size >= 2 && frame->olist.back() && frame->olist.back()->getTypeCode() == ::ot_integer && !frame->olist.back()->getObjGen().isIndirect() && frame->olist.at(size - 2) && frame->olist.at(size - 2)->getTypeCode() == ::ot_integer && @@ -607,14 +384,8 @@ QPDFParser::parseRemainder(bool content_stream) } frame->olist.pop_back(); frame->olist.pop_back(); - } else if ((value == "endobj") && (frame->state == st_top)) { - // We just saw endobj without having read anything. Treat this as a null and do - // not move the input source's offset. - is_null = true; - input->seek(input->getLastOffset(), SEEK_SET); -// empty = true; } else { - QTC::TC("qpdf", "QPDFParser treat word as string"); + QTC::TC("qpdf", "QPDFParser treat word as string in parseRemainder"); warn("unknown token while reading object; treating as string"); if (tooManyBadTokens()) { return {QPDF_Null::create()}; diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index ec11c57b..cbb4ac1d 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -57,11 +57,14 @@ QPDF trailer lacks size 0 QPDF trailer size not integer 0 QPDF trailer prev not integer 0 QPDFParser bad brace 0 +QPDFParser bad brace in parseRemainder 0 QPDFParser bad array close 0 +QPDFParser bad array close in parseRemainder 0 QPDF stream without length 0 QPDF stream length not integer 0 QPDF missing endstream 0 QPDFParser bad dictionary close 0 +QPDFParser bad dictionary close in parseRemainder 0 QPDF can't find xref 0 QPDFTokenizer bad ) 0 QPDFTokenizer bad > 0 @@ -258,6 +261,7 @@ QPDFParser indirect with 0 objid 0 QPDF object id 0 0 QPDF recursion loop in resolve 0 QPDFParser treat word as string 0 +QPDFParser treat word as string in parseRemainder 0 QPDFParser found fake 1 QPDFParser no val for last key 0 QPDF resolve failure to null 0 @@ -289,7 +293,9 @@ QPDFObjectHandle coalesce called on stream 0 QPDFObjectHandle coalesce provide stream data 0 QPDF_Stream bad token at end during normalize 0 QPDFParser bad token in parse 0 +QPDFParser bad token in parseRemainder 0 QPDFParser eof in parse 0 +QPDFParser eof in parseRemainder 0 QPDFObjectHandle array bounds 0 QPDFObjectHandle boolean returning false 0 QPDFObjectHandle integer returning 0 0 diff --git a/qpdf/qtest/parsing.test b/qpdf/qtest/parsing.test index 23edcac4..97cf9edf 100644 --- a/qpdf/qtest/parsing.test +++ b/qpdf/qtest/parsing.test @@ -17,7 +17,7 @@ my $td = new TestDriver('parsing'); my $n_tests = 17; $td->runtest("parse objects from string", - {$td->COMMAND => "test_driver 31 good1.qdf"}, + {$td->COMMAND => "test_driver 31 bad39.qdf"}, {$td->FILE => "parse-object.out", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); $td->runtest("EOF terminating literal tokens", diff --git a/qpdf/qtest/qpdf/bad39.qdf b/qpdf/qtest/qpdf/bad39.qdf new file mode 100644 index 00000000..1da316e6 --- /dev/null +++ b/qpdf/qtest/qpdf/bad39.qdf @@ -0,0 +1,102 @@ +%PDF-1.3 +%¿÷¢þ +%QDF-1.0 + +%% Original object ID: 1 0 +1 0 obj +<< + /Pages 2 0 R + /Type /Catalog +>> +endobj + +%% Original object ID: 2 0 +2 0 obj +<< + /Count 1 + /Kids [ + 3 0 R + ] + /Type /Pages +>> +endobj + +%% Page 1 +%% Original object ID: 3 0 +3 0 obj +<< + /Contents 4 0 R + /MediaBox [ + 0 + 0 + 612 + 792 + ] + /Parent 2 0 R + /Resources << + /Font << + /F1 6 0 R + >> + /ProcSet 7 0 R + >> + /Type /Page +>> +endobj + +%% Contents for page 1 +%% Original object ID: 4 0 +4 0 obj +<< + /Length 5 0 R +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET +endstream +endobj + +5 0 obj +44 +endobj + +%% Original object ID: 6 0 +6 0 obj +<< + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + /Name /F1 + /Subtype /Type1 + /Type /Font +>> +endobj + +%% Original object ID: 5 0 +7 0 obj +[ + /PDF + /Text +] +endobj + +xref +0 8 +0000000000 65535 f +0000000052 00000 n +0000000133 00000 n +0000000242 00000 n +0000000484 00000 n +0000000583 00000 n +0000000629 00000 n +0000001113 00000 n +trailer << + /Root 1 0 R + /Size 8 + /ID [<31415926535897932384626433832795><31415926535897932384626433832795>] +>> +startxref +809 +%%EOF +7 0 obj diff --git a/qpdf/qtest/qpdf/parse-object.out b/qpdf/qtest/qpdf/parse-object.out index 2e09f6ad..cb3cb742 100644 --- a/qpdf/qtest/qpdf/parse-object.out +++ b/qpdf/qtest/qpdf/parse-object.out @@ -2,4 +2,10 @@ logic error parsing indirect: QPDFObjectHandle::parse called without context on an object with indirect references trailing data: parsed object (trailing test): trailing data found parsing object from string WARNING: parsed object (offset 9): unknown token while reading object; treating as string +WARNING: parsed object: treating unexpected brace token as null +WARNING: parsed object: treating unexpected brace token as null +WARNING: parsed object: unexpected dictionary close token +WARNING: bad39.qdf (object 7 0, offset 1121): unexpected EOF +WARNING: bad39.qdf (object 7 0, offset 1121): expected endobj +WARNING: bad39.qdf (object 7 0, offset 1121): EOF after endobj test 31 done diff --git a/qpdf/test_driver.cc b/qpdf/test_driver.cc index 03631eb2..319c80d2 100644 --- a/qpdf/test_driver.cc +++ b/qpdf/test_driver.cc @@ -1195,6 +1195,13 @@ test_31(QPDF& pdf, char const* arg2) // mistakenly parsed as an indirect object. assert(QPDFObjectHandle::parse(&pdf, "[5 0 R 0 R /X]").unparse() == "[ 5 0 R 0 (R) /X ]"); assert(QPDFObjectHandle::parse(&pdf, "[1 0 R]", "indirect test").unparse() == "[ 1 0 R ]"); + // TC:QPDFParser bad brace + assert(QPDFObjectHandle::parse(&pdf, "}").unparse() == "null"); + assert(QPDFObjectHandle::parse(&pdf, "{").unparse() == "null"); + // TC:QPDFParser bad dictionary close + assert(QPDFObjectHandle::parse(&pdf, ">>").unparse() == "null"); + // TC:QPDFParser eof in parse + assert(QPDFObjectHandle::parse(&pdf, "[7 0 R]").getArrayItem(0).isNull()); } static void From c912af7384d5abeb281856f7122d427af5e29d73 Mon Sep 17 00:00:00 2001 From: m-holger Date: Mon, 30 Oct 2023 19:47:47 +0000 Subject: [PATCH 10/15] In QPDFParser remove state st_top --- libqpdf/QPDFParser.cc | 65 ++++++++++++-------------------------- libqpdf/qpdf/QPDFParser.hh | 2 +- 2 files changed, 21 insertions(+), 46 deletions(-) diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 1758c7b8..e118933f 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -33,10 +33,7 @@ QPDFParser::parse(bool& empty, bool content_stream) empty = false; std::shared_ptr object; - stack.clear(); - stack.emplace_back(input, st_top); - frame = &stack.back(); - object = nullptr; + auto offset = input->tell(); if (!tokenizer.nextToken(*input, object_description)) { warn(tokenizer.getErrorMessage()); @@ -74,9 +71,11 @@ QPDFParser::parse(bool& empty, bool content_stream) case QPDFTokenizer::tt_array_open: case QPDFTokenizer::tt_dict_open: + stack.clear(); stack.emplace_back( input, (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary); + frame = &stack.back(); return parseRemainder(content_stream); case QPDFTokenizer::tt_bool: @@ -132,7 +131,7 @@ QPDFParser::parse(bool& empty, bool content_stream) return {QPDF_Null::create()}; } - setDescription(object, frame->offset); + setDescription(object, offset); return object; } @@ -148,14 +147,11 @@ QPDFParser::parseRemainder(bool content_stream) std::shared_ptr object; bool set_offset = false; - - bool done = false; bool b_contents = false; bool is_null = false; - frame = &stack.back(); // CHANGED bad_count = 0; - while (!done) { + while (true) { bool indirect_ref = false; is_null = false; object = nullptr; @@ -197,10 +193,6 @@ QPDFParser::parseRemainder(bool content_stream) case QPDFTokenizer::tt_array_close: if (frame->state == st_array) { - if (stack.size() < 2) { - throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with " - "insufficient elements in stack"); - } object = QPDF_Array::create(std::move(frame->olist), frame->null_count > 100); setDescription(object, frame->offset - 1); // The `offset` points to the next of "[". Set the rewind offset to point to the @@ -208,6 +200,9 @@ QPDFParser::parseRemainder(bool content_stream) // array start delimiter. getLastOffset points to the array end token and therefore // can't be used here. set_offset = true; + if (stack.size() <= 1) { + return object; + } stack.pop_back(); frame = &stack.back(); } else { @@ -222,11 +217,6 @@ QPDFParser::parseRemainder(bool content_stream) case QPDFTokenizer::tt_dict_close: if (frame->state == st_dictionary) { - if (stack.size() < 2) { - throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with " - "insufficient elements in stack"); - } - // Convert list to map. Alternating elements are keys. Attempt to recover more or // less gracefully from invalid dictionaries. std::set names; @@ -278,7 +268,7 @@ QPDFParser::parseRemainder(bool content_stream) val = QPDF_Null::create(); } - dict[std::move(key)] = std::move(val); + dict[std::move(key)] = val; } if (!frame->contents_string.empty() && dict.count("/Type") && dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") && @@ -292,6 +282,9 @@ QPDFParser::parseRemainder(bool content_stream) // beginning of "<<". This has been explicitly tested with whitespace surrounding // the dictionary start delimiter. getLastOffset points to the dictionary end token // and therefore can't be used here. + if (stack.size() <= 1) { + return object; + } set_offset = true; stack.pop_back(); frame = &stack.back(); @@ -307,7 +300,7 @@ QPDFParser::parseRemainder(bool content_stream) case QPDFTokenizer::tt_array_open: case QPDFTokenizer::tt_dict_open: - if (stack.size() > 500) { + if (stack.size() > 499) { QTC::TC("qpdf", "QPDFParser too deep"); warn("ignoring excessively deeply nested data structure"); return {QPDF_Null::create()}; @@ -328,7 +321,6 @@ QPDFParser::parseRemainder(bool content_stream) case QPDFTokenizer::tt_null: is_null = true; ++frame->null_count; - break; case QPDFTokenizer::tt_integer: @@ -426,32 +418,15 @@ QPDFParser::parseRemainder(bool content_stream) throw std::logic_error("QPDFParser:parseInternal: unexpected uninitialized object"); } - switch (frame->state) { - case st_dictionary: - case st_array: - if (is_null) { - object = null_oh; - // No need to set description for direct nulls - they probably will become implicit. - } else if (!indirect_ref && !set_offset) { - setDescription(object, input->getLastOffset()); - } - set_offset = true; - frame->olist.push_back(object); - break; - - case st_top: - done = true; - break; + if (is_null) { + object = null_oh; + // No need to set description for direct nulls - they probably will become implicit. + } else if (!indirect_ref && !set_offset) { + setDescription(object, input->getLastOffset()); } + frame->olist.push_back(object); } - - if (is_null) { - object = QPDF_Null::create(); - } - if (!set_offset) { - setDescription(object, frame->offset); - } - return object; + return {}; // unreachable } void diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index fcf40eeb..19373f24 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -32,7 +32,7 @@ class QPDFParser private: struct StackFrame; - enum parser_state_e { st_top, st_dictionary, st_array }; + enum parser_state_e { st_dictionary, st_array }; struct StackFrame { From 4c8836d52053187e8006021d4e5d367bdaf80093 Mon Sep 17 00:00:00 2001 From: m-holger Date: Mon, 30 Oct 2023 20:03:00 +0000 Subject: [PATCH 11/15] In QPDFParser::parse eliminate most temporary variables --- libqpdf/QPDFParser.cc | 52 +++++++++++++++++++------------------- libqpdf/qpdf/QPDFParser.hh | 15 +++++++---- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index e118933f..a76db91f 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -21,6 +21,8 @@ #include +using ObjectPtr = std::shared_ptr; + QPDFObjectHandle QPDFParser::parse(bool& empty, bool content_stream) { @@ -31,9 +33,7 @@ QPDFParser::parse(bool& empty, bool content_stream) QPDF::ParseGuard pg(context); empty = false; - - std::shared_ptr object; - auto offset = input->tell(); + start = input->tell(); if (!tokenizer.nextToken(*input, object_description)) { warn(tokenizer.getErrorMessage()); @@ -79,29 +79,25 @@ QPDFParser::parse(bool& empty, bool content_stream) return parseRemainder(content_stream); case QPDFTokenizer::tt_bool: - object = QPDF_Bool::create((tokenizer.getValue() == "true")); - break; + return withDescription(tokenizer.getValue() == "true"); case QPDFTokenizer::tt_null: return {QPDF_Null::create()}; case QPDFTokenizer::tt_integer: - object = QPDF_Integer::create(QUtil::string_to_ll(tokenizer.getValue().c_str())); - break; + return withDescription(QUtil::string_to_ll(tokenizer.getValue().c_str())); case QPDFTokenizer::tt_real: - object = QPDF_Real::create(tokenizer.getValue()); - break; + return withDescription(tokenizer.getValue()); case QPDFTokenizer::tt_name: - object = QPDF_Name::create(tokenizer.getValue()); - break; + return withDescription(tokenizer.getValue()); case QPDFTokenizer::tt_word: { auto const& value = tokenizer.getValue(); if (content_stream) { - object = QPDF_Operator::create(value); + return withDescription(value); } else if (value == "endobj") { // We just saw endobj without having read anything. Treat this as a null and do // not move the input source's offset. @@ -111,28 +107,23 @@ QPDFParser::parse(bool& empty, bool content_stream) } else { QTC::TC("qpdf", "QPDFParser treat word as string"); warn("unknown token while reading object; treating as string"); - object = QPDF_String::create(value); + return withDescription(value); } } - break; case QPDFTokenizer::tt_string: if (decrypter) { - std::string s{tokenizer.getValue()}; - decrypter->decryptString(s); - object = QPDF_String::create(s); + std::string s{tokenizer.getValue()}; + decrypter->decryptString(s); + return withDescription(s); } else { - object = QPDF_String::create(tokenizer.getValue()); + return withDescription(tokenizer.getValue()); } - break; default: warn("treating unknown token type as null while reading object"); return {QPDF_Null::create()}; } - - setDescription(object, offset); - return object; } QPDFObjectHandle @@ -143,9 +134,9 @@ QPDFParser::parseRemainder(bool content_stream) // effect of reading the object and changing the file pointer. If you do this, it will cause a // logic error to be thrown from QPDF::inParse(). - const static std::shared_ptr null_oh = QPDF_Null::create(); + const static ObjectPtr null_oh = QPDF_Null::create(); - std::shared_ptr object; + ObjectPtr object; bool set_offset = false; bool b_contents = false; bool is_null = false; @@ -256,7 +247,7 @@ QPDFParser::parseRemainder(bool content_stream) } // Calculate value. - std::shared_ptr val; + ObjectPtr val; if (iter != frame->olist.end()) { val = *iter; ++iter; @@ -429,8 +420,17 @@ QPDFParser::parseRemainder(bool content_stream) return {}; // unreachable } +template +QPDFObjectHandle +QPDFParser::withDescription(Args&&... args) +{ + auto obj = T::create(args...); + obj->setDescription(context, description, start); + return {obj}; +} + void -QPDFParser::setDescription(std::shared_ptr& obj, qpdf_offset_t parsed_offset) +QPDFParser::setDescription(ObjectPtr& obj, qpdf_offset_t parsed_offset) { if (obj) { obj->setDescription(context, description, parsed_offset); diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index 19373f24..b571cdaa 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -45,18 +45,20 @@ class QPDFParser std::vector> olist; parser_state_e state; qpdf_offset_t offset; - std::string contents_string{""}; + std::string contents_string; qpdf_offset_t contents_offset{-1}; int null_count{0}; }; - - QPDFObjectHandle - parseRemainder(bool content_stream); + QPDFObjectHandle parseRemainder(bool content_stream); bool tooManyBadTokens(); void warn(qpdf_offset_t offset, std::string const& msg) const; void warn(std::string const& msg) const; void warn(QPDFExc const&) const; + template + // Create a new scalar object complete with parsed offset and description. + // NB the offset includes any leading whitespace. + QPDFObjectHandle withDescription(Args&&... args); void setDescription(std::shared_ptr& obj, qpdf_offset_t parsed_offset); std::shared_ptr input; std::string const& object_description; @@ -65,11 +67,14 @@ class QPDFParser QPDF* context; std::shared_ptr description; std::vector stack; - StackFrame* frame; + StackFrame* frame; // Number of recent bad tokens. int bad_count = 0; // Number of good tokens since last bad token. Irrelevant if bad_count == 0. int good_count = 0; + // Start offset including any leading whitespace. + qpdf_offset_t start; + }; #endif // QPDFPARSER_HH From 1548b8d8be4b57e4087caaced1794a25a01c3488 Mon Sep 17 00:00:00 2001 From: m-holger Date: Tue, 31 Oct 2023 11:57:34 +0000 Subject: [PATCH 12/15] In QPDFParser::parseRemainder eliminate most temporary variables --- libqpdf/QPDFParser.cc | 127 +++++++++++++++++++------------------ libqpdf/qpdf/QPDFParser.hh | 4 ++ 2 files changed, 68 insertions(+), 63 deletions(-) diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index a76db91f..a1c79468 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -113,11 +113,11 @@ QPDFParser::parse(bool& empty, bool content_stream) case QPDFTokenizer::tt_string: if (decrypter) { - std::string s{tokenizer.getValue()}; - decrypter->decryptString(s); - return withDescription(s); + std::string s{tokenizer.getValue()}; + decrypter->decryptString(s); + return withDescription(s); } else { - return withDescription(tokenizer.getValue()); + return withDescription(tokenizer.getValue()); } default: @@ -134,20 +134,10 @@ QPDFParser::parseRemainder(bool content_stream) // effect of reading the object and changing the file pointer. If you do this, it will cause a // logic error to be thrown from QPDF::inParse(). - const static ObjectPtr null_oh = QPDF_Null::create(); - - ObjectPtr object; - bool set_offset = false; - bool b_contents = false; - bool is_null = false; bad_count = 0; + bool b_contents = false; while (true) { - bool indirect_ref = false; - is_null = false; - object = nullptr; - set_offset = false; - if (!tokenizer.nextToken(*input, object_description)) { warn(tokenizer.getErrorMessage()); } @@ -169,8 +159,8 @@ QPDFParser::parseRemainder(bool content_stream) if (tooManyBadTokens()) { return {QPDF_Null::create()}; } - is_null = true; - break; + addNull(); + continue; case QPDFTokenizer::tt_brace_open: case QPDFTokenizer::tt_brace_close: @@ -179,32 +169,32 @@ QPDFParser::parseRemainder(bool content_stream) if (tooManyBadTokens()) { return {QPDF_Null::create()}; } - is_null = true; - break; + addNull(); + continue; case QPDFTokenizer::tt_array_close: if (frame->state == st_array) { - object = QPDF_Array::create(std::move(frame->olist), frame->null_count > 100); + auto object = QPDF_Array::create(std::move(frame->olist), frame->null_count > 100); setDescription(object, frame->offset - 1); // The `offset` points to the next of "[". Set the rewind offset to point to the // beginning of "[". This has been explicitly tested with whitespace surrounding the // array start delimiter. getLastOffset points to the array end token and therefore // can't be used here. - set_offset = true; if (stack.size() <= 1) { return object; } stack.pop_back(); frame = &stack.back(); + add(std::move(object)); } else { QTC::TC("qpdf", "QPDFParser bad array close in parseRemainder"); warn("treating unexpected array close token as null"); if (tooManyBadTokens()) { return {QPDF_Null::create()}; } - is_null = true; + addNull(); } - break; + continue; case QPDFTokenizer::tt_dict_close: if (frame->state == st_dictionary) { @@ -267,7 +257,7 @@ QPDFParser::parseRemainder(bool content_stream) dict["/Contents"] = QPDFObjectHandle::newString(frame->contents_string); dict["/Contents"].setParsedOffset(frame->contents_offset); } - object = QPDF_Dictionary::create(std::move(dict)); + auto object = QPDF_Dictionary::create(std::move(dict)); setDescription(object, frame->offset - 2); // The `offset` points to the next of "<<". Set the rewind offset to point to the // beginning of "<<". This has been explicitly tested with whitespace surrounding @@ -276,18 +266,18 @@ QPDFParser::parseRemainder(bool content_stream) if (stack.size() <= 1) { return object; } - set_offset = true; stack.pop_back(); frame = &stack.back(); + add(std::move(object)); } else { QTC::TC("qpdf", "QPDFParser bad dictionary close in parseRemainder"); warn("unexpected dictionary close token"); if (tooManyBadTokens()) { return {QPDF_Null::create()}; } - is_null = true; + addNull(); } - break; + continue; case QPDFTokenizer::tt_array_open: case QPDFTokenizer::tt_dict_open: @@ -306,26 +296,25 @@ QPDFParser::parseRemainder(bool content_stream) } case QPDFTokenizer::tt_bool: - object = QPDF_Bool::create((tokenizer.getValue() == "true")); - break; + addScalar(tokenizer.getValue() == "true"); + continue; case QPDFTokenizer::tt_null: - is_null = true; - ++frame->null_count; - break; + addNull(); + continue; case QPDFTokenizer::tt_integer: - object = QPDF_Integer::create(QUtil::string_to_ll(tokenizer.getValue().c_str())); - break; + addScalar(QUtil::string_to_ll(tokenizer.getValue().c_str())); + continue; case QPDFTokenizer::tt_real: - object = QPDF_Real::create(tokenizer.getValue()); - break; + addScalar(tokenizer.getValue()); + continue; case QPDFTokenizer::tt_name: { auto const& name = tokenizer.getValue(); - object = QPDF_Name::create(name); + addScalar(name); if (name == "/Contents") { b_contents = true; @@ -333,14 +322,14 @@ QPDFParser::parseRemainder(bool content_stream) b_contents = false; } } - break; + continue; case QPDFTokenizer::tt_word: { auto const& value = tokenizer.getValue(); auto size = frame->olist.size(); if (content_stream) { - object = QPDF_Operator::create(value); + addScalar(value); } else if ( value == "R" && size >= 2 && frame->olist.back() && frame->olist.back()->getTypeCode() == ::ot_integer && @@ -359,24 +348,25 @@ QPDFParser::parseRemainder(bool content_stream) // This action has the desirable side effect of causing dangling references // (references to indirect objects that don't appear in the PDF) in any // parsed object to appear in the object cache. - object = context->getObject(ref_og).obj; - indirect_ref = true; + frame->olist.pop_back(); + frame->olist.pop_back(); + add(std::move(context->getObject(ref_og).obj)); } else { QTC::TC("qpdf", "QPDFParser indirect with 0 objid"); - is_null = true; + frame->olist.pop_back(); + frame->olist.pop_back(); + addNull(); } - frame->olist.pop_back(); - frame->olist.pop_back(); } else { QTC::TC("qpdf", "QPDFParser treat word as string in parseRemainder"); warn("unknown token while reading object; treating as string"); if (tooManyBadTokens()) { return {QPDF_Null::create()}; } - object = QPDF_String::create(value); + addScalar(value); } } - break; + continue; case QPDFTokenizer::tt_string: { @@ -389,37 +379,48 @@ QPDFParser::parseRemainder(bool content_stream) } std::string s{val}; decrypter->decryptString(s); - object = QPDF_String::create(s); + addScalar(s); } else { - object = QPDF_String::create(val); + addScalar(val); } } - break; + continue; default: warn("treating unknown token type as null while reading object"); if (tooManyBadTokens()) { return {QPDF_Null::create()}; } - is_null = true; - break; + addNull(); } - - if (object == nullptr && !is_null) { - throw std::logic_error("QPDFParser:parseInternal: unexpected uninitialized object"); - } - - if (is_null) { - object = null_oh; - // No need to set description for direct nulls - they probably will become implicit. - } else if (!indirect_ref && !set_offset) { - setDescription(object, input->getLastOffset()); - } - frame->olist.push_back(object); } return {}; // unreachable } +void +QPDFParser::add(std::shared_ptr&& obj) +{ + frame->olist.emplace_back(std::move(obj)); +} + +void +QPDFParser::addNull() +{ + const static ObjectPtr null_obj = QPDF_Null::create(); + + frame->olist.emplace_back(null_obj); + ++frame->null_count; +} + +template +void +QPDFParser::addScalar(Args&&... args) +{ + auto obj = T::create(args...); + obj->setDescription(context, description, input->getLastOffset()); + add(std::move(obj)); +} + template QPDFObjectHandle QPDFParser::withDescription(Args&&... args) diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index b571cdaa..70892e41 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -51,6 +51,10 @@ class QPDFParser }; QPDFObjectHandle parseRemainder(bool content_stream); + void add(std::shared_ptr&& obj); + void addNull(); + template + void addScalar(Args&&... args); bool tooManyBadTokens(); void warn(qpdf_offset_t offset, std::string const& msg) const; void warn(std::string const& msg) const; From 0328d8723793fa8c7f3cb4d243bfc7ed051e85bb Mon Sep 17 00:00:00 2001 From: m-holger Date: Tue, 31 Oct 2023 17:44:01 +0000 Subject: [PATCH 13/15] In QPDFParser::parse refactor parsing of indirect references --- libqpdf/QPDFParser.cc | 109 +++++++++++++++++++------------ libqpdf/qpdf/QPDFParser.hh | 5 ++ qpdf/qtest/qpdf/parse-object.out | 2 +- 3 files changed, 75 insertions(+), 41 deletions(-) diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index a1c79468..fd57c6f3 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -143,6 +143,51 @@ QPDFParser::parseRemainder(bool content_stream) } ++good_count; // optimistically + if (int_count != 0) { + // Special handling of indirect references. Treat integer tokens as part of an indirect + // reference until proven otherwise. + if (tokenizer.getType() == QPDFTokenizer::tt_integer) { + if (++int_count > 2) { + // Process the oldest buffered integer. + addInt(int_count); + } + last_offset_buffer[int_count % 2] = input->getLastOffset(); + int_buffer[int_count % 2] = QUtil::string_to_ll(tokenizer.getValue().c_str()); + continue; + + } else if ( + int_count >= 2 && tokenizer.getType() == QPDFTokenizer::tt_word && + tokenizer.getValue() == "R") { + if (context == nullptr) { + QTC::TC("qpdf", "QPDFParser indirect without context"); + throw std::logic_error("QPDFParser::parse called without context on an object " + "with indirect references"); + } + auto ref_og = QPDFObjGen( + QIntC::to_int(int_buffer[(int_count - 1) % 2]), + QIntC::to_int(int_buffer[(int_count) % 2])); + if (ref_og.isIndirect()) { + // This action has the desirable side effect of causing dangling references + // (references to indirect objects that don't appear in the PDF) in any parsed + // object to appear in the object cache. + add(std::move(context->getObject(ref_og).obj)); + } else { + QTC::TC("qpdf", "QPDFParser indirect with 0 objid"); + addNull(); + } + int_count = 0; + continue; + + } else if (int_count > 0) { + // Process the buffered integers before processing the current token. + if (int_count > 1) { + addInt(int_count - 1); + } + addInt(int_count); + int_count = 0; + } + } + switch (tokenizer.getType()) { case QPDFTokenizer::tt_eof: warn("parse error while reading object"); @@ -304,7 +349,14 @@ QPDFParser::parseRemainder(bool content_stream) continue; case QPDFTokenizer::tt_integer: - addScalar(QUtil::string_to_ll(tokenizer.getValue().c_str())); + if (!content_stream) { + // Buffer token in case it is part of an indirect reference. + last_offset_buffer[1] = input->getLastOffset(); + int_buffer[1] = QUtil::string_to_ll(tokenizer.getValue().c_str()); + int_count = 1; + } else { + addScalar(QUtil::string_to_ll(tokenizer.getValue().c_str())); + } continue; case QPDFTokenizer::tt_real: @@ -325,46 +377,15 @@ QPDFParser::parseRemainder(bool content_stream) continue; case QPDFTokenizer::tt_word: - { - auto const& value = tokenizer.getValue(); - auto size = frame->olist.size(); - if (content_stream) { - addScalar(value); - } else if ( - value == "R" && size >= 2 && frame->olist.back() && - frame->olist.back()->getTypeCode() == ::ot_integer && - !frame->olist.back()->getObjGen().isIndirect() && frame->olist.at(size - 2) && - frame->olist.at(size - 2)->getTypeCode() == ::ot_integer && - !frame->olist.at(size - 2)->getObjGen().isIndirect()) { - if (context == nullptr) { - QTC::TC("qpdf", "QPDFParser indirect without context"); - throw std::logic_error("QPDFObjectHandle::parse called without context on " - "an object with indirect references"); - } - auto ref_og = QPDFObjGen( - QPDFObjectHandle(frame->olist.at(size - 2)).getIntValueAsInt(), - QPDFObjectHandle(frame->olist.back()).getIntValueAsInt()); - if (ref_og.isIndirect()) { - // This action has the desirable side effect of causing dangling references - // (references to indirect objects that don't appear in the PDF) in any - // parsed object to appear in the object cache. - frame->olist.pop_back(); - frame->olist.pop_back(); - add(std::move(context->getObject(ref_og).obj)); - } else { - QTC::TC("qpdf", "QPDFParser indirect with 0 objid"); - frame->olist.pop_back(); - frame->olist.pop_back(); - addNull(); - } - } else { - QTC::TC("qpdf", "QPDFParser treat word as string in parseRemainder"); - warn("unknown token while reading object; treating as string"); - if (tooManyBadTokens()) { - return {QPDF_Null::create()}; - } - addScalar(value); + if (content_stream) { + addScalar(tokenizer.getValue()); + } else { + QTC::TC("qpdf", "QPDFParser treat word as string in parseRemainder"); + warn("unknown token while reading object; treating as string"); + if (tooManyBadTokens()) { + return {QPDF_Null::create()}; } + addScalar(tokenizer.getValue()); } continue; @@ -412,6 +433,14 @@ QPDFParser::addNull() ++frame->null_count; } +void +QPDFParser::addInt(int count) +{ + auto obj = QPDF_Integer::create(int_buffer[count % 2]); + obj->setDescription(context, description, last_offset_buffer[count % 2]); + add(std::move(obj)); +} + template void QPDFParser::addScalar(Args&&... args) diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index 70892e41..ef5be98e 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -53,6 +53,7 @@ class QPDFParser QPDFObjectHandle parseRemainder(bool content_stream); void add(std::shared_ptr&& obj); void addNull(); + void addInt(int count); template void addScalar(Args&&... args); bool tooManyBadTokens(); @@ -78,6 +79,10 @@ class QPDFParser int good_count = 0; // Start offset including any leading whitespace. qpdf_offset_t start; + // Number of successive integer tokens. + int int_count = 0; + long long int_buffer[2]{0, 0}; + qpdf_offset_t last_offset_buffer[2]{0, 0}; }; diff --git a/qpdf/qtest/qpdf/parse-object.out b/qpdf/qtest/qpdf/parse-object.out index cb3cb742..de7b42e6 100644 --- a/qpdf/qtest/qpdf/parse-object.out +++ b/qpdf/qtest/qpdf/parse-object.out @@ -1,5 +1,5 @@ [ /name 16059 3.14159 false << /key true /other [ (string1) (string2) ] >> null ] -logic error parsing indirect: QPDFObjectHandle::parse called without context on an object with indirect references +logic error parsing indirect: QPDFParser::parse called without context on an object with indirect references trailing data: parsed object (trailing test): trailing data found parsing object from string WARNING: parsed object (offset 9): unknown token while reading object; treating as string WARNING: parsed object: treating unexpected brace token as null From 605b1429e8b58d7fada225acaf530cfe8e9954ac Mon Sep 17 00:00:00 2001 From: m-holger Date: Wed, 1 Nov 2023 09:10:56 +0000 Subject: [PATCH 14/15] In QPDFParser::parse create dictionaries on the fly Also, don't search for /Contents name unless the result is used. --- libqpdf/QPDFParser.cc | 128 ++++++++++++++++-------------- libqpdf/qpdf/QPDFParser.hh | 9 ++- qpdf/qtest/qpdf/bad36-recover.out | 2 +- qpdf/qtest/qpdf/bad36.out | 2 +- qpdf/qtest/qpdf/issue-335a.out | 4 + 5 files changed, 80 insertions(+), 65 deletions(-) diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index fd57c6f3..d2b2af6a 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -74,7 +74,7 @@ QPDFParser::parse(bool& empty, bool content_stream) stack.clear(); stack.emplace_back( input, - (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary); + (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary_key); frame = &stack.back(); return parseRemainder(content_stream); @@ -242,60 +242,44 @@ QPDFParser::parseRemainder(bool content_stream) continue; case QPDFTokenizer::tt_dict_close: - if (frame->state == st_dictionary) { - // Convert list to map. Alternating elements are keys. Attempt to recover more or - // less gracefully from invalid dictionaries. - std::set names; - for (auto& obj: frame->olist) { - if (obj) { + if (frame->state <= st_dictionary_value) { + // Attempt to recover more or less gracefully from invalid dictionaries. + + auto& dict = frame->dict; + if (frame->state == st_dictionary_value) { + QTC::TC("qpdf", "QPDFParser no val for last key"); + warn( + frame->offset, + "dictionary ended prematurely; using null as value for last key"); + dict[frame->key] = QPDF_Null::create(); + } + + if (!frame->olist.empty()) { + std::set names; + for (auto& obj: frame->olist) { if (obj->getTypeCode() == ::ot_name) { names.insert(obj->getStringValue()); } } - } - - std::map dict; - int next_fake_key = 1; - for (auto iter = frame->olist.begin(); iter != frame->olist.end();) { - // Calculate key. - std::string key; - if (*iter && (*iter)->getTypeCode() == ::ot_name) { - key = (*iter)->getStringValue(); - ++iter; - } else { - for (bool found_fake = false; !found_fake;) { - key = "/QPDFFake" + std::to_string(next_fake_key++); - found_fake = (names.count(key) == 0); + int next_fake_key = 1; + for (auto const& item: frame->olist) { + while (true) { + const std::string key = "/QPDFFake" + std::to_string(next_fake_key++); + const bool found_fake = (dict.count(key) == 0 && names.count(key) == 0); QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1)); + if (found_fake) { + warn( + frame->offset, + "expected dictionary key but found non-name object; inserting " + "key " + + key); + dict[key] = item; + break; + } } - warn( - frame->offset, - "expected dictionary key but found non-name object; inserting key " + - key); } - if (dict.count(key) > 0) { - QTC::TC("qpdf", "QPDFParser duplicate dict key"); - warn( - frame->offset, - "dictionary has duplicated key " + key + - "; last occurrence overrides earlier ones"); - } - - // Calculate value. - ObjectPtr val; - if (iter != frame->olist.end()) { - val = *iter; - ++iter; - } else { - QTC::TC("qpdf", "QPDFParser no val for last key"); - warn( - frame->offset, - "dictionary ended prematurely; using null as value for last key"); - val = QPDF_Null::create(); - } - - dict[std::move(key)] = val; } + if (!frame->contents_string.empty() && dict.count("/Type") && dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") && dict.count("/Contents") && dict["/Contents"].isString()) { @@ -335,7 +319,7 @@ QPDFParser::parseRemainder(bool content_stream) stack.emplace_back( input, (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array - : st_dictionary); + : st_dictionary_key); frame = &stack.back(); continue; } @@ -364,15 +348,13 @@ QPDFParser::parseRemainder(bool content_stream) continue; case QPDFTokenizer::tt_name: - { - auto const& name = tokenizer.getValue(); - addScalar(name); - - if (name == "/Contents") { - b_contents = true; - } else { - b_contents = false; - } + if (frame->state == st_dictionary_key) { + frame->key = tokenizer.getValue(); + frame->state = st_dictionary_value; + b_contents = decrypter && frame->key == "/Contents"; + continue; + } else { + addScalar(tokenizer.getValue()); } continue; @@ -415,13 +397,21 @@ QPDFParser::parseRemainder(bool content_stream) addNull(); } } - return {}; // unreachable } void QPDFParser::add(std::shared_ptr&& obj) { - frame->olist.emplace_back(std::move(obj)); + if (frame->state != st_dictionary_value) { + // If state is st_dictionary_key then there is a missing key. Push onto olist for + // processing once the tt_dict_close token has been found. + frame->olist.emplace_back(std::move(obj)); + } else { + if (auto res = frame->dict.insert_or_assign(frame->key, std::move(obj)); !res.second) { + warnDuplicateKey(); + } + frame->state = st_dictionary_key; + } } void @@ -429,7 +419,16 @@ QPDFParser::addNull() { const static ObjectPtr null_obj = QPDF_Null::create(); - frame->olist.emplace_back(null_obj); + if (frame->state != st_dictionary_value) { + // If state is st_dictionary_key then there is a missing key. Push onto olist for + // processing once the tt_dict_close token has been found. + frame->olist.emplace_back(null_obj); + } else { + if (auto res = frame->dict.insert_or_assign(frame->key, null_obj); !res.second) { + warnDuplicateKey(); + } + frame->state = st_dictionary_key; + } ++frame->null_count; } @@ -495,6 +494,15 @@ QPDFParser::warn(QPDFExc const& e) const } } +void +QPDFParser::warnDuplicateKey() +{ + QTC::TC("qpdf", "QPDFParser duplicate dict key"); + warn( + frame->offset, + "dictionary has duplicated key " + frame->key + "; last occurrence overrides earlier ones"); +} + void QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const { diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index ef5be98e..3abe6c92 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -31,8 +31,9 @@ class QPDFParser QPDFObjectHandle parse(bool& empty, bool content_stream); private: - struct StackFrame; - enum parser_state_e { st_dictionary, st_array }; + // Parser state. Note: + // state < st_dictionary_value == (state = st_dictionary_key || state = st_dictionary_value) + enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array }; struct StackFrame { @@ -43,7 +44,9 @@ class QPDFParser } std::vector> olist; + std::map dict; parser_state_e state; + std::string key; qpdf_offset_t offset; std::string contents_string; qpdf_offset_t contents_offset{-1}; @@ -57,6 +60,7 @@ class QPDFParser template void addScalar(Args&&... args); bool tooManyBadTokens(); + void warnDuplicateKey(); void warn(qpdf_offset_t offset, std::string const& msg) const; void warn(std::string const& msg) const; void warn(QPDFExc const&) const; @@ -83,7 +87,6 @@ class QPDFParser int int_count = 0; long long int_buffer[2]{0, 0}; qpdf_offset_t last_offset_buffer[2]{0, 0}; - }; #endif // QPDFPARSER_HH diff --git a/qpdf/qtest/qpdf/bad36-recover.out b/qpdf/qtest/qpdf/bad36-recover.out index ac05acd9..9aacd729 100644 --- a/qpdf/qtest/qpdf/bad36-recover.out +++ b/qpdf/qtest/qpdf/bad36-recover.out @@ -1,6 +1,6 @@ WARNING: bad36.pdf (trailer, offset 764): unknown token while reading object; treating as string -WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2 WARNING: bad36.pdf (trailer, offset 715): dictionary ended prematurely; using null as value for last key +WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2 /QTest is implicit /QTest is direct and has type null (2) /QTest is null diff --git a/qpdf/qtest/qpdf/bad36.out b/qpdf/qtest/qpdf/bad36.out index cee3c286..e60d8685 100644 --- a/qpdf/qtest/qpdf/bad36.out +++ b/qpdf/qtest/qpdf/bad36.out @@ -1,6 +1,6 @@ WARNING: bad36.pdf (trailer, offset 764): unknown token while reading object; treating as string -WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2 WARNING: bad36.pdf (trailer, offset 715): dictionary ended prematurely; using null as value for last key +WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2 /QTest is implicit /QTest is direct and has type null (2) /QTest is null diff --git a/qpdf/qtest/qpdf/issue-335a.out b/qpdf/qtest/qpdf/issue-335a.out index 456bc475..c5b64465 100644 --- a/qpdf/qtest/qpdf/issue-335a.out +++ b/qpdf/qtest/qpdf/issue-335a.out @@ -51,6 +51,7 @@ WARNING: issue-335a.pdf (trailer, offset 563): unexpected ) WARNING: issue-335a.pdf (trailer, offset 596): unexpected ) WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2 WARNING: issue-335a.pdf (trailer, offset 600): unexpected ) +WARNING: issue-335a.pdf (trailer, offset 134): dictionary has duplicated key /L WARNING: issue-335a.pdf (trailer, offset 601): unexpected ) WARNING: issue-335a.pdf (trailer, offset 648): unexpected ) WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2 @@ -74,6 +75,7 @@ WARNING: issue-335a.pdf (trailer, offset 563): unexpected ) WARNING: issue-335a.pdf (trailer, offset 596): unexpected ) WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2 WARNING: issue-335a.pdf (trailer, offset 600): unexpected ) +WARNING: issue-335a.pdf (trailer, offset 164): dictionary has duplicated key /L WARNING: issue-335a.pdf (trailer, offset 601): unexpected ) WARNING: issue-335a.pdf (trailer, offset 648): unexpected ) WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2 @@ -97,6 +99,7 @@ WARNING: issue-335a.pdf (trailer, offset 563): unexpected ) WARNING: issue-335a.pdf (trailer, offset 596): unexpected ) WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2 WARNING: issue-335a.pdf (trailer, offset 600): unexpected ) +WARNING: issue-335a.pdf (trailer, offset 231): dictionary has duplicated key /L WARNING: issue-335a.pdf (trailer, offset 601): unexpected ) WARNING: issue-335a.pdf (trailer, offset 648): unexpected ) WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2 @@ -448,6 +451,7 @@ WARNING: issue-335a.pdf (trailer, offset 1168): unexpected ) WARNING: issue-335a.pdf (trailer, offset 1328): unexpected ) WARNING: issue-335a.pdf (trailer, offset 1329): name with stray # will not work with PDF >= 1.2 WARNING: issue-335a.pdf (trailer, offset 1332): unexpected ) +WARNING: issue-335a.pdf (trailer, offset 1033): dictionary has duplicated key /L WARNING: issue-335a.pdf (trailer, offset 1333): unexpected ) WARNING: issue-335a.pdf (trailer, offset 1344): unexpected ) WARNING: issue-335a.pdf (trailer, offset 1428): unexpected ) From 1285f9767af983df74b52c4f2aadcbfaec36a6fc Mon Sep 17 00:00:00 2001 From: m-holger Date: Fri, 3 Nov 2023 11:22:21 +0000 Subject: [PATCH 15/15] Add new method QPDFParser::fixMissingKeys --- libqpdf/QPDFParser.cc | 55 ++++++++++++++++++++------------------ libqpdf/qpdf/QPDFParser.hh | 1 + 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index d2b2af6a..32c4f8e9 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -244,8 +244,8 @@ QPDFParser::parseRemainder(bool content_stream) case QPDFTokenizer::tt_dict_close: if (frame->state <= st_dictionary_value) { // Attempt to recover more or less gracefully from invalid dictionaries. - auto& dict = frame->dict; + if (frame->state == st_dictionary_value) { QTC::TC("qpdf", "QPDFParser no val for last key"); warn( @@ -254,31 +254,8 @@ QPDFParser::parseRemainder(bool content_stream) dict[frame->key] = QPDF_Null::create(); } - if (!frame->olist.empty()) { - std::set names; - for (auto& obj: frame->olist) { - if (obj->getTypeCode() == ::ot_name) { - names.insert(obj->getStringValue()); - } - } - int next_fake_key = 1; - for (auto const& item: frame->olist) { - while (true) { - const std::string key = "/QPDFFake" + std::to_string(next_fake_key++); - const bool found_fake = (dict.count(key) == 0 && names.count(key) == 0); - QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1)); - if (found_fake) { - warn( - frame->offset, - "expected dictionary key but found non-name object; inserting " - "key " + - key); - dict[key] = item; - break; - } - } - } - } + if (!frame->olist.empty()) + fixMissingKeys(); if (!frame->contents_string.empty() && dict.count("/Type") && dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") && @@ -466,6 +443,32 @@ QPDFParser::setDescription(ObjectPtr& obj, qpdf_offset_t parsed_offset) } } +void +QPDFParser::fixMissingKeys() +{ + std::set names; + for (auto& obj: frame->olist) { + if (obj->getTypeCode() == ::ot_name) { + names.insert(obj->getStringValue()); + } + } + int next_fake_key = 1; + for (auto const& item: frame->olist) { + while (true) { + const std::string key = "/QPDFFake" + std::to_string(next_fake_key++); + const bool found_fake = frame->dict.count(key) == 0 && names.count(key) == 0; + QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1)); + if (found_fake) { + warn( + frame->offset, + "expected dictionary key but found non-name object; inserting key " + key); + frame->dict[key] = item; + break; + } + } + } +} + bool QPDFParser::tooManyBadTokens() { diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index 3abe6c92..7f5f7804 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -61,6 +61,7 @@ class QPDFParser void addScalar(Args&&... args); bool tooManyBadTokens(); void warnDuplicateKey(); + void fixMissingKeys(); void warn(qpdf_offset_t offset, std::string const& msg) const; void warn(std::string const& msg) const; void warn(QPDFExc const&) const;