diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 8e3d0019..1758c7b8 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -21,7 +21,6 @@ #include - QPDFObjectHandle QPDFParser::parse(bool& empty, bool content_stream) { @@ -30,327 +29,110 @@ QPDFParser::parse(bool& empty, bool content_stream) // effect of reading the object and changing the file pointer. If you do this, it will cause a // logic error to be thrown from QPDF::inParse(). - const static std::shared_ptr null_oh = QPDF_Null::create(); QPDF::ParseGuard pg(context); - empty = false; std::shared_ptr object; - bool set_offset = false; + stack.clear(); + stack.emplace_back(input, st_top); + frame = &stack.back(); + object = nullptr; -// std::vector stack{{input, st_top}}; - stack.clear(); // NEW - stack.emplace_back(input, st_top); // NEW - bool done = false; - bool b_contents = false; - bool is_null = false; - frame = &stack.back(); // CHANGED + if (!tokenizer.nextToken(*input, object_description)) { + warn(tokenizer.getErrorMessage()); + } - while (!done) { - bool indirect_ref = false; - is_null = false; - object = nullptr; - set_offset = false; - - if (!tokenizer.nextToken(*input, object_description)) { - warn(tokenizer.getErrorMessage()); + switch (tokenizer.getType()) { + case QPDFTokenizer::tt_eof: + if (content_stream) { + // In content stream mode, leave object uninitialized to indicate EOF + return {}; } - ++good_count; // optimistically + QTC::TC("qpdf", "QPDFParser eof in parse"); + warn("unexpected EOF"); + return {QPDF_Null::create()}; - switch (tokenizer.getType()) { - case QPDFTokenizer::tt_eof: - if (stack.size() > 1) { - warn("parse error while reading object"); - } + case QPDFTokenizer::tt_bad: + QTC::TC("qpdf", "QPDFParser bad token in parse"); + return {QPDF_Null::create()}; + + case QPDFTokenizer::tt_brace_open: + case QPDFTokenizer::tt_brace_close: + QTC::TC("qpdf", "QPDFParser bad brace"); + warn("treating unexpected brace token as null"); + return {QPDF_Null::create()}; + + case QPDFTokenizer::tt_array_close: + QTC::TC("qpdf", "QPDFParser bad array close"); + warn("treating unexpected array close token as null"); + return {QPDF_Null::create()}; + + case QPDFTokenizer::tt_dict_close: + QTC::TC("qpdf", "QPDFParser bad dictionary close"); + warn("unexpected dictionary close token"); + return {QPDF_Null::create()}; + + case QPDFTokenizer::tt_array_open: + case QPDFTokenizer::tt_dict_open: + stack.emplace_back( + input, + (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary); + return parseRemainder(content_stream); + + case QPDFTokenizer::tt_bool: + object = QPDF_Bool::create((tokenizer.getValue() == "true")); + break; + + case QPDFTokenizer::tt_null: + return {QPDF_Null::create()}; + + case QPDFTokenizer::tt_integer: + object = QPDF_Integer::create(QUtil::string_to_ll(tokenizer.getValue().c_str())); + break; + + case QPDFTokenizer::tt_real: + object = QPDF_Real::create(tokenizer.getValue()); + break; + + case QPDFTokenizer::tt_name: + object = QPDF_Name::create(tokenizer.getValue()); + break; + + case QPDFTokenizer::tt_word: + { + auto const& value = tokenizer.getValue(); if (content_stream) { - // In content stream mode, leave object uninitialized to indicate EOF - return {}; - } -// QTC::TC("qpdf", "QPDFParser eof in parse"); - warn("unexpected EOF"); - return {QPDF_Null::create()}; - - case QPDFTokenizer::tt_bad: -// QTC::TC("qpdf", "QPDFParser bad token in parse"); - if (tooManyBadTokens()) { - return {QPDF_Null::create()}; - } - is_null = true; - break; - - case QPDFTokenizer::tt_brace_open: - case QPDFTokenizer::tt_brace_close: -// QTC::TC("qpdf", "QPDFParser bad brace"); - warn("treating unexpected brace token as null"); - if (tooManyBadTokens()) { - return {QPDF_Null::create()}; - } - is_null = true; - break; - - case QPDFTokenizer::tt_array_close: - if (frame->state == st_array) { - if (stack.size() < 2) { - throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with " - "insufficient elements in stack"); - } - object = QPDF_Array::create(std::move(frame->olist), frame->null_count > 100); - setDescription(object, frame->offset - 1); - // The `offset` points to the next of "[". Set the rewind offset to point to the - // beginning of "[". This has been explicitly tested with whitespace surrounding the - // array start delimiter. getLastOffset points to the array end token and therefore - // can't be used here. - set_offset = true; - stack.pop_back(); - frame = &stack.back(); - } else { -// QTC::TC("qpdf", "QPDFParser bad array close"); - warn("treating unexpected array close token as null"); - if (tooManyBadTokens()) { - return {QPDF_Null::create()}; - } - is_null = true; - } - break; - - case QPDFTokenizer::tt_dict_close: - if (frame->state == st_dictionary) { - if (stack.size() < 2) { - throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with " - "insufficient elements in stack"); - } - - // Convert list to map. Alternating elements are keys. Attempt to recover more or - // less gracefully from invalid dictionaries. - std::set names; - for (auto& obj: frame->olist) { - if (obj) { - if (obj->getTypeCode() == ::ot_name) { - names.insert(obj->getStringValue()); - } - } - } - - std::map dict; - int next_fake_key = 1; - for (auto iter = frame->olist.begin(); iter != frame->olist.end();) { - // Calculate key. - std::string key; - if (*iter && (*iter)->getTypeCode() == ::ot_name) { - key = (*iter)->getStringValue(); - ++iter; - } else { - for (bool found_fake = false; !found_fake;) { - key = "/QPDFFake" + std::to_string(next_fake_key++); - found_fake = (names.count(key) == 0); -// QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1)); - } - warn( - frame->offset, - "expected dictionary key but found non-name object; inserting key " + - key); - } - if (dict.count(key) > 0) { -// QTC::TC("qpdf", "QPDFParser duplicate dict key"); - warn( - frame->offset, - "dictionary has duplicated key " + key + - "; last occurrence overrides earlier ones"); - } - - // Calculate value. - std::shared_ptr val; - if (iter != frame->olist.end()) { - val = *iter; - ++iter; - } else { -// QTC::TC("qpdf", "QPDFParser no val for last key"); - warn( - frame->offset, - "dictionary ended prematurely; using null as value for last key"); - val = QPDF_Null::create(); - } - - dict[std::move(key)] = std::move(val); - } - if (!frame->contents_string.empty() && dict.count("/Type") && - dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") && - dict.count("/Contents") && dict["/Contents"].isString()) { - dict["/Contents"] = QPDFObjectHandle::newString(frame->contents_string); - dict["/Contents"].setParsedOffset(frame->contents_offset); - } - object = QPDF_Dictionary::create(std::move(dict)); - setDescription(object, frame->offset - 2); - // The `offset` points to the next of "<<". Set the rewind offset to point to the - // beginning of "<<". This has been explicitly tested with whitespace surrounding - // the dictionary start delimiter. getLastOffset points to the dictionary end token - // and therefore can't be used here. - set_offset = true; - stack.pop_back(); - frame = &stack.back(); - } else { -// QTC::TC("qpdf", "QPDFParser bad dictionary close"); - warn("unexpected dictionary close token"); - if (tooManyBadTokens()) { - return {QPDF_Null::create()}; - } - is_null = true; - } - break; - - case QPDFTokenizer::tt_array_open: - case QPDFTokenizer::tt_dict_open: - if (stack.size() > 500) { -// QTC::TC("qpdf", "QPDFParser too deep"); - warn("ignoring excessively deeply nested data structure"); + object = QPDF_Operator::create(value); + } else if (value == "endobj") { + // We just saw endobj without having read anything. Treat this as a null and do + // not move the input source's offset. + input->seek(input->getLastOffset(), SEEK_SET); + empty = true; return {QPDF_Null::create()}; } else { - b_contents = false; - stack.emplace_back( - input, - (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array - : st_dictionary); - frame = &stack.back(); - return parseRemainder(content_stream); // NEW - continue; + QTC::TC("qpdf", "QPDFParser treat word as string"); + warn("unknown token while reading object; treating as string"); + object = QPDF_String::create(value); } - - case QPDFTokenizer::tt_bool: - object = QPDF_Bool::create((tokenizer.getValue() == "true")); - break; - - case QPDFTokenizer::tt_null: - is_null = true; - ++frame->null_count; - - break; - - case QPDFTokenizer::tt_integer: - object = QPDF_Integer::create(QUtil::string_to_ll(tokenizer.getValue().c_str())); - break; - - case QPDFTokenizer::tt_real: - object = QPDF_Real::create(tokenizer.getValue()); - break; - - case QPDFTokenizer::tt_name: - { - auto const& name = tokenizer.getValue(); - object = QPDF_Name::create(name); - - if (name == "/Contents") { - b_contents = true; - } else { - b_contents = false; - } - } - break; - - case QPDFTokenizer::tt_word: - { - auto const& value = tokenizer.getValue(); - auto size = frame->olist.size(); - if (content_stream) { - object = QPDF_Operator::create(value); - } else if ( - value == "R" && frame->state != st_top && size >= 2 && frame->olist.back() && - frame->olist.back()->getTypeCode() == ::ot_integer && - !frame->olist.back()->getObjGen().isIndirect() && frame->olist.at(size - 2) && - frame->olist.at(size - 2)->getTypeCode() == ::ot_integer && - !frame->olist.at(size - 2)->getObjGen().isIndirect()) { - if (context == nullptr) { -// QTC::TC("qpdf", "QPDFParser indirect without context"); - throw std::logic_error("QPDFObjectHandle::parse called without context on " - "an object with indirect references"); - } - auto ref_og = QPDFObjGen( - QPDFObjectHandle(frame->olist.at(size - 2)).getIntValueAsInt(), - QPDFObjectHandle(frame->olist.back()).getIntValueAsInt()); - if (ref_og.isIndirect()) { - // This action has the desirable side effect of causing dangling references - // (references to indirect objects that don't appear in the PDF) in any - // parsed object to appear in the object cache. - object = context->getObject(ref_og).obj; - indirect_ref = true; - } else { -// QTC::TC("qpdf", "QPDFParser indirect with 0 objid"); - is_null = true; - } - frame->olist.pop_back(); - frame->olist.pop_back(); - } else if ((value == "endobj") && (frame->state == st_top)) { - // We just saw endobj without having read anything. Treat this as a null and do - // not move the input source's offset. - is_null = true; - input->seek(input->getLastOffset(), SEEK_SET); - empty = true; - } else { -// QTC::TC("qpdf", "QPDFParser treat word as string"); - warn("unknown token while reading object; treating as string"); - if (tooManyBadTokens()) { - return {QPDF_Null::create()}; - } - object = QPDF_String::create(value); - } - } - break; - - case QPDFTokenizer::tt_string: - { - auto const& val = tokenizer.getValue(); - if (decrypter) { - if (b_contents) { - frame->contents_string = val; - frame->contents_offset = input->getLastOffset(); - b_contents = false; - } - std::string s{val}; - decrypter->decryptString(s); - object = QPDF_String::create(s); - } else { - object = QPDF_String::create(val); - } - } - break; - - default: - warn("treating unknown token type as null while reading object"); - if (tooManyBadTokens()) { - return {QPDF_Null::create()}; - } - is_null = true; - break; } + break; - if (object == nullptr && !is_null) { - throw std::logic_error("QPDFParser:parseInternal: unexpected uninitialized object"); + case QPDFTokenizer::tt_string: + if (decrypter) { + std::string s{tokenizer.getValue()}; + decrypter->decryptString(s); + object = QPDF_String::create(s); + } else { + object = QPDF_String::create(tokenizer.getValue()); } + break; - switch (frame->state) { - case st_dictionary: - case st_array: - if (is_null) { - object = null_oh; - // No need to set description for direct nulls - they probably will become implicit. - } else if (!indirect_ref && !set_offset) { - setDescription(object, input->getLastOffset()); - } - set_offset = true; - frame->olist.push_back(object); - break; - - case st_top: - done = true; - break; - } + default: + warn("treating unknown token type as null while reading object"); + return {QPDF_Null::create()}; } - if (is_null) { - object = QPDF_Null::create(); - } - if (!set_offset) { - setDescription(object, frame->offset); - } + setDescription(object, frame->offset); return object; } @@ -363,18 +145,15 @@ QPDFParser::parseRemainder(bool content_stream) // logic error to be thrown from QPDF::inParse(). const static std::shared_ptr null_oh = QPDF_Null::create(); -// QPDF::ParseGuard pg(context); - -// empty = false; std::shared_ptr object; bool set_offset = false; -// std::vector stack{{input, st_top},}; bool done = false; bool b_contents = false; bool is_null = false; frame = &stack.back(); // CHANGED + bad_count = 0; while (!done) { bool indirect_ref = false; @@ -389,19 +168,17 @@ QPDFParser::parseRemainder(bool content_stream) switch (tokenizer.getType()) { case QPDFTokenizer::tt_eof: - if (stack.size() > 1) { - warn("parse error while reading object"); - } + warn("parse error while reading object"); if (content_stream) { // In content stream mode, leave object uninitialized to indicate EOF return {}; } - QTC::TC("qpdf", "QPDFParser eof in parse"); + QTC::TC("qpdf", "QPDFParser eof in parseRemainder"); warn("unexpected EOF"); return {QPDF_Null::create()}; case QPDFTokenizer::tt_bad: - QTC::TC("qpdf", "QPDFParser bad token in parse"); + QTC::TC("qpdf", "QPDFParser bad token in parseRemainder"); if (tooManyBadTokens()) { return {QPDF_Null::create()}; } @@ -410,7 +187,7 @@ QPDFParser::parseRemainder(bool content_stream) case QPDFTokenizer::tt_brace_open: case QPDFTokenizer::tt_brace_close: - QTC::TC("qpdf", "QPDFParser bad brace"); + QTC::TC("qpdf", "QPDFParser bad brace in parseRemainder"); warn("treating unexpected brace token as null"); if (tooManyBadTokens()) { return {QPDF_Null::create()}; @@ -434,7 +211,7 @@ QPDFParser::parseRemainder(bool content_stream) stack.pop_back(); frame = &stack.back(); } else { - QTC::TC("qpdf", "QPDFParser bad array close"); + QTC::TC("qpdf", "QPDFParser bad array close in parseRemainder"); warn("treating unexpected array close token as null"); if (tooManyBadTokens()) { return {QPDF_Null::create()}; @@ -519,7 +296,7 @@ QPDFParser::parseRemainder(bool content_stream) stack.pop_back(); frame = &stack.back(); } else { - QTC::TC("qpdf", "QPDFParser bad dictionary close"); + QTC::TC("qpdf", "QPDFParser bad dictionary close in parseRemainder"); warn("unexpected dictionary close token"); if (tooManyBadTokens()) { return {QPDF_Null::create()}; @@ -582,7 +359,7 @@ QPDFParser::parseRemainder(bool content_stream) if (content_stream) { object = QPDF_Operator::create(value); } else if ( - value == "R" && frame->state != st_top && size >= 2 && frame->olist.back() && + value == "R" && size >= 2 && frame->olist.back() && frame->olist.back()->getTypeCode() == ::ot_integer && !frame->olist.back()->getObjGen().isIndirect() && frame->olist.at(size - 2) && frame->olist.at(size - 2)->getTypeCode() == ::ot_integer && @@ -607,14 +384,8 @@ QPDFParser::parseRemainder(bool content_stream) } frame->olist.pop_back(); frame->olist.pop_back(); - } else if ((value == "endobj") && (frame->state == st_top)) { - // We just saw endobj without having read anything. Treat this as a null and do - // not move the input source's offset. - is_null = true; - input->seek(input->getLastOffset(), SEEK_SET); -// empty = true; } else { - QTC::TC("qpdf", "QPDFParser treat word as string"); + QTC::TC("qpdf", "QPDFParser treat word as string in parseRemainder"); warn("unknown token while reading object; treating as string"); if (tooManyBadTokens()) { return {QPDF_Null::create()}; diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index ec11c57b..cbb4ac1d 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -57,11 +57,14 @@ QPDF trailer lacks size 0 QPDF trailer size not integer 0 QPDF trailer prev not integer 0 QPDFParser bad brace 0 +QPDFParser bad brace in parseRemainder 0 QPDFParser bad array close 0 +QPDFParser bad array close in parseRemainder 0 QPDF stream without length 0 QPDF stream length not integer 0 QPDF missing endstream 0 QPDFParser bad dictionary close 0 +QPDFParser bad dictionary close in parseRemainder 0 QPDF can't find xref 0 QPDFTokenizer bad ) 0 QPDFTokenizer bad > 0 @@ -258,6 +261,7 @@ QPDFParser indirect with 0 objid 0 QPDF object id 0 0 QPDF recursion loop in resolve 0 QPDFParser treat word as string 0 +QPDFParser treat word as string in parseRemainder 0 QPDFParser found fake 1 QPDFParser no val for last key 0 QPDF resolve failure to null 0 @@ -289,7 +293,9 @@ QPDFObjectHandle coalesce called on stream 0 QPDFObjectHandle coalesce provide stream data 0 QPDF_Stream bad token at end during normalize 0 QPDFParser bad token in parse 0 +QPDFParser bad token in parseRemainder 0 QPDFParser eof in parse 0 +QPDFParser eof in parseRemainder 0 QPDFObjectHandle array bounds 0 QPDFObjectHandle boolean returning false 0 QPDFObjectHandle integer returning 0 0 diff --git a/qpdf/qtest/parsing.test b/qpdf/qtest/parsing.test index 23edcac4..97cf9edf 100644 --- a/qpdf/qtest/parsing.test +++ b/qpdf/qtest/parsing.test @@ -17,7 +17,7 @@ my $td = new TestDriver('parsing'); my $n_tests = 17; $td->runtest("parse objects from string", - {$td->COMMAND => "test_driver 31 good1.qdf"}, + {$td->COMMAND => "test_driver 31 bad39.qdf"}, {$td->FILE => "parse-object.out", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); $td->runtest("EOF terminating literal tokens", diff --git a/qpdf/qtest/qpdf/bad39.qdf b/qpdf/qtest/qpdf/bad39.qdf new file mode 100644 index 00000000..1da316e6 --- /dev/null +++ b/qpdf/qtest/qpdf/bad39.qdf @@ -0,0 +1,102 @@ +%PDF-1.3 +%¿÷¢þ +%QDF-1.0 + +%% Original object ID: 1 0 +1 0 obj +<< + /Pages 2 0 R + /Type /Catalog +>> +endobj + +%% Original object ID: 2 0 +2 0 obj +<< + /Count 1 + /Kids [ + 3 0 R + ] + /Type /Pages +>> +endobj + +%% Page 1 +%% Original object ID: 3 0 +3 0 obj +<< + /Contents 4 0 R + /MediaBox [ + 0 + 0 + 612 + 792 + ] + /Parent 2 0 R + /Resources << + /Font << + /F1 6 0 R + >> + /ProcSet 7 0 R + >> + /Type /Page +>> +endobj + +%% Contents for page 1 +%% Original object ID: 4 0 +4 0 obj +<< + /Length 5 0 R +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET +endstream +endobj + +5 0 obj +44 +endobj + +%% Original object ID: 6 0 +6 0 obj +<< + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + /Name /F1 + /Subtype /Type1 + /Type /Font +>> +endobj + +%% Original object ID: 5 0 +7 0 obj +[ + /PDF + /Text +] +endobj + +xref +0 8 +0000000000 65535 f +0000000052 00000 n +0000000133 00000 n +0000000242 00000 n +0000000484 00000 n +0000000583 00000 n +0000000629 00000 n +0000001113 00000 n +trailer << + /Root 1 0 R + /Size 8 + /ID [<31415926535897932384626433832795><31415926535897932384626433832795>] +>> +startxref +809 +%%EOF +7 0 obj diff --git a/qpdf/qtest/qpdf/parse-object.out b/qpdf/qtest/qpdf/parse-object.out index 2e09f6ad..cb3cb742 100644 --- a/qpdf/qtest/qpdf/parse-object.out +++ b/qpdf/qtest/qpdf/parse-object.out @@ -2,4 +2,10 @@ logic error parsing indirect: QPDFObjectHandle::parse called without context on an object with indirect references trailing data: parsed object (trailing test): trailing data found parsing object from string WARNING: parsed object (offset 9): unknown token while reading object; treating as string +WARNING: parsed object: treating unexpected brace token as null +WARNING: parsed object: treating unexpected brace token as null +WARNING: parsed object: unexpected dictionary close token +WARNING: bad39.qdf (object 7 0, offset 1121): unexpected EOF +WARNING: bad39.qdf (object 7 0, offset 1121): expected endobj +WARNING: bad39.qdf (object 7 0, offset 1121): EOF after endobj test 31 done diff --git a/qpdf/test_driver.cc b/qpdf/test_driver.cc index 03631eb2..319c80d2 100644 --- a/qpdf/test_driver.cc +++ b/qpdf/test_driver.cc @@ -1195,6 +1195,13 @@ test_31(QPDF& pdf, char const* arg2) // mistakenly parsed as an indirect object. assert(QPDFObjectHandle::parse(&pdf, "[5 0 R 0 R /X]").unparse() == "[ 5 0 R 0 (R) /X ]"); assert(QPDFObjectHandle::parse(&pdf, "[1 0 R]", "indirect test").unparse() == "[ 1 0 R ]"); + // TC:QPDFParser bad brace + assert(QPDFObjectHandle::parse(&pdf, "}").unparse() == "null"); + assert(QPDFObjectHandle::parse(&pdf, "{").unparse() == "null"); + // TC:QPDFParser bad dictionary close + assert(QPDFObjectHandle::parse(&pdf, ">>").unparse() == "null"); + // TC:QPDFParser eof in parse + assert(QPDFObjectHandle::parse(&pdf, "[7 0 R]").getArrayItem(0).isNull()); } static void