From 0328d8723793fa8c7f3cb4d243bfc7ed051e85bb Mon Sep 17 00:00:00 2001 From: m-holger Date: Tue, 31 Oct 2023 17:44:01 +0000 Subject: [PATCH] In QPDFParser::parse refactor parsing of indirect references --- libqpdf/QPDFParser.cc | 109 +++++++++++++++++++------------ libqpdf/qpdf/QPDFParser.hh | 5 ++ qpdf/qtest/qpdf/parse-object.out | 2 +- 3 files changed, 75 insertions(+), 41 deletions(-) diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index a1c79468..fd57c6f3 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -143,6 +143,51 @@ QPDFParser::parseRemainder(bool content_stream) } ++good_count; // optimistically + if (int_count != 0) { + // Special handling of indirect references. Treat integer tokens as part of an indirect + // reference until proven otherwise. + if (tokenizer.getType() == QPDFTokenizer::tt_integer) { + if (++int_count > 2) { + // Process the oldest buffered integer. + addInt(int_count); + } + last_offset_buffer[int_count % 2] = input->getLastOffset(); + int_buffer[int_count % 2] = QUtil::string_to_ll(tokenizer.getValue().c_str()); + continue; + + } else if ( + int_count >= 2 && tokenizer.getType() == QPDFTokenizer::tt_word && + tokenizer.getValue() == "R") { + if (context == nullptr) { + QTC::TC("qpdf", "QPDFParser indirect without context"); + throw std::logic_error("QPDFParser::parse called without context on an object " + "with indirect references"); + } + auto ref_og = QPDFObjGen( + QIntC::to_int(int_buffer[(int_count - 1) % 2]), + QIntC::to_int(int_buffer[(int_count) % 2])); + if (ref_og.isIndirect()) { + // This action has the desirable side effect of causing dangling references + // (references to indirect objects that don't appear in the PDF) in any parsed + // object to appear in the object cache. + add(std::move(context->getObject(ref_og).obj)); + } else { + QTC::TC("qpdf", "QPDFParser indirect with 0 objid"); + addNull(); + } + int_count = 0; + continue; + + } else if (int_count > 0) { + // Process the buffered integers before processing the current token. + if (int_count > 1) { + addInt(int_count - 1); + } + addInt(int_count); + int_count = 0; + } + } + switch (tokenizer.getType()) { case QPDFTokenizer::tt_eof: warn("parse error while reading object"); @@ -304,7 +349,14 @@ QPDFParser::parseRemainder(bool content_stream) continue; case QPDFTokenizer::tt_integer: - addScalar(QUtil::string_to_ll(tokenizer.getValue().c_str())); + if (!content_stream) { + // Buffer token in case it is part of an indirect reference. + last_offset_buffer[1] = input->getLastOffset(); + int_buffer[1] = QUtil::string_to_ll(tokenizer.getValue().c_str()); + int_count = 1; + } else { + addScalar(QUtil::string_to_ll(tokenizer.getValue().c_str())); + } continue; case QPDFTokenizer::tt_real: @@ -325,46 +377,15 @@ QPDFParser::parseRemainder(bool content_stream) continue; case QPDFTokenizer::tt_word: - { - auto const& value = tokenizer.getValue(); - auto size = frame->olist.size(); - if (content_stream) { - addScalar(value); - } else if ( - value == "R" && size >= 2 && frame->olist.back() && - frame->olist.back()->getTypeCode() == ::ot_integer && - !frame->olist.back()->getObjGen().isIndirect() && frame->olist.at(size - 2) && - frame->olist.at(size - 2)->getTypeCode() == ::ot_integer && - !frame->olist.at(size - 2)->getObjGen().isIndirect()) { - if (context == nullptr) { - QTC::TC("qpdf", "QPDFParser indirect without context"); - throw std::logic_error("QPDFObjectHandle::parse called without context on " - "an object with indirect references"); - } - auto ref_og = QPDFObjGen( - QPDFObjectHandle(frame->olist.at(size - 2)).getIntValueAsInt(), - QPDFObjectHandle(frame->olist.back()).getIntValueAsInt()); - if (ref_og.isIndirect()) { - // This action has the desirable side effect of causing dangling references - // (references to indirect objects that don't appear in the PDF) in any - // parsed object to appear in the object cache. - frame->olist.pop_back(); - frame->olist.pop_back(); - add(std::move(context->getObject(ref_og).obj)); - } else { - QTC::TC("qpdf", "QPDFParser indirect with 0 objid"); - frame->olist.pop_back(); - frame->olist.pop_back(); - addNull(); - } - } else { - QTC::TC("qpdf", "QPDFParser treat word as string in parseRemainder"); - warn("unknown token while reading object; treating as string"); - if (tooManyBadTokens()) { - return {QPDF_Null::create()}; - } - addScalar(value); + if (content_stream) { + addScalar(tokenizer.getValue()); + } else { + QTC::TC("qpdf", "QPDFParser treat word as string in parseRemainder"); + warn("unknown token while reading object; treating as string"); + if (tooManyBadTokens()) { + return {QPDF_Null::create()}; } + addScalar(tokenizer.getValue()); } continue; @@ -412,6 +433,14 @@ QPDFParser::addNull() ++frame->null_count; } +void +QPDFParser::addInt(int count) +{ + auto obj = QPDF_Integer::create(int_buffer[count % 2]); + obj->setDescription(context, description, last_offset_buffer[count % 2]); + add(std::move(obj)); +} + template void QPDFParser::addScalar(Args&&... args) diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index 70892e41..ef5be98e 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -53,6 +53,7 @@ class QPDFParser QPDFObjectHandle parseRemainder(bool content_stream); void add(std::shared_ptr&& obj); void addNull(); + void addInt(int count); template void addScalar(Args&&... args); bool tooManyBadTokens(); @@ -78,6 +79,10 @@ class QPDFParser int good_count = 0; // Start offset including any leading whitespace. qpdf_offset_t start; + // Number of successive integer tokens. + int int_count = 0; + long long int_buffer[2]{0, 0}; + qpdf_offset_t last_offset_buffer[2]{0, 0}; }; diff --git a/qpdf/qtest/qpdf/parse-object.out b/qpdf/qtest/qpdf/parse-object.out index cb3cb742..de7b42e6 100644 --- a/qpdf/qtest/qpdf/parse-object.out +++ b/qpdf/qtest/qpdf/parse-object.out @@ -1,5 +1,5 @@ [ /name 16059 3.14159 false << /key true /other [ (string1) (string2) ] >> null ] -logic error parsing indirect: QPDFObjectHandle::parse called without context on an object with indirect references +logic error parsing indirect: QPDFParser::parse called without context on an object with indirect references trailing data: parsed object (trailing test): trailing data found parsing object from string WARNING: parsed object (offset 9): unknown token while reading object; treating as string WARNING: parsed object: treating unexpected brace token as null