In QPDFParser::parse refactor parsing of indirect references

This commit is contained in:
m-holger 2023-10-31 17:44:01 +00:00
parent 1548b8d8be
commit 0328d87237
3 changed files with 75 additions and 41 deletions

View File

@ -143,6 +143,51 @@ QPDFParser::parseRemainder(bool content_stream)
}
++good_count; // optimistically
if (int_count != 0) {
// Special handling of indirect references. Treat integer tokens as part of an indirect
// reference until proven otherwise.
if (tokenizer.getType() == QPDFTokenizer::tt_integer) {
if (++int_count > 2) {
// Process the oldest buffered integer.
addInt(int_count);
}
last_offset_buffer[int_count % 2] = input->getLastOffset();
int_buffer[int_count % 2] = QUtil::string_to_ll(tokenizer.getValue().c_str());
continue;
} else if (
int_count >= 2 && tokenizer.getType() == QPDFTokenizer::tt_word &&
tokenizer.getValue() == "R") {
if (context == nullptr) {
QTC::TC("qpdf", "QPDFParser indirect without context");
throw std::logic_error("QPDFParser::parse called without context on an object "
"with indirect references");
}
auto ref_og = QPDFObjGen(
QIntC::to_int(int_buffer[(int_count - 1) % 2]),
QIntC::to_int(int_buffer[(int_count) % 2]));
if (ref_og.isIndirect()) {
// This action has the desirable side effect of causing dangling references
// (references to indirect objects that don't appear in the PDF) in any parsed
// object to appear in the object cache.
add(std::move(context->getObject(ref_og).obj));
} else {
QTC::TC("qpdf", "QPDFParser indirect with 0 objid");
addNull();
}
int_count = 0;
continue;
} else if (int_count > 0) {
// Process the buffered integers before processing the current token.
if (int_count > 1) {
addInt(int_count - 1);
}
addInt(int_count);
int_count = 0;
}
}
switch (tokenizer.getType()) {
case QPDFTokenizer::tt_eof:
warn("parse error while reading object");
@ -304,7 +349,14 @@ QPDFParser::parseRemainder(bool content_stream)
continue;
case QPDFTokenizer::tt_integer:
addScalar<QPDF_Integer>(QUtil::string_to_ll(tokenizer.getValue().c_str()));
if (!content_stream) {
// Buffer token in case it is part of an indirect reference.
last_offset_buffer[1] = input->getLastOffset();
int_buffer[1] = QUtil::string_to_ll(tokenizer.getValue().c_str());
int_count = 1;
} else {
addScalar<QPDF_Integer>(QUtil::string_to_ll(tokenizer.getValue().c_str()));
}
continue;
case QPDFTokenizer::tt_real:
@ -325,46 +377,15 @@ QPDFParser::parseRemainder(bool content_stream)
continue;
case QPDFTokenizer::tt_word:
{
auto const& value = tokenizer.getValue();
auto size = frame->olist.size();
if (content_stream) {
addScalar<QPDF_Operator>(value);
} else if (
value == "R" && size >= 2 && frame->olist.back() &&
frame->olist.back()->getTypeCode() == ::ot_integer &&
!frame->olist.back()->getObjGen().isIndirect() && frame->olist.at(size - 2) &&
frame->olist.at(size - 2)->getTypeCode() == ::ot_integer &&
!frame->olist.at(size - 2)->getObjGen().isIndirect()) {
if (context == nullptr) {
QTC::TC("qpdf", "QPDFParser indirect without context");
throw std::logic_error("QPDFObjectHandle::parse called without context on "
"an object with indirect references");
}
auto ref_og = QPDFObjGen(
QPDFObjectHandle(frame->olist.at(size - 2)).getIntValueAsInt(),
QPDFObjectHandle(frame->olist.back()).getIntValueAsInt());
if (ref_og.isIndirect()) {
// This action has the desirable side effect of causing dangling references
// (references to indirect objects that don't appear in the PDF) in any
// parsed object to appear in the object cache.
frame->olist.pop_back();
frame->olist.pop_back();
add(std::move(context->getObject(ref_og).obj));
} else {
QTC::TC("qpdf", "QPDFParser indirect with 0 objid");
frame->olist.pop_back();
frame->olist.pop_back();
addNull();
}
} else {
QTC::TC("qpdf", "QPDFParser treat word as string in parseRemainder");
warn("unknown token while reading object; treating as string");
if (tooManyBadTokens()) {
return {QPDF_Null::create()};
}
addScalar<QPDF_String>(value);
if (content_stream) {
addScalar<QPDF_Operator>(tokenizer.getValue());
} else {
QTC::TC("qpdf", "QPDFParser treat word as string in parseRemainder");
warn("unknown token while reading object; treating as string");
if (tooManyBadTokens()) {
return {QPDF_Null::create()};
}
addScalar<QPDF_String>(tokenizer.getValue());
}
continue;
@ -412,6 +433,14 @@ QPDFParser::addNull()
++frame->null_count;
}
void
QPDFParser::addInt(int count)
{
auto obj = QPDF_Integer::create(int_buffer[count % 2]);
obj->setDescription(context, description, last_offset_buffer[count % 2]);
add(std::move(obj));
}
template <typename T, typename... Args>
void
QPDFParser::addScalar(Args&&... args)

View File

@ -53,6 +53,7 @@ class QPDFParser
QPDFObjectHandle parseRemainder(bool content_stream);
void add(std::shared_ptr<QPDFObject>&& obj);
void addNull();
void addInt(int count);
template <typename T, typename... Args>
void addScalar(Args&&... args);
bool tooManyBadTokens();
@ -78,6 +79,10 @@ class QPDFParser
int good_count = 0;
// Start offset including any leading whitespace.
qpdf_offset_t start;
// Number of successive integer tokens.
int int_count = 0;
long long int_buffer[2]{0, 0};
qpdf_offset_t last_offset_buffer[2]{0, 0};
};

View File

@ -1,5 +1,5 @@
[ /name 16059 3.14159 false << /key true /other [ (string1) (string2) ] >> null ]
logic error parsing indirect: QPDFObjectHandle::parse called without context on an object with indirect references
logic error parsing indirect: QPDFParser::parse called without context on an object with indirect references
trailing data: parsed object (trailing test): trailing data found parsing object from string
WARNING: parsed object (offset 9): unknown token while reading object; treating as string
WARNING: parsed object: treating unexpected brace token as null