2
1
mirror of https://github.com/qpdf/qpdf.git synced 2025-01-03 07:12:28 +00:00

Parse iteratively to avoid stack overflow (fixes #146)

This commit is contained in:
Jay Berkenbilt 2017-08-25 21:52:29 -04:00
parent 85f05cc57f
commit ad527a64f9
6 changed files with 173 additions and 111 deletions

View File

@ -1,5 +1,8 @@
2017-08-25 Jay Berkenbilt <ejb@ql.org> 2017-08-25 Jay Berkenbilt <ejb@ql.org>
* Re-implement parser iteratively to avoid stack overflow on very
deeply nested arrays and dictionaries. Fixes #146.
* Detect infinite loop while finding additional xref tables. Fixes * Detect infinite loop while finding additional xref tables. Fixes
#149. #149.

View File

@ -667,7 +667,6 @@ class QPDFObjectHandle
std::string const& object_description, std::string const& object_description,
QPDFTokenizer& tokenizer, bool& empty, QPDFTokenizer& tokenizer, bool& empty,
StringDecrypter* decrypter, QPDF* context, StringDecrypter* decrypter, QPDF* context,
bool in_array, bool in_dictionary,
bool content_stream); bool content_stream);
static void parseContentStream_internal( static void parseContentStream_internal(
PointerHolder<Buffer> stream_data, PointerHolder<Buffer> stream_data,

View File

@ -883,8 +883,7 @@ QPDFObjectHandle::parseContentStream_internal(PointerHolder<Buffer> stream_data,
while (static_cast<size_t>(input->tell()) < length) while (static_cast<size_t>(input->tell()) < length)
{ {
QPDFObjectHandle obj = QPDFObjectHandle obj =
parseInternal(input, "content", tokenizer, empty, parseInternal(input, "content", tokenizer, empty, 0, 0, true);
0, 0, false, false, true);
if (! obj.isInitialized()) if (! obj.isInitialized())
{ {
// EOF // EOF
@ -945,7 +944,7 @@ QPDFObjectHandle::parse(PointerHolder<InputSource> input,
StringDecrypter* decrypter, QPDF* context) StringDecrypter* decrypter, QPDF* context)
{ {
return parseInternal(input, object_description, tokenizer, empty, return parseInternal(input, object_description, tokenizer, empty,
decrypter, context, false, false, false); decrypter, context, false);
} }
QPDFObjectHandle QPDFObjectHandle
@ -953,7 +952,6 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
std::string const& object_description, std::string const& object_description,
QPDFTokenizer& tokenizer, bool& empty, QPDFTokenizer& tokenizer, bool& empty,
StringDecrypter* decrypter, QPDF* context, StringDecrypter* decrypter, QPDF* context,
bool in_array, bool in_dictionary,
bool content_stream) bool content_stream)
{ {
// This method must take care not to resolve any objects. Don't // This method must take care not to resolve any objects. Don't
@ -962,22 +960,23 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
// of reading the object and changing the file pointer. // of reading the object and changing the file pointer.
empty = false; empty = false;
if (in_dictionary && in_array)
{
// Although dictionaries and arrays arbitrarily nest, these
// variables indicate what is at the top of the stack right
// now, so they can, by definition, never both be true.
throw std::logic_error(
"INTERNAL ERROR: parseInternal: in_dict && in_array");
}
QPDFObjectHandle object; QPDFObjectHandle object;
qpdf_offset_t offset = input->tell(); std::vector<std::vector<QPDFObjectHandle> > olist_stack;
std::vector<QPDFObjectHandle> olist; olist_stack.push_back(std::vector<QPDFObjectHandle>());
enum state_e { st_top, st_start, st_stop, st_eof, st_dictionary, st_array };
std::vector<state_e> state_stack;
state_stack.push_back(st_top);
std::vector<qpdf_offset_t> offset_stack;
offset_stack.push_back(input->tell());
bool done = false; bool done = false;
while (! done) while (! done)
{ {
std::vector<QPDFObjectHandle>& olist = olist_stack.back();
state_e state = state_stack.back();
qpdf_offset_t offset = offset_stack.back();
object = QPDFObjectHandle(); object = QPDFObjectHandle();
QPDFTokenizer::Token token = QPDFTokenizer::Token token =
@ -988,8 +987,7 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
case QPDFTokenizer::tt_eof: case QPDFTokenizer::tt_eof:
if (content_stream) if (content_stream)
{ {
// Return uninitialized object to indicate EOF state = st_eof;
return object;
} }
else else
{ {
@ -1012,9 +1010,9 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
break; break;
case QPDFTokenizer::tt_array_close: case QPDFTokenizer::tt_array_close:
if (in_array) if (state == st_array)
{ {
done = true; state = st_stop;
} }
else else
{ {
@ -1029,9 +1027,9 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
break; break;
case QPDFTokenizer::tt_dict_close: case QPDFTokenizer::tt_dict_close:
if (in_dictionary) if (state == st_dictionary)
{ {
done = true; state = st_stop;
} }
else else
{ {
@ -1046,15 +1044,13 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
break; break;
case QPDFTokenizer::tt_array_open: case QPDFTokenizer::tt_array_open:
object = parseInternal(
input, object_description, tokenizer, empty,
decrypter, context, true, false, content_stream);
break;
case QPDFTokenizer::tt_dict_open: case QPDFTokenizer::tt_dict_open:
object = parseInternal( olist_stack.push_back(std::vector<QPDFObjectHandle>());
input, object_description, tokenizer, empty, state = st_start;
decrypter, context, false, true, content_stream); offset_stack.push_back(input->tell());
state_stack.push_back(
(token.getType() == QPDFTokenizer::tt_array_open) ?
st_array : st_dictionary);
break; break;
case QPDFTokenizer::tt_bool: case QPDFTokenizer::tt_bool:
@ -1084,12 +1080,12 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
{ {
object = QPDFObjectHandle::newOperator(value); object = QPDFObjectHandle::newOperator(value);
} }
else if ((value == "R") && (in_array || in_dictionary) && else if ((value == "R") && (state != st_top) &&
(olist.size() >= 2) && (olist.size() >= 2) &&
(! olist.at(olist.size() - 1).isIndirect()) && (! olist.at(olist.size() - 1).isIndirect()) &&
(olist.at(olist.size() - 1).isInteger()) && (olist.at(olist.size() - 1).isInteger()) &&
(! olist.at(olist.size() - 2).isIndirect()) && (! olist.at(olist.size() - 2).isIndirect()) &&
(olist.at(olist.size() - 2).isInteger())) (olist.at(olist.size() - 2).isInteger()))
{ {
if (context == 0) if (context == 0)
{ {
@ -1106,8 +1102,7 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
olist.pop_back(); olist.pop_back();
olist.pop_back(); olist.pop_back();
} }
else if ((value == "endobj") && else if ((value == "endobj") && (state == st_top))
(! (in_array || in_dictionary)))
{ {
// We just saw endobj without having read // We just saw endobj without having read
// anything. Treat this as a null and do not move // anything. Treat this as a null and do not move
@ -1153,93 +1148,132 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
break; break;
} }
if (in_dictionary || in_array) if ((! object.isInitialized()) &&
{ (! ((state == st_start) ||
if (! done) (state == st_stop) ||
{ (state == st_eof))))
olist.push_back(object);
}
}
else if (! object.isInitialized())
{
warn(context,
QPDFExc(qpdf_e_damaged_pdf, input->getName(),
object_description,
input->getLastOffset(),
"parse error while reading object"));
object = newNull();
}
else
{
done = true;
}
}
if (in_array)
{
object = newArray(olist);
}
else if (in_dictionary)
{
// Convert list to map. Alternating elements are keys. Attempt
// to recover more or less gracefully from invalid
// dictionaries.
std::set<std::string> names;
for (std::vector<QPDFObjectHandle>::iterator iter = olist.begin();
iter != olist.end(); ++iter)
{ {
if ((! (*iter).isIndirect()) && (*iter).isName()) throw std::logic_error(
{ "QPDFObjectHandle::parseInternal: "
names.insert((*iter).getName()); "unexpected uninitialized object");
} object = newNull();
} }
std::map<std::string, QPDFObjectHandle> dict; switch (state)
int next_fake_key = 1;
for (unsigned int i = 0; i < olist.size(); ++i)
{ {
QPDFObjectHandle key_obj = olist.at(i); case st_eof:
QPDFObjectHandle val; if (state_stack.size() > 1)
if (key_obj.isIndirect() || (! key_obj.isName()))
{ {
bool found_fake = false;
std::string candidate;
while (! found_fake)
{
candidate =
"/QPDFFake" + QUtil::int_to_string(next_fake_key++);
found_fake = (names.count(candidate) == 0);
QTC::TC("qpdf", "QPDFObjectHandle found fake",
(found_fake ? 0 : 1));
}
warn(context, warn(context,
QPDFExc( QPDFExc(qpdf_e_damaged_pdf, input->getName(),
qpdf_e_damaged_pdf, object_description,
input->getName(), object_description, offset, input->getLastOffset(),
"expected dictionary key but found" "parse error while reading object"));
" non-name object; inserting key " +
candidate));
val = key_obj;
key_obj = newName(candidate);
} }
else if (i + 1 >= olist.size()) done = true;
// Leave object uninitialized to indicate EOF
break;
case st_dictionary:
case st_array:
olist.push_back(object);
break;
case st_top:
done = true;
break;
case st_start:
break;
case st_stop:
if ((state_stack.size() < 2) || (olist_stack.size() < 2))
{ {
QTC::TC("qpdf", "QPDFObjectHandle no val for last key"); throw std::logic_error(
warn(context, "QPDFObjectHandle::parseInternal: st_stop encountered"
QPDFExc( " with insufficient elements in stack");
qpdf_e_damaged_pdf, }
input->getName(), object_description, offset, state_e old_state = state_stack.back();
"dictionary ended prematurely; using null as value" state_stack.pop_back();
" for last key")); if (old_state == st_array)
val = newNull(); {
object = newArray(olist);
}
else if (old_state == st_dictionary)
{
// Convert list to map. Alternating elements are keys.
// Attempt to recover more or less gracefully from
// invalid dictionaries.
std::set<std::string> names;
for (std::vector<QPDFObjectHandle>::iterator iter =
olist.begin();
iter != olist.end(); ++iter)
{
if ((! (*iter).isIndirect()) && (*iter).isName())
{
names.insert((*iter).getName());
}
}
std::map<std::string, QPDFObjectHandle> dict;
int next_fake_key = 1;
for (unsigned int i = 0; i < olist.size(); ++i)
{
QPDFObjectHandle key_obj = olist.at(i);
QPDFObjectHandle val;
if (key_obj.isIndirect() || (! key_obj.isName()))
{
bool found_fake = false;
std::string candidate;
while (! found_fake)
{
candidate =
"/QPDFFake" +
QUtil::int_to_string(next_fake_key++);
found_fake = (names.count(candidate) == 0);
QTC::TC("qpdf", "QPDFObjectHandle found fake",
(found_fake ? 0 : 1));
}
warn(context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(), object_description, offset,
"expected dictionary key but found"
" non-name object; inserting key " +
candidate));
val = key_obj;
key_obj = newName(candidate);
}
else if (i + 1 >= olist.size())
{
QTC::TC("qpdf", "QPDFObjectHandle no val for last key");
warn(context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(), object_description, offset,
"dictionary ended prematurely; "
"using null as value for last key"));
val = newNull();
}
else
{
val = olist.at(++i);
}
dict[key_obj.getName()] = val;
}
object = newDictionary(dict);
}
olist_stack.pop_back();
offset_stack.pop_back();
if (state_stack.back() == st_top)
{
done = true;
} }
else else
{ {
val = olist.at(++i); olist_stack.back().push_back(object);
} }
dict[key_obj.getName()] = val;
} }
object = newDictionary(dict);
} }
return object; return object;

View File

@ -221,6 +221,7 @@ my @bug_tests = (
["141a", "/W entry size 0", 2], ["141a", "/W entry size 0", 2],
["141b", "/W entry size 0", 2], ["141b", "/W entry size 0", 2],
["143", "self-referential ostream", 3], ["143", "self-referential ostream", 3],
["146", "very deeply nested array", 2],
["149", "xref prev pointer loop", 3], ["149", "xref prev pointer loop", 3],
); );
$n_tests += scalar(@bug_tests); $n_tests += scalar(@bug_tests);

View File

@ -0,0 +1,5 @@
WARNING: issue-146.pdf: file is damaged
WARNING: issue-146.pdf: can't find startxref
WARNING: issue-146.pdf: Attempting to reconstruct cross-reference table
WARNING: issue-146.pdf (trailer, file position 20728): unknown token while reading object; treating as string
issue-146.pdf (trailer, file position 20732): EOF while reading token

File diff suppressed because one or more lines are too long