2
1
mirror of https://github.com/qpdf/qpdf.git synced 2025-01-07 00:53:56 +00:00

Merge branch 'parse_ref' into work

This commit is contained in:
Jay Berkenbilt 2023-12-21 15:43:50 -05:00
commit b8fd18ae56
12 changed files with 593 additions and 343 deletions

View File

@ -21,22 +21,7 @@
#include <memory> #include <memory>
namespace using ObjectPtr = std::shared_ptr<QPDFObject>;
{
struct StackFrame
{
StackFrame(std::shared_ptr<InputSource> input) :
offset(input->tell())
{
}
std::vector<std::shared_ptr<QPDFObject>> olist;
qpdf_offset_t offset;
std::string contents_string{""};
qpdf_offset_t contents_offset{-1};
int null_count{0};
};
} // namespace
QPDFObjectHandle QPDFObjectHandle
QPDFParser::parse(bool& empty, bool content_stream) QPDFParser::parse(bool& empty, bool content_stream)
@ -46,373 +31,459 @@ QPDFParser::parse(bool& empty, bool content_stream)
// effect of reading the object and changing the file pointer. If you do this, it will cause a // effect of reading the object and changing the file pointer. If you do this, it will cause a
// logic error to be thrown from QPDF::inParse(). // logic error to be thrown from QPDF::inParse().
const static std::shared_ptr<QPDFObject> null_oh = QPDF_Null::create();
QPDF::ParseGuard pg(context); QPDF::ParseGuard pg(context);
empty = false; empty = false;
start = input->tell();
std::shared_ptr<QPDFObject> object; if (!tokenizer.nextToken(*input, object_description)) {
bool set_offset = false; warn(tokenizer.getErrorMessage());
}
std::vector<StackFrame> stack; switch (tokenizer.getType()) {
stack.emplace_back(input); case QPDFTokenizer::tt_eof:
std::vector<parser_state_e> state_stack; if (content_stream) {
state_stack.push_back(st_top); // In content stream mode, leave object uninitialized to indicate EOF
qpdf_offset_t offset; return {};
bool done = false; }
int bad_count = 0; QTC::TC("qpdf", "QPDFParser eof in parse");
int good_count = 0; warn("unexpected EOF");
return {QPDF_Null::create()};
case QPDFTokenizer::tt_bad:
QTC::TC("qpdf", "QPDFParser bad token in parse");
return {QPDF_Null::create()};
case QPDFTokenizer::tt_brace_open:
case QPDFTokenizer::tt_brace_close:
QTC::TC("qpdf", "QPDFParser bad brace");
warn("treating unexpected brace token as null");
return {QPDF_Null::create()};
case QPDFTokenizer::tt_array_close:
QTC::TC("qpdf", "QPDFParser bad array close");
warn("treating unexpected array close token as null");
return {QPDF_Null::create()};
case QPDFTokenizer::tt_dict_close:
QTC::TC("qpdf", "QPDFParser bad dictionary close");
warn("unexpected dictionary close token");
return {QPDF_Null::create()};
case QPDFTokenizer::tt_array_open:
case QPDFTokenizer::tt_dict_open:
stack.clear();
stack.emplace_back(
input,
(tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary_key);
frame = &stack.back();
return parseRemainder(content_stream);
case QPDFTokenizer::tt_bool:
return withDescription<QPDF_Bool>(tokenizer.getValue() == "true");
case QPDFTokenizer::tt_null:
return {QPDF_Null::create()};
case QPDFTokenizer::tt_integer:
return withDescription<QPDF_Integer>(QUtil::string_to_ll(tokenizer.getValue().c_str()));
case QPDFTokenizer::tt_real:
return withDescription<QPDF_Real>(tokenizer.getValue());
case QPDFTokenizer::tt_name:
return withDescription<QPDF_Name>(tokenizer.getValue());
case QPDFTokenizer::tt_word:
{
auto const& value = tokenizer.getValue();
if (content_stream) {
return withDescription<QPDF_Operator>(value);
} else if (value == "endobj") {
// We just saw endobj without having read anything. Treat this as a null and do
// not move the input source's offset.
input->seek(input->getLastOffset(), SEEK_SET);
empty = true;
return {QPDF_Null::create()};
} else {
QTC::TC("qpdf", "QPDFParser treat word as string");
warn("unknown token while reading object; treating as string");
return withDescription<QPDF_String>(value);
}
}
case QPDFTokenizer::tt_string:
if (decrypter) {
std::string s{tokenizer.getValue()};
decrypter->decryptString(s);
return withDescription<QPDF_String>(s);
} else {
return withDescription<QPDF_String>(tokenizer.getValue());
}
default:
warn("treating unknown token type as null while reading object");
return {QPDF_Null::create()};
}
}
QPDFObjectHandle
QPDFParser::parseRemainder(bool content_stream)
{
// This method must take care not to resolve any objects. Don't check the type of any object
// without first ensuring that it is a direct object. Otherwise, doing so may have the side
// effect of reading the object and changing the file pointer. If you do this, it will cause a
// logic error to be thrown from QPDF::inParse().
bad_count = 0;
bool b_contents = false; bool b_contents = false;
bool is_null = false;
while (!done) {
bool bad = false;
bool indirect_ref = false;
is_null = false;
auto& frame = stack.back();
auto& olist = frame.olist;
parser_state_e state = state_stack.back();
offset = frame.offset;
object = nullptr;
set_offset = false;
while (true) {
if (!tokenizer.nextToken(*input, object_description)) { if (!tokenizer.nextToken(*input, object_description)) {
warn(tokenizer.getErrorMessage()); warn(tokenizer.getErrorMessage());
} }
++good_count; // optimistically
if (int_count != 0) {
// Special handling of indirect references. Treat integer tokens as part of an indirect
// reference until proven otherwise.
if (tokenizer.getType() == QPDFTokenizer::tt_integer) {
if (++int_count > 2) {
// Process the oldest buffered integer.
addInt(int_count);
}
last_offset_buffer[int_count % 2] = input->getLastOffset();
int_buffer[int_count % 2] = QUtil::string_to_ll(tokenizer.getValue().c_str());
continue;
} else if (
int_count >= 2 && tokenizer.getType() == QPDFTokenizer::tt_word &&
tokenizer.getValue() == "R") {
if (context == nullptr) {
QTC::TC("qpdf", "QPDFParser indirect without context");
throw std::logic_error("QPDFParser::parse called without context on an object "
"with indirect references");
}
auto ref_og = QPDFObjGen(
QIntC::to_int(int_buffer[(int_count - 1) % 2]),
QIntC::to_int(int_buffer[(int_count) % 2]));
if (ref_og.isIndirect()) {
// This action has the desirable side effect of causing dangling references
// (references to indirect objects that don't appear in the PDF) in any parsed
// object to appear in the object cache.
add(std::move(context->getObject(ref_og).obj));
} else {
QTC::TC("qpdf", "QPDFParser indirect with 0 objid");
addNull();
}
int_count = 0;
continue;
} else if (int_count > 0) {
// Process the buffered integers before processing the current token.
if (int_count > 1) {
addInt(int_count - 1);
}
addInt(int_count);
int_count = 0;
}
}
switch (tokenizer.getType()) { switch (tokenizer.getType()) {
case QPDFTokenizer::tt_eof: case QPDFTokenizer::tt_eof:
if (!content_stream) { warn("parse error while reading object");
QTC::TC("qpdf", "QPDFParser eof in parse"); if (content_stream) {
warn("unexpected EOF"); // In content stream mode, leave object uninitialized to indicate EOF
return {};
} }
bad = true; QTC::TC("qpdf", "QPDFParser eof in parseRemainder");
state = st_eof; warn("unexpected EOF");
break; return {QPDF_Null::create()};
case QPDFTokenizer::tt_bad: case QPDFTokenizer::tt_bad:
QTC::TC("qpdf", "QPDFParser bad token in parse"); QTC::TC("qpdf", "QPDFParser bad token in parseRemainder");
bad = true; if (tooManyBadTokens()) {
is_null = true; return {QPDF_Null::create()};
break; }
addNull();
continue;
case QPDFTokenizer::tt_brace_open: case QPDFTokenizer::tt_brace_open:
case QPDFTokenizer::tt_brace_close: case QPDFTokenizer::tt_brace_close:
QTC::TC("qpdf", "QPDFParser bad brace"); QTC::TC("qpdf", "QPDFParser bad brace in parseRemainder");
warn("treating unexpected brace token as null"); warn("treating unexpected brace token as null");
bad = true; if (tooManyBadTokens()) {
is_null = true; return {QPDF_Null::create()};
break; }
addNull();
continue;
case QPDFTokenizer::tt_array_close: case QPDFTokenizer::tt_array_close:
if (state == st_array) { if (frame->state == st_array) {
state = st_stop; auto object = QPDF_Array::create(std::move(frame->olist), frame->null_count > 100);
} else { setDescription(object, frame->offset - 1);
QTC::TC("qpdf", "QPDFParser bad array close");
warn("treating unexpected array close token as null");
bad = true;
is_null = true;
}
break;
case QPDFTokenizer::tt_dict_close:
if (state == st_dictionary) {
state = st_stop;
} else {
QTC::TC("qpdf", "QPDFParser bad dictionary close");
warn("unexpected dictionary close token");
bad = true;
is_null = true;
}
break;
case QPDFTokenizer::tt_array_open:
case QPDFTokenizer::tt_dict_open:
if (stack.size() > 500) {
QTC::TC("qpdf", "QPDFParser too deep");
warn("ignoring excessively deeply nested data structure");
bad = true;
is_null = true;
state = st_top;
} else {
state = st_start;
state_stack.push_back(
(tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array
: st_dictionary);
b_contents = false;
stack.emplace_back(input);
}
break;
case QPDFTokenizer::tt_bool:
object = QPDF_Bool::create((tokenizer.getValue() == "true"));
break;
case QPDFTokenizer::tt_null:
is_null = true;
++frame.null_count;
break;
case QPDFTokenizer::tt_integer:
object = QPDF_Integer::create(
QUtil::string_to_ll(std::string(tokenizer.getValue()).c_str()));
break;
case QPDFTokenizer::tt_real:
object = QPDF_Real::create(tokenizer.getValue());
break;
case QPDFTokenizer::tt_name:
{
auto name = tokenizer.getValue();
object = QPDF_Name::create(name);
if (name == "/Contents") {
b_contents = true;
} else {
b_contents = false;
}
}
break;
case QPDFTokenizer::tt_word:
{
auto value = tokenizer.getValue();
auto size = olist.size();
if (content_stream) {
object = QPDF_Operator::create(value);
} else if (
value == "R" && state != st_top && size >= 2 && olist.back() &&
olist.back()->getTypeCode() == ::ot_integer &&
!olist.back()->getObjGen().isIndirect() && olist.at(size - 2) &&
olist.at(size - 2)->getTypeCode() == ::ot_integer &&
!olist.at(size - 2)->getObjGen().isIndirect()) {
if (context == nullptr) {
QTC::TC("qpdf", "QPDFParser indirect without context");
throw std::logic_error("QPDFObjectHandle::parse called without context on "
"an object with indirect references");
}
auto ref_og = QPDFObjGen(
QPDFObjectHandle(olist.at(size - 2)).getIntValueAsInt(),
QPDFObjectHandle(olist.back()).getIntValueAsInt());
if (ref_og.isIndirect()) {
// This action has the desirable side effect of causing dangling references
// (references to indirect objects that don't appear in the PDF) in any
// parsed object to appear in the object cache.
object = context->getObject(ref_og).obj;
indirect_ref = true;
} else {
QTC::TC("qpdf", "QPDFParser indirect with 0 objid");
is_null = true;
}
olist.pop_back();
olist.pop_back();
} else if ((value == "endobj") && (state == st_top)) {
// We just saw endobj without having read anything. Treat this as a null and do
// not move the input source's offset.
is_null = true;
input->seek(input->getLastOffset(), SEEK_SET);
empty = true;
} else {
QTC::TC("qpdf", "QPDFParser treat word as string");
warn("unknown token while reading object; treating as string");
bad = true;
object = QPDF_String::create(value);
}
}
break;
case QPDFTokenizer::tt_string:
{
auto val = tokenizer.getValue();
if (decrypter) {
if (b_contents) {
frame.contents_string = val;
frame.contents_offset = input->getLastOffset();
b_contents = false;
}
std::string s{val};
decrypter->decryptString(s);
object = QPDF_String::create(s);
} else {
object = QPDF_String::create(val);
}
}
break;
default:
warn("treating unknown token type as null while reading object");
bad = true;
is_null = true;
break;
}
if (object == nullptr && !is_null &&
(!((state == st_start) || (state == st_stop) || (state == st_eof)))) {
throw std::logic_error("QPDFParser:parseInternal: unexpected uninitialized object");
is_null = true;
}
if (bad) {
++bad_count;
good_count = 0;
} else {
++good_count;
if (good_count > 3) {
bad_count = 0;
}
}
if (bad_count > 5) {
// We had too many consecutive errors without enough intervening successful objects.
// Give up.
warn("too many errors; giving up on reading object");
state = st_top;
is_null = true;
}
switch (state) {
case st_eof:
if (state_stack.size() > 1) {
warn("parse error while reading object");
}
done = true;
// In content stream mode, leave object uninitialized to indicate EOF
if (!content_stream) {
is_null = true;
}
break;
case st_dictionary:
case st_array:
if (is_null) {
object = null_oh;
// No need to set description for direct nulls - they probably will become implicit.
} else if (!indirect_ref) {
setDescription(object, input->getLastOffset());
}
set_offset = true;
olist.push_back(object);
break;
case st_top:
done = true;
break;
case st_start:
break;
case st_stop:
if ((state_stack.size() < 2) || (stack.size() < 2)) {
throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with "
"insufficient elements in stack");
}
parser_state_e old_state = state_stack.back();
state_stack.pop_back();
if (old_state == st_array) {
object = QPDF_Array::create(std::move(olist), frame.null_count > 100);
setDescription(object, offset - 1);
// The `offset` points to the next of "[". Set the rewind offset to point to the // The `offset` points to the next of "[". Set the rewind offset to point to the
// beginning of "[". This has been explicitly tested with whitespace surrounding the // beginning of "[". This has been explicitly tested with whitespace surrounding the
// array start delimiter. getLastOffset points to the array end token and therefore // array start delimiter. getLastOffset points to the array end token and therefore
// can't be used here. // can't be used here.
set_offset = true; if (stack.size() <= 1) {
} else if (old_state == st_dictionary) { return object;
// Convert list to map. Alternating elements are keys. Attempt to recover more or }
// less gracefully from invalid dictionaries. stack.pop_back();
std::set<std::string> names; frame = &stack.back();
for (auto& obj: olist) { add(std::move(object));
if (obj) { } else {
if (obj->getTypeCode() == ::ot_name) { QTC::TC("qpdf", "QPDFParser bad array close in parseRemainder");
names.insert(obj->getStringValue()); warn("treating unexpected array close token as null");
} if (tooManyBadTokens()) {
} return {QPDF_Null::create()};
}
addNull();
}
continue;
case QPDFTokenizer::tt_dict_close:
if (frame->state <= st_dictionary_value) {
// Attempt to recover more or less gracefully from invalid dictionaries.
auto& dict = frame->dict;
if (frame->state == st_dictionary_value) {
QTC::TC("qpdf", "QPDFParser no val for last key");
warn(
frame->offset,
"dictionary ended prematurely; using null as value for last key");
dict[frame->key] = QPDF_Null::create();
} }
std::map<std::string, QPDFObjectHandle> dict; if (!frame->olist.empty())
int next_fake_key = 1; fixMissingKeys();
for (auto iter = olist.begin(); iter != olist.end();) {
// Calculate key.
std::string key;
if (*iter && (*iter)->getTypeCode() == ::ot_name) {
key = (*iter)->getStringValue();
++iter;
} else {
for (bool found_fake = false; !found_fake;) {
key = "/QPDFFake" + std::to_string(next_fake_key++);
found_fake = (names.count(key) == 0);
QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1));
}
warn(
offset,
"expected dictionary key but found non-name object; inserting key " +
key);
}
if (dict.count(key) > 0) {
QTC::TC("qpdf", "QPDFParser duplicate dict key");
warn(
offset,
"dictionary has duplicated key " + key +
"; last occurrence overrides earlier ones");
}
// Calculate value. if (!frame->contents_string.empty() && dict.count("/Type") &&
std::shared_ptr<QPDFObject> val;
if (iter != olist.end()) {
val = *iter;
++iter;
} else {
QTC::TC("qpdf", "QPDFParser no val for last key");
warn(
offset,
"dictionary ended prematurely; using null as value for last key");
val = QPDF_Null::create();
}
dict[std::move(key)] = std::move(val);
}
if (!frame.contents_string.empty() && dict.count("/Type") &&
dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") && dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") &&
dict.count("/Contents") && dict["/Contents"].isString()) { dict.count("/Contents") && dict["/Contents"].isString()) {
dict["/Contents"] = QPDFObjectHandle::newString(frame.contents_string); dict["/Contents"] = QPDFObjectHandle::newString(frame->contents_string);
dict["/Contents"].setParsedOffset(frame.contents_offset); dict["/Contents"].setParsedOffset(frame->contents_offset);
} }
object = QPDF_Dictionary::create(std::move(dict)); auto object = QPDF_Dictionary::create(std::move(dict));
setDescription(object, offset - 2); setDescription(object, frame->offset - 2);
// The `offset` points to the next of "<<". Set the rewind offset to point to the // The `offset` points to the next of "<<". Set the rewind offset to point to the
// beginning of "<<". This has been explicitly tested with whitespace surrounding // beginning of "<<". This has been explicitly tested with whitespace surrounding
// the dictionary start delimiter. getLastOffset points to the dictionary end token // the dictionary start delimiter. getLastOffset points to the dictionary end token
// and therefore can't be used here. // and therefore can't be used here.
set_offset = true; if (stack.size() <= 1) {
} return object;
stack.pop_back(); }
if (state_stack.back() == st_top) { stack.pop_back();
done = true; frame = &stack.back();
add(std::move(object));
} else { } else {
stack.back().olist.push_back(object); QTC::TC("qpdf", "QPDFParser bad dictionary close in parseRemainder");
warn("unexpected dictionary close token");
if (tooManyBadTokens()) {
return {QPDF_Null::create()};
}
addNull();
} }
continue;
case QPDFTokenizer::tt_array_open:
case QPDFTokenizer::tt_dict_open:
if (stack.size() > 499) {
QTC::TC("qpdf", "QPDFParser too deep");
warn("ignoring excessively deeply nested data structure");
return {QPDF_Null::create()};
} else {
b_contents = false;
stack.emplace_back(
input,
(tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array
: st_dictionary_key);
frame = &stack.back();
continue;
}
case QPDFTokenizer::tt_bool:
addScalar<QPDF_Bool>(tokenizer.getValue() == "true");
continue;
case QPDFTokenizer::tt_null:
addNull();
continue;
case QPDFTokenizer::tt_integer:
if (!content_stream) {
// Buffer token in case it is part of an indirect reference.
last_offset_buffer[1] = input->getLastOffset();
int_buffer[1] = QUtil::string_to_ll(tokenizer.getValue().c_str());
int_count = 1;
} else {
addScalar<QPDF_Integer>(QUtil::string_to_ll(tokenizer.getValue().c_str()));
}
continue;
case QPDFTokenizer::tt_real:
addScalar<QPDF_Real>(tokenizer.getValue());
continue;
case QPDFTokenizer::tt_name:
if (frame->state == st_dictionary_key) {
frame->key = tokenizer.getValue();
frame->state = st_dictionary_value;
b_contents = decrypter && frame->key == "/Contents";
continue;
} else {
addScalar<QPDF_Name>(tokenizer.getValue());
}
continue;
case QPDFTokenizer::tt_word:
if (content_stream) {
addScalar<QPDF_Operator>(tokenizer.getValue());
} else {
QTC::TC("qpdf", "QPDFParser treat word as string in parseRemainder");
warn("unknown token while reading object; treating as string");
if (tooManyBadTokens()) {
return {QPDF_Null::create()};
}
addScalar<QPDF_String>(tokenizer.getValue());
}
continue;
case QPDFTokenizer::tt_string:
{
auto const& val = tokenizer.getValue();
if (decrypter) {
if (b_contents) {
frame->contents_string = val;
frame->contents_offset = input->getLastOffset();
b_contents = false;
}
std::string s{val};
decrypter->decryptString(s);
addScalar<QPDF_String>(s);
} else {
addScalar<QPDF_String>(val);
}
}
continue;
default:
warn("treating unknown token type as null while reading object");
if (tooManyBadTokens()) {
return {QPDF_Null::create()};
}
addNull();
} }
} }
if (is_null) {
object = QPDF_Null::create();
}
if (!set_offset) {
setDescription(object, offset);
}
return object;
} }
void void
QPDFParser::setDescription(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset) QPDFParser::add(std::shared_ptr<QPDFObject>&& obj)
{
if (frame->state != st_dictionary_value) {
// If state is st_dictionary_key then there is a missing key. Push onto olist for
// processing once the tt_dict_close token has been found.
frame->olist.emplace_back(std::move(obj));
} else {
if (auto res = frame->dict.insert_or_assign(frame->key, std::move(obj)); !res.second) {
warnDuplicateKey();
}
frame->state = st_dictionary_key;
}
}
void
QPDFParser::addNull()
{
const static ObjectPtr null_obj = QPDF_Null::create();
if (frame->state != st_dictionary_value) {
// If state is st_dictionary_key then there is a missing key. Push onto olist for
// processing once the tt_dict_close token has been found.
frame->olist.emplace_back(null_obj);
} else {
if (auto res = frame->dict.insert_or_assign(frame->key, null_obj); !res.second) {
warnDuplicateKey();
}
frame->state = st_dictionary_key;
}
++frame->null_count;
}
void
QPDFParser::addInt(int count)
{
auto obj = QPDF_Integer::create(int_buffer[count % 2]);
obj->setDescription(context, description, last_offset_buffer[count % 2]);
add(std::move(obj));
}
template <typename T, typename... Args>
void
QPDFParser::addScalar(Args&&... args)
{
auto obj = T::create(args...);
obj->setDescription(context, description, input->getLastOffset());
add(std::move(obj));
}
template <typename T, typename... Args>
QPDFObjectHandle
QPDFParser::withDescription(Args&&... args)
{
auto obj = T::create(args...);
obj->setDescription(context, description, start);
return {obj};
}
void
QPDFParser::setDescription(ObjectPtr& obj, qpdf_offset_t parsed_offset)
{ {
if (obj) { if (obj) {
obj->setDescription(context, description, parsed_offset); obj->setDescription(context, description, parsed_offset);
} }
} }
void
QPDFParser::fixMissingKeys()
{
std::set<std::string> names;
for (auto& obj: frame->olist) {
if (obj->getTypeCode() == ::ot_name) {
names.insert(obj->getStringValue());
}
}
int next_fake_key = 1;
for (auto const& item: frame->olist) {
while (true) {
const std::string key = "/QPDFFake" + std::to_string(next_fake_key++);
const bool found_fake = frame->dict.count(key) == 0 && names.count(key) == 0;
QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1));
if (found_fake) {
warn(
frame->offset,
"expected dictionary key but found non-name object; inserting key " + key);
frame->dict[key] = item;
break;
}
}
}
}
bool
QPDFParser::tooManyBadTokens()
{
if (good_count <= 4) {
if (++bad_count > 5) {
warn("too many errors; giving up on reading object");
return true;
}
} else {
bad_count = 1;
}
good_count = 0;
return false;
}
void void
QPDFParser::warn(QPDFExc const& e) const QPDFParser::warn(QPDFExc const& e) const
{ {
@ -426,6 +497,15 @@ QPDFParser::warn(QPDFExc const& e) const
} }
} }
void
QPDFParser::warnDuplicateKey()
{
QTC::TC("qpdf", "QPDFParser duplicate dict key");
warn(
frame->offset,
"dictionary has duplicated key " + frame->key + "; last occurrence overrides earlier ones");
}
void void
QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const
{ {

View File

@ -31,11 +31,44 @@ class QPDFParser
QPDFObjectHandle parse(bool& empty, bool content_stream); QPDFObjectHandle parse(bool& empty, bool content_stream);
private: private:
enum parser_state_e { st_top, st_start, st_stop, st_eof, st_dictionary, st_array }; // Parser state. Note:
// state < st_dictionary_value == (state = st_dictionary_key || state = st_dictionary_value)
enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array };
struct StackFrame
{
StackFrame(std::shared_ptr<InputSource> const& input, parser_state_e state) :
state(state),
offset(input->tell())
{
}
std::vector<std::shared_ptr<QPDFObject>> olist;
std::map<std::string, QPDFObjectHandle> dict;
parser_state_e state;
std::string key;
qpdf_offset_t offset;
std::string contents_string;
qpdf_offset_t contents_offset{-1};
int null_count{0};
};
QPDFObjectHandle parseRemainder(bool content_stream);
void add(std::shared_ptr<QPDFObject>&& obj);
void addNull();
void addInt(int count);
template <typename T, typename... Args>
void addScalar(Args&&... args);
bool tooManyBadTokens();
void warnDuplicateKey();
void fixMissingKeys();
void warn(qpdf_offset_t offset, std::string const& msg) const; void warn(qpdf_offset_t offset, std::string const& msg) const;
void warn(std::string const& msg) const; void warn(std::string const& msg) const;
void warn(QPDFExc const&) const; void warn(QPDFExc const&) const;
template <typename T, typename... Args>
// Create a new scalar object complete with parsed offset and description.
// NB the offset includes any leading whitespace.
QPDFObjectHandle withDescription(Args&&... args);
void setDescription(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset); void setDescription(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset);
std::shared_ptr<InputSource> input; std::shared_ptr<InputSource> input;
std::string const& object_description; std::string const& object_description;
@ -43,6 +76,18 @@ class QPDFParser
QPDFObjectHandle::StringDecrypter* decrypter; QPDFObjectHandle::StringDecrypter* decrypter;
QPDF* context; QPDF* context;
std::shared_ptr<QPDFValue::Description> description; std::shared_ptr<QPDFValue::Description> description;
std::vector<StackFrame> stack;
StackFrame* frame;
// Number of recent bad tokens.
int bad_count = 0;
// Number of good tokens since last bad token. Irrelevant if bad_count == 0.
int good_count = 0;
// Start offset including any leading whitespace.
qpdf_offset_t start;
// Number of successive integer tokens.
int int_count = 0;
long long int_buffer[2]{0, 0};
qpdf_offset_t last_offset_buffer[2]{0, 0};
}; };
#endif // QPDFPARSER_HH #endif // QPDFPARSER_HH

View File

@ -57,11 +57,14 @@ QPDF trailer lacks size 0
QPDF trailer size not integer 0 QPDF trailer size not integer 0
QPDF trailer prev not integer 0 QPDF trailer prev not integer 0
QPDFParser bad brace 0 QPDFParser bad brace 0
QPDFParser bad brace in parseRemainder 0
QPDFParser bad array close 0 QPDFParser bad array close 0
QPDFParser bad array close in parseRemainder 0
QPDF stream without length 0 QPDF stream without length 0
QPDF stream length not integer 0 QPDF stream length not integer 0
QPDF missing endstream 0 QPDF missing endstream 0
QPDFParser bad dictionary close 0 QPDFParser bad dictionary close 0
QPDFParser bad dictionary close in parseRemainder 0
QPDF can't find xref 0 QPDF can't find xref 0
QPDFTokenizer bad ) 0 QPDFTokenizer bad ) 0
QPDFTokenizer bad > 0 QPDFTokenizer bad > 0
@ -258,6 +261,7 @@ QPDFParser indirect with 0 objid 0
QPDF object id 0 0 QPDF object id 0 0
QPDF recursion loop in resolve 0 QPDF recursion loop in resolve 0
QPDFParser treat word as string 0 QPDFParser treat word as string 0
QPDFParser treat word as string in parseRemainder 0
QPDFParser found fake 1 QPDFParser found fake 1
QPDFParser no val for last key 0 QPDFParser no val for last key 0
QPDF resolve failure to null 0 QPDF resolve failure to null 0
@ -289,7 +293,9 @@ QPDFObjectHandle coalesce called on stream 0
QPDFObjectHandle coalesce provide stream data 0 QPDFObjectHandle coalesce provide stream data 0
QPDF_Stream bad token at end during normalize 0 QPDF_Stream bad token at end during normalize 0
QPDFParser bad token in parse 0 QPDFParser bad token in parse 0
QPDFParser bad token in parseRemainder 0
QPDFParser eof in parse 0 QPDFParser eof in parse 0
QPDFParser eof in parseRemainder 0
QPDFObjectHandle array bounds 0 QPDFObjectHandle array bounds 0
QPDFObjectHandle boolean returning false 0 QPDFObjectHandle boolean returning false 0
QPDFObjectHandle integer returning 0 0 QPDFObjectHandle integer returning 0 0

View File

@ -17,7 +17,7 @@ my $td = new TestDriver('parsing');
my $n_tests = 17; my $n_tests = 17;
$td->runtest("parse objects from string", $td->runtest("parse objects from string",
{$td->COMMAND => "test_driver 31 good1.qdf"}, {$td->COMMAND => "test_driver 31 bad39.qdf"},
{$td->FILE => "parse-object.out", $td->EXIT_STATUS => 0}, {$td->FILE => "parse-object.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES); $td->NORMALIZE_NEWLINES);
$td->runtest("EOF terminating literal tokens", $td->runtest("EOF terminating literal tokens",

View File

@ -1,14 +1,14 @@
WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token
WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token
WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string
WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
WARNING: bad16.pdf (trailer, offset 779): parse error while reading object WARNING: bad16.pdf (trailer, offset 779): parse error while reading object
WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
WARNING: bad16.pdf: file is damaged WARNING: bad16.pdf: file is damaged
WARNING: bad16.pdf (offset 712): expected trailer dictionary WARNING: bad16.pdf (offset 712): expected trailer dictionary
WARNING: bad16.pdf: Attempting to reconstruct cross-reference table WARNING: bad16.pdf: Attempting to reconstruct cross-reference table
WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token
WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token
WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string
WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
WARNING: bad16.pdf (trailer, offset 779): parse error while reading object WARNING: bad16.pdf (trailer, offset 779): parse error while reading object
WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
bad16.pdf: unable to find trailer dictionary while recovering damaged file bad16.pdf: unable to find trailer dictionary while recovering damaged file

View File

@ -1,6 +1,6 @@
WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token
WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token
WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string
WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
WARNING: bad16.pdf (trailer, offset 779): parse error while reading object WARNING: bad16.pdf (trailer, offset 779): parse error while reading object
WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
bad16.pdf (offset 712): expected trailer dictionary bad16.pdf (offset 712): expected trailer dictionary

View File

@ -1,6 +1,6 @@
WARNING: bad36.pdf (trailer, offset 764): unknown token while reading object; treating as string WARNING: bad36.pdf (trailer, offset 764): unknown token while reading object; treating as string
WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
WARNING: bad36.pdf (trailer, offset 715): dictionary ended prematurely; using null as value for last key WARNING: bad36.pdf (trailer, offset 715): dictionary ended prematurely; using null as value for last key
WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
/QTest is implicit /QTest is implicit
/QTest is direct and has type null (2) /QTest is direct and has type null (2)
/QTest is null /QTest is null

View File

@ -1,6 +1,6 @@
WARNING: bad36.pdf (trailer, offset 764): unknown token while reading object; treating as string WARNING: bad36.pdf (trailer, offset 764): unknown token while reading object; treating as string
WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
WARNING: bad36.pdf (trailer, offset 715): dictionary ended prematurely; using null as value for last key WARNING: bad36.pdf (trailer, offset 715): dictionary ended prematurely; using null as value for last key
WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
/QTest is implicit /QTest is implicit
/QTest is direct and has type null (2) /QTest is direct and has type null (2)
/QTest is null /QTest is null

102
qpdf/qtest/qpdf/bad39.qdf Normal file
View File

@ -0,0 +1,102 @@
%PDF-1.3
%¿÷¢þ
%QDF-1.0
%% Original object ID: 1 0
1 0 obj
<<
/Pages 2 0 R
/Type /Catalog
>>
endobj
%% Original object ID: 2 0
2 0 obj
<<
/Count 1
/Kids [
3 0 R
]
/Type /Pages
>>
endobj
%% Page 1
%% Original object ID: 3 0
3 0 obj
<<
/Contents 4 0 R
/MediaBox [
0
0
612
792
]
/Parent 2 0 R
/Resources <<
/Font <<
/F1 6 0 R
>>
/ProcSet 7 0 R
>>
/Type /Page
>>
endobj
%% Contents for page 1
%% Original object ID: 4 0
4 0 obj
<<
/Length 5 0 R
>>
stream
BT
/F1 24 Tf
72 720 Td
(Potato) Tj
ET
endstream
endobj
5 0 obj
44
endobj
%% Original object ID: 6 0
6 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
/Name /F1
/Subtype /Type1
/Type /Font
>>
endobj
%% Original object ID: 5 0
7 0 obj
[
/PDF
/Text
]
endobj
xref
0 8
0000000000 65535 f
0000000052 00000 n
0000000133 00000 n
0000000242 00000 n
0000000484 00000 n
0000000583 00000 n
0000000629 00000 n
0000001113 00000 n
trailer <<
/Root 1 0 R
/Size 8
/ID [<31415926535897932384626433832795><31415926535897932384626433832795>]
>>
startxref
809
%%EOF
7 0 obj

View File

@ -51,6 +51,7 @@ WARNING: issue-335a.pdf (trailer, offset 563): unexpected )
WARNING: issue-335a.pdf (trailer, offset 596): unexpected ) WARNING: issue-335a.pdf (trailer, offset 596): unexpected )
WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2 WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2
WARNING: issue-335a.pdf (trailer, offset 600): unexpected ) WARNING: issue-335a.pdf (trailer, offset 600): unexpected )
WARNING: issue-335a.pdf (trailer, offset 134): dictionary has duplicated key /L
WARNING: issue-335a.pdf (trailer, offset 601): unexpected ) WARNING: issue-335a.pdf (trailer, offset 601): unexpected )
WARNING: issue-335a.pdf (trailer, offset 648): unexpected ) WARNING: issue-335a.pdf (trailer, offset 648): unexpected )
WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2 WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2
@ -74,6 +75,7 @@ WARNING: issue-335a.pdf (trailer, offset 563): unexpected )
WARNING: issue-335a.pdf (trailer, offset 596): unexpected ) WARNING: issue-335a.pdf (trailer, offset 596): unexpected )
WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2 WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2
WARNING: issue-335a.pdf (trailer, offset 600): unexpected ) WARNING: issue-335a.pdf (trailer, offset 600): unexpected )
WARNING: issue-335a.pdf (trailer, offset 164): dictionary has duplicated key /L
WARNING: issue-335a.pdf (trailer, offset 601): unexpected ) WARNING: issue-335a.pdf (trailer, offset 601): unexpected )
WARNING: issue-335a.pdf (trailer, offset 648): unexpected ) WARNING: issue-335a.pdf (trailer, offset 648): unexpected )
WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2 WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2
@ -97,6 +99,7 @@ WARNING: issue-335a.pdf (trailer, offset 563): unexpected )
WARNING: issue-335a.pdf (trailer, offset 596): unexpected ) WARNING: issue-335a.pdf (trailer, offset 596): unexpected )
WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2 WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2
WARNING: issue-335a.pdf (trailer, offset 600): unexpected ) WARNING: issue-335a.pdf (trailer, offset 600): unexpected )
WARNING: issue-335a.pdf (trailer, offset 231): dictionary has duplicated key /L
WARNING: issue-335a.pdf (trailer, offset 601): unexpected ) WARNING: issue-335a.pdf (trailer, offset 601): unexpected )
WARNING: issue-335a.pdf (trailer, offset 648): unexpected ) WARNING: issue-335a.pdf (trailer, offset 648): unexpected )
WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2 WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2
@ -448,6 +451,7 @@ WARNING: issue-335a.pdf (trailer, offset 1168): unexpected )
WARNING: issue-335a.pdf (trailer, offset 1328): unexpected ) WARNING: issue-335a.pdf (trailer, offset 1328): unexpected )
WARNING: issue-335a.pdf (trailer, offset 1329): name with stray # will not work with PDF >= 1.2 WARNING: issue-335a.pdf (trailer, offset 1329): name with stray # will not work with PDF >= 1.2
WARNING: issue-335a.pdf (trailer, offset 1332): unexpected ) WARNING: issue-335a.pdf (trailer, offset 1332): unexpected )
WARNING: issue-335a.pdf (trailer, offset 1033): dictionary has duplicated key /L
WARNING: issue-335a.pdf (trailer, offset 1333): unexpected ) WARNING: issue-335a.pdf (trailer, offset 1333): unexpected )
WARNING: issue-335a.pdf (trailer, offset 1344): unexpected ) WARNING: issue-335a.pdf (trailer, offset 1344): unexpected )
WARNING: issue-335a.pdf (trailer, offset 1428): unexpected ) WARNING: issue-335a.pdf (trailer, offset 1428): unexpected )

View File

@ -1,5 +1,11 @@
[ /name 16059 3.14159 false << /key true /other [ (string1) (string2) ] >> null ] [ /name 16059 3.14159 false << /key true /other [ (string1) (string2) ] >> null ]
logic error parsing indirect: QPDFObjectHandle::parse called without context on an object with indirect references logic error parsing indirect: QPDFParser::parse called without context on an object with indirect references
trailing data: parsed object (trailing test): trailing data found parsing object from string trailing data: parsed object (trailing test): trailing data found parsing object from string
WARNING: parsed object (offset 9): unknown token while reading object; treating as string WARNING: parsed object (offset 9): unknown token while reading object; treating as string
WARNING: parsed object: treating unexpected brace token as null
WARNING: parsed object: treating unexpected brace token as null
WARNING: parsed object: unexpected dictionary close token
WARNING: bad39.qdf (object 7 0, offset 1121): unexpected EOF
WARNING: bad39.qdf (object 7 0, offset 1121): expected endobj
WARNING: bad39.qdf (object 7 0, offset 1121): EOF after endobj
test 31 done test 31 done

View File

@ -1195,6 +1195,13 @@ test_31(QPDF& pdf, char const* arg2)
// mistakenly parsed as an indirect object. // mistakenly parsed as an indirect object.
assert(QPDFObjectHandle::parse(&pdf, "[5 0 R 0 R /X]").unparse() == "[ 5 0 R 0 (R) /X ]"); assert(QPDFObjectHandle::parse(&pdf, "[5 0 R 0 R /X]").unparse() == "[ 5 0 R 0 (R) /X ]");
assert(QPDFObjectHandle::parse(&pdf, "[1 0 R]", "indirect test").unparse() == "[ 1 0 R ]"); assert(QPDFObjectHandle::parse(&pdf, "[1 0 R]", "indirect test").unparse() == "[ 1 0 R ]");
// TC:QPDFParser bad brace
assert(QPDFObjectHandle::parse(&pdf, "}").unparse() == "null");
assert(QPDFObjectHandle::parse(&pdf, "{").unparse() == "null");
// TC:QPDFParser bad dictionary close
assert(QPDFObjectHandle::parse(&pdf, ">>").unparse() == "null");
// TC:QPDFParser eof in parse
assert(QPDFObjectHandle::parse(&pdf, "[7 0 R]").getArrayItem(0).isNull());
} }
static void static void