Move QPDFObjectHandle::parseInternal to new class QPDFParser

Part of #729
This commit is contained in:
m-holger 2022-08-16 13:59:32 +01:00
parent 0adfd74f8b
commit 6670c685ab
7 changed files with 576 additions and 522 deletions

View File

@ -49,6 +49,7 @@ class QPDF_Stream;
class BitStream;
class BitWriter;
class QPDFLogger;
class QPDFParser;
class QPDF
{
@ -881,7 +882,7 @@ class QPDF
// resolution
class ParseGuard
{
friend class QPDFObjectHandle;
friend class QPDFParser;
private:
ParseGuard(QPDF* qpdf) :

View File

@ -49,9 +49,12 @@ class QPDFTokenizer;
class QPDFExc;
class Pl_QPDFTokenizer;
class QPDFMatrix;
class QPDFParser;
class QPDFObjectHandle
{
friend class QPDFParser;
public:
// This class is used by replaceStreamData. It provides an
// alternative way of associating stream data with a stream. See
@ -1563,15 +1566,6 @@ class QPDFObjectHandle
QPDFObjectHandle(QPDF*, QPDFObjGen const& og);
QPDFObjectHandle(std::shared_ptr<QPDFObject> const&);
enum parser_state_e {
st_top,
st_start,
st_stop,
st_eof,
st_dictionary,
st_array
};
// Private object factory methods
static QPDFObjectHandle newIndirect(QPDF*, QPDFObjGen const& og);
static QPDFObjectHandle newStream(
@ -1599,14 +1593,7 @@ class QPDFObjectHandle
std::string const&,
std::shared_ptr<InputSource>,
qpdf_offset_t);
static QPDFObjectHandle parseInternal(
std::shared_ptr<InputSource> input,
std::string const& object_description,
QPDFTokenizer& tokenizer,
bool& empty,
StringDecrypter* decrypter,
QPDF* context,
bool content_stream);
void setParsedOffset(qpdf_offset_t offset);
void parseContentStream_internal(
std::string const& description, ParserCallbacks* callbacks);

View File

@ -80,6 +80,7 @@ set(libqpdf_SOURCES
QPDFPageDocumentHelper.cc
QPDFPageLabelDocumentHelper.cc
QPDFPageObjectHelper.cc
QPDFParser.cc
QPDFStreamFilter.cc
QPDFSystemError.cc
QPDFTokenizer.cc

View File

@ -8,6 +8,7 @@
#include <qpdf/QPDFLogger.hh>
#include <qpdf/QPDFMatrix.hh>
#include <qpdf/QPDFPageObjectHelper.hh>
#include <qpdf/QPDFParser.hh>
#include <qpdf/QPDF_Array.hh>
#include <qpdf/QPDF_Bool.hh>
#include <qpdf/QPDF_Dictionary.hh>
@ -1879,8 +1880,8 @@ QPDFObjectHandle::parseContentStream_data(
tokenizer.readToken(input, "content", true);
qpdf_offset_t offset = input->getLastOffset();
input->seek(offset, SEEK_SET);
QPDFObjectHandle obj = parseInternal(
input, "content", tokenizer, empty, nullptr, context, true);
auto obj = QPDFParser(input, "content", tokenizer, nullptr, context)
.parse(empty, true);
if (!obj.isInitialized()) {
// EOF
break;
@ -1943,497 +1944,8 @@ QPDFObjectHandle::parse(
StringDecrypter* decrypter,
QPDF* context)
{
return parseInternal(
input, object_description, tokenizer, empty, decrypter, context, false);
}
QPDFObjectHandle
QPDFObjectHandle::parseInternal(
std::shared_ptr<InputSource> input,
std::string const& object_description,
QPDFTokenizer& tokenizer,
bool& empty,
StringDecrypter* decrypter,
QPDF* context,
bool content_stream)
{
// This method must take care not to resolve any objects. Don't
// check the type of any object without first ensuring that it is
// a direct object. Otherwise, doing so may have the side effect
// of reading the object and changing the file pointer. If you do
// this, it will cause a logic error to be thrown from
// QPDF::inParse().
QPDF::ParseGuard pg(context);
empty = false;
QPDFObjectHandle object;
bool set_offset = false;
std::vector<SparseOHArray> olist_stack;
olist_stack.push_back(SparseOHArray());
std::vector<parser_state_e> state_stack;
state_stack.push_back(st_top);
std::vector<qpdf_offset_t> offset_stack;
qpdf_offset_t offset = input->tell();
offset_stack.push_back(offset);
bool done = false;
int bad_count = 0;
int good_count = 0;
bool b_contents = false;
std::vector<std::string> contents_string_stack;
contents_string_stack.push_back("");
std::vector<qpdf_offset_t> contents_offset_stack;
contents_offset_stack.push_back(-1);
while (!done) {
bool bad = false;
SparseOHArray& olist = olist_stack.back();
parser_state_e state = state_stack.back();
offset = offset_stack.back();
std::string& contents_string = contents_string_stack.back();
qpdf_offset_t& contents_offset = contents_offset_stack.back();
object = QPDFObjectHandle();
set_offset = false;
QPDFTokenizer::Token token =
tokenizer.readToken(input, object_description, true);
std::string const& token_error_message = token.getErrorMessage();
if (!token_error_message.empty()) {
// Tokens other than tt_bad can still generate warnings.
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
token_error_message));
}
switch (token.getType()) {
case QPDFTokenizer::tt_eof:
if (!content_stream) {
QTC::TC("qpdf", "QPDFObjectHandle eof in parseInternal");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"unexpected EOF"));
}
bad = true;
state = st_eof;
break;
case QPDFTokenizer::tt_bad:
QTC::TC("qpdf", "QPDFObjectHandle bad token in parse");
bad = true;
object = newNull();
break;
case QPDFTokenizer::tt_brace_open:
case QPDFTokenizer::tt_brace_close:
QTC::TC("qpdf", "QPDFObjectHandle bad brace");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"treating unexpected brace token as null"));
bad = true;
object = newNull();
break;
case QPDFTokenizer::tt_array_close:
if (state == st_array) {
state = st_stop;
} else {
QTC::TC("qpdf", "QPDFObjectHandle bad array close");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"treating unexpected array close token as null"));
bad = true;
object = newNull();
}
break;
case QPDFTokenizer::tt_dict_close:
if (state == st_dictionary) {
state = st_stop;
} else {
QTC::TC("qpdf", "QPDFObjectHandle bad dictionary close");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"unexpected dictionary close token"));
bad = true;
object = newNull();
}
break;
case QPDFTokenizer::tt_array_open:
case QPDFTokenizer::tt_dict_open:
if (olist_stack.size() > 500) {
QTC::TC("qpdf", "QPDFObjectHandle too deep");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"ignoring excessively deeply nested data structure"));
bad = true;
object = newNull();
state = st_top;
} else {
olist_stack.push_back(SparseOHArray());
state = st_start;
offset_stack.push_back(input->tell());
state_stack.push_back(
(token.getType() == QPDFTokenizer::tt_array_open)
? st_array
: st_dictionary);
b_contents = false;
contents_string_stack.push_back("");
contents_offset_stack.push_back(-1);
}
break;
case QPDFTokenizer::tt_bool:
object = newBool((token.getValue() == "true"));
break;
case QPDFTokenizer::tt_null:
object = newNull();
break;
case QPDFTokenizer::tt_integer:
object = newInteger(QUtil::string_to_ll(token.getValue().c_str()));
break;
case QPDFTokenizer::tt_real:
object = newReal(token.getValue());
break;
case QPDFTokenizer::tt_name:
{
std::string name = token.getValue();
object = newName(name);
if (name == "/Contents") {
b_contents = true;
} else {
b_contents = false;
}
}
break;
case QPDFTokenizer::tt_word:
{
std::string const& value = token.getValue();
if (content_stream) {
object = QPDFObjectHandle::newOperator(value);
} else if (
(value == "R") && (state != st_top) &&
(olist.size() >= 2) &&
(!olist.at(olist.size() - 1).isIndirect()) &&
(olist.at(olist.size() - 1).isInteger()) &&
(!olist.at(olist.size() - 2).isIndirect()) &&
(olist.at(olist.size() - 2).isInteger())) {
if (context == nullptr) {
QTC::TC(
"qpdf",
"QPDFObjectHandle indirect without context");
throw std::logic_error(
"QPDFObjectHandle::parse called without context"
" on an object with indirect references");
}
// Try to resolve indirect objects
object = newIndirect(
context,
QPDFObjGen(
olist.at(olist.size() - 2).getIntValueAsInt(),
olist.at(olist.size() - 1).getIntValueAsInt()));
olist.remove_last();
olist.remove_last();
} else if ((value == "endobj") && (state == st_top)) {
// We just saw endobj without having read
// anything. Treat this as a null and do not move
// the input source's offset.
object = newNull();
input->seek(input->getLastOffset(), SEEK_SET);
empty = true;
} else {
QTC::TC("qpdf", "QPDFObjectHandle treat word as string");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"unknown token while reading object;"
" treating as string"));
bad = true;
object = newString(value);
}
}
break;
case QPDFTokenizer::tt_string:
{
std::string val = token.getValue();
if (decrypter) {
if (b_contents) {
contents_string = val;
contents_offset = input->getLastOffset();
b_contents = false;
}
decrypter->decryptString(val);
}
object = QPDFObjectHandle::newString(val);
}
break;
default:
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"treating unknown token type as null while "
"reading object"));
bad = true;
object = newNull();
break;
}
if ((!object.isInitialized()) &&
(!((state == st_start) || (state == st_stop) ||
(state == st_eof)))) {
throw std::logic_error("QPDFObjectHandle::parseInternal: "
"unexpected uninitialized object");
object = newNull();
}
if (bad) {
++bad_count;
good_count = 0;
} else {
++good_count;
if (good_count > 3) {
bad_count = 0;
}
}
if (bad_count > 5) {
// We had too many consecutive errors without enough
// intervening successful objects. Give up.
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"too many errors; giving up on reading object"));
state = st_top;
object = newNull();
}
switch (state) {
case st_eof:
if (state_stack.size() > 1) {
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"parse error while reading object"));
}
done = true;
// In content stream mode, leave object uninitialized to
// indicate EOF
if (!content_stream) {
object = newNull();
}
break;
case st_dictionary:
case st_array:
setObjectDescriptionFromInput(
object,
context,
object_description,
input,
input->getLastOffset());
object.setParsedOffset(input->getLastOffset());
set_offset = true;
olist.append(object);
break;
case st_top:
done = true;
break;
case st_start:
break;
case st_stop:
if ((state_stack.size() < 2) || (olist_stack.size() < 2)) {
throw std::logic_error(
"QPDFObjectHandle::parseInternal: st_stop encountered"
" with insufficient elements in stack");
}
parser_state_e old_state = state_stack.back();
state_stack.pop_back();
if (old_state == st_array) {
// There's no newArray(SparseOHArray) since
// SparseOHArray is not part of the public API.
object = QPDFObjectHandle(QPDF_Array::create(olist));
setObjectDescriptionFromInput(
object, context, object_description, input, offset);
// The `offset` points to the next of "[". Set the
// rewind offset to point to the beginning of "[".
// This has been explicitly tested with whitespace
// surrounding the array start delimiter.
// getLastOffset points to the array end token and
// therefore can't be used here.
object.setParsedOffset(offset - 1);
set_offset = true;
} else if (old_state == st_dictionary) {
// Convert list to map. Alternating elements are keys.
// Attempt to recover more or less gracefully from
// invalid dictionaries.
std::set<std::string> names;
size_t n_elements = olist.size();
for (size_t i = 0; i < n_elements; ++i) {
QPDFObjectHandle oh = olist.at(i);
if ((!oh.isIndirect()) && oh.isName()) {
names.insert(oh.getName());
}
}
std::map<std::string, QPDFObjectHandle> dict;
int next_fake_key = 1;
for (unsigned int i = 0; i < olist.size(); ++i) {
QPDFObjectHandle key_obj = olist.at(i);
QPDFObjectHandle val;
if (key_obj.isIndirect() || (!key_obj.isName())) {
bool found_fake = false;
std::string candidate;
while (!found_fake) {
candidate = "/QPDFFake" +
QUtil::int_to_string(next_fake_key++);
found_fake = (names.count(candidate) == 0);
QTC::TC(
"qpdf",
"QPDFObjectHandle found fake",
(found_fake ? 0 : 1));
}
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
offset,
"expected dictionary key but found"
" non-name object; inserting key " +
candidate));
val = key_obj;
key_obj = newName(candidate);
} else if (i + 1 >= olist.size()) {
QTC::TC("qpdf", "QPDFObjectHandle no val for last key");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
offset,
"dictionary ended prematurely; "
"using null as value for last key"));
val = newNull();
setObjectDescriptionFromInput(
val, context, object_description, input, offset);
} else {
val = olist.at(++i);
}
std::string key = key_obj.getName();
if (dict.count(key) > 0) {
QTC::TC("qpdf", "QPDFObjectHandle duplicate dict key");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
offset,
"dictionary has duplicated key " + key +
"; last occurrence overrides earlier "
"ones"));
}
dict[key] = val;
}
if (!contents_string.empty() && dict.count("/Type") &&
dict["/Type"].isNameAndEquals("/Sig") &&
dict.count("/ByteRange") && dict.count("/Contents") &&
dict["/Contents"].isString()) {
dict["/Contents"] =
QPDFObjectHandle::newString(contents_string);
dict["/Contents"].setParsedOffset(contents_offset);
}
object = newDictionary(dict);
setObjectDescriptionFromInput(
object, context, object_description, input, offset);
// The `offset` points to the next of "<<". Set the
// rewind offset to point to the beginning of "<<".
// This has been explicitly tested with whitespace
// surrounding the dictionary start delimiter.
// getLastOffset points to the dictionary end token
// and therefore can't be used here.
object.setParsedOffset(offset - 2);
set_offset = true;
}
olist_stack.pop_back();
offset_stack.pop_back();
if (state_stack.back() == st_top) {
done = true;
} else {
olist_stack.back().append(object);
}
contents_string_stack.pop_back();
contents_offset_stack.pop_back();
}
}
if (!set_offset) {
setObjectDescriptionFromInput(
object, context, object_description, input, offset);
object.setParsedOffset(offset);
}
return object;
return QPDFParser(input, object_description, tokenizer, decrypter, context)
.parse(empty, false);
}
qpdf_offset_t

503
libqpdf/QPDFParser.cc Normal file
View File

@ -0,0 +1,503 @@
#include <qpdf/QPDFParser.hh>
#include <qpdf/QPDF.hh>
#include <qpdf/QPDFObjectHandle.hh>
#include <qpdf/QPDF_Array.hh>
#include <qpdf/QTC.hh>
#include <qpdf/QUtil.hh>
#include <qpdf/SparseOHArray.hh>
QPDFObjectHandle
QPDFParser::parse(bool& empty, bool content_stream)
{
// This method must take care not to resolve any objects. Don't
// check the type of any object without first ensuring that it is
// a direct object. Otherwise, doing so may have the side effect
// of reading the object and changing the file pointer. If you do
// this, it will cause a logic error to be thrown from
// QPDF::inParse().
QPDF::ParseGuard pg(context);
empty = false;
QPDFObjectHandle object;
bool set_offset = false;
std::vector<SparseOHArray> olist_stack;
olist_stack.push_back(SparseOHArray());
std::vector<parser_state_e> state_stack;
state_stack.push_back(st_top);
std::vector<qpdf_offset_t> offset_stack;
qpdf_offset_t offset = input->tell();
offset_stack.push_back(offset);
bool done = false;
int bad_count = 0;
int good_count = 0;
bool b_contents = false;
std::vector<std::string> contents_string_stack;
contents_string_stack.push_back("");
std::vector<qpdf_offset_t> contents_offset_stack;
contents_offset_stack.push_back(-1);
while (!done) {
bool bad = false;
SparseOHArray& olist = olist_stack.back();
parser_state_e state = state_stack.back();
offset = offset_stack.back();
std::string& contents_string = contents_string_stack.back();
qpdf_offset_t& contents_offset = contents_offset_stack.back();
object = QPDFObjectHandle();
set_offset = false;
QPDFTokenizer::Token token =
tokenizer.readToken(input, object_description, true);
std::string const& token_error_message = token.getErrorMessage();
if (!token_error_message.empty()) {
// Tokens other than tt_bad can still generate warnings.
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
token_error_message));
}
switch (token.getType()) {
case QPDFTokenizer::tt_eof:
if (!content_stream) {
QTC::TC("qpdf", "QPDFParser eof in parse");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"unexpected EOF"));
}
bad = true;
state = st_eof;
break;
case QPDFTokenizer::tt_bad:
QTC::TC("qpdf", "QPDFParser bad token in parse");
bad = true;
object = QPDFObjectHandle::newNull();
break;
case QPDFTokenizer::tt_brace_open:
case QPDFTokenizer::tt_brace_close:
QTC::TC("qpdf", "QPDFParser bad brace");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"treating unexpected brace token as null"));
bad = true;
object = QPDFObjectHandle::newNull();
break;
case QPDFTokenizer::tt_array_close:
if (state == st_array) {
state = st_stop;
} else {
QTC::TC("qpdf", "QPDFParser bad array close");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"treating unexpected array close token as null"));
bad = true;
object = QPDFObjectHandle::newNull();
}
break;
case QPDFTokenizer::tt_dict_close:
if (state == st_dictionary) {
state = st_stop;
} else {
QTC::TC("qpdf", "QPDFParser bad dictionary close");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"unexpected dictionary close token"));
bad = true;
object = QPDFObjectHandle::newNull();
}
break;
case QPDFTokenizer::tt_array_open:
case QPDFTokenizer::tt_dict_open:
if (olist_stack.size() > 500) {
QTC::TC("qpdf", "QPDFParser too deep");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"ignoring excessively deeply nested data structure"));
bad = true;
object = QPDFObjectHandle::newNull();
state = st_top;
} else {
olist_stack.push_back(SparseOHArray());
state = st_start;
offset_stack.push_back(input->tell());
state_stack.push_back(
(token.getType() == QPDFTokenizer::tt_array_open)
? st_array
: st_dictionary);
b_contents = false;
contents_string_stack.push_back("");
contents_offset_stack.push_back(-1);
}
break;
case QPDFTokenizer::tt_bool:
object = QPDFObjectHandle::newBool((token.getValue() == "true"));
break;
case QPDFTokenizer::tt_null:
object = QPDFObjectHandle::newNull();
break;
case QPDFTokenizer::tt_integer:
object = QPDFObjectHandle::newInteger(
QUtil::string_to_ll(token.getValue().c_str()));
break;
case QPDFTokenizer::tt_real:
object = QPDFObjectHandle::newReal(token.getValue());
break;
case QPDFTokenizer::tt_name:
{
std::string name = token.getValue();
object = QPDFObjectHandle::newName(name);
if (name == "/Contents") {
b_contents = true;
} else {
b_contents = false;
}
}
break;
case QPDFTokenizer::tt_word:
{
std::string const& value = token.getValue();
if (content_stream) {
object = QPDFObjectHandle::newOperator(value);
} else if (
(value == "R") && (state != st_top) &&
(olist.size() >= 2) &&
(!olist.at(olist.size() - 1).isIndirect()) &&
(olist.at(olist.size() - 1).isInteger()) &&
(!olist.at(olist.size() - 2).isIndirect()) &&
(olist.at(olist.size() - 2).isInteger())) {
if (context == nullptr) {
QTC::TC("qpdf", "QPDFParser indirect without context");
throw std::logic_error(
"QPDFObjectHandle::parse called without context"
" on an object with indirect references");
}
// Try to resolve indirect objects
object = QPDFObjectHandle::newIndirect(
context,
QPDFObjGen(
olist.at(olist.size() - 2).getIntValueAsInt(),
olist.at(olist.size() - 1).getIntValueAsInt()));
olist.remove_last();
olist.remove_last();
} else if ((value == "endobj") && (state == st_top)) {
// We just saw endobj without having read
// anything. Treat this as a null and do not move
// the input source's offset.
object = QPDFObjectHandle::newNull();
input->seek(input->getLastOffset(), SEEK_SET);
empty = true;
} else {
QTC::TC("qpdf", "QPDFParser treat word as string");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"unknown token while reading object;"
" treating as string"));
bad = true;
object = QPDFObjectHandle::newString(value);
}
}
break;
case QPDFTokenizer::tt_string:
{
std::string val = token.getValue();
if (decrypter) {
if (b_contents) {
contents_string = val;
contents_offset = input->getLastOffset();
b_contents = false;
}
decrypter->decryptString(val);
}
object = QPDFObjectHandle::newString(val);
}
break;
default:
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"treating unknown token type as null while "
"reading object"));
bad = true;
object = QPDFObjectHandle::newNull();
break;
}
if ((!object.isInitialized()) &&
(!((state == st_start) || (state == st_stop) ||
(state == st_eof)))) {
throw std::logic_error("QPDFObjectHandle::parseInternal: "
"unexpected uninitialized object");
object = QPDFObjectHandle::newNull();
}
if (bad) {
++bad_count;
good_count = 0;
} else {
++good_count;
if (good_count > 3) {
bad_count = 0;
}
}
if (bad_count > 5) {
// We had too many consecutive errors without enough
// intervening successful objects. Give up.
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"too many errors; giving up on reading object"));
state = st_top;
object = QPDFObjectHandle::newNull();
}
switch (state) {
case st_eof:
if (state_stack.size() > 1) {
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
input->getLastOffset(),
"parse error while reading object"));
}
done = true;
// In content stream mode, leave object uninitialized to
// indicate EOF
if (!content_stream) {
object = QPDFObjectHandle::newNull();
}
break;
case st_dictionary:
case st_array:
QPDFObjectHandle::setObjectDescriptionFromInput(
object,
context,
object_description,
input,
input->getLastOffset());
object.setParsedOffset(input->getLastOffset());
set_offset = true;
olist.append(object);
break;
case st_top:
done = true;
break;
case st_start:
break;
case st_stop:
if ((state_stack.size() < 2) || (olist_stack.size() < 2)) {
throw std::logic_error(
"QPDFObjectHandle::parseInternal: st_stop encountered"
" with insufficient elements in stack");
}
parser_state_e old_state = state_stack.back();
state_stack.pop_back();
if (old_state == st_array) {
// There's no newArray(SparseOHArray) since
// SparseOHArray is not part of the public API.
object = QPDFObjectHandle(QPDF_Array::create(olist));
QPDFObjectHandle::setObjectDescriptionFromInput(
object, context, object_description, input, offset);
// The `offset` points to the next of "[". Set the
// rewind offset to point to the beginning of "[".
// This has been explicitly tested with whitespace
// surrounding the array start delimiter.
// getLastOffset points to the array end token and
// therefore can't be used here.
object.setParsedOffset(offset - 1);
set_offset = true;
} else if (old_state == st_dictionary) {
// Convert list to map. Alternating elements are keys.
// Attempt to recover more or less gracefully from
// invalid dictionaries.
std::set<std::string> names;
size_t n_elements = olist.size();
for (size_t i = 0; i < n_elements; ++i) {
QPDFObjectHandle oh = olist.at(i);
if ((!oh.isIndirect()) && oh.isName()) {
names.insert(oh.getName());
}
}
std::map<std::string, QPDFObjectHandle> dict;
int next_fake_key = 1;
for (unsigned int i = 0; i < olist.size(); ++i) {
QPDFObjectHandle key_obj = olist.at(i);
QPDFObjectHandle val;
if (key_obj.isIndirect() || (!key_obj.isName())) {
bool found_fake = false;
std::string candidate;
while (!found_fake) {
candidate = "/QPDFFake" +
QUtil::int_to_string(next_fake_key++);
found_fake = (names.count(candidate) == 0);
QTC::TC(
"qpdf",
"QPDFParser found fake",
(found_fake ? 0 : 1));
}
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
offset,
"expected dictionary key but found"
" non-name object; inserting key " +
candidate));
val = key_obj;
key_obj = QPDFObjectHandle::newName(candidate);
} else if (i + 1 >= olist.size()) {
QTC::TC("qpdf", "QPDFParser no val for last key");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
offset,
"dictionary ended prematurely; "
"using null as value for last key"));
val = QPDFObjectHandle::newNull();
QPDFObjectHandle::setObjectDescriptionFromInput(
val, context, object_description, input, offset);
} else {
val = olist.at(++i);
}
std::string key = key_obj.getName();
if (dict.count(key) > 0) {
QTC::TC("qpdf", "QPDFParser duplicate dict key");
warn(
context,
QPDFExc(
qpdf_e_damaged_pdf,
input->getName(),
object_description,
offset,
"dictionary has duplicated key " + key +
"; last occurrence overrides earlier "
"ones"));
}
dict[key] = val;
}
if (!contents_string.empty() && dict.count("/Type") &&
dict["/Type"].isNameAndEquals("/Sig") &&
dict.count("/ByteRange") && dict.count("/Contents") &&
dict["/Contents"].isString()) {
dict["/Contents"] =
QPDFObjectHandle::newString(contents_string);
dict["/Contents"].setParsedOffset(contents_offset);
}
object = QPDFObjectHandle::newDictionary(dict);
QPDFObjectHandle::setObjectDescriptionFromInput(
object, context, object_description, input, offset);
// The `offset` points to the next of "<<". Set the
// rewind offset to point to the beginning of "<<".
// This has been explicitly tested with whitespace
// surrounding the dictionary start delimiter.
// getLastOffset points to the dictionary end token
// and therefore can't be used here.
object.setParsedOffset(offset - 2);
set_offset = true;
}
olist_stack.pop_back();
offset_stack.pop_back();
if (state_stack.back() == st_top) {
done = true;
} else {
olist_stack.back().append(object);
}
contents_string_stack.pop_back();
contents_offset_stack.pop_back();
}
}
if (!set_offset) {
QPDFObjectHandle::setObjectDescriptionFromInput(
object, context, object_description, input, offset);
object.setParsedOffset(offset);
}
return object;
}
void
QPDFParser::warn(QPDF* qpdf, QPDFExc const& e)
{
// If parsing on behalf of a QPDF object and want to give a
// warning, we can warn through the object. If parsing for some
// other reason, such as an explicit creation of an object from a
// string, then just throw the exception.
if (qpdf) {
qpdf->warn(e);
} else {
throw e;
}
}

View File

@ -0,0 +1,50 @@
#ifndef QPDFPARSER_HH
#define QPDFPARSER_HH
#include <qpdf/QPDFObjectHandle.hh>
#include <memory>
#include <string>
class QPDFParser
{
public:
QPDFParser() = delete;
QPDFParser(
std::shared_ptr<InputSource> input,
std::string const& object_description,
QPDFTokenizer& tokenizer,
QPDFObjectHandle::StringDecrypter* decrypter,
QPDF* context) :
input(input),
object_description(object_description),
tokenizer(tokenizer),
decrypter(decrypter),
context(context)
{
}
virtual ~QPDFParser() = default;
QPDFObjectHandle parse(bool& empty, bool content_stream);
private:
enum parser_state_e {
st_top,
st_start,
st_stop,
st_eof,
st_dictionary,
st_array
};
static void warn(QPDF*, QPDFExc const&);
void setParsedOffset(qpdf_offset_t offset);
std::shared_ptr<InputSource> input;
std::string const& object_description;
QPDFTokenizer& tokenizer;
QPDFObjectHandle::StringDecrypter* decrypter;
QPDF* context;
};
#endif // QPDFPARSER_HH

View File

@ -56,12 +56,12 @@ QPDF missing trailer 0
QPDF trailer lacks size 0
QPDF trailer size not integer 0
QPDF trailer prev not integer 0
QPDFObjectHandle bad brace 0
QPDFObjectHandle bad array close 0
QPDFParser bad brace 0
QPDFParser bad array close 0
QPDF stream without length 0
QPDF stream length not integer 0
QPDF missing endstream 0
QPDFObjectHandle bad dictionary close 0
QPDFParser bad dictionary close 0
QPDF can't find xref 0
QPDFTokenizer bad ) 0
QPDFTokenizer bad > 0
@ -215,7 +215,7 @@ QPDF not copying pages object 0
QPDF insert foreign page 0
QPDFWriter foreign object 0
QPDFWriter copy use_aes 1
QPDFObjectHandle indirect without context 0
QPDFParser indirect without context 0
QPDFObjectHandle trailing data in parse 0
QPDFJob pages encryption password 0
QPDFTokenizer EOF reading token 0
@ -257,9 +257,9 @@ qpdf-c called qpdf_set_deterministic_ID 0
QPDFObjectHandle indirect with 0 objid 0
QPDF object id 0 0
QPDF recursion loop in resolve 0
QPDFObjectHandle treat word as string 0
QPDFObjectHandle found fake 1
QPDFObjectHandle no val for last key 0
QPDFParser treat word as string 0
QPDFParser found fake 1
QPDFParser no val for last key 0
QPDF resolve failure to null 0
QPDFWriter preserve unreferenced standard 0
QPDFObjectHandle errors in parsecontent 0
@ -288,8 +288,8 @@ QPDFObjectHandle non-stream in stream array 0
QPDFObjectHandle coalesce called on stream 0
QPDFObjectHandle coalesce provide stream data 0
QPDF_Stream bad token at end during normalize 0
QPDFObjectHandle bad token in parse 0
QPDFObjectHandle eof in parseInternal 0
QPDFParser bad token in parse 0
QPDFParser eof in parse 0
QPDFObjectHandle array bounds 0
QPDFObjectHandle boolean returning false 0
QPDFObjectHandle integer returning 0 0
@ -317,7 +317,7 @@ QPDFObjectHandle numeric non-numeric 0
QPDFObjectHandle erase array bounds 0
qpdf-c called qpdf_check_pdf 0
QPDF xref loop 0
QPDFObjectHandle too deep 0
QPDFParser too deep 0
QPDFFormFieldObjectHelper non-trivial inheritance 0
QPDFFormFieldObjectHelper non-trivial qualified name 0
QPDFFormFieldObjectHelper TU present 0
@ -428,7 +428,7 @@ QPDF eof skipping spaces before xref 1
QPDF_encryption user matches owner V < 5 0
QPDF_encryption same password 1
QPDFWriter stream in ostream 0
QPDFObjectHandle duplicate dict key 0
QPDFParser duplicate dict key 0
QPDFWriter no encryption sig contents 0
QPDFPageObjectHelper colorspace lookup 0
QPDFWriter ignore XRef in qdf mode 0