mirror of
https://github.com/qpdf/qpdf.git
synced 2025-01-31 02:48:31 +00:00
Move QPDFObjectHandle::parseInternal to new class QPDFParser
Part of #729
This commit is contained in:
parent
0adfd74f8b
commit
6670c685ab
@ -49,6 +49,7 @@ class QPDF_Stream;
|
||||
class BitStream;
|
||||
class BitWriter;
|
||||
class QPDFLogger;
|
||||
class QPDFParser;
|
||||
|
||||
class QPDF
|
||||
{
|
||||
@ -881,7 +882,7 @@ class QPDF
|
||||
// resolution
|
||||
class ParseGuard
|
||||
{
|
||||
friend class QPDFObjectHandle;
|
||||
friend class QPDFParser;
|
||||
|
||||
private:
|
||||
ParseGuard(QPDF* qpdf) :
|
||||
|
@ -49,9 +49,12 @@ class QPDFTokenizer;
|
||||
class QPDFExc;
|
||||
class Pl_QPDFTokenizer;
|
||||
class QPDFMatrix;
|
||||
class QPDFParser;
|
||||
|
||||
class QPDFObjectHandle
|
||||
{
|
||||
friend class QPDFParser;
|
||||
|
||||
public:
|
||||
// This class is used by replaceStreamData. It provides an
|
||||
// alternative way of associating stream data with a stream. See
|
||||
@ -1563,15 +1566,6 @@ class QPDFObjectHandle
|
||||
QPDFObjectHandle(QPDF*, QPDFObjGen const& og);
|
||||
QPDFObjectHandle(std::shared_ptr<QPDFObject> const&);
|
||||
|
||||
enum parser_state_e {
|
||||
st_top,
|
||||
st_start,
|
||||
st_stop,
|
||||
st_eof,
|
||||
st_dictionary,
|
||||
st_array
|
||||
};
|
||||
|
||||
// Private object factory methods
|
||||
static QPDFObjectHandle newIndirect(QPDF*, QPDFObjGen const& og);
|
||||
static QPDFObjectHandle newStream(
|
||||
@ -1599,14 +1593,7 @@ class QPDFObjectHandle
|
||||
std::string const&,
|
||||
std::shared_ptr<InputSource>,
|
||||
qpdf_offset_t);
|
||||
static QPDFObjectHandle parseInternal(
|
||||
std::shared_ptr<InputSource> input,
|
||||
std::string const& object_description,
|
||||
QPDFTokenizer& tokenizer,
|
||||
bool& empty,
|
||||
StringDecrypter* decrypter,
|
||||
QPDF* context,
|
||||
bool content_stream);
|
||||
|
||||
void setParsedOffset(qpdf_offset_t offset);
|
||||
void parseContentStream_internal(
|
||||
std::string const& description, ParserCallbacks* callbacks);
|
||||
|
@ -80,6 +80,7 @@ set(libqpdf_SOURCES
|
||||
QPDFPageDocumentHelper.cc
|
||||
QPDFPageLabelDocumentHelper.cc
|
||||
QPDFPageObjectHelper.cc
|
||||
QPDFParser.cc
|
||||
QPDFStreamFilter.cc
|
||||
QPDFSystemError.cc
|
||||
QPDFTokenizer.cc
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include <qpdf/QPDFLogger.hh>
|
||||
#include <qpdf/QPDFMatrix.hh>
|
||||
#include <qpdf/QPDFPageObjectHelper.hh>
|
||||
#include <qpdf/QPDFParser.hh>
|
||||
#include <qpdf/QPDF_Array.hh>
|
||||
#include <qpdf/QPDF_Bool.hh>
|
||||
#include <qpdf/QPDF_Dictionary.hh>
|
||||
@ -1879,8 +1880,8 @@ QPDFObjectHandle::parseContentStream_data(
|
||||
tokenizer.readToken(input, "content", true);
|
||||
qpdf_offset_t offset = input->getLastOffset();
|
||||
input->seek(offset, SEEK_SET);
|
||||
QPDFObjectHandle obj = parseInternal(
|
||||
input, "content", tokenizer, empty, nullptr, context, true);
|
||||
auto obj = QPDFParser(input, "content", tokenizer, nullptr, context)
|
||||
.parse(empty, true);
|
||||
if (!obj.isInitialized()) {
|
||||
// EOF
|
||||
break;
|
||||
@ -1943,497 +1944,8 @@ QPDFObjectHandle::parse(
|
||||
StringDecrypter* decrypter,
|
||||
QPDF* context)
|
||||
{
|
||||
return parseInternal(
|
||||
input, object_description, tokenizer, empty, decrypter, context, false);
|
||||
}
|
||||
|
||||
QPDFObjectHandle
|
||||
QPDFObjectHandle::parseInternal(
|
||||
std::shared_ptr<InputSource> input,
|
||||
std::string const& object_description,
|
||||
QPDFTokenizer& tokenizer,
|
||||
bool& empty,
|
||||
StringDecrypter* decrypter,
|
||||
QPDF* context,
|
||||
bool content_stream)
|
||||
{
|
||||
// This method must take care not to resolve any objects. Don't
|
||||
// check the type of any object without first ensuring that it is
|
||||
// a direct object. Otherwise, doing so may have the side effect
|
||||
// of reading the object and changing the file pointer. If you do
|
||||
// this, it will cause a logic error to be thrown from
|
||||
// QPDF::inParse().
|
||||
|
||||
QPDF::ParseGuard pg(context);
|
||||
|
||||
empty = false;
|
||||
|
||||
QPDFObjectHandle object;
|
||||
bool set_offset = false;
|
||||
|
||||
std::vector<SparseOHArray> olist_stack;
|
||||
olist_stack.push_back(SparseOHArray());
|
||||
std::vector<parser_state_e> state_stack;
|
||||
state_stack.push_back(st_top);
|
||||
std::vector<qpdf_offset_t> offset_stack;
|
||||
qpdf_offset_t offset = input->tell();
|
||||
offset_stack.push_back(offset);
|
||||
bool done = false;
|
||||
int bad_count = 0;
|
||||
int good_count = 0;
|
||||
bool b_contents = false;
|
||||
std::vector<std::string> contents_string_stack;
|
||||
contents_string_stack.push_back("");
|
||||
std::vector<qpdf_offset_t> contents_offset_stack;
|
||||
contents_offset_stack.push_back(-1);
|
||||
while (!done) {
|
||||
bool bad = false;
|
||||
SparseOHArray& olist = olist_stack.back();
|
||||
parser_state_e state = state_stack.back();
|
||||
offset = offset_stack.back();
|
||||
std::string& contents_string = contents_string_stack.back();
|
||||
qpdf_offset_t& contents_offset = contents_offset_stack.back();
|
||||
|
||||
object = QPDFObjectHandle();
|
||||
set_offset = false;
|
||||
|
||||
QPDFTokenizer::Token token =
|
||||
tokenizer.readToken(input, object_description, true);
|
||||
std::string const& token_error_message = token.getErrorMessage();
|
||||
if (!token_error_message.empty()) {
|
||||
// Tokens other than tt_bad can still generate warnings.
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
token_error_message));
|
||||
}
|
||||
|
||||
switch (token.getType()) {
|
||||
case QPDFTokenizer::tt_eof:
|
||||
if (!content_stream) {
|
||||
QTC::TC("qpdf", "QPDFObjectHandle eof in parseInternal");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"unexpected EOF"));
|
||||
}
|
||||
bad = true;
|
||||
state = st_eof;
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_bad:
|
||||
QTC::TC("qpdf", "QPDFObjectHandle bad token in parse");
|
||||
bad = true;
|
||||
object = newNull();
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_brace_open:
|
||||
case QPDFTokenizer::tt_brace_close:
|
||||
QTC::TC("qpdf", "QPDFObjectHandle bad brace");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"treating unexpected brace token as null"));
|
||||
bad = true;
|
||||
object = newNull();
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_array_close:
|
||||
if (state == st_array) {
|
||||
state = st_stop;
|
||||
} else {
|
||||
QTC::TC("qpdf", "QPDFObjectHandle bad array close");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"treating unexpected array close token as null"));
|
||||
bad = true;
|
||||
object = newNull();
|
||||
}
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_dict_close:
|
||||
if (state == st_dictionary) {
|
||||
state = st_stop;
|
||||
} else {
|
||||
QTC::TC("qpdf", "QPDFObjectHandle bad dictionary close");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"unexpected dictionary close token"));
|
||||
bad = true;
|
||||
object = newNull();
|
||||
}
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_array_open:
|
||||
case QPDFTokenizer::tt_dict_open:
|
||||
if (olist_stack.size() > 500) {
|
||||
QTC::TC("qpdf", "QPDFObjectHandle too deep");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"ignoring excessively deeply nested data structure"));
|
||||
bad = true;
|
||||
object = newNull();
|
||||
state = st_top;
|
||||
} else {
|
||||
olist_stack.push_back(SparseOHArray());
|
||||
state = st_start;
|
||||
offset_stack.push_back(input->tell());
|
||||
state_stack.push_back(
|
||||
(token.getType() == QPDFTokenizer::tt_array_open)
|
||||
? st_array
|
||||
: st_dictionary);
|
||||
b_contents = false;
|
||||
contents_string_stack.push_back("");
|
||||
contents_offset_stack.push_back(-1);
|
||||
}
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_bool:
|
||||
object = newBool((token.getValue() == "true"));
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_null:
|
||||
object = newNull();
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_integer:
|
||||
object = newInteger(QUtil::string_to_ll(token.getValue().c_str()));
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_real:
|
||||
object = newReal(token.getValue());
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_name:
|
||||
{
|
||||
std::string name = token.getValue();
|
||||
object = newName(name);
|
||||
|
||||
if (name == "/Contents") {
|
||||
b_contents = true;
|
||||
} else {
|
||||
b_contents = false;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_word:
|
||||
{
|
||||
std::string const& value = token.getValue();
|
||||
if (content_stream) {
|
||||
object = QPDFObjectHandle::newOperator(value);
|
||||
} else if (
|
||||
(value == "R") && (state != st_top) &&
|
||||
(olist.size() >= 2) &&
|
||||
(!olist.at(olist.size() - 1).isIndirect()) &&
|
||||
(olist.at(olist.size() - 1).isInteger()) &&
|
||||
(!olist.at(olist.size() - 2).isIndirect()) &&
|
||||
(olist.at(olist.size() - 2).isInteger())) {
|
||||
if (context == nullptr) {
|
||||
QTC::TC(
|
||||
"qpdf",
|
||||
"QPDFObjectHandle indirect without context");
|
||||
throw std::logic_error(
|
||||
"QPDFObjectHandle::parse called without context"
|
||||
" on an object with indirect references");
|
||||
}
|
||||
// Try to resolve indirect objects
|
||||
object = newIndirect(
|
||||
context,
|
||||
QPDFObjGen(
|
||||
olist.at(olist.size() - 2).getIntValueAsInt(),
|
||||
olist.at(olist.size() - 1).getIntValueAsInt()));
|
||||
olist.remove_last();
|
||||
olist.remove_last();
|
||||
} else if ((value == "endobj") && (state == st_top)) {
|
||||
// We just saw endobj without having read
|
||||
// anything. Treat this as a null and do not move
|
||||
// the input source's offset.
|
||||
object = newNull();
|
||||
input->seek(input->getLastOffset(), SEEK_SET);
|
||||
empty = true;
|
||||
} else {
|
||||
QTC::TC("qpdf", "QPDFObjectHandle treat word as string");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"unknown token while reading object;"
|
||||
" treating as string"));
|
||||
bad = true;
|
||||
object = newString(value);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_string:
|
||||
{
|
||||
std::string val = token.getValue();
|
||||
if (decrypter) {
|
||||
if (b_contents) {
|
||||
contents_string = val;
|
||||
contents_offset = input->getLastOffset();
|
||||
b_contents = false;
|
||||
}
|
||||
decrypter->decryptString(val);
|
||||
}
|
||||
object = QPDFObjectHandle::newString(val);
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"treating unknown token type as null while "
|
||||
"reading object"));
|
||||
bad = true;
|
||||
object = newNull();
|
||||
break;
|
||||
}
|
||||
|
||||
if ((!object.isInitialized()) &&
|
||||
(!((state == st_start) || (state == st_stop) ||
|
||||
(state == st_eof)))) {
|
||||
throw std::logic_error("QPDFObjectHandle::parseInternal: "
|
||||
"unexpected uninitialized object");
|
||||
object = newNull();
|
||||
}
|
||||
|
||||
if (bad) {
|
||||
++bad_count;
|
||||
good_count = 0;
|
||||
} else {
|
||||
++good_count;
|
||||
if (good_count > 3) {
|
||||
bad_count = 0;
|
||||
}
|
||||
}
|
||||
if (bad_count > 5) {
|
||||
// We had too many consecutive errors without enough
|
||||
// intervening successful objects. Give up.
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"too many errors; giving up on reading object"));
|
||||
state = st_top;
|
||||
object = newNull();
|
||||
}
|
||||
|
||||
switch (state) {
|
||||
case st_eof:
|
||||
if (state_stack.size() > 1) {
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"parse error while reading object"));
|
||||
}
|
||||
done = true;
|
||||
// In content stream mode, leave object uninitialized to
|
||||
// indicate EOF
|
||||
if (!content_stream) {
|
||||
object = newNull();
|
||||
}
|
||||
break;
|
||||
|
||||
case st_dictionary:
|
||||
case st_array:
|
||||
setObjectDescriptionFromInput(
|
||||
object,
|
||||
context,
|
||||
object_description,
|
||||
input,
|
||||
input->getLastOffset());
|
||||
object.setParsedOffset(input->getLastOffset());
|
||||
set_offset = true;
|
||||
olist.append(object);
|
||||
break;
|
||||
|
||||
case st_top:
|
||||
done = true;
|
||||
break;
|
||||
|
||||
case st_start:
|
||||
break;
|
||||
|
||||
case st_stop:
|
||||
if ((state_stack.size() < 2) || (olist_stack.size() < 2)) {
|
||||
throw std::logic_error(
|
||||
"QPDFObjectHandle::parseInternal: st_stop encountered"
|
||||
" with insufficient elements in stack");
|
||||
}
|
||||
parser_state_e old_state = state_stack.back();
|
||||
state_stack.pop_back();
|
||||
if (old_state == st_array) {
|
||||
// There's no newArray(SparseOHArray) since
|
||||
// SparseOHArray is not part of the public API.
|
||||
object = QPDFObjectHandle(QPDF_Array::create(olist));
|
||||
setObjectDescriptionFromInput(
|
||||
object, context, object_description, input, offset);
|
||||
// The `offset` points to the next of "[". Set the
|
||||
// rewind offset to point to the beginning of "[".
|
||||
// This has been explicitly tested with whitespace
|
||||
// surrounding the array start delimiter.
|
||||
// getLastOffset points to the array end token and
|
||||
// therefore can't be used here.
|
||||
object.setParsedOffset(offset - 1);
|
||||
set_offset = true;
|
||||
} else if (old_state == st_dictionary) {
|
||||
// Convert list to map. Alternating elements are keys.
|
||||
// Attempt to recover more or less gracefully from
|
||||
// invalid dictionaries.
|
||||
std::set<std::string> names;
|
||||
size_t n_elements = olist.size();
|
||||
for (size_t i = 0; i < n_elements; ++i) {
|
||||
QPDFObjectHandle oh = olist.at(i);
|
||||
if ((!oh.isIndirect()) && oh.isName()) {
|
||||
names.insert(oh.getName());
|
||||
}
|
||||
}
|
||||
|
||||
std::map<std::string, QPDFObjectHandle> dict;
|
||||
int next_fake_key = 1;
|
||||
for (unsigned int i = 0; i < olist.size(); ++i) {
|
||||
QPDFObjectHandle key_obj = olist.at(i);
|
||||
QPDFObjectHandle val;
|
||||
if (key_obj.isIndirect() || (!key_obj.isName())) {
|
||||
bool found_fake = false;
|
||||
std::string candidate;
|
||||
while (!found_fake) {
|
||||
candidate = "/QPDFFake" +
|
||||
QUtil::int_to_string(next_fake_key++);
|
||||
found_fake = (names.count(candidate) == 0);
|
||||
QTC::TC(
|
||||
"qpdf",
|
||||
"QPDFObjectHandle found fake",
|
||||
(found_fake ? 0 : 1));
|
||||
}
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
offset,
|
||||
"expected dictionary key but found"
|
||||
" non-name object; inserting key " +
|
||||
candidate));
|
||||
val = key_obj;
|
||||
key_obj = newName(candidate);
|
||||
} else if (i + 1 >= olist.size()) {
|
||||
QTC::TC("qpdf", "QPDFObjectHandle no val for last key");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
offset,
|
||||
"dictionary ended prematurely; "
|
||||
"using null as value for last key"));
|
||||
val = newNull();
|
||||
setObjectDescriptionFromInput(
|
||||
val, context, object_description, input, offset);
|
||||
} else {
|
||||
val = olist.at(++i);
|
||||
}
|
||||
std::string key = key_obj.getName();
|
||||
if (dict.count(key) > 0) {
|
||||
QTC::TC("qpdf", "QPDFObjectHandle duplicate dict key");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
offset,
|
||||
"dictionary has duplicated key " + key +
|
||||
"; last occurrence overrides earlier "
|
||||
"ones"));
|
||||
}
|
||||
dict[key] = val;
|
||||
}
|
||||
if (!contents_string.empty() && dict.count("/Type") &&
|
||||
dict["/Type"].isNameAndEquals("/Sig") &&
|
||||
dict.count("/ByteRange") && dict.count("/Contents") &&
|
||||
dict["/Contents"].isString()) {
|
||||
dict["/Contents"] =
|
||||
QPDFObjectHandle::newString(contents_string);
|
||||
dict["/Contents"].setParsedOffset(contents_offset);
|
||||
}
|
||||
object = newDictionary(dict);
|
||||
setObjectDescriptionFromInput(
|
||||
object, context, object_description, input, offset);
|
||||
// The `offset` points to the next of "<<". Set the
|
||||
// rewind offset to point to the beginning of "<<".
|
||||
// This has been explicitly tested with whitespace
|
||||
// surrounding the dictionary start delimiter.
|
||||
// getLastOffset points to the dictionary end token
|
||||
// and therefore can't be used here.
|
||||
object.setParsedOffset(offset - 2);
|
||||
set_offset = true;
|
||||
}
|
||||
olist_stack.pop_back();
|
||||
offset_stack.pop_back();
|
||||
if (state_stack.back() == st_top) {
|
||||
done = true;
|
||||
} else {
|
||||
olist_stack.back().append(object);
|
||||
}
|
||||
contents_string_stack.pop_back();
|
||||
contents_offset_stack.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
if (!set_offset) {
|
||||
setObjectDescriptionFromInput(
|
||||
object, context, object_description, input, offset);
|
||||
object.setParsedOffset(offset);
|
||||
}
|
||||
return object;
|
||||
return QPDFParser(input, object_description, tokenizer, decrypter, context)
|
||||
.parse(empty, false);
|
||||
}
|
||||
|
||||
qpdf_offset_t
|
||||
|
503
libqpdf/QPDFParser.cc
Normal file
503
libqpdf/QPDFParser.cc
Normal file
@ -0,0 +1,503 @@
|
||||
#include <qpdf/QPDFParser.hh>
|
||||
|
||||
#include <qpdf/QPDF.hh>
|
||||
#include <qpdf/QPDFObjectHandle.hh>
|
||||
#include <qpdf/QPDF_Array.hh>
|
||||
#include <qpdf/QTC.hh>
|
||||
#include <qpdf/QUtil.hh>
|
||||
#include <qpdf/SparseOHArray.hh>
|
||||
|
||||
QPDFObjectHandle
|
||||
QPDFParser::parse(bool& empty, bool content_stream)
|
||||
{
|
||||
// This method must take care not to resolve any objects. Don't
|
||||
// check the type of any object without first ensuring that it is
|
||||
// a direct object. Otherwise, doing so may have the side effect
|
||||
// of reading the object and changing the file pointer. If you do
|
||||
// this, it will cause a logic error to be thrown from
|
||||
// QPDF::inParse().
|
||||
|
||||
QPDF::ParseGuard pg(context);
|
||||
|
||||
empty = false;
|
||||
|
||||
QPDFObjectHandle object;
|
||||
bool set_offset = false;
|
||||
|
||||
std::vector<SparseOHArray> olist_stack;
|
||||
olist_stack.push_back(SparseOHArray());
|
||||
std::vector<parser_state_e> state_stack;
|
||||
state_stack.push_back(st_top);
|
||||
std::vector<qpdf_offset_t> offset_stack;
|
||||
qpdf_offset_t offset = input->tell();
|
||||
offset_stack.push_back(offset);
|
||||
bool done = false;
|
||||
int bad_count = 0;
|
||||
int good_count = 0;
|
||||
bool b_contents = false;
|
||||
std::vector<std::string> contents_string_stack;
|
||||
contents_string_stack.push_back("");
|
||||
std::vector<qpdf_offset_t> contents_offset_stack;
|
||||
contents_offset_stack.push_back(-1);
|
||||
while (!done) {
|
||||
bool bad = false;
|
||||
SparseOHArray& olist = olist_stack.back();
|
||||
parser_state_e state = state_stack.back();
|
||||
offset = offset_stack.back();
|
||||
std::string& contents_string = contents_string_stack.back();
|
||||
qpdf_offset_t& contents_offset = contents_offset_stack.back();
|
||||
|
||||
object = QPDFObjectHandle();
|
||||
set_offset = false;
|
||||
|
||||
QPDFTokenizer::Token token =
|
||||
tokenizer.readToken(input, object_description, true);
|
||||
std::string const& token_error_message = token.getErrorMessage();
|
||||
if (!token_error_message.empty()) {
|
||||
// Tokens other than tt_bad can still generate warnings.
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
token_error_message));
|
||||
}
|
||||
|
||||
switch (token.getType()) {
|
||||
case QPDFTokenizer::tt_eof:
|
||||
if (!content_stream) {
|
||||
QTC::TC("qpdf", "QPDFParser eof in parse");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"unexpected EOF"));
|
||||
}
|
||||
bad = true;
|
||||
state = st_eof;
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_bad:
|
||||
QTC::TC("qpdf", "QPDFParser bad token in parse");
|
||||
bad = true;
|
||||
object = QPDFObjectHandle::newNull();
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_brace_open:
|
||||
case QPDFTokenizer::tt_brace_close:
|
||||
QTC::TC("qpdf", "QPDFParser bad brace");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"treating unexpected brace token as null"));
|
||||
bad = true;
|
||||
object = QPDFObjectHandle::newNull();
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_array_close:
|
||||
if (state == st_array) {
|
||||
state = st_stop;
|
||||
} else {
|
||||
QTC::TC("qpdf", "QPDFParser bad array close");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"treating unexpected array close token as null"));
|
||||
bad = true;
|
||||
object = QPDFObjectHandle::newNull();
|
||||
}
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_dict_close:
|
||||
if (state == st_dictionary) {
|
||||
state = st_stop;
|
||||
} else {
|
||||
QTC::TC("qpdf", "QPDFParser bad dictionary close");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"unexpected dictionary close token"));
|
||||
bad = true;
|
||||
object = QPDFObjectHandle::newNull();
|
||||
}
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_array_open:
|
||||
case QPDFTokenizer::tt_dict_open:
|
||||
if (olist_stack.size() > 500) {
|
||||
QTC::TC("qpdf", "QPDFParser too deep");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"ignoring excessively deeply nested data structure"));
|
||||
bad = true;
|
||||
object = QPDFObjectHandle::newNull();
|
||||
state = st_top;
|
||||
} else {
|
||||
olist_stack.push_back(SparseOHArray());
|
||||
state = st_start;
|
||||
offset_stack.push_back(input->tell());
|
||||
state_stack.push_back(
|
||||
(token.getType() == QPDFTokenizer::tt_array_open)
|
||||
? st_array
|
||||
: st_dictionary);
|
||||
b_contents = false;
|
||||
contents_string_stack.push_back("");
|
||||
contents_offset_stack.push_back(-1);
|
||||
}
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_bool:
|
||||
object = QPDFObjectHandle::newBool((token.getValue() == "true"));
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_null:
|
||||
object = QPDFObjectHandle::newNull();
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_integer:
|
||||
object = QPDFObjectHandle::newInteger(
|
||||
QUtil::string_to_ll(token.getValue().c_str()));
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_real:
|
||||
object = QPDFObjectHandle::newReal(token.getValue());
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_name:
|
||||
{
|
||||
std::string name = token.getValue();
|
||||
object = QPDFObjectHandle::newName(name);
|
||||
|
||||
if (name == "/Contents") {
|
||||
b_contents = true;
|
||||
} else {
|
||||
b_contents = false;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_word:
|
||||
{
|
||||
std::string const& value = token.getValue();
|
||||
if (content_stream) {
|
||||
object = QPDFObjectHandle::newOperator(value);
|
||||
} else if (
|
||||
(value == "R") && (state != st_top) &&
|
||||
(olist.size() >= 2) &&
|
||||
(!olist.at(olist.size() - 1).isIndirect()) &&
|
||||
(olist.at(olist.size() - 1).isInteger()) &&
|
||||
(!olist.at(olist.size() - 2).isIndirect()) &&
|
||||
(olist.at(olist.size() - 2).isInteger())) {
|
||||
if (context == nullptr) {
|
||||
QTC::TC("qpdf", "QPDFParser indirect without context");
|
||||
throw std::logic_error(
|
||||
"QPDFObjectHandle::parse called without context"
|
||||
" on an object with indirect references");
|
||||
}
|
||||
// Try to resolve indirect objects
|
||||
object = QPDFObjectHandle::newIndirect(
|
||||
context,
|
||||
QPDFObjGen(
|
||||
olist.at(olist.size() - 2).getIntValueAsInt(),
|
||||
olist.at(olist.size() - 1).getIntValueAsInt()));
|
||||
olist.remove_last();
|
||||
olist.remove_last();
|
||||
} else if ((value == "endobj") && (state == st_top)) {
|
||||
// We just saw endobj without having read
|
||||
// anything. Treat this as a null and do not move
|
||||
// the input source's offset.
|
||||
object = QPDFObjectHandle::newNull();
|
||||
input->seek(input->getLastOffset(), SEEK_SET);
|
||||
empty = true;
|
||||
} else {
|
||||
QTC::TC("qpdf", "QPDFParser treat word as string");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"unknown token while reading object;"
|
||||
" treating as string"));
|
||||
bad = true;
|
||||
object = QPDFObjectHandle::newString(value);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case QPDFTokenizer::tt_string:
|
||||
{
|
||||
std::string val = token.getValue();
|
||||
if (decrypter) {
|
||||
if (b_contents) {
|
||||
contents_string = val;
|
||||
contents_offset = input->getLastOffset();
|
||||
b_contents = false;
|
||||
}
|
||||
decrypter->decryptString(val);
|
||||
}
|
||||
object = QPDFObjectHandle::newString(val);
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"treating unknown token type as null while "
|
||||
"reading object"));
|
||||
bad = true;
|
||||
object = QPDFObjectHandle::newNull();
|
||||
break;
|
||||
}
|
||||
|
||||
if ((!object.isInitialized()) &&
|
||||
(!((state == st_start) || (state == st_stop) ||
|
||||
(state == st_eof)))) {
|
||||
throw std::logic_error("QPDFObjectHandle::parseInternal: "
|
||||
"unexpected uninitialized object");
|
||||
object = QPDFObjectHandle::newNull();
|
||||
}
|
||||
|
||||
if (bad) {
|
||||
++bad_count;
|
||||
good_count = 0;
|
||||
} else {
|
||||
++good_count;
|
||||
if (good_count > 3) {
|
||||
bad_count = 0;
|
||||
}
|
||||
}
|
||||
if (bad_count > 5) {
|
||||
// We had too many consecutive errors without enough
|
||||
// intervening successful objects. Give up.
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"too many errors; giving up on reading object"));
|
||||
state = st_top;
|
||||
object = QPDFObjectHandle::newNull();
|
||||
}
|
||||
|
||||
switch (state) {
|
||||
case st_eof:
|
||||
if (state_stack.size() > 1) {
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
input->getLastOffset(),
|
||||
"parse error while reading object"));
|
||||
}
|
||||
done = true;
|
||||
// In content stream mode, leave object uninitialized to
|
||||
// indicate EOF
|
||||
if (!content_stream) {
|
||||
object = QPDFObjectHandle::newNull();
|
||||
}
|
||||
break;
|
||||
|
||||
case st_dictionary:
|
||||
case st_array:
|
||||
QPDFObjectHandle::setObjectDescriptionFromInput(
|
||||
object,
|
||||
context,
|
||||
object_description,
|
||||
input,
|
||||
input->getLastOffset());
|
||||
object.setParsedOffset(input->getLastOffset());
|
||||
set_offset = true;
|
||||
olist.append(object);
|
||||
break;
|
||||
|
||||
case st_top:
|
||||
done = true;
|
||||
break;
|
||||
|
||||
case st_start:
|
||||
break;
|
||||
|
||||
case st_stop:
|
||||
if ((state_stack.size() < 2) || (olist_stack.size() < 2)) {
|
||||
throw std::logic_error(
|
||||
"QPDFObjectHandle::parseInternal: st_stop encountered"
|
||||
" with insufficient elements in stack");
|
||||
}
|
||||
parser_state_e old_state = state_stack.back();
|
||||
state_stack.pop_back();
|
||||
if (old_state == st_array) {
|
||||
// There's no newArray(SparseOHArray) since
|
||||
// SparseOHArray is not part of the public API.
|
||||
object = QPDFObjectHandle(QPDF_Array::create(olist));
|
||||
QPDFObjectHandle::setObjectDescriptionFromInput(
|
||||
object, context, object_description, input, offset);
|
||||
// The `offset` points to the next of "[". Set the
|
||||
// rewind offset to point to the beginning of "[".
|
||||
// This has been explicitly tested with whitespace
|
||||
// surrounding the array start delimiter.
|
||||
// getLastOffset points to the array end token and
|
||||
// therefore can't be used here.
|
||||
object.setParsedOffset(offset - 1);
|
||||
set_offset = true;
|
||||
} else if (old_state == st_dictionary) {
|
||||
// Convert list to map. Alternating elements are keys.
|
||||
// Attempt to recover more or less gracefully from
|
||||
// invalid dictionaries.
|
||||
std::set<std::string> names;
|
||||
size_t n_elements = olist.size();
|
||||
for (size_t i = 0; i < n_elements; ++i) {
|
||||
QPDFObjectHandle oh = olist.at(i);
|
||||
if ((!oh.isIndirect()) && oh.isName()) {
|
||||
names.insert(oh.getName());
|
||||
}
|
||||
}
|
||||
|
||||
std::map<std::string, QPDFObjectHandle> dict;
|
||||
int next_fake_key = 1;
|
||||
for (unsigned int i = 0; i < olist.size(); ++i) {
|
||||
QPDFObjectHandle key_obj = olist.at(i);
|
||||
QPDFObjectHandle val;
|
||||
if (key_obj.isIndirect() || (!key_obj.isName())) {
|
||||
bool found_fake = false;
|
||||
std::string candidate;
|
||||
while (!found_fake) {
|
||||
candidate = "/QPDFFake" +
|
||||
QUtil::int_to_string(next_fake_key++);
|
||||
found_fake = (names.count(candidate) == 0);
|
||||
QTC::TC(
|
||||
"qpdf",
|
||||
"QPDFParser found fake",
|
||||
(found_fake ? 0 : 1));
|
||||
}
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
offset,
|
||||
"expected dictionary key but found"
|
||||
" non-name object; inserting key " +
|
||||
candidate));
|
||||
val = key_obj;
|
||||
key_obj = QPDFObjectHandle::newName(candidate);
|
||||
} else if (i + 1 >= olist.size()) {
|
||||
QTC::TC("qpdf", "QPDFParser no val for last key");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
offset,
|
||||
"dictionary ended prematurely; "
|
||||
"using null as value for last key"));
|
||||
val = QPDFObjectHandle::newNull();
|
||||
QPDFObjectHandle::setObjectDescriptionFromInput(
|
||||
val, context, object_description, input, offset);
|
||||
} else {
|
||||
val = olist.at(++i);
|
||||
}
|
||||
std::string key = key_obj.getName();
|
||||
if (dict.count(key) > 0) {
|
||||
QTC::TC("qpdf", "QPDFParser duplicate dict key");
|
||||
warn(
|
||||
context,
|
||||
QPDFExc(
|
||||
qpdf_e_damaged_pdf,
|
||||
input->getName(),
|
||||
object_description,
|
||||
offset,
|
||||
"dictionary has duplicated key " + key +
|
||||
"; last occurrence overrides earlier "
|
||||
"ones"));
|
||||
}
|
||||
dict[key] = val;
|
||||
}
|
||||
if (!contents_string.empty() && dict.count("/Type") &&
|
||||
dict["/Type"].isNameAndEquals("/Sig") &&
|
||||
dict.count("/ByteRange") && dict.count("/Contents") &&
|
||||
dict["/Contents"].isString()) {
|
||||
dict["/Contents"] =
|
||||
QPDFObjectHandle::newString(contents_string);
|
||||
dict["/Contents"].setParsedOffset(contents_offset);
|
||||
}
|
||||
object = QPDFObjectHandle::newDictionary(dict);
|
||||
QPDFObjectHandle::setObjectDescriptionFromInput(
|
||||
object, context, object_description, input, offset);
|
||||
// The `offset` points to the next of "<<". Set the
|
||||
// rewind offset to point to the beginning of "<<".
|
||||
// This has been explicitly tested with whitespace
|
||||
// surrounding the dictionary start delimiter.
|
||||
// getLastOffset points to the dictionary end token
|
||||
// and therefore can't be used here.
|
||||
object.setParsedOffset(offset - 2);
|
||||
set_offset = true;
|
||||
}
|
||||
olist_stack.pop_back();
|
||||
offset_stack.pop_back();
|
||||
if (state_stack.back() == st_top) {
|
||||
done = true;
|
||||
} else {
|
||||
olist_stack.back().append(object);
|
||||
}
|
||||
contents_string_stack.pop_back();
|
||||
contents_offset_stack.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
if (!set_offset) {
|
||||
QPDFObjectHandle::setObjectDescriptionFromInput(
|
||||
object, context, object_description, input, offset);
|
||||
object.setParsedOffset(offset);
|
||||
}
|
||||
return object;
|
||||
}
|
||||
|
||||
void
|
||||
QPDFParser::warn(QPDF* qpdf, QPDFExc const& e)
|
||||
{
|
||||
// If parsing on behalf of a QPDF object and want to give a
|
||||
// warning, we can warn through the object. If parsing for some
|
||||
// other reason, such as an explicit creation of an object from a
|
||||
// string, then just throw the exception.
|
||||
if (qpdf) {
|
||||
qpdf->warn(e);
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
50
libqpdf/qpdf/QPDFParser.hh
Normal file
50
libqpdf/qpdf/QPDFParser.hh
Normal file
@ -0,0 +1,50 @@
|
||||
#ifndef QPDFPARSER_HH
|
||||
#define QPDFPARSER_HH
|
||||
|
||||
#include <qpdf/QPDFObjectHandle.hh>
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
class QPDFParser
|
||||
{
|
||||
public:
|
||||
QPDFParser() = delete;
|
||||
QPDFParser(
|
||||
std::shared_ptr<InputSource> input,
|
||||
std::string const& object_description,
|
||||
QPDFTokenizer& tokenizer,
|
||||
QPDFObjectHandle::StringDecrypter* decrypter,
|
||||
QPDF* context) :
|
||||
input(input),
|
||||
object_description(object_description),
|
||||
tokenizer(tokenizer),
|
||||
decrypter(decrypter),
|
||||
context(context)
|
||||
{
|
||||
}
|
||||
virtual ~QPDFParser() = default;
|
||||
|
||||
QPDFObjectHandle parse(bool& empty, bool content_stream);
|
||||
|
||||
private:
|
||||
enum parser_state_e {
|
||||
st_top,
|
||||
st_start,
|
||||
st_stop,
|
||||
st_eof,
|
||||
st_dictionary,
|
||||
st_array
|
||||
};
|
||||
|
||||
static void warn(QPDF*, QPDFExc const&);
|
||||
void setParsedOffset(qpdf_offset_t offset);
|
||||
|
||||
std::shared_ptr<InputSource> input;
|
||||
std::string const& object_description;
|
||||
QPDFTokenizer& tokenizer;
|
||||
QPDFObjectHandle::StringDecrypter* decrypter;
|
||||
QPDF* context;
|
||||
};
|
||||
|
||||
#endif // QPDFPARSER_HH
|
@ -56,12 +56,12 @@ QPDF missing trailer 0
|
||||
QPDF trailer lacks size 0
|
||||
QPDF trailer size not integer 0
|
||||
QPDF trailer prev not integer 0
|
||||
QPDFObjectHandle bad brace 0
|
||||
QPDFObjectHandle bad array close 0
|
||||
QPDFParser bad brace 0
|
||||
QPDFParser bad array close 0
|
||||
QPDF stream without length 0
|
||||
QPDF stream length not integer 0
|
||||
QPDF missing endstream 0
|
||||
QPDFObjectHandle bad dictionary close 0
|
||||
QPDFParser bad dictionary close 0
|
||||
QPDF can't find xref 0
|
||||
QPDFTokenizer bad ) 0
|
||||
QPDFTokenizer bad > 0
|
||||
@ -215,7 +215,7 @@ QPDF not copying pages object 0
|
||||
QPDF insert foreign page 0
|
||||
QPDFWriter foreign object 0
|
||||
QPDFWriter copy use_aes 1
|
||||
QPDFObjectHandle indirect without context 0
|
||||
QPDFParser indirect without context 0
|
||||
QPDFObjectHandle trailing data in parse 0
|
||||
QPDFJob pages encryption password 0
|
||||
QPDFTokenizer EOF reading token 0
|
||||
@ -257,9 +257,9 @@ qpdf-c called qpdf_set_deterministic_ID 0
|
||||
QPDFObjectHandle indirect with 0 objid 0
|
||||
QPDF object id 0 0
|
||||
QPDF recursion loop in resolve 0
|
||||
QPDFObjectHandle treat word as string 0
|
||||
QPDFObjectHandle found fake 1
|
||||
QPDFObjectHandle no val for last key 0
|
||||
QPDFParser treat word as string 0
|
||||
QPDFParser found fake 1
|
||||
QPDFParser no val for last key 0
|
||||
QPDF resolve failure to null 0
|
||||
QPDFWriter preserve unreferenced standard 0
|
||||
QPDFObjectHandle errors in parsecontent 0
|
||||
@ -288,8 +288,8 @@ QPDFObjectHandle non-stream in stream array 0
|
||||
QPDFObjectHandle coalesce called on stream 0
|
||||
QPDFObjectHandle coalesce provide stream data 0
|
||||
QPDF_Stream bad token at end during normalize 0
|
||||
QPDFObjectHandle bad token in parse 0
|
||||
QPDFObjectHandle eof in parseInternal 0
|
||||
QPDFParser bad token in parse 0
|
||||
QPDFParser eof in parse 0
|
||||
QPDFObjectHandle array bounds 0
|
||||
QPDFObjectHandle boolean returning false 0
|
||||
QPDFObjectHandle integer returning 0 0
|
||||
@ -317,7 +317,7 @@ QPDFObjectHandle numeric non-numeric 0
|
||||
QPDFObjectHandle erase array bounds 0
|
||||
qpdf-c called qpdf_check_pdf 0
|
||||
QPDF xref loop 0
|
||||
QPDFObjectHandle too deep 0
|
||||
QPDFParser too deep 0
|
||||
QPDFFormFieldObjectHelper non-trivial inheritance 0
|
||||
QPDFFormFieldObjectHelper non-trivial qualified name 0
|
||||
QPDFFormFieldObjectHelper TU present 0
|
||||
@ -428,7 +428,7 @@ QPDF eof skipping spaces before xref 1
|
||||
QPDF_encryption user matches owner V < 5 0
|
||||
QPDF_encryption same password 1
|
||||
QPDFWriter stream in ostream 0
|
||||
QPDFObjectHandle duplicate dict key 0
|
||||
QPDFParser duplicate dict key 0
|
||||
QPDFWriter no encryption sig contents 0
|
||||
QPDFPageObjectHelper colorspace lookup 0
|
||||
QPDFWriter ignore XRef in qdf mode 0
|
||||
|
Loading…
x
Reference in New Issue
Block a user