diff --git a/ChangeLog b/ChangeLog index 1b76b011..2bb4564a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2012-07-21 Jay Berkenbilt + + * Add new method QPDFObjectHandle::replaceDict to replace a + stream's dictionary. Use with caution; see comments in + QPDFObjectHandle.hh. + + * Add new method QPDFObjectHandle::parse for creation of + QPDFObjectHandle objects from string representations of the + objects. Thanks to Tobias Hoffmann for the idea. + 2012-07-15 Jay Berkenbilt * add new QPDF::isEncrypted method that returns some additional diff --git a/TODO b/TODO index cf752daf..42d5e7a6 100644 --- a/TODO +++ b/TODO @@ -20,16 +20,14 @@ Next * Make sure that the release notes call attention to the one API breaking change: removal of length from replaceStreamData. - * Add a way to create new QPDFObjectHandles with a string - representation of them, such as - QPDFObjectHandle::parse("<< /a 1 /b 2 >>"); - * Document thread safety: One individual QPDF or QPDFWriter object can only be used by one thread at a time, but multiple threads can simultaneously use separate objects. * Write some documentation about the design of copyForeignObject. + * Mention QPDFObjectHandle::parse in the documentation. + * copyForeignObject still to do: - qpdf command diff --git a/examples/pdf-create.cc b/examples/pdf-create.cc index a9ad2389..902c6805 100644 --- a/examples/pdf-create.cc +++ b/examples/pdf-create.cc @@ -81,24 +81,28 @@ static void create_pdf(char const* filename) // Add an indirect object to contain a font descriptor for the // built-in Helvetica font. QPDFObjectHandle font = pdf.makeIndirectObject( - QPDFObjectHandle::newDictionary()); - font.replaceKey("/Type", newName("/Font")); - font.replaceKey("/Subtype", newName("/Type1")); - font.replaceKey("/Name", newName("/F1")); - font.replaceKey("/BaseFont", newName("/Helvetica")); - font.replaceKey("/Encoding", newName("/WinAnsiEncoding")); + QPDFObjectHandle::parse( + "<<" + " /Type /Font" + " /Subtype /Type1" + " /Name /F1" + " /BaseFont /Helvetica" + " /Encoding /WinAnsiEncoding" + ">>")); // Create a stream to encode our image. We don't have to set the // length or filters. QPDFWriter will fill in the length and // compress the stream data using FlateDecode by default. QPDFObjectHandle image = QPDFObjectHandle::newStream(&pdf); - QPDFObjectHandle image_dict = image.getDict(); - image_dict.replaceKey("/Type", newName("/XObject")); - image_dict.replaceKey("/Subtype", newName("/Image")); - image_dict.replaceKey("/ColorSpace", newName("/DeviceRGB")); - image_dict.replaceKey("/BitsPerComponent", newInteger(8)); - image_dict.replaceKey("/Width", newInteger(100)); - image_dict.replaceKey("/Height", newInteger(100)); + image.replaceDict(QPDFObjectHandle::parse( + "<<" + " /Type /XObject" + " /Subtype /Image" + " /ColorSpace /DeviceRGB" + " /BitsPerComponent 8" + " /Width 100" + " /Height 100" + ">>")); // Provide the stream data. ImageProvider* p = new ImageProvider(100, 100); PointerHolder provider(p); @@ -107,10 +111,8 @@ static void create_pdf(char const* filename) QPDFObjectHandle::newNull()); // Create direct objects as needed by the page dictionary. - QPDFObjectHandle procset = QPDFObjectHandle::newArray(); - procset.appendItem(newName("/PDF")); - procset.appendItem(newName("/Text")); - procset.appendItem(newName("/ImageC")); + QPDFObjectHandle procset = QPDFObjectHandle::parse( + "[/PDF /Text /ImageC]"); QPDFObjectHandle rfont = QPDFObjectHandle::newDictionary(); rfont.replaceKey("/F1", font); diff --git a/include/qpdf/BufferInputSource.hh b/include/qpdf/BufferInputSource.hh index 63c14def..48f6e3ac 100644 --- a/include/qpdf/BufferInputSource.hh +++ b/include/qpdf/BufferInputSource.hh @@ -9,6 +9,8 @@ class BufferInputSource: public InputSource public: BufferInputSource(std::string const& description, Buffer* buf, bool own_memory = false); + BufferInputSource(std::string const& description, + std::string const& contents); virtual ~BufferInputSource(); virtual qpdf_offset_t findAndSkipNextEOL(); virtual std::string const& getName() const; diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index e6ff75b4..dc6e9090 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -531,6 +531,23 @@ class QPDF std::map foreign_streams; }; + class StringDecrypter: public QPDFObjectHandle::StringDecrypter + { + friend class QPDF; + + public: + StringDecrypter(QPDF* qpdf, int objid, int gen); + virtual ~StringDecrypter() + { + } + virtual void decryptString(std::string& val); + + private: + QPDF* qpdf; + int objid; + int gen; + }; + void parse(char const* password); void warn(QPDFExc const& e); void setTrailer(QPDFObjectHandle obj); @@ -547,10 +564,6 @@ class QPDF QPDFObjectHandle readObject( PointerHolder, std::string const& description, int objid, int generation, bool in_object_stream); - QPDFObjectHandle readObjectInternal( - PointerHolder input, int objid, int generation, - bool in_object_stream, - bool in_array, bool in_dictionary); size_t recoverStreamLength( PointerHolder input, int objid, int generation, qpdf_offset_t stream_offset); diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index 22ded37e..a1819d99 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -18,6 +18,7 @@ #include #include +#include #include @@ -25,6 +26,7 @@ class Pipeline; class QPDF; class QPDF_Dictionary; class QPDF_Array; +class QPDFTokenizer; class QPDFObjectHandle { @@ -57,6 +59,18 @@ class QPDFObjectHandle Pipeline* pipeline) = 0; }; + // This class is used by parse to decrypt strings when reading an + // object that contains encrypted strings. + class StringDecrypter + { + public: + QPDF_DLL + virtual ~StringDecrypter() + { + } + virtual void decryptString(std::string& val) = 0; + }; + QPDF_DLL QPDFObjectHandle(); QPDF_DLL @@ -95,6 +109,30 @@ class QPDFObjectHandle // Public factory methods + // Construct an object of any type from a string representation of + // the object. Throws QPDFExc with an empty filename and an + // offset into the string if there is an error. Any indirect + // object syntax (obj gen R) will cause a logic_error exception to + // be thrown. If object_description is provided, it will appear + // in the message of any QPDFExc exception thrown for invalid + // syntax. + QPDF_DLL + static QPDFObjectHandle parse(std::string const& object_str, + std::string const& object_description = ""); + + // Construct an object as above by reading from the given + // InputSource at its current position and using the tokenizer you + // supply. Indirect objects and encrypted strings are permitted. + // This method is intended to be called by QPDF for parsing + // objects that are ready from the object's input stream. + QPDF_DLL + static QPDFObjectHandle parse(PointerHolder input, + std::string const& object_description, + QPDFTokenizer&, bool& empty, + StringDecrypter* decrypter, + QPDF* context); + + // Type-specific factories QPDF_DLL static QPDFObjectHandle newNull(); QPDF_DLL @@ -124,7 +162,8 @@ class QPDFObjectHandle // object. A subsequent call must be made to replaceStreamData() // to provide data for the stream. The stream's dictionary may be // retrieved by calling getDict(), and the resulting dictionary - // may be modified. + // may be modified. Alternatively, you can create a new + // dictionary and call replaceDict to install it. QPDF_DLL static QPDFObjectHandle newStream(QPDF* qpdf); @@ -303,6 +342,15 @@ class QPDFObjectHandle bool pipeStreamData(Pipeline*, bool filter, bool normalize, bool compress); + // Replace a stream's dictionary. The new dictionary must be + // consistent with the stream's data. This is most appropriately + // used when creating streams from scratch that will use a stream + // data provider and therefore start with an empty dictionary. It + // may be more convenient in this case than calling getDict and + // modifying it for each key. The pdf-create example does this. + QPDF_DLL + void replaceDict(QPDFObjectHandle); + // Replace this stream's stream data with the given data buffer, // and replace the /Filter and /DecodeParms keys in the stream // dictionary with the given values. (If either value is empty, @@ -489,6 +537,12 @@ class QPDFObjectHandle void dereference(); void makeDirectInternal(std::set& visited); void releaseResolved(); + static QPDFObjectHandle parseInternal( + PointerHolder input, + std::string const& object_description, + QPDFTokenizer& tokenizer, bool& empty, + StringDecrypter* decrypter, QPDF* context, + bool in_array, bool in_dictionary); bool initialized; diff --git a/libqpdf/BufferInputSource.cc b/libqpdf/BufferInputSource.cc index 6909dce2..03439955 100644 --- a/libqpdf/BufferInputSource.cc +++ b/libqpdf/BufferInputSource.cc @@ -11,6 +11,18 @@ BufferInputSource::BufferInputSource(std::string const& description, { } +BufferInputSource::BufferInputSource(std::string const& description, + std::string const& contents) : + own_memory(true), + description(description), + buf(0), + cur_offset(0) +{ + this->buf = new Buffer(contents.length()); + unsigned char* bp = buf->getBuffer(); + memcpy(bp, (char*)contents.c_str(), contents.length()); +} + BufferInputSource::~BufferInputSource() { if (own_memory) diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index e1ce5fc4..bee2f3ee 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -68,6 +68,18 @@ QPDF::CopiedStreamDataProvider::registerForeignStream( this->foreign_streams[local_og] = foreign_stream; } +QPDF::StringDecrypter::StringDecrypter(QPDF* qpdf, int objid, int gen) : + qpdf(qpdf), + objid(objid), + gen(gen) +{ +} + +void +QPDF::StringDecrypter::decryptString(std::string& val) +{ + qpdf->decryptString(val, objid, gen); +} std::string const& QPDF::QPDFVersion() @@ -940,361 +952,167 @@ QPDF::readObject(PointerHolder input, { setLastObjectDescription(description, objid, generation); qpdf_offset_t offset = input->tell(); - QPDFObjectHandle object = readObjectInternal( - input, objid, generation, in_object_stream, false, false); + + bool empty = false; + PointerHolder decrypter_ph; + StringDecrypter* decrypter = 0; + if (this->encrypted && (! in_object_stream)) + { + decrypter_ph = new StringDecrypter(this, objid, generation); + decrypter = decrypter_ph.getPointer(); + } + QPDFObjectHandle object = QPDFObjectHandle::parse( + input, description, this->tokenizer, empty, decrypter, this); + if (empty) + { + // Nothing in the PDF spec appears to allow empty objects, but + // they have been encountered in actual PDF files and Adobe + // Reader appears to ignore them. + warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(), + this->last_object_description, + input->getLastOffset(), + "empty object treated as null")); + } + else if (object.isDictionary() && (! in_object_stream)) + { + // check for stream + qpdf_offset_t cur_offset = input->tell(); + if (readToken(input) == + QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream")) + { + // The PDF specification states that the word "stream" + // should be followed by either a carriage return and + // a newline or by a newline alone. It specifically + // disallowed following it by a carriage return alone + // since, in that case, there would be no way to tell + // whether the NL in a CR NL sequence was part of the + // stream data. However, some readers, including + // Adobe reader, accept a carriage return by itself + // when followed by a non-newline character, so that's + // what we do here. + { + char ch; + if (input->read(&ch, 1) == 0) + { + // A premature EOF here will result in some + // other problem that will get reported at + // another time. + } + else if (ch == '\n') + { + // ready to read stream data + QTC::TC("qpdf", "QPDF stream with NL only"); + } + else if (ch == '\r') + { + // Read another character + if (input->read(&ch, 1) != 0) + { + if (ch == '\n') + { + // Ready to read stream data + QTC::TC("qpdf", "QPDF stream with CRNL"); + } + else + { + // Treat the \r by itself as the + // whitespace after endstream and + // start reading stream data in spite + // of not having seen a newline. + QTC::TC("qpdf", "QPDF stream with CR only"); + input->unreadCh(ch); + warn(QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + this->last_object_description, + input->tell(), + "stream keyword followed" + " by carriage return only")); + } + } + } + else + { + QTC::TC("qpdf", "QPDF stream without newline"); + warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(), + this->last_object_description, + input->tell(), + "stream keyword not followed" + " by proper line terminator")); + } + } + + // Must get offset before accessing any additional + // objects since resolving a previously unresolved + // indirect object will change file position. + qpdf_offset_t stream_offset = input->tell(); + size_t length = 0; + + try + { + std::map dict = + object.getDictAsMap(); + + if (dict.count("/Length") == 0) + { + QTC::TC("qpdf", "QPDF stream without length"); + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + this->last_object_description, offset, + "stream dictionary lacks /Length key"); + } + + QPDFObjectHandle length_obj = dict["/Length"]; + if (! length_obj.isInteger()) + { + QTC::TC("qpdf", "QPDF stream length not integer"); + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + this->last_object_description, offset, + "/Length key in stream dictionary is not " + "an integer"); + } + + length = length_obj.getIntValue(); + input->seek( + stream_offset + (qpdf_offset_t)length, SEEK_SET); + if (! (readToken(input) == + QPDFTokenizer::Token( + QPDFTokenizer::tt_word, "endstream"))) + { + QTC::TC("qpdf", "QPDF missing endstream"); + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + this->last_object_description, + input->getLastOffset(), + "expected endstream"); + } + } + catch (QPDFExc& e) + { + if (this->attempt_recovery) + { + // may throw an exception + length = recoverStreamLength( + input, objid, generation, stream_offset); + } + else + { + throw e; + } + } + object = QPDFObjectHandle::Factory::newStream( + this, objid, generation, object, stream_offset, length); + } + else + { + input->seek(cur_offset, SEEK_SET); + } + } + // Override last_offset so that it points to the beginning of the // object we just read input->setLastOffset(offset); return object; } -QPDFObjectHandle -QPDF::readObjectInternal(PointerHolder input, - int objid, int generation, - bool in_object_stream, - bool in_array, bool in_dictionary) -{ - if (in_dictionary && in_array) - { - // Although dictionaries and arrays arbitrarily nest, these - // variables indicate what is at the top of the stack right - // now, so they can, by definition, never both be true. - throw std::logic_error( - "INTERNAL ERROR: readObjectInternal: in_dict && in_array"); - } - - QPDFObjectHandle object; - - qpdf_offset_t offset = input->tell(); - std::vector olist; - bool done = false; - while (! done) - { - object = QPDFObjectHandle(); - - QPDFTokenizer::Token token = readToken(input); - - switch (token.getType()) - { - case QPDFTokenizer::tt_brace_open: - case QPDFTokenizer::tt_brace_close: - // Don't know what to do with these for now - QTC::TC("qpdf", "QPDF bad brace"); - throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), - this->last_object_description, - input->getLastOffset(), - "unexpected brace token"); - break; - - case QPDFTokenizer::tt_array_close: - if (in_array) - { - done = true; - } - else - { - QTC::TC("qpdf", "QPDF bad array close"); - throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), - this->last_object_description, - input->getLastOffset(), - "unexpected array close token"); - } - break; - - case QPDFTokenizer::tt_dict_close: - if (in_dictionary) - { - done = true; - } - else - { - QTC::TC("qpdf", "QPDF bad dictionary close"); - throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), - this->last_object_description, - input->getLastOffset(), - "unexpected dictionary close token"); - } - break; - - case QPDFTokenizer::tt_array_open: - object = readObjectInternal( - input, objid, generation, in_object_stream, true, false); - break; - - case QPDFTokenizer::tt_dict_open: - object = readObjectInternal( - input, objid, generation, in_object_stream, false, true); - break; - - case QPDFTokenizer::tt_bool: - object = QPDFObjectHandle::newBool( - (token.getValue() == "true")); - break; - - case QPDFTokenizer::tt_null: - object = QPDFObjectHandle::newNull(); - break; - - case QPDFTokenizer::tt_integer: - object = QPDFObjectHandle::newInteger( - QUtil::string_to_ll(token.getValue().c_str())); - break; - - case QPDFTokenizer::tt_real: - object = QPDFObjectHandle::newReal(token.getValue()); - break; - - case QPDFTokenizer::tt_name: - object = QPDFObjectHandle::newName(token.getValue()); - break; - - case QPDFTokenizer::tt_word: - { - std::string const& value = token.getValue(); - if ((value == "R") && (in_array || in_dictionary) && - (olist.size() >= 2) && - (olist[olist.size() - 1].isInteger()) && - (olist[olist.size() - 2].isInteger())) - { - // Try to resolve indirect objects - object = QPDFObjectHandle::Factory::newIndirect( - this, - olist[olist.size() - 2].getIntValue(), - olist[olist.size() - 1].getIntValue()); - olist.pop_back(); - olist.pop_back(); - } - else if ((value == "endobj") && - (! (in_array || in_dictionary))) - { - // Nothing in the PDF spec appears to allow empty - // objects, but they have been encountered in - // actual PDF files and Adobe Reader appears to - // ignore them. - warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(), - this->last_object_description, - input->getLastOffset(), - "empty object treated as null")); - object = QPDFObjectHandle::newNull(); - input->seek(input->getLastOffset(), SEEK_SET); - } - else - { - throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), - this->last_object_description, - input->getLastOffset(), - "unknown token while reading object (" + - value + ")"); - } - } - break; - - case QPDFTokenizer::tt_string: - { - std::string val = token.getValue(); - if (this->encrypted && (! in_object_stream)) - { - decryptString(val, objid, generation); - } - object = QPDFObjectHandle::newString(val); - } - break; - - default: - throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), - this->last_object_description, - input->getLastOffset(), - "unknown token type while reading object"); - break; - } - - if (in_dictionary || in_array) - { - if (! done) - { - olist.push_back(object); - } - } - else if (! object.isInitialized()) - { - throw std::logic_error( - "INTERNAL ERROR: uninitialized object (token = " + - QUtil::int_to_string(token.getType()) + - ", " + token.getValue() + ")"); - } - else - { - done = true; - } - } - - if (in_array) - { - object = QPDFObjectHandle::newArray(olist); - } - else if (in_dictionary) - { - // Convert list to map. Alternating elements are keys. - std::map dict; - if (olist.size() % 2) - { - QTC::TC("qpdf", "QPDF dictionary odd number of elements"); - throw QPDFExc( - qpdf_e_damaged_pdf, input->getName(), - this->last_object_description, input->getLastOffset(), - "dictionary ending here has an odd number of elements"); - } - for (unsigned int i = 0; i < olist.size(); i += 2) - { - QPDFObjectHandle key_obj = olist[i]; - QPDFObjectHandle val = olist[i + 1]; - if (! key_obj.isName()) - { - throw QPDFExc( - qpdf_e_damaged_pdf, - input->getName(), this->last_object_description, offset, - std::string("dictionary key not name (") + - key_obj.unparse() + ")"); - } - dict[key_obj.getName()] = val; - } - object = QPDFObjectHandle::newDictionary(dict); - - if (! in_object_stream) - { - // check for stream - qpdf_offset_t cur_offset = input->tell(); - if (readToken(input) == - QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream")) - { - // The PDF specification states that the word "stream" - // should be followed by either a carriage return and - // a newline or by a newline alone. It specifically - // disallowed following it by a carriage return alone - // since, in that case, there would be no way to tell - // whether the NL in a CR NL sequence was part of the - // stream data. However, some readers, including - // Adobe reader, accept a carriage return by itself - // when followed by a non-newline character, so that's - // what we do here. - { - char ch; - if (input->read(&ch, 1) == 0) - { - // A premature EOF here will result in some - // other problem that will get reported at - // another time. - } - else if (ch == '\n') - { - // ready to read stream data - QTC::TC("qpdf", "QPDF stream with NL only"); - } - else if (ch == '\r') - { - // Read another character - if (input->read(&ch, 1) != 0) - { - if (ch == '\n') - { - // Ready to read stream data - QTC::TC("qpdf", "QPDF stream with CRNL"); - } - else - { - // Treat the \r by itself as the - // whitespace after endstream and - // start reading stream data in spite - // of not having seen a newline. - QTC::TC("qpdf", "QPDF stream with CR only"); - input->unreadCh(ch); - warn(QPDFExc( - qpdf_e_damaged_pdf, - input->getName(), - this->last_object_description, - input->tell(), - "stream keyword followed" - " by carriage return only")); - } - } - } - else - { - QTC::TC("qpdf", "QPDF stream without newline"); - warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(), - this->last_object_description, - input->tell(), - "stream keyword not followed" - " by proper line terminator")); - } - } - - // Must get offset before accessing any additional - // objects since resolving a previously unresolved - // indirect object will change file position. - qpdf_offset_t stream_offset = input->tell(); - size_t length = 0; - - try - { - if (dict.count("/Length") == 0) - { - QTC::TC("qpdf", "QPDF stream without length"); - throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), - this->last_object_description, offset, - "stream dictionary lacks /Length key"); - } - - QPDFObjectHandle length_obj = dict["/Length"]; - if (! length_obj.isInteger()) - { - QTC::TC("qpdf", "QPDF stream length not integer"); - throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), - this->last_object_description, offset, - "/Length key in stream dictionary is not " - "an integer"); - } - - length = length_obj.getIntValue(); - input->seek( - stream_offset + (qpdf_offset_t)length, SEEK_SET); - if (! (readToken(input) == - QPDFTokenizer::Token( - QPDFTokenizer::tt_word, "endstream"))) - { - QTC::TC("qpdf", "QPDF missing endstream"); - throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), - this->last_object_description, - input->getLastOffset(), - "expected endstream"); - } - } - catch (QPDFExc& e) - { - if (this->attempt_recovery) - { - // may throw an exception - length = recoverStreamLength( - input, objid, generation, stream_offset); - } - else - { - throw e; - } - } - object = QPDFObjectHandle::Factory::newStream( - this, objid, generation, object, stream_offset, length); - } - else - { - input->seek(cur_offset, SEEK_SET); - } - } - } - - return object; -} - size_t QPDF::recoverStreamLength(PointerHolder input, int objid, int generation, diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 642dee69..6bb182e8 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -11,12 +11,15 @@ #include #include #include +#include +#include #include #include #include #include +#include QPDFObjectHandle::QPDFObjectHandle() : initialized(false), @@ -398,6 +401,13 @@ QPDFObjectHandle::getDict() return dynamic_cast(obj.getPointer())->getDict(); } +void +QPDFObjectHandle::replaceDict(QPDFObjectHandle new_dict) +{ + assertStream(); + dynamic_cast(obj.getPointer())->replaceDict(new_dict); +} + PointerHolder QPDFObjectHandle::getStreamData() { @@ -598,6 +608,265 @@ QPDFObjectHandle::unparseResolved() return this->obj->unparse(); } +QPDFObjectHandle +QPDFObjectHandle::parse(std::string const& object_str, + std::string const& object_description) +{ + PointerHolder input = + new BufferInputSource("parsed object", object_str); + QPDFTokenizer tokenizer; + bool empty = false; + QPDFObjectHandle result = + parse(input, object_description, tokenizer, empty, 0, 0); + size_t offset = (size_t) input->tell(); + while (offset < object_str.length()) + { + if (! isspace(object_str[offset])) + { + QTC::TC("qpdf", "QPDFObjectHandle trailing data in parse"); + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + object_description, + input->getLastOffset(), + "trailing data found parsing object from string"); + } + ++offset; + } + return result; +} + +QPDFObjectHandle +QPDFObjectHandle::parse(PointerHolder input, + std::string const& object_description, + QPDFTokenizer& tokenizer, bool& empty, + StringDecrypter* decrypter, QPDF* context) +{ + return parseInternal(input, object_description, tokenizer, empty, + decrypter, context, false, false); +} + +QPDFObjectHandle +QPDFObjectHandle::parseInternal(PointerHolder input, + std::string const& object_description, + QPDFTokenizer& tokenizer, bool& empty, + StringDecrypter* decrypter, QPDF* context, + bool in_array, bool in_dictionary) +{ + empty = false; + if (in_dictionary && in_array) + { + // Although dictionaries and arrays arbitrarily nest, these + // variables indicate what is at the top of the stack right + // now, so they can, by definition, never both be true. + throw std::logic_error( + "INTERNAL ERROR: parseInternal: in_dict && in_array"); + } + + QPDFObjectHandle object; + + qpdf_offset_t offset = input->tell(); + std::vector olist; + bool done = false; + while (! done) + { + object = QPDFObjectHandle(); + + QPDFTokenizer::Token token = + tokenizer.readToken(input, object_description); + + switch (token.getType()) + { + case QPDFTokenizer::tt_brace_open: + case QPDFTokenizer::tt_brace_close: + // Don't know what to do with these for now + QTC::TC("qpdf", "QPDFObjectHandle bad brace"); + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + object_description, + input->getLastOffset(), + "unexpected brace token"); + break; + + case QPDFTokenizer::tt_array_close: + if (in_array) + { + done = true; + } + else + { + QTC::TC("qpdf", "QPDFObjectHandle bad array close"); + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + object_description, + input->getLastOffset(), + "unexpected array close token"); + } + break; + + case QPDFTokenizer::tt_dict_close: + if (in_dictionary) + { + done = true; + } + else + { + QTC::TC("qpdf", "QPDFObjectHandle bad dictionary close"); + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + object_description, + input->getLastOffset(), + "unexpected dictionary close token"); + } + break; + + case QPDFTokenizer::tt_array_open: + object = parseInternal( + input, object_description, tokenizer, empty, + decrypter, context, true, false); + break; + + case QPDFTokenizer::tt_dict_open: + object = parseInternal( + input, object_description, tokenizer, empty, + decrypter, context, false, true); + break; + + case QPDFTokenizer::tt_bool: + object = newBool((token.getValue() == "true")); + break; + + case QPDFTokenizer::tt_null: + object = newNull(); + break; + + case QPDFTokenizer::tt_integer: + object = newInteger(QUtil::string_to_ll(token.getValue().c_str())); + break; + + case QPDFTokenizer::tt_real: + object = newReal(token.getValue()); + break; + + case QPDFTokenizer::tt_name: + object = newName(token.getValue()); + break; + + case QPDFTokenizer::tt_word: + { + std::string const& value = token.getValue(); + if ((value == "R") && (in_array || in_dictionary) && + (olist.size() >= 2) && + (olist[olist.size() - 1].isInteger()) && + (olist[olist.size() - 2].isInteger())) + { + if (context == 0) + { + QTC::TC("qpdf", "QPDFObjectHandle indirect without context"); + throw std::logic_error( + "QPDFObjectHandle::parse called without context" + " on an object with indirect references"); + } + // Try to resolve indirect objects + object = newIndirect( + context, + olist[olist.size() - 2].getIntValue(), + olist[olist.size() - 1].getIntValue()); + olist.pop_back(); + olist.pop_back(); + } + else if ((value == "endobj") && + (! (in_array || in_dictionary))) + { + // We just saw endobj without having read + // anything. Treat this as a null and do not move + // the input source's offset. + object = newNull(); + input->seek(input->getLastOffset(), SEEK_SET); + empty = true; + } + else + { + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + object_description, + input->getLastOffset(), + "unknown token while reading object (" + + value + ")"); + } + } + break; + + case QPDFTokenizer::tt_string: + { + std::string val = token.getValue(); + if (decrypter) + { + decrypter->decryptString(val); + } + object = QPDFObjectHandle::newString(val); + } + + break; + + default: + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + object_description, + input->getLastOffset(), + "unknown token type while reading object"); + break; + } + + if (in_dictionary || in_array) + { + if (! done) + { + olist.push_back(object); + } + } + else if (! object.isInitialized()) + { + throw std::logic_error( + "INTERNAL ERROR: uninitialized object (token = " + + QUtil::int_to_string(token.getType()) + + ", " + token.getValue() + ")"); + } + else + { + done = true; + } + } + + if (in_array) + { + object = newArray(olist); + } + else if (in_dictionary) + { + // Convert list to map. Alternating elements are keys. + std::map dict; + if (olist.size() % 2) + { + QTC::TC("qpdf", "QPDFObjectHandle dictionary odd number of elements"); + throw QPDFExc( + qpdf_e_damaged_pdf, input->getName(), + object_description, input->getLastOffset(), + "dictionary ending here has an odd number of elements"); + } + for (unsigned int i = 0; i < olist.size(); i += 2) + { + QPDFObjectHandle key_obj = olist[i]; + QPDFObjectHandle val = olist[i + 1]; + if (! key_obj.isName()) + { + throw QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), object_description, offset, + std::string("dictionary key not name (") + + key_obj.unparse() + ")"); + } + dict[key_obj.getName()] = val; + } + object = newDictionary(dict); + } + + return object; +} + QPDFObjectHandle QPDFObjectHandle::newIndirect(QPDF* qpdf, int objid, int generation) { diff --git a/libqpdf/QPDF_Stream.cc b/libqpdf/QPDF_Stream.cc index c089bcc1..970ee58b 100644 --- a/libqpdf/QPDF_Stream.cc +++ b/libqpdf/QPDF_Stream.cc @@ -464,3 +464,18 @@ QPDF_Stream::replaceFilterData(QPDFObjectHandle const& filter, "/Length", QPDFObjectHandle::newInteger((int)length)); } } + +void +QPDF_Stream::replaceDict(QPDFObjectHandle new_dict) +{ + this->stream_dict = new_dict; + QPDFObjectHandle length_obj = new_dict.getKey("/Length"); + if (length_obj.isInteger()) + { + this->length = length_obj.getIntValue(); + } + else + { + this->length = 0; + } +} diff --git a/libqpdf/qpdf/QPDF_Stream.hh b/libqpdf/qpdf/QPDF_Stream.hh index ce46d994..34eaceeb 100644 --- a/libqpdf/qpdf/QPDF_Stream.hh +++ b/libqpdf/qpdf/QPDF_Stream.hh @@ -32,6 +32,8 @@ class QPDF_Stream: public QPDFObject QPDFObjectHandle const& filter, QPDFObjectHandle const& decode_parms); + void replaceDict(QPDFObjectHandle new_dict); + // Replace object ID and generation. This may only be called if // object ID and generation are 0. It is used by QPDFObjectHandle // when adding streams to files. diff --git a/qpdf/pdf_from_scratch.cc b/qpdf/pdf_from_scratch.cc index 2f853c24..10ef5512 100644 --- a/qpdf/pdf_from_scratch.cc +++ b/qpdf/pdf_from_scratch.cc @@ -38,25 +38,20 @@ void runtest(int n) // Create a minimal PDF from scratch. QPDFObjectHandle font = pdf.makeIndirectObject( - QPDFObjectHandle::newDictionary()); - font.replaceKey("/Type", newName("/Font")); - font.replaceKey("/Subtype", newName("/Type1")); - font.replaceKey("/Name", newName("/F1")); - font.replaceKey("/BaseFont", newName("/Helvetica")); - font.replaceKey("/Encoding", newName("/WinAnsiEncoding")); + QPDFObjectHandle::parse("<<" + " /Type /Font" + " /Subtype /Type1" + " /Name /F1" + " /BaseFont /Helvetica" + " /Encoding /WinAnsiEncoding" + ">>")); QPDFObjectHandle procset = pdf.makeIndirectObject( - QPDFObjectHandle::newArray()); - procset.appendItem(newName("/PDF")); - procset.appendItem(newName("/Text")); + QPDFObjectHandle::parse("[/PDF /Text]")); QPDFObjectHandle contents = createPageContents(pdf, "First Page"); - QPDFObjectHandle mediabox = QPDFObjectHandle::newArray(); - mediabox.appendItem(QPDFObjectHandle::newInteger(0)); - mediabox.appendItem(QPDFObjectHandle::newInteger(0)); - mediabox.appendItem(QPDFObjectHandle::newInteger(612)); - mediabox.appendItem(QPDFObjectHandle::newInteger(792)); + QPDFObjectHandle mediabox = QPDFObjectHandle::parse("[0 0 612 792]"); QPDFObjectHandle rfont = QPDFObjectHandle::newDictionary(); rfont.replaceKey("/F1", font); diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index c51e527b..ae771c6d 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -60,13 +60,13 @@ QPDF missing trailer 0 QPDF trailer lacks size 0 QPDF trailer size not integer 0 QPDF trailer prev not integer 0 -QPDF bad brace 0 -QPDF bad array close 0 -QPDF dictionary odd number of elements 0 +QPDFObjectHandle bad brace 0 +QPDFObjectHandle bad array close 0 +QPDFObjectHandle dictionary odd number of elements 0 QPDF stream without length 0 QPDF stream length not integer 0 QPDF missing endstream 0 -QPDF bad dictionary close 0 +QPDFObjectHandle bad dictionary close 0 QPDF can't find xref 0 QPDF_Tokenizer bad ) 0 QPDF_Tokenizer bad > 0 @@ -235,3 +235,5 @@ QPDF not copying pages object 0 QPDF insert foreign page 0 QPDFWriter foreign object 0 QPDFWriter copy use_aes 1 +QPDFObjectHandle indirect without context 0 +QPDFObjectHandle trailing data in parse 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index 1b979724..02a90736 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -149,7 +149,7 @@ $td->runtest("remove page we don't have", $td->NORMALIZE_NEWLINES); # ---------- $td->notify("--- Miscellaneous Tests ---"); -$n_tests += 44; +$n_tests += 45; $td->runtest("qpdf version", {$td->COMMAND => "qpdf --version"}, @@ -370,6 +370,10 @@ $td->runtest("detect foreign object in write", " copy-foreign-objects-in.pdf minimal.pdf"}, {$td->FILE => "foreign-in-write.out", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); +$td->runtest("parse objects from string", + {$td->COMMAND => "test_driver 31 minimal.pdf"}, # file not used + {$td->FILE => "parse-object.out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); show_ntests(); # ---------- diff --git a/qpdf/qtest/qpdf/bad22.out b/qpdf/qtest/qpdf/bad22.out index 2ff4de23..ec6d5f8e 100644 --- a/qpdf/qtest/qpdf/bad22.out +++ b/qpdf/qtest/qpdf/bad22.out @@ -1 +1 @@ -bad22.pdf (object 4 0, file position 317): stream dictionary lacks /Length key +bad22.pdf (object 4 0, file position 314): stream dictionary lacks /Length key diff --git a/qpdf/qtest/qpdf/bad23.out b/qpdf/qtest/qpdf/bad23.out index 9ff20de3..b4cf25e8 100644 --- a/qpdf/qtest/qpdf/bad23.out +++ b/qpdf/qtest/qpdf/bad23.out @@ -1 +1 @@ -bad23.pdf (object 4 0, file position 317): /Length key in stream dictionary is not an integer +bad23.pdf (object 4 0, file position 314): /Length key in stream dictionary is not an integer diff --git a/qpdf/qtest/qpdf/parse-object.out b/qpdf/qtest/qpdf/parse-object.out new file mode 100644 index 00000000..456e2f80 --- /dev/null +++ b/qpdf/qtest/qpdf/parse-object.out @@ -0,0 +1,4 @@ +[ /name 16059 3.14159 false << /key true /other [ (string1) (string2) ] >> null ] +logic error parsing indirect: QPDFObjectHandle::parse called without context on an object with indirect references +trailing data: parsed object (trailing test): trailing data found parsing object from string +test 31 done diff --git a/qpdf/test_driver.cc b/qpdf/test_driver.cc index 3d2f0dca..6cbb7882 100644 --- a/qpdf/test_driver.cc +++ b/qpdf/test_driver.cc @@ -1054,6 +1054,38 @@ void runtest(int n, char const* filename1, char const* filename2) << std::endl; } } + else if (n == 31) + { + // Test object parsing from a string. The input file is not used. + + QPDFObjectHandle o1 = + QPDFObjectHandle::parse( + "[/name 16059 3.14159 false\n" + " << /key true /other [ (string1) (string2) ] >> null]"); + std::cout << o1.unparse() << std::endl; + QPDFObjectHandle o2 = QPDFObjectHandle::parse(" 12345 \f "); + assert(o2.isInteger() && (o2.getIntValue() == 12345)); + try + { + QPDFObjectHandle::parse("[1 0 R]", "indirect test"); + std::cout << "oops -- didn't throw" << std::endl; + } + catch (std::logic_error e) + { + std::cout << "logic error parsing indirect: " << e.what() + << std::endl; + } + try + { + QPDFObjectHandle::parse("0 trailing", "trailing test"); + std::cout << "oops -- didn't throw" << std::endl; + } + catch (std::runtime_error e) + { + std::cout << "trailing data: " << e.what() + << std::endl; + } + } else { throw std::runtime_error(std::string("invalid test ") +