From 959ae4b4da48801b09219a9258d0e2c6733b884d Mon Sep 17 00:00:00 2001 From: m-holger Date: Sat, 27 Jul 2024 16:26:19 +0100 Subject: [PATCH 1/5] Avoid unnecessary string copies in ContentNormalizer::handleToken --- libqpdf/ContentNormalizer.cc | 12 ++++++------ libqpdf/QPDFObjectHandle.cc | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/libqpdf/ContentNormalizer.cc b/libqpdf/ContentNormalizer.cc index 47830f42..9da6fa1c 100644 --- a/libqpdf/ContentNormalizer.cc +++ b/libqpdf/ContentNormalizer.cc @@ -11,7 +11,6 @@ ContentNormalizer::ContentNormalizer() : void ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) { - std::string value = token.getRawValue(); QPDFTokenizer::token_type_e token_type = token.getType(); if (token_type == QPDFTokenizer::tt_bad) { @@ -24,6 +23,7 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) switch (token_type) { case QPDFTokenizer::tt_space: { + std::string const& value = token.getRawValue(); size_t len = value.length(); for (size_t i = 0; i < len; ++i) { char ch = value.at(i); @@ -38,7 +38,7 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) } } } - break; + return; case QPDFTokenizer::tt_string: // Replacing string and name tokens in this way normalizes their representation as this will @@ -52,12 +52,12 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) default: writeToken(token); - break; + return; } - value = token.getRawValue(); - if (((token_type == QPDFTokenizer::tt_string) || (token_type == QPDFTokenizer::tt_name)) && - ((value.find('\r') != std::string::npos) || (value.find('\n') != std::string::npos))) { + // tt_string or tt_name + std::string const& value = token.getRawValue(); + if (value.find('\r') != std::string::npos || value.find('\n') != std::string::npos) { write("\n"); } } diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 4eaa2236..f2ab1d3e 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -148,7 +148,7 @@ QPDFObjectHandle::TokenFilter::write(std::string const& str) void QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token) { - std::string value = token.getRawValue(); + std::string const& value = token.getRawValue(); write(value.c_str(), value.length()); } From ffe462e67eca12a728bf11a2a6f2034e82c2d496 Mon Sep 17 00:00:00 2001 From: m-holger Date: Sat, 27 Jul 2024 16:45:58 +0100 Subject: [PATCH 2/5] In ContentNormalizer::handleToken refactor handling of string and name tokens --- libqpdf/ContentNormalizer.cc | 5 +++-- libqpdf/QPDFOutlineDocumentHelper.cc | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/libqpdf/ContentNormalizer.cc b/libqpdf/ContentNormalizer.cc index 9da6fa1c..2582e729 100644 --- a/libqpdf/ContentNormalizer.cc +++ b/libqpdf/ContentNormalizer.cc @@ -1,5 +1,6 @@ #include +#include #include ContentNormalizer::ContentNormalizer() : @@ -43,11 +44,11 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) case QPDFTokenizer::tt_string: // Replacing string and name tokens in this way normalizes their representation as this will // automatically handle quoting of unprintable characters, etc. - writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, token.getValue())); + write(QPDFObjectHandle::newString(token.getValue()).unparse()); break; case QPDFTokenizer::tt_name: - writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, token.getValue())); + write(QPDF_Name::normalizeName(token.getValue())); break; default: diff --git a/libqpdf/QPDFOutlineDocumentHelper.cc b/libqpdf/QPDFOutlineDocumentHelper.cc index dd9db5c1..ba4e4291 100644 --- a/libqpdf/QPDFOutlineDocumentHelper.cc +++ b/libqpdf/QPDFOutlineDocumentHelper.cc @@ -71,7 +71,7 @@ QPDFOutlineDocumentHelper::resolveNamedDest(QPDFObjectHandle name) m->dest_dict = qpdf.getRoot().getKey("/Dests"); } QTC::TC("qpdf", "QPDFOutlineDocumentHelper name named dest"); - result= m->dest_dict.getKeyIfDict(name.getName()); + result = m->dest_dict.getKeyIfDict(name.getName()); } else if (name.isString()) { if (!m->names_dest) { auto dests = qpdf.getRoot().getKey("/Names").getKeyIfDict("/Dests"); From 4783b223121d4f1133f0bb14dfd62833d7348a07 Mon Sep 17 00:00:00 2001 From: m-holger Date: Sat, 27 Jul 2024 18:06:12 +0100 Subject: [PATCH 3/5] In ContentNormalizer::handleToken refactor handling of space tokens Avoid writing each space char individually. --- libqpdf/ContentNormalizer.cc | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/libqpdf/ContentNormalizer.cc b/libqpdf/ContentNormalizer.cc index 2582e729..bca8ad66 100644 --- a/libqpdf/ContentNormalizer.cc +++ b/libqpdf/ContentNormalizer.cc @@ -25,18 +25,25 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) case QPDFTokenizer::tt_space: { std::string const& value = token.getRawValue(); - size_t len = value.length(); - for (size_t i = 0; i < len; ++i) { - char ch = value.at(i); - if (ch == '\r') { - if ((i + 1 < len) && (value.at(i + 1) == '\n')) { - // ignore - } else { - write("\n"); - } - } else { - write(&ch, 1); + auto size = value.size(); + size_t pos = 0; + auto r_pos = value.find('\r'); + while (r_pos != std::string::npos) { + if (pos != r_pos) { + write(&value[pos], r_pos - pos); } + if (++r_pos >= size) { + write("\n"); + return; + } + if (value[r_pos] != '\n') { + write("\n"); + } + pos = r_pos; + r_pos = value.find('\r', pos); + } + if (pos < size) { + write(&value[pos], size - pos); } } return; From 986a253cdd504c6e2c9d05c5af77e1b4578637bd Mon Sep 17 00:00:00 2001 From: m-holger Date: Thu, 25 Jul 2024 16:42:29 +0100 Subject: [PATCH 4/5] Overload QPDFTokenizer::findEI to take a InputSource& --- include/qpdf/QPDFTokenizer.hh | 4 +++- libqpdf/QPDFTokenizer.cc | 40 ++++++++++++++++++----------------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh index 15f7a773..ec9bbc1b 100644 --- a/include/qpdf/QPDFTokenizer.hh +++ b/include/qpdf/QPDFTokenizer.hh @@ -191,6 +191,8 @@ class QPDFTokenizer // returns a tt_inline_image token. QPDF_DLL void expectInlineImage(std::shared_ptr input); + QPDF_DLL + void expectInlineImage(InputSource& input); private: friend class QPDFParser; @@ -217,7 +219,7 @@ class QPDFTokenizer bool isSpace(char); bool isDelimiter(char); - void findEI(std::shared_ptr input); + void findEI(InputSource& input); enum state_e { st_top, diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index ca09708a..9b789883 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -27,7 +27,7 @@ namespace class QPDFWordTokenFinder: public InputSource::Finder { public: - QPDFWordTokenFinder(std::shared_ptr is, std::string const& str) : + QPDFWordTokenFinder(InputSource& is, std::string const& str) : is(is), str(str) { @@ -36,7 +36,7 @@ namespace bool check() override; private: - std::shared_ptr is; + InputSource& is; std::string str; }; } // namespace @@ -48,21 +48,21 @@ QPDFWordTokenFinder::check() // delimiter or EOF. QPDFTokenizer tokenizer; QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true); - qpdf_offset_t pos = is->tell(); + qpdf_offset_t pos = is.tell(); if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); return false; } - qpdf_offset_t token_start = is->getLastOffset(); + qpdf_offset_t token_start = is.getLastOffset(); char next; bool next_okay = false; - if (is->read(&next, 1) == 0) { + if (is.read(&next, 1) == 0) { QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); next_okay = true; } else { next_okay = is_delimiter(next); } - is->seek(pos, SEEK_SET); + is.seek(pos, SEEK_SET); if (!next_okay) { return false; } @@ -763,12 +763,18 @@ QPDFTokenizer::presentEOF() void QPDFTokenizer::expectInlineImage(std::shared_ptr input) +{ + expectInlineImage(*input); +} + +void +QPDFTokenizer::expectInlineImage(InputSource& input) { if (this->state == st_token_ready) { reset(); } else if (this->state != st_before_token) { - throw std::logic_error("QPDFTokenizer::expectInlineImage called" - " when tokenizer is in improper state"); + throw std::logic_error( + "QPDFTokenizer::expectInlineImage called when tokenizer is in improper state"); } findEI(input); this->before_token = false; @@ -777,14 +783,10 @@ QPDFTokenizer::expectInlineImage(std::shared_ptr input) } void -QPDFTokenizer::findEI(std::shared_ptr input) +QPDFTokenizer::findEI(InputSource& input) { - if (!input.get()) { - return; - } - - qpdf_offset_t last_offset = input->getLastOffset(); - qpdf_offset_t pos = input->tell(); + qpdf_offset_t last_offset = input.getLastOffset(); + qpdf_offset_t pos = input.tell(); // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part @@ -797,10 +799,10 @@ QPDFTokenizer::findEI(std::shared_ptr input) bool first_try = true; while (!okay) { QPDFWordTokenFinder f(input, "EI"); - if (!input->findFirst("EI", input->tell(), 0, f)) { + if (!input.findFirst("EI", input.tell(), 0, f)) { break; } - this->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2); + inline_image_bytes = QIntC::to_size(input.tell() - pos - 2); QPDFTokenizer check; bool found_bad = false; @@ -858,8 +860,8 @@ QPDFTokenizer::findEI(std::shared_ptr input) QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); } - input->seek(pos, SEEK_SET); - input->setLastOffset(last_offset); + input.seek(pos, SEEK_SET); + input.setLastOffset(last_offset); } bool From 1536a76071494306ec8661388356cf56ee53c86d Mon Sep 17 00:00:00 2001 From: m-holger Date: Sat, 27 Jul 2024 18:55:43 +0100 Subject: [PATCH 5/5] Refactor Pl_QPDFTokenizer::finish Remove unnecessary use of shared pointers and avoid unnecessary string creation. --- libqpdf/Pl_QPDFTokenizer.cc | 11 ++++------- libqpdf/QPDFTokenizer.cc | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc index 7bb86d5f..91973fc4 100644 --- a/libqpdf/Pl_QPDFTokenizer.cc +++ b/libqpdf/Pl_QPDFTokenizer.cc @@ -36,20 +36,17 @@ void Pl_QPDFTokenizer::finish() { m->buf.finish(); - auto input = std::shared_ptr( - // line-break - new BufferInputSource("tokenizer data", m->buf.getBuffer(), true)); - + auto input = BufferInputSource("tokenizer data", m->buf.getBuffer(), true); + std::string empty; while (true) { - QPDFTokenizer::Token token = - m->tokenizer.readToken(input, "offset " + std::to_string(input->tell()), true); + auto token = m->tokenizer.readToken(input, empty, true); m->filter->handleToken(token); if (token.getType() == QPDFTokenizer::tt_eof) { break; } else if (token.isWord("ID")) { // Read the space after the ID. char ch = ' '; - input->read(&ch, 1); + input.read(&ch, 1); m->filter->handleToken( // line-break QPDFTokenizer::Token(QPDFTokenizer::tt_space, std::string(1, ch))); diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index 9b789883..d48abd3e 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -904,7 +904,7 @@ QPDFTokenizer::readToken( throw QPDFExc( qpdf_e_damaged_pdf, input.getName(), - context, + context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context, input.getLastOffset(), token.getErrorMessage()); }