From 959ae4b4da48801b09219a9258d0e2c6733b884d Mon Sep 17 00:00:00 2001
From: m-holger <m.holger@qpdf.org>
Date: Sat, 27 Jul 2024 16:26:19 +0100
Subject: [PATCH 1/5] Avoid unnecessary string copies in
 ContentNormalizer::handleToken

---
 libqpdf/ContentNormalizer.cc | 12 ++++++------
 libqpdf/QPDFObjectHandle.cc  |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/libqpdf/ContentNormalizer.cc b/libqpdf/ContentNormalizer.cc
index 47830f42..9da6fa1c 100644
--- a/libqpdf/ContentNormalizer.cc
+++ b/libqpdf/ContentNormalizer.cc
@@ -11,7 +11,6 @@ ContentNormalizer::ContentNormalizer() :
 void
 ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
 {
-    std::string value = token.getRawValue();
     QPDFTokenizer::token_type_e token_type = token.getType();
 
     if (token_type == QPDFTokenizer::tt_bad) {
@@ -24,6 +23,7 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
     switch (token_type) {
     case QPDFTokenizer::tt_space:
         {
+            std::string const& value = token.getRawValue();
             size_t len = value.length();
             for (size_t i = 0; i < len; ++i) {
                 char ch = value.at(i);
@@ -38,7 +38,7 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
                 }
             }
         }
-        break;
+        return;
 
     case QPDFTokenizer::tt_string:
         // Replacing string and name tokens in this way normalizes their representation as this will
@@ -52,12 +52,12 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
 
     default:
         writeToken(token);
-        break;
+        return;
     }
 
-    value = token.getRawValue();
-    if (((token_type == QPDFTokenizer::tt_string) || (token_type == QPDFTokenizer::tt_name)) &&
-        ((value.find('\r') != std::string::npos) || (value.find('\n') != std::string::npos))) {
+    // tt_string or tt_name
+    std::string const& value = token.getRawValue();
+    if (value.find('\r') != std::string::npos || value.find('\n') != std::string::npos) {
         write("\n");
     }
 }
diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc
index 4eaa2236..f2ab1d3e 100644
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@@ -148,7 +148,7 @@ QPDFObjectHandle::TokenFilter::write(std::string const& str)
 void
 QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token)
 {
-    std::string value = token.getRawValue();
+    std::string const& value = token.getRawValue();
     write(value.c_str(), value.length());
 }
 

From ffe462e67eca12a728bf11a2a6f2034e82c2d496 Mon Sep 17 00:00:00 2001
From: m-holger <m.holger@qpdf.org>
Date: Sat, 27 Jul 2024 16:45:58 +0100
Subject: [PATCH 2/5] In ContentNormalizer::handleToken refactor handling of
 string and name tokens

---
 libqpdf/ContentNormalizer.cc         | 5 +++--
 libqpdf/QPDFOutlineDocumentHelper.cc | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/libqpdf/ContentNormalizer.cc b/libqpdf/ContentNormalizer.cc
index 9da6fa1c..2582e729 100644
--- a/libqpdf/ContentNormalizer.cc
+++ b/libqpdf/ContentNormalizer.cc
@@ -1,5 +1,6 @@
 #include <qpdf/ContentNormalizer.hh>
 
+#include <qpdf/QPDF_Name.hh>
 #include <qpdf/QUtil.hh>
 
 ContentNormalizer::ContentNormalizer() :
@@ -43,11 +44,11 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
     case QPDFTokenizer::tt_string:
         // Replacing string and name tokens in this way normalizes their representation as this will
         // automatically handle quoting of unprintable characters, etc.
-        writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, token.getValue()));
+        write(QPDFObjectHandle::newString(token.getValue()).unparse());
         break;
 
     case QPDFTokenizer::tt_name:
-        writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, token.getValue()));
+        write(QPDF_Name::normalizeName(token.getValue()));
         break;
 
     default:
diff --git a/libqpdf/QPDFOutlineDocumentHelper.cc b/libqpdf/QPDFOutlineDocumentHelper.cc
index dd9db5c1..ba4e4291 100644
--- a/libqpdf/QPDFOutlineDocumentHelper.cc
+++ b/libqpdf/QPDFOutlineDocumentHelper.cc
@@ -71,7 +71,7 @@ QPDFOutlineDocumentHelper::resolveNamedDest(QPDFObjectHandle name)
             m->dest_dict = qpdf.getRoot().getKey("/Dests");
         }
         QTC::TC("qpdf", "QPDFOutlineDocumentHelper name named dest");
-        result=  m->dest_dict.getKeyIfDict(name.getName());
+        result = m->dest_dict.getKeyIfDict(name.getName());
     } else if (name.isString()) {
         if (!m->names_dest) {
             auto dests = qpdf.getRoot().getKey("/Names").getKeyIfDict("/Dests");

From 4783b223121d4f1133f0bb14dfd62833d7348a07 Mon Sep 17 00:00:00 2001
From: m-holger <m.holger@qpdf.org>
Date: Sat, 27 Jul 2024 18:06:12 +0100
Subject: [PATCH 3/5] In ContentNormalizer::handleToken refactor handling of
 space tokens

Avoid writing each space char individually.
---
 libqpdf/ContentNormalizer.cc | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/libqpdf/ContentNormalizer.cc b/libqpdf/ContentNormalizer.cc
index 2582e729..bca8ad66 100644
--- a/libqpdf/ContentNormalizer.cc
+++ b/libqpdf/ContentNormalizer.cc
@@ -25,18 +25,25 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
     case QPDFTokenizer::tt_space:
         {
             std::string const& value = token.getRawValue();
-            size_t len = value.length();
-            for (size_t i = 0; i < len; ++i) {
-                char ch = value.at(i);
-                if (ch == '\r') {
-                    if ((i + 1 < len) && (value.at(i + 1) == '\n')) {
-                        // ignore
-                    } else {
-                        write("\n");
-                    }
-                } else {
-                    write(&ch, 1);
+            auto size = value.size();
+            size_t pos = 0;
+            auto r_pos = value.find('\r');
+            while (r_pos != std::string::npos) {
+                if (pos != r_pos) {
+                    write(&value[pos], r_pos - pos);
                 }
+                if (++r_pos >= size) {
+                    write("\n");
+                    return;
+                }
+                if (value[r_pos] != '\n') {
+                    write("\n");
+                }
+                pos = r_pos;
+                r_pos = value.find('\r', pos);
+            }
+            if (pos < size) {
+                write(&value[pos], size - pos);
             }
         }
         return;

From 986a253cdd504c6e2c9d05c5af77e1b4578637bd Mon Sep 17 00:00:00 2001
From: m-holger <m.holger@qpdf.org>
Date: Thu, 25 Jul 2024 16:42:29 +0100
Subject: [PATCH 4/5] Overload QPDFTokenizer::findEI to take a InputSource&

---
 include/qpdf/QPDFTokenizer.hh |  4 +++-
 libqpdf/QPDFTokenizer.cc      | 40 ++++++++++++++++++-----------------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh
index 15f7a773..ec9bbc1b 100644
--- a/include/qpdf/QPDFTokenizer.hh
+++ b/include/qpdf/QPDFTokenizer.hh
@@ -191,6 +191,8 @@ class QPDFTokenizer
     // returns a tt_inline_image token.
     QPDF_DLL
     void expectInlineImage(std::shared_ptr<InputSource> input);
+    QPDF_DLL
+    void expectInlineImage(InputSource& input);
 
   private:
     friend class QPDFParser;
@@ -217,7 +219,7 @@ class QPDFTokenizer
 
     bool isSpace(char);
     bool isDelimiter(char);
-    void findEI(std::shared_ptr<InputSource> input);
+    void findEI(InputSource& input);
 
     enum state_e {
         st_top,
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index ca09708a..9b789883 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -27,7 +27,7 @@ namespace
     class QPDFWordTokenFinder: public InputSource::Finder
     {
       public:
-        QPDFWordTokenFinder(std::shared_ptr<InputSource> is, std::string const& str) :
+        QPDFWordTokenFinder(InputSource& is, std::string const& str) :
             is(is),
             str(str)
         {
@@ -36,7 +36,7 @@ namespace
         bool check() override;
 
       private:
-        std::shared_ptr<InputSource> is;
+        InputSource& is;
         std::string str;
     };
 } // namespace
@@ -48,21 +48,21 @@ QPDFWordTokenFinder::check()
     // delimiter or EOF.
     QPDFTokenizer tokenizer;
     QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
-    qpdf_offset_t pos = is->tell();
+    qpdf_offset_t pos = is.tell();
     if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) {
         QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
         return false;
     }
-    qpdf_offset_t token_start = is->getLastOffset();
+    qpdf_offset_t token_start = is.getLastOffset();
     char next;
     bool next_okay = false;
-    if (is->read(&next, 1) == 0) {
+    if (is.read(&next, 1) == 0) {
         QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
         next_okay = true;
     } else {
         next_okay = is_delimiter(next);
     }
-    is->seek(pos, SEEK_SET);
+    is.seek(pos, SEEK_SET);
     if (!next_okay) {
         return false;
     }
@@ -763,12 +763,18 @@ QPDFTokenizer::presentEOF()
 
 void
 QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
+{
+    expectInlineImage(*input);
+}
+
+void
+QPDFTokenizer::expectInlineImage(InputSource& input)
 {
     if (this->state == st_token_ready) {
         reset();
     } else if (this->state != st_before_token) {
-        throw std::logic_error("QPDFTokenizer::expectInlineImage called"
-                               " when tokenizer is in improper state");
+        throw std::logic_error(
+            "QPDFTokenizer::expectInlineImage called when tokenizer is in improper state");
     }
     findEI(input);
     this->before_token = false;
@@ -777,14 +783,10 @@ QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
 }
 
 void
-QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
+QPDFTokenizer::findEI(InputSource& input)
 {
-    if (!input.get()) {
-        return;
-    }
-
-    qpdf_offset_t last_offset = input->getLastOffset();
-    qpdf_offset_t pos = input->tell();
+    qpdf_offset_t last_offset = input.getLastOffset();
+    qpdf_offset_t pos = input.tell();
 
     // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several
     // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part
@@ -797,10 +799,10 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
     bool first_try = true;
     while (!okay) {
         QPDFWordTokenFinder f(input, "EI");
-        if (!input->findFirst("EI", input->tell(), 0, f)) {
+        if (!input.findFirst("EI", input.tell(), 0, f)) {
             break;
         }
-        this->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2);
+        inline_image_bytes = QIntC::to_size(input.tell() - pos - 2);
 
         QPDFTokenizer check;
         bool found_bad = false;
@@ -858,8 +860,8 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
         QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
     }
 
-    input->seek(pos, SEEK_SET);
-    input->setLastOffset(last_offset);
+    input.seek(pos, SEEK_SET);
+    input.setLastOffset(last_offset);
 }
 
 bool

From 1536a76071494306ec8661388356cf56ee53c86d Mon Sep 17 00:00:00 2001
From: m-holger <m.holger@qpdf.org>
Date: Sat, 27 Jul 2024 18:55:43 +0100
Subject: [PATCH 5/5] Refactor Pl_QPDFTokenizer::finish

Remove unnecessary use of shared pointers and avoid unnecessary string
creation.
---
 libqpdf/Pl_QPDFTokenizer.cc | 11 ++++-------
 libqpdf/QPDFTokenizer.cc    |  2 +-
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc
index 7bb86d5f..91973fc4 100644
--- a/libqpdf/Pl_QPDFTokenizer.cc
+++ b/libqpdf/Pl_QPDFTokenizer.cc
@@ -36,20 +36,17 @@ void
 Pl_QPDFTokenizer::finish()
 {
     m->buf.finish();
-    auto input = std::shared_ptr<InputSource>(
-        // line-break
-        new BufferInputSource("tokenizer data", m->buf.getBuffer(), true));
-
+    auto input = BufferInputSource("tokenizer data", m->buf.getBuffer(), true);
+    std::string empty;
     while (true) {
-        QPDFTokenizer::Token token =
-            m->tokenizer.readToken(input, "offset " + std::to_string(input->tell()), true);
+        auto token = m->tokenizer.readToken(input, empty, true);
         m->filter->handleToken(token);
         if (token.getType() == QPDFTokenizer::tt_eof) {
             break;
         } else if (token.isWord("ID")) {
             // Read the space after the ID.
             char ch = ' ';
-            input->read(&ch, 1);
+            input.read(&ch, 1);
             m->filter->handleToken(
                 // line-break
                 QPDFTokenizer::Token(QPDFTokenizer::tt_space, std::string(1, ch)));
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index 9b789883..d48abd3e 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -904,7 +904,7 @@ QPDFTokenizer::readToken(
             throw QPDFExc(
                 qpdf_e_damaged_pdf,
                 input.getName(),
-                context,
+                context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context,
                 input.getLastOffset(),
                 token.getErrorMessage());
         }