Add state st_string_escape in QPDFTokenizer

2024-12-22 10:58:58 +00:00 · 2022-08-20 12:01:49 +01:00 · 2022-08-20 12:01:49 +01:00 · 7c32f6cc2e
commit 7c32f6cc2e
parent 7c5778f999
2 changed files with 55 additions and 51 deletions
--- a/include/qpdf/QPDFTokenizer.hh
+++ b/include/qpdf/QPDFTokenizer.hh
@ -203,6 +203,7 @@ class QPDFTokenizer
        st_in_space,
        st_in_comment,
        st_in_string,
+        st_string_escape,
        st_char_code,
        st_string_after_cr,
        st_lt,
@ -238,7 +239,6 @@ class QPDFTokenizer
    // State for strings
    int string_depth;
    char bs_num_register[4];
-    bool last_char_was_bs;
 };

 #endif // QPDFTOKENIZER_HH
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@ -85,7 +85,6 @@ QPDFTokenizer::reset()
    char_to_unread = '\0';
    inline_image_bytes = 0;
    string_depth = 0;
-    last_char_was_bs = false;
 }

 QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) :
@ -244,7 +243,6 @@ QPDFTokenizer::handleCharacter(char ch)
        case '(':
            this->string_depth = 1;
            memset(this->bs_num_register, '\0', sizeof(this->bs_num_register));
-            this->last_char_was_bs = false;
            this->state = st_in_string;
            return;

@ -348,22 +346,66 @@ QPDFTokenizer::handleCharacter(char ch)
        return;

    case st_in_string:
-        {
-            inString(ch);
-            this->last_char_was_bs =
-                ((!this->last_char_was_bs) && (ch == '\\'));
-        }
+        inString(ch);
        return;

-    case (st_string_after_cr):
+    case st_string_after_cr:
        // CR LF in strings are either ignored or normalized to CR
        this->state = st_in_string;
        if (ch != '\n') {
-            handleCharacter(ch);
+            inString(ch);
        }
        return;

-    case (st_char_code):
+    case st_string_escape:
+        this->state = st_in_string;
+        switch (ch) {
+        case '0':
+        case '1':
+        case '2':
+        case '3':
+        case '4':
+        case '5':
+        case '6':
+        case '7':
+            this->state = st_char_code;
+            inCharCode(ch);
+            return;
+
+        case 'n':
+            this->val += '\n';
+            return;
+
+        case 'r':
+            this->val += '\r';
+            return;
+
+        case 't':
+            this->val += '\t';
+            return;
+
+        case 'b':
+            this->val += '\b';
+            return;
+
+        case 'f':
+            this->val += '\f';
+            return;
+
+        case '\n':
+            return;
+
+        case '\r':
+            this->state = st_string_after_cr;
+            return;
+
+        default:
+            // PDF spec says backslash is ignored before anything else
+            this->val += ch;
+            return;
+        }
+
+    case st_char_code:
        inCharCode(ch);
        return;

@ -444,47 +486,9 @@ QPDFTokenizer::inHexstring(char ch)
 void
 QPDFTokenizer::inString(char ch)
 {
-    bool ch_is_octal = ((ch >= '0') && (ch <= '7'));
-    if (ch_is_octal && this->last_char_was_bs) {
-        this->state = st_char_code;
-        inCharCode(ch);
+    if (ch == '\\') {
+        this->state = st_string_escape;
        return;
-    } else if (this->last_char_was_bs) {
-        switch (ch) {
-        case 'n':
-            this->val += '\n';
-            return;
-
-        case 'r':
-            this->val += '\r';
-            return;
-
-        case 't':
-            this->val += '\t';
-            return;
-
-        case 'b':
-            this->val += '\b';
-            return;
-
-        case 'f':
-            this->val += '\f';
-            return;
-
-        case '\n':
-            return;
-
-        case '\r':
-            this->state = st_string_after_cr;
-            return;
-
-        default:
-            // PDF spec says backslash is ignored before anything else
-            this->val += ch;
-            return;
-        }
-    } else if (ch == '\\') {
-        // last_char_was_bs is set/cleared below as appropriate
    } else if (ch == '(') {
        this->val += ch;
        ++this->string_depth;