Split QPDFTokenizer::handleCharacter into individual methods

2024-12-22 10:58:58 +00:00 · 2022-08-21 20:08:58 +01:00 · 2022-08-21 20:08:58 +01:00 · a3f3238f37
commit a3f3238f37
parent 6111a6a424
2 changed files with 311 additions and 237 deletions
--- a/include/qpdf/QPDFTokenizer.hh
+++ b/include/qpdf/QPDFTokenizer.hh
@ -200,26 +200,36 @@ class QPDFTokenizer

    enum state_e {
        st_top,
+        st_in_hexstring,
+        st_in_string,
+        st_in_hexstring_2nd,
+        st_literal,
        st_in_space,
        st_in_comment,
-        st_in_string,
        st_string_escape,
        st_char_code,
        st_string_after_cr,
        st_lt,
        st_gt,
-        st_literal,
-        st_in_hexstring,
-        st_in_hexstring_2nd,
        st_inline_image,
        st_token_ready
    };

    void handleCharacter(char);
+    void inTop(char);
+    void inSpace(char);
+    void inComment(char);
+    void inString(char);
+    void inLt(char);
+    void inGt(char);
+    void inStringAfterCR(char);
+    void inStringEscape(char);
+    void inLiteral(char);
    void inCharCode(char);
    void inHexstring(char);
    void inHexstring2nd(char);
-    void inString(char);
+    void inInlineImage(char);
+    void inTokenReady(char);

    void reset();

--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@ -217,134 +217,24 @@ QPDFTokenizer::handleCharacter(char ch)
    // the character that caused a state change in the new state.

    switch (this->state) {
-    case (st_token_ready):
-        throw std::logic_error(
-            "INTERNAL ERROR: QPDF tokenizer presented character "
-            "while token is waiting");
-
    case st_top:
-        // Note: we specifically do not use ctype here.  It is
-        // locale-dependent.
-        if (isSpace(ch)) {
-            if (this->include_ignorable) {
-                this->state = st_in_space;
-                this->val += ch;
-            }
-            return;
-        }
-        switch (ch) {
-        case '%':
-            this->state = st_in_comment;
-            if (this->include_ignorable) {
-                this->val += ch;
-            }
-            return;
-
-        case '(':
-            this->string_depth = 1;
-            this->state = st_in_string;
-            return;
-
-        case '<':
-            this->state = st_lt;
-            return;
-
-        case '>':
-            this->state = st_gt;
-            return;
-
-        case (')'):
-            this->type = tt_bad;
-            QTC::TC("qpdf", "QPDFTokenizer bad )");
-            this->error_message = "unexpected )";
-            this->val += ch;
-            this->state = st_token_ready;
-            return;
-
-        case '[':
-            this->type = tt_array_open;
-            this->state = st_token_ready;
-            this->val += ch;
-            return;
-
-        case ']':
-            this->type = tt_array_close;
-            this->val += ch;
-            this->state = st_token_ready;
-            return;
-
-        case '{':
-            this->type = tt_brace_open;
-            this->state = st_token_ready;
-            this->val += ch;
-            return;
-
-        case '}':
-            this->type = tt_brace_close;
-            this->state = st_token_ready;
-            this->val += ch;
-            return;
-
-        default:
-            this->state = st_literal;
-            this->val += ch;
-            return;
-        }
+        inTop(ch);
+        return;

    case st_in_space:
-        // We only enter this state if include_ignorable is true.
-        if (!isSpace(ch)) {
-            this->type = tt_space;
-            this->unread_char = true;
-            this->char_to_unread = ch;
-            this->state = st_token_ready;
-            return;
-        } else {
-            this->val += ch;
-            return;
-        }
+        inSpace(ch);
+        return;

    case st_in_comment:
-        if ((ch == '\r') || (ch == '\n')) {
-            if (this->include_ignorable) {
-                this->type = tt_comment;
-                this->unread_char = true;
-                this->char_to_unread = ch;
-                this->state = st_token_ready;
-            } else {
-                this->state = st_top;
-            }
-        } else if (this->include_ignorable) {
-            this->val += ch;
-        }
+        inComment(ch);
        return;

    case st_lt:
-        if (ch == '<') {
-            this->val += "<<";
-            this->type = tt_dict_open;
-            this->state = st_token_ready;
-            return;
-        }
-
-        this->state = st_in_hexstring;
-        inHexstring(ch);
+        inLt(ch);
        return;

    case st_gt:
-        if (ch == '>') {
-            this->val += ">>";
-            this->type = tt_dict_close;
-            this->state = st_token_ready;
-        } else {
-            this->val += ">";
-            this->type = tt_bad;
-            QTC::TC("qpdf", "QPDFTokenizer bad >");
-            this->error_message = "unexpected >";
-            this->unread_char = true;
-            this->char_to_unread = ch;
-            this->state = st_token_ready;
-        }
+        inGt(ch);
        return;

    case st_in_string:
@ -352,95 +242,25 @@ QPDFTokenizer::handleCharacter(char ch)
        return;

    case st_string_after_cr:
-        // CR LF in strings are either ignored or normalized to CR
-        this->state = st_in_string;
-        if (ch != '\n') {
-            inString(ch);
-        }
+        inStringAfterCR(ch);
        return;

    case st_string_escape:
-        this->state = st_in_string;
-        switch (ch) {
-        case '0':
-        case '1':
-        case '2':
-        case '3':
-        case '4':
-        case '5':
-        case '6':
-        case '7':
-            this->state = st_char_code;
-            this->char_code = 0;
-            this->digit_count = 0;
-            inCharCode(ch);
-            return;
-
-        case 'n':
-            this->val += '\n';
-            return;
-
-        case 'r':
-            this->val += '\r';
-            return;
-
-        case 't':
-            this->val += '\t';
-            return;
-
-        case 'b':
-            this->val += '\b';
-            return;
-
-        case 'f':
-            this->val += '\f';
-            return;
-
-        case '\n':
-            return;
-
-        case '\r':
-            this->state = st_string_after_cr;
-            return;
-
-        default:
-            // PDF spec says backslash is ignored before anything else
-            this->val += ch;
-            return;
-        }
+        inStringEscape(ch);
+        return;

    case st_char_code:
        inCharCode(ch);
        return;

    case st_literal:
-        if (isDelimiter(ch)) {
-            // A C-locale whitespace character or delimiter terminates
-            // token.  It is important to unread the whitespace
-            // character even though it is ignored since it may be the
-            // newline after a stream keyword.  Removing it here could
-            // make the stream-reading code break on some files,
-            // though not on any files in the test suite as of this
-            // writing.
-
-            this->type = tt_word;
-            this->unread_char = true;
-            this->char_to_unread = ch;
-            this->state = st_token_ready;
-        } else {
-            this->val += ch;
-        }
+        inLiteral(ch);
        return;

    case st_inline_image:
-        this->val += ch;
-        if (this->val.length() == this->inline_image_bytes) {
-            QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
-            this->type = tt_inline_image;
-            this->inline_image_bytes = 0;
-            this->state = st_token_ready;
-        }
+        inInlineImage(ch);
        return;
+        this->val += ch;

    case st_in_hexstring:
        inHexstring(ch);
@ -450,12 +270,283 @@ QPDFTokenizer::handleCharacter(char ch)
        inHexstring2nd(ch);
        return;

+    case (st_token_ready):
+        inTokenReady(ch);
+        return;
+
    default:
        throw std::logic_error(
            "INTERNAL ERROR: invalid state while reading token");
    }
 }

+void
+QPDFTokenizer::inTokenReady(char ch)
+{
+    throw std::logic_error("INTERNAL ERROR: QPDF tokenizer presented character "
+                           "while token is waiting");
+}
+
+void
+QPDFTokenizer::inTop(char ch)
+{
+    // Note: we specifically do not use ctype here.  It is
+    // locale-dependent.
+    if (isSpace(ch)) {
+        if (this->include_ignorable) {
+            this->state = st_in_space;
+            this->val += ch;
+            return;
+        }
+        return;
+    }
+    switch (ch) {
+    case '%':
+        this->state = st_in_comment;
+        if (this->include_ignorable) {
+            this->val += ch;
+        }
+        return;
+
+    case '(':
+        this->string_depth = 1;
+        this->state = st_in_string;
+        return;
+
+    case '<':
+        this->state = st_lt;
+        return;
+
+    case '>':
+        this->state = st_gt;
+        return;
+
+    case (')'):
+        this->type = tt_bad;
+        QTC::TC("qpdf", "QPDFTokenizer bad )");
+        this->error_message = "unexpected )";
+        this->val += ch;
+        this->state = st_token_ready;
+        return;
+
+    case '[':
+        this->type = tt_array_open;
+        this->state = st_token_ready;
+        this->val += ch;
+        return;
+
+    case ']':
+        this->type = tt_array_close;
+        this->val += ch;
+        this->state = st_token_ready;
+        return;
+
+    case '{':
+        this->type = tt_brace_open;
+        this->state = st_token_ready;
+        this->val += ch;
+        return;
+
+    case '}':
+        this->type = tt_brace_close;
+        this->state = st_token_ready;
+        this->val += ch;
+        return;
+
+    default:
+        this->state = st_literal;
+        this->val += ch;
+        return;
+    }
+}
+
+void
+QPDFTokenizer::inSpace(char ch)
+{
+    // We only enter this state if include_ignorable is true.
+    if (!isSpace(ch)) {
+        this->type = tt_space;
+        this->unread_char = true;
+        this->char_to_unread = ch;
+        this->state = st_token_ready;
+        return;
+    } else {
+        this->val += ch;
+        return;
+    }
+}
+
+void
+QPDFTokenizer::inComment(char ch)
+{
+    if ((ch == '\r') || (ch == '\n')) {
+        if (this->include_ignorable) {
+            this->type = tt_comment;
+            this->unread_char = true;
+            this->char_to_unread = ch;
+            this->state = st_token_ready;
+        } else {
+            this->state = st_top;
+        }
+    } else if (this->include_ignorable) {
+        this->val += ch;
+    }
+}
+
+void
+QPDFTokenizer::inString(char ch)
+{
+    switch (ch) {
+    case '\\':
+        this->state = st_string_escape;
+        return;
+
+    case '(':
+        this->val += ch;
+        ++this->string_depth;
+        return;
+
+    case ')':
+        if (--this->string_depth == 0) {
+            this->type = tt_string;
+            this->state = st_token_ready;
+            return;
+        }
+
+        this->val += ch;
+        return;
+
+    case '\r':
+        // CR by itself is converted to LF
+        this->val += '\n';
+        this->state = st_string_after_cr;
+        return;
+
+    case '\n':
+        this->val += ch;
+        return;
+
+    default:
+        this->val += ch;
+        return;
+    }
+}
+
+void
+QPDFTokenizer::inStringEscape(char ch)
+{
+    this->state = st_in_string;
+    switch (ch) {
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+        this->state = st_char_code;
+        this->char_code = 0;
+        this->digit_count = 0;
+        inCharCode(ch);
+        return;
+
+    case 'n':
+        this->val += '\n';
+        return;
+
+    case 'r':
+        this->val += '\r';
+        return;
+
+    case 't':
+        this->val += '\t';
+        return;
+
+    case 'b':
+        this->val += '\b';
+        return;
+
+    case 'f':
+        this->val += '\f';
+        return;
+
+    case '\n':
+        return;
+
+    case '\r':
+        this->state = st_string_after_cr;
+        return;
+
+    default:
+        // PDF spec says backslash is ignored before anything else
+        this->val += ch;
+        return;
+    }
+}
+
+void
+QPDFTokenizer::inStringAfterCR(char ch)
+{
+    this->state = st_in_string;
+    if (ch != '\n') {
+        inString(ch);
+    }
+}
+
+void
+QPDFTokenizer::inLt(char ch)
+{
+    if (ch == '<') {
+        this->val += "<<";
+        this->type = tt_dict_open;
+        this->state = st_token_ready;
+        return;
+    }
+
+    this->state = st_in_hexstring;
+    inHexstring(ch);
+}
+
+void
+QPDFTokenizer::inGt(char ch)
+{
+    if (ch == '>') {
+        this->val += ">>";
+        this->type = tt_dict_close;
+        this->state = st_token_ready;
+    } else {
+        this->val += ">";
+        this->type = tt_bad;
+        QTC::TC("qpdf", "QPDFTokenizer bad >");
+        this->error_message = "unexpected >";
+        this->unread_char = true;
+        this->char_to_unread = ch;
+        this->state = st_token_ready;
+    }
+}
+
+void
+QPDFTokenizer::inLiteral(char ch)
+{
+    if (isDelimiter(ch)) {
+        // A C-locale whitespace character or delimiter terminates
+        // token.  It is important to unread the whitespace
+        // character even though it is ignored since it may be the
+        // newline after a stream keyword.  Removing it here could
+        // make the stream-reading code break on some files,
+        // though not on any files in the test suite as of this
+        // writing.
+
+        this->type = tt_word;
+        this->unread_char = true;
+        this->char_to_unread = ch;
+        this->state = st_token_ready;
+    } else {
+        this->val += ch;
+    }
+}
+
 void
 QPDFTokenizer::inHexstring(char ch)
 {
@ -520,45 +611,6 @@ QPDFTokenizer::inHexstring2nd(char ch)
    }
 }

-void
-QPDFTokenizer::inString(char ch)
-{
-    switch (ch) {
-    case '\\':
-        this->state = st_string_escape;
-        return;
-
-    case '(':
-        this->val += ch;
-        ++this->string_depth;
-        return;
-
-    case ')':
-        if (--this->string_depth == 0) {
-            this->type = tt_string;
-            this->state = st_token_ready;
-            return;
-        }
-
-        this->val += ch;
-        return;
-
-    case '\r':
-        // CR by itself is converted to LF
-        this->val += '\n';
-        this->state = st_string_after_cr;
-        return;
-
-    case '\n':
-        this->val += ch;
-        return;
-
-    default:
-        this->val += ch;
-        return;
-    }
-}
-
 void
 QPDFTokenizer::inCharCode(char ch)
 {
@ -575,6 +627,18 @@ QPDFTokenizer::inCharCode(char ch)
    return;
 }

+void
+QPDFTokenizer::inInlineImage(char ch)
+{
+    this->val += ch;
+    if (this->val.length() == this->inline_image_bytes) {
+        QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
+        this->type = tt_inline_image;
+        this->inline_image_bytes = 0;
+        this->state = st_token_ready;
+    }
+}
+
 void
 QPDFTokenizer::presentEOF()
 {