Lexer enhancements: EOF, comment, space

Significant enhancements to the lexer to improve EOF handling and to support comments and spaces as tokens. Various other minor issues were fixed as well.
2018-01-28 18:28:45 -05:00 · 2018-01-28 18:28:45 -05:00 · d97474868d
parent bb9e91adbd
commit d97474868d
22 changed files with 5084 additions and 89 deletions
--- a/32
+++ b/32
@ -1,3 +1,35 @@
+2018-02-04  Jay Berkenbilt  <ejb@ql.org>
+
+	* Significant lexer (tokenizer) enhancements. These are changes to
+	  the QPDFTokenizer class. These changes are of concern only to
+	  people who are operating with PDF files at the lexical layer
+	  using qpdf. They have little or no impact on most high-level
+	  interfaces or the command-line tool.
+	  * New token types tt_space and tt_comment to recognize
+	    whitespace and comments. this makes it possible to tokenize a
+	    PDF file or stream and preserve everything about it.
+	  * For backward compatibility, space and comment tokens are not
+	    returned by the tokenizer unless
+	    QPDFTokenizer.includeIgnorable() is called.
+	  * Better handling of null bytes. These are now included in space
+	    tokens rather than being their own "tt_word" tokens. This
+	    should have no impact on any correct PDF file and has no
+	    impact on output, but it may change offsets in some error
+	    messages when trying to parse contents of bad files. Under
+	    default operation, qpdf does not attempt to parse content
+	    streams, so this change is mostly invisible.
+	  * Bug fix to handling of bad tokens at ends of streams. Now,
+	    when allowEOF() has been called, these are treated as bad tokens
+	    (tt_bad or an exception, depending on invocation), and a
+	    separate tt_eof token is returned. Before the bad token
+	    contents were returned as the value of a tt_eof token. tt_eof
+	    tokens are always empty now.
+	  * Fix a bug that would, on rare occasions, report the offset in an
+	    error message in the wrong space because of spaces or comments
+	    adjacent to a bad token.
+	  * Clarify in comments exactly where the input source is
+	    positioned surrounding calls to readToken and getToken.
+
 2018-02-04  Jay Berkenbilt  <ejb@ql.org>

 	* Add QPDFWriter::setLinearizationPass1Filename method and
--- a/include/qpdf/QPDFTokenizer.hh
+++ b/include/qpdf/QPDFTokenizer.hh
@ -33,7 +33,8 @@ class QPDFTokenizer
 {
  public:
    // Token type tt_eof is only returned of allowEOF() is called on
-    // the tokenizer.  tt_eof was introduced in QPDF version 4.1.
+    // the tokenizer. tt_eof was introduced in QPDF version 4.1.
+    // tt_space and tt_comment were added in QPDF version 8.
    enum token_type_e
    {
 	tt_bad,
@ -51,6 +52,8 @@ class QPDFTokenizer
 	tt_bool,
 	tt_word,
        tt_eof,
+        tt_space,
+        tt_comment,
    };

    class Token
@ -120,6 +123,11 @@ class QPDFTokenizer
    QPDF_DLL
    void allowEOF();

+    // If called, readToken will return "ignorable" tokens for space
+    // and comments. This was added in QPDF 8.
+    QPDF_DLL
+    void includeIgnorable();
+
    // Mode of operation:

    // Keep presenting characters and calling getToken() until
@ -159,13 +167,15 @@ class QPDFTokenizer
  private:
    void reset();
    void resolveLiteral();
+    bool isSpace(char);

    // Lexer state
-    enum { st_top, st_in_comment, st_in_string, st_lt, st_gt,
+    enum { st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt,
 	   st_literal, st_in_hexstring, st_token_ready } state;

    bool pound_special_in_name;
    bool allow_eof;
+    bool include_ignorable;

    // Current token accumulation
    token_type_e type;
--- a/libqpdf/FileInputSource.cc
+++ b/libqpdf/FileInputSource.cc
@ -120,15 +120,23 @@ FileInputSource::rewind()
 size_t
 FileInputSource::read(char* buffer, size_t length)
 {
-    this->last_offset = QUtil::tell(this->file);
+    this->last_offset = this->tell();
    size_t len = fread(buffer, 1, length, this->file);
-    if ((len == 0) && ferror(this->file))
+    if (len == 0)
    {
-	throw QPDFExc(qpdf_e_system,
-		      this->filename, "",
-		      this->last_offset,
-		      std::string("read ") +
-		      QUtil::int_to_string(length) + " bytes");
+        if (ferror(this->file))
+        {
+            throw QPDFExc(qpdf_e_system,
+                          this->filename, "",
+                          this->last_offset,
+                          std::string("read ") +
+                          QUtil::int_to_string(length) + " bytes");
+        }
+        else if (length > 0)
+        {
+            this->seek(0, SEEK_END);
+            this->last_offset = this->tell();
+        }
    }
    return len;
 }
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@ -14,7 +14,8 @@

 QPDFTokenizer::QPDFTokenizer() :
    pound_special_in_name(true),
-    allow_eof(false)
+    allow_eof(false),
+    include_ignorable(false)
 {
    reset();
 }
@ -32,6 +33,18 @@ QPDFTokenizer::allowEOF()
    this->allow_eof = true;
 }

+void
+QPDFTokenizer::includeIgnorable()
+{
+    this->include_ignorable = true;
+}
+
+bool
+QPDFTokenizer::isSpace(char ch)
+{
+    return ((ch == '\0') || QUtil::is_space(ch));
+}
+
 void
 QPDFTokenizer::reset()
 {
@ -148,14 +161,21 @@ QPDFTokenizer::presentCharacter(char ch)
    {
 	// Note: we specifically do not use ctype here.  It is
 	// locale-dependent.
-	if (strchr(" \t\n\v\f\r", ch))
+	if (isSpace(ch))
 	{
-	    // ignore
+            if (this->include_ignorable)
+            {
+                state = st_in_space;
+                val += ch;
+            }
 	}
 	else if (ch == '%')
 	{
-	    // Discard comments
 	    state = st_in_comment;
+            if (this->include_ignorable)
+            {
+                val += ch;
+            }
 	}
 	else if (ch == '(')
 	{
@ -209,12 +229,41 @@ QPDFTokenizer::presentCharacter(char ch)
 	    }
 	}
    }
+    else if (state == st_in_space)
+    {
+        // We only enter this state if include_ignorable is true.
+        if (! isSpace(ch))
+        {
+	    type = tt_space;
+	    unread_char = true;
+	    char_to_unread = ch;
+	    state = st_token_ready;
+        }
+        else
+        {
+            val += ch;
+        }
+    }
    else if (state == st_in_comment)
    {
 	if ((ch == '\r') || (ch == '\n'))
-	{
-	    state = st_top;
-	}
+        {
+            if (this->include_ignorable)
+            {
+                type = tt_comment;
+                unread_char = true;
+                char_to_unread = ch;
+                state = st_token_ready;
+            }
+            else
+            {
+                state = st_top;
+            }
+        }
+        else if (this->include_ignorable)
+        {
+            val += ch;
+        }
    }
    else if (state == st_lt)
    {
@ -397,7 +446,7 @@ QPDFTokenizer::presentCharacter(char ch)
 	{
 	    val += ch;
 	}
-	else if (strchr(" \t\n\v\f\r", ch))
+	else if (isSpace(ch))
 	{
 	    // ignore
 	}
@ -435,19 +484,23 @@ QPDFTokenizer::presentEOF()
        QTC::TC("qpdf", "QPDF_Tokenizer EOF reading appendable token");
        resolveLiteral();
    }
+    else if ((this->include_ignorable) && (state == st_in_space))
+    {
+        type = tt_space;
+    }
+    else if ((this->include_ignorable) && (state == st_in_comment))
+    {
+        type = tt_comment;
+    }
+    else if (betweenTokens())
+    {
+        type = tt_eof;
+    }
    else if (state != st_token_ready)
    {
-        QTC::TC("qpdf", "QPDF_Tokenizer EOF reading token",
-                this->allow_eof ? 1 : 0);
-        if ((this->allow_eof) && (state == st_top))
-        {
-            type = tt_eof;
-        }
-        else
-        {
-            type = tt_bad;
-            error_message = "EOF while reading token";
-        }
+        QTC::TC("qpdf", "QPDF_Tokenizer EOF reading token");
+        type = tt_bad;
+        error_message = "EOF while reading token";
    }

    state = st_token_ready;
@ -461,6 +514,10 @@ QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
    ch = this->char_to_unread;
    if (ready)
    {
+        if (type == tt_bad)
+        {
+            val = raw_val;
+        }
 	token = Token(type, val, raw_val, error_message);
 	reset();
    }
@ -470,7 +527,10 @@ QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
 bool
 QPDFTokenizer::betweenTokens()
 {
-    return ((state == st_top) || (state == st_in_comment));
+    return ((state == st_top) ||
+            ((! this->include_ignorable) &&
+             ((state == st_in_comment) ||
+              (state == st_in_space))));
 }

 QPDFTokenizer::Token
@ -493,6 +553,13 @@ QPDFTokenizer::readToken(PointerHolder<InputSource> input,
            {
                presentEOF();
                presented_eof = true;
+                if ((type == tt_eof) && (! this->allow_eof))
+                {
+                    QTC::TC("qpdf", "QPDF_Tokenizer EOF when not allowed");
+                    type = tt_bad;
+                    error_message = "unexpected EOF";
+                    offset = input->getLastOffset();
+                }
            }
            else
            {
@ -502,12 +569,11 @@ QPDFTokenizer::readToken(PointerHolder<InputSource> input,
 	}
 	else
 	{
-	    if (QUtil::is_space(static_cast<unsigned char>(ch)) &&
-		(input->getLastOffset() == offset))
+	    presentCharacter(ch);
+	    if (betweenTokens() && (input->getLastOffset() == offset))
 	    {
 		++offset;
 	    }
-	    presentCharacter(ch);
            if (max_len && (raw_val.length() >= max_len) &&
                (this->state != st_token_ready))
            {
@ -515,6 +581,8 @@ QPDFTokenizer::readToken(PointerHolder<InputSource> input,
                QTC::TC("qpdf", "QPDFTokenizer block long token");
                this->type = tt_bad;
                this->state = st_token_ready;
+                error_message =
+                    "exceeded allowable length while reading token";
            }
 	}
    }
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@ -233,7 +233,7 @@ QPDFWriter copy use_aes 1
 QPDFObjectHandle indirect without context 0
 QPDFObjectHandle trailing data in parse 0
 qpdf pages encryption password 0
-QPDF_Tokenizer EOF reading token 1
+QPDF_Tokenizer EOF reading token 0
 QPDF_Tokenizer EOF reading appendable token 0
 QPDFWriter extra header text no newline 0
 QPDFWriter extra header text add newline 0
@ -302,3 +302,4 @@ qpdf-c called qpdf_set_compress_streams 0
 qpdf-c called qpdf_set_preserve_unreferenced_objects 0
 qpdf-c called qpdf_set_newline_before_endstream 0
 QPDF_Stream TIFF predictor 0
+QPDF_Tokenizer EOF when not allowed 0
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@ -240,7 +240,7 @@ foreach my $d (@bug_tests)
 show_ntests();
 # ----------
 $td->notify("--- Miscellaneous Tests ---");
-$n_tests += 97;
+$n_tests += 99;

 $td->runtest("qpdf version",
 	     {$td->COMMAND => "qpdf --version"},
@ -263,11 +263,21 @@ $td->runtest("check pass1 file",
 	     {$td->FILE => "b.pdf"},
 	     {$td->FILE => "minimal-linearize-pass1.pdf"});

+$td->runtest("tokenizer with no ignorable",
+             {$td->COMMAND => "test_tokenizer -no-ignorable tokens.pdf"},
+             {$td->FILE => "tokens-no-ignorable.out", $td->EXIT_STATUS => 0},
+             $td->NORMALIZE_NEWLINES);
+
 $td->runtest("tokenizer",
             {$td->COMMAND => "test_tokenizer tokens.pdf"},
             {$td->FILE => "tokens.out", $td->EXIT_STATUS => 0},
             $td->NORMALIZE_NEWLINES);

+$td->runtest("tokenizer with max_len",
+             {$td->COMMAND => "test_tokenizer -maxlen 50 tokens.pdf"},
+             {$td->FILE => "tokens-maxlen.out", $td->EXIT_STATUS => 0},
+             $td->NORMALIZE_NEWLINES);
+
 foreach (my $i = 1; $i <= 3; ++$i)
 {
    $td->runtest("misc tests",
--- a/qpdf/qtest/qpdf/bad16-recover.out
+++ b/qpdf/qtest/qpdf/bad16-recover.out
@ -2,9 +2,9 @@ WARNING: bad16.pdf (trailer, file position 753): unexpected dictionary close tok
 WARNING: bad16.pdf (trailer, file position 756): unexpected dictionary close token
 WARNING: bad16.pdf (trailer, file position 759): unknown token while reading object; treating as string
 WARNING: bad16.pdf: file is damaged
-WARNING: bad16.pdf (trailer, file position 773): EOF while reading token
+WARNING: bad16.pdf (trailer, file position 779): unexpected EOF
 WARNING: bad16.pdf: Attempting to reconstruct cross-reference table
 WARNING: bad16.pdf (trailer, file position 753): unexpected dictionary close token
 WARNING: bad16.pdf (trailer, file position 756): unexpected dictionary close token
 WARNING: bad16.pdf (trailer, file position 759): unknown token while reading object; treating as string
-bad16.pdf (trailer, file position 773): EOF while reading token
+bad16.pdf (trailer, file position 779): unexpected EOF
--- a/qpdf/qtest/qpdf/bad16.out
+++ b/qpdf/qtest/qpdf/bad16.out
@ -1,4 +1,4 @@
 WARNING: bad16.pdf (trailer, file position 753): unexpected dictionary close token
 WARNING: bad16.pdf (trailer, file position 756): unexpected dictionary close token
 WARNING: bad16.pdf (trailer, file position 759): unknown token while reading object; treating as string
-bad16.pdf (trailer, file position 773): EOF while reading token
+bad16.pdf (trailer, file position 779): unexpected EOF
--- a/qpdf/qtest/qpdf/good14.out
+++ b/qpdf/qtest/qpdf/good14.out
@ -39,4 +39,12 @@ This stream does end with a newline.
 -- stream 4 --
 (ends with a name)
 /ThisMustBeLast
+-- stream 5 --
+% This stream has an inline image marker that is not terminated
+(Potato)
+
+BI
+ID
+<506f7
+461746f>
 test 3 done
--- a/qpdf/qtest/qpdf/good14.pdf
+++ b/qpdf/qtest/qpdf/good14.pdf
@ -125,8 +125,21 @@ stream
 /ThisMustBeLastendstream
 endobj

+13 0 obj
+<< /Length 103 >>
+stream
+% This stream has an inline image marker that is not terminated
+<506f7
+461746f>
+BI
+ID
+<506f7
+461746f>
+endstream
+endobj
+
 xref
-0 13
+0 14
 0000000000 65535 f 
 0000000045 00000 n 
 0000000099 00000 n 
@ -140,11 +153,12 @@ xref
 0000001283 00000 n 
 0000001374 00000 n 
 0000001430 00000 n 
+0000001515 00000 n 
 trailer <<
-  /Size 13
+  /Size 14
  /Root 1 0 R
-  /QStreams [ 7 0 R 8 0 R 10 0 R 11 0 R 12 0 R ]
+  /QStreams [ 7 0 R 8 0 R 10 0 R 11 0 R 12 0 R 13 0 R ]
 >>
 startxref
-1515
+1670
 %%EOF
--- a/qpdf/qtest/qpdf/good14.qdf
+++ b/qpdf/qtest/qpdf/good14.qdf
@ -5,7 +5,7 @@
 %% Original object ID: 1 0
 1 0 obj
 <<
-  /Pages 12 0 R
+  /Pages 14 0 R
  /Type /Catalog
 >>
 endobj
@ -110,12 +110,32 @@ endobj
 34
 endobj

-%% Original object ID: 2 0
+%% Original object ID: 13 0
 12 0 obj
+<<
+  /Length 13 0 R
+>>
+stream
+% This stream has an inline image marker that is not terminated
+<506f7
+461746f>
+BI
+ID
+<506f7
+461746f>
+endstream
+endobj
+
+13 0 obj
+103
+endobj
+
+%% Original object ID: 2 0
+14 0 obj
 <<
  /Count 1
  /Kids [
-    13 0 R
+    15 0 R
  ]
  /Type /Pages
 >>
@ -123,21 +143,21 @@ endobj

 %% Page 1
 %% Original object ID: 3 0
-13 0 obj
+15 0 obj
 <<
-  /Contents 14 0 R
+  /Contents 16 0 R
  /MediaBox [
    0
    0
    612
    792
  ]
-  /Parent 12 0 R
+  /Parent 14 0 R
  /Resources <<
    /Font <<
-      /F1 16 0 R
+      /F1 18 0 R
    >>
-    /ProcSet 17 0 R
+    /ProcSet 19 0 R
  >>
  /Type /Page
 >>
@ -145,9 +165,9 @@ endobj

 %% Contents for page 1
 %% Original object ID: 4 0
-14 0 obj
+16 0 obj
 <<
-  /Length 15 0 R
+  /Length 17 0 R
 >>
 stream
 BT
@ -158,12 +178,12 @@ ET
 endstream
 endobj

-15 0 obj
+17 0 obj
 44
 endobj

 %% Original object ID: 6 0
-16 0 obj
+18 0 obj
 <<
  /BaseFont /Helvetica
  /Encoding /WinAnsiEncoding
@ -174,7 +194,7 @@ endobj
 endobj

 %% Original object ID: 5 0
-17 0 obj
+19 0 obj
 [
  /PDF
  /Text
@ -182,7 +202,7 @@ endobj
 endobj

 xref
-0 18
+0 20
 0000000000 65535 f 
 0000000052 00000 n 
 0000000134 00000 n 
@ -195,12 +215,14 @@ xref
 0000001151 00000 n 
 0000001197 00000 n 
 0000001310 00000 n 
-0000001357 00000 n 
-0000001468 00000 n 
-0000001715 00000 n 
-0000001816 00000 n 
-0000001863 00000 n 
-0000002009 00000 n 
+0000001358 00000 n 
+0000001518 00000 n 
+0000001566 00000 n 
+0000001677 00000 n 
+0000001924 00000 n 
+0000002025 00000 n 
+0000002072 00000 n 
+0000002218 00000 n 
 trailer <<
  /QStreams [
    2 0 R
@ -208,11 +230,12 @@ trailer <<
    6 0 R
    8 0 R
    10 0 R
+    12 0 R
  ]
  /Root 1 0 R
-  /Size 18
+  /Size 20
  /ID [<31415926535897932384626433832795><31415926535897932384626433832795>]
 >>
 startxref
-2045
+2254
 %%EOF
--- a/qpdf/qtest/qpdf/issue-100.out
+++ b/qpdf/qtest/qpdf/issue-100.out
@ -13,4 +13,4 @@ WARNING: issue-100.pdf (object 5 0, file position 489): attempting to recover st
 WARNING: issue-100.pdf (object 5 0, file position 489): recovered stream length: 12
 WARNING: issue-100.pdf (trailer, file position 953): expected dictionary key but found non-name object; inserting key /QPDFFake1
 WARNING: issue-100.pdf (trailer, file position 953): dictionary ended prematurely; using null as value for last key
-issue-100.pdf (file position 1138): unable to find /Root dictionary
+issue-100.pdf (file position 1144): unable to find /Root dictionary
--- a/qpdf/qtest/qpdf/issue-101.out
+++ b/qpdf/qtest/qpdf/issue-101.out
@ -16,7 +16,7 @@ WARNING: issue-101.pdf (trailer, file position 2613): /Length key in stream dict
 WARNING: issue-101.pdf (trailer, file position 2684): attempting to recover stream length
 WARNING: issue-101.pdf (trailer, file position 2684): recovered stream length: 74
 WARNING: issue-101.pdf (trailer, file position 2928): unknown token while reading object; treating as string
-WARNING: issue-101.pdf (trailer, file position 2929): unknown token while reading object; treating as string
+WARNING: issue-101.pdf (trailer, file position 2930): unknown token while reading object; treating as string
 WARNING: issue-101.pdf (trailer, file position 2928): expected dictionary key but found non-name object; inserting key /QPDFFake1
 WARNING: issue-101.pdf (trailer, file position 2928): expected dictionary key but found non-name object; inserting key /QPDFFake2
 WARNING: issue-101.pdf (trailer, file position 2928): expected dictionary key but found non-name object; inserting key /QPDFFake3
@ -45,7 +45,7 @@ WARNING: issue-101.pdf (file position 696): unknown token while reading object;
 WARNING: issue-101.pdf (file position 698): unknown token while reading object; treating as string
 WARNING: issue-101.pdf (file position 701): unknown token while reading object; treating as string
 WARNING: issue-101.pdf (file position 711): unknown token while reading object; treating as string
-WARNING: issue-101.pdf (file position 742): unknown token while reading object; treating as string
+WARNING: issue-101.pdf (file position 743): unknown token while reading object; treating as string
 WARNING: issue-101.pdf (file position 745): unknown token while reading object; treating as string
 WARNING: issue-101.pdf (file position 747): unknown token while reading object; treating as string
 WARNING: issue-101.pdf (file position 777): unknown token while reading object; treating as string
--- a/qpdf/qtest/qpdf/issue-146.out
+++ b/qpdf/qtest/qpdf/issue-146.out
@ -2,4 +2,4 @@ WARNING: issue-146.pdf: file is damaged
 WARNING: issue-146.pdf: can't find startxref
 WARNING: issue-146.pdf: Attempting to reconstruct cross-reference table
 WARNING: issue-146.pdf (trailer, file position 20728): unknown token while reading object; treating as string
-issue-146.pdf (trailer, file position 20732): EOF while reading token
+issue-146.pdf (trailer, file position 20732): unexpected EOF
--- a/qpdf/qtest/qpdf/issue-51.out
+++ b/qpdf/qtest/qpdf/issue-51.out
@ -6,5 +6,5 @@ WARNING: issue-51.pdf (file position 70): loop detected resolving object 2 0
 WARNING: issue-51.pdf (object 2 0, file position 26): /Length key in stream dictionary is not an integer
 WARNING: issue-51.pdf (object 2 0, file position 71): attempting to recover stream length
 WARNING: issue-51.pdf (object 2 0, file position 71): unable to recover stream data; treating stream as empty
-WARNING: issue-51.pdf (object 2 0, file position 977): EOF while reading token
+WARNING: issue-51.pdf (object 2 0, file position 977): unexpected EOF
 qpdf: operation succeeded with warnings; resulting file may have some problems
--- a/qpdf/qtest/qpdf/issue-99.out
+++ b/qpdf/qtest/qpdf/issue-99.out
@ -1,4 +1,4 @@
 WARNING: issue-99.pdf: file is damaged
 WARNING: issue-99.pdf (file position 3526): xref not found
 WARNING: issue-99.pdf: Attempting to reconstruct cross-reference table
-issue-99.pdf (file position 4793): unable to find /Root dictionary
+issue-99.pdf (file position 4798): unable to find /Root dictionary
--- a/qpdf/qtest/qpdf/issue-99b.out
+++ b/qpdf/qtest/qpdf/issue-99b.out
@ -2,4 +2,4 @@ WARNING: issue-99b.pdf: file is damaged
 WARNING: issue-99b.pdf (object 1 0, file position 9): object with ID 0
 WARNING: issue-99b.pdf: Attempting to reconstruct cross-reference table
 WARNING: issue-99b.pdf: object 1 0 not found in file after regenerating cross reference table
-issue-99b.pdf (file position 757): unable to find /Root dictionary
+issue-99b.pdf (file position 763): unable to find /Root dictionary
--- a/qpdf/qtest/qpdf/linearization-bounds-1.out
+++ b/qpdf/qtest/qpdf/linearization-bounds-1.out
@ -2,7 +2,7 @@ checking linearization-bounds-1.pdf
 PDF Version: 1.3
 File is not encrypted
 File is linearized
-WARNING: linearization-bounds-1.pdf (linearization hint stream: object 62 0, file position 1001182): EOF while reading token
+WARNING: linearization-bounds-1.pdf (linearization hint stream: object 62 0, file position 12302): unexpected EOF
 WARNING: linearization-bounds-1.pdf (linearization hint stream: object 62 0, file position 1183): attempting to recover stream length
 WARNING: linearization-bounds-1.pdf (linearization hint stream: object 62 0, file position 1183): recovered stream length: 106
 linearization-bounds-1.pdf (linearization hint table, file position 1183): /S (shared object) offset is out of bounds
--- a/qpdf/qtest/qpdf/tokens-maxlen.out
+++ b/qpdf/qtest/qpdf/tokens-maxlen.out
--- a/qpdf/qtest/qpdf/tokens-no-ignorable.out
+++ b/qpdf/qtest/qpdf/tokens-no-ignorable.out
--- a/qpdf/qtest/qpdf/tokens.out
+++ b/qpdf/qtest/qpdf/tokens.out
--- a/qpdf/test_tokenizer.cc
+++ b/qpdf/test_tokenizer.cc
@ -13,7 +13,8 @@ static char const* whoami = 0;

 void usage()
 {
-    std::cerr << "Usage: " << whoami << " filename"
+    std::cerr << "Usage: " << whoami
+              << " [-maxlen len | -no-ignorable] filename"
              << std::endl;
    exit(2);
 }
@ -83,6 +84,10 @@ static char const* tokenTypeName(QPDFTokenizer::token_type_e ttype)
        return "word";
      case QPDFTokenizer::tt_eof:
        return "eof";
+      case QPDFTokenizer::tt_space:
+        return "space";
+      case QPDFTokenizer::tt_comment:
+        return "comment";
    }
    return 0;
 }
@ -108,7 +113,8 @@ sanitize(std::string const& value)
 }

 static void
-try_skipping(PointerHolder<InputSource> is, char const* what, Finder& f)
+try_skipping(QPDFTokenizer& tokenizer, PointerHolder<InputSource> is,
+             size_t max_len, char const* what, Finder& f)
 {
    std::cout << "skipping to " << what << std::endl;
    qpdf_offset_t offset = is->tell();
@ -121,6 +127,7 @@ try_skipping(PointerHolder<InputSource> is, char const* what, Finder& f)

 static void
 dump_tokens(PointerHolder<InputSource> is, std::string const& label,
+            size_t max_len, bool include_ignorable,
            bool skip_streams, bool skip_inline_images)
 {
    Finder f1(is, "endstream");
@ -129,11 +136,16 @@ dump_tokens(PointerHolder<InputSource> is, std::string const& label,
    bool done = false;
    QPDFTokenizer tokenizer;
    tokenizer.allowEOF();
+    if (include_ignorable)
+    {
+        tokenizer.includeIgnorable();
+    }
    while (! done)
    {
-        QPDFTokenizer::Token token = tokenizer.readToken(is, "test", true);
+        QPDFTokenizer::Token token =
+            tokenizer.readToken(is, "test", true, max_len);

-        qpdf_offset_t offset = is->tell() - token.getRawValue().length();
+        qpdf_offset_t offset = is->getLastOffset();
        std::cout << offset << ": "
                  << tokenTypeName(token.getType());
        if (token.getType() != QPDFTokenizer::tt_eof)
@ -153,12 +165,12 @@ dump_tokens(PointerHolder<InputSource> is, std::string const& label,
        if (skip_streams &&
            (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream")))
        {
-            try_skipping(is, "endstream", f1);
+            try_skipping(tokenizer, is, max_len, "endstream", f1);
        }
        else if (skip_inline_images &&
                 (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID")))
        {
-            try_skipping(is, "EI", f2);
+            try_skipping(tokenizer, is, max_len, "EI", f2);
        }
        else if (token.getType() == QPDFTokenizer::tt_eof)
        {
@ -168,17 +180,16 @@ dump_tokens(PointerHolder<InputSource> is, std::string const& label,
    std::cout << "--- END " << label << " ---" << std::endl;
 }

-static void process(char const* filename)
+static void process(char const* filename, bool include_ignorable,
+                    size_t max_len)
 {
    PointerHolder<InputSource> is;
-    QPDFTokenizer tokenizer;
-    tokenizer.allowEOF();

    // Tokenize file, skipping streams
    FileInputSource* fis = new FileInputSource();
    fis->setFilename(filename);
    is = fis;
-    dump_tokens(is, "FILE", true, false);
+    dump_tokens(is, "FILE", max_len, include_ignorable, true, false);

    // Tokenize content streams, skipping inline images
    QPDF qpdf;
@ -201,7 +212,8 @@ static void process(char const* filename)
        BufferInputSource* bis = new BufferInputSource(
            "content data", content_data.getPointer());
        is = bis;
-        dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno), false, true);
+        dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno),
+                    max_len, include_ignorable, false, true);
    }

    // Tokenize object streams
@ -220,7 +232,7 @@ static void process(char const* filename)
            is = bis;
            dump_tokens(is, "OBJECT STREAM " +
                        QUtil::int_to_string((*iter).getObjectID()),
-                        false, false);
+                        max_len, include_ignorable, false, false);
        }
    }
 }
@ -242,15 +254,47 @@ int main(int argc, char* argv[])
 	whoami += 3;
    }

-    if (argc != 2)
+    char const* filename = 0;
+    size_t max_len = 0;
+    bool include_ignorable = true;
+    for (int i = 1; i < argc; ++i)
+    {
+        if (argv[i][0] == '-')
+        {
+            if (strcmp(argv[i], "-maxlen") == 0)
+            {
+                if (++i >= argc)
+                {
+                    usage();
+                }
+                max_len = QUtil::string_to_int(argv[i]);
+            }
+            else if (strcmp(argv[i], "-no-ignorable") == 0)
+            {
+                include_ignorable = false;
+            }
+            else
+            {
+                usage();
+            }
+        }
+        else if (filename)
+        {
+            usage();
+        }
+        else
+        {
+            filename = argv[i];
+        }
+    }
+    if (filename == 0)
    {
        usage();
    }

-    char const* filename = argv[1];
    try
    {
-        process(filename);
+        process(filename, include_ignorable, max_len);
    }
    catch (std::exception& e)
    {