Avoid merging adjacent tokens when concatenating contents (fixes #444)

2024-12-31 05:51:51 +00:00 · 2020-10-23 06:40:27 -04:00 · 2020-10-23 06:40:27 -04:00 · b30deaeeab
commit b30deaeeab
parent 0dea276997
16 changed files with 541 additions and 43 deletions
--- a/6
+++ b/6
@ -1,5 +1,11 @@
 2020-10-23  Jay Berkenbilt  <ejb@ql.org>
 	* Bug fix: when concatenating content streams, insert a newline if
 	needed to prevent the last token from the old stream from being
 	merged with the first token of the new stream. Qpdf was mistakenly
 	concatenating the streams without regard to the specification that
 	content streams are to be broken on token boundaries. Fixes #444.
 	* Bug fix: fix-qdf: properly handle empty streams with ignore
 	newline.
--- a/1
+++ b/1
@ -4,7 +4,6 @@ Candidates for upcoming release
 * Open "next" issues
  * bugs
    * #473: zsh completion with directories
    * #444: concatenated stream/whitespace bug
  * Non-bugs
    * #446: recognize edited QDF files
    * #436: parsing of document with form xobject
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@ -165,6 +165,47 @@ QPDFObjectHandle::ParserCallbacks::terminateParsing()
    throw TerminateParsing();
 }
 class LastChar: public Pipeline
 {
  public:
    LastChar(Pipeline* next);
    virtual ~LastChar() = default;
    virtual void write(unsigned char* data, size_t len);
    virtual void finish();
    unsigned char getLastChar();
  private:
    unsigned char last_char;
 };
 LastChar::LastChar(Pipeline* next) :
    Pipeline("lastchar", next),
    last_char(0)
 {
 }
 void
 LastChar::write(unsigned char* data, size_t len)
 {
    if (len > 0)
    {
        this->last_char = data[len - 1];
    }
    getNext()->write(data, len);
 }
 void
 LastChar::finish()
 {
    getNext()->finish();
 }
 unsigned char
 LastChar::getLastChar()
 {
    return this->last_char;
 }
 QPDFObjectHandle::QPDFObjectHandle() :
    initialized(false),
    qpdf(0),
@ -1600,21 +1641,31 @@ QPDFObjectHandle::pipeContentStreams(
    std::vector<QPDFObjectHandle> streams =
        arrayOrStreamToStreamArray(
            description, all_description);
    bool need_newline = false;
    for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
         iter != streams.end(); ++iter)
    {
        if (need_newline)
        {
            p->write(QUtil::unsigned_char_pointer("\n"), 1);
        }
        LastChar lc(p);
        QPDFObjectHandle stream = *iter;
        std::string og =
            QUtil::int_to_string(stream.getObjectID()) + " " +
            QUtil::int_to_string(stream.getGeneration());
        std::string w_description = "content stream object " + og;
-        if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized))
+        if (! stream.pipeStreamData(&lc, 0, qpdf_dl_specialized))
        {
            QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
            throw QPDFExc(qpdf_e_damaged_pdf, "content stream",
                          w_description, 0,
                          "errors while decoding content stream");
        }
        lc.finish();
        need_newline = (lc.getLastChar() != static_cast<unsigned char>('\n'));
        QTC::TC("qpdf", "QPDFObjectHandle need_newline",
                need_newline ? 0 : 1);
    }
 }
--- a/manual/qpdf-manual.xml
+++ b/manual/qpdf-manual.xml
@ -2090,14 +2090,9 @@ outfile.pdf</option>
        option causes qpdf to combine them into a single stream. Use
        of this option is never necessary for ordinary usage, but it
        can help when working with some files in some cases. For
-        example, some PDF writers split page contents into small
+        example, this can also be combined with QDF mode or content
-        streams at arbitrary points that may fall in the middle of
+        normalization to make it easier to look at all of a page's
-        lexical tokens within the content, and some PDF readers may
+        contents at once.
        get confused on such files. If you use qpdf to coalesce the
        content streams, such readers may be able to work with the
        file more easily. This can also be combined with QDF mode or
        content normalization to make it easier to look at all of a
        page's contents at once.
       </para>
      </listitem>
     </varlistentry>
@ -2398,25 +2393,15 @@ outfile.pdf</option>
    You should not use this for &ldquo;production&rdquo; PDF files.
   </para>
   <para>
-    This paragraph discusses edge cases of content normalization that
+    When normalizing content, if qpdf runs into any lexical errors, it
-    are not of concern to most users and are not relevant when content
+    will print a warning indicating that content may be damaged. The
-    normalization is not enabled. When normalizing content, if qpdf
+    only situation in which qpdf is known to cause damage during
-    runs into any lexical errors, it will print a warning indicating
+    content normalization is when a page's contents are split across
-    that content may be damaged. The only situation in which qpdf is
+    multiple streams and streams are split in the middle of a lexical
-    known to cause damage during content normalization is when a
+    token such as a string, name, or inline image. Note that files
-    page's contents are split across multiple streams and streams are
+    that do this are invalid since the PDF specification states that
-    split in the middle of a lexical token such as a string, name, or
+    content streams are not to be split in the middle of a token. If
-    inline image. There may be some pathological cases in which qpdf
+    you want to inspect the original content streams in an
    could damage content without noticing this, such as if the partial
    tokens at the end of one stream and the beginning of the next
    stream are both valid, but usually qpdf will be able to detect
    this case. For slightly increased safety, you can specify
    <option>--coalesce-contents</option> in addition to
    <option>--normalize-content</option> or <option>--qdf</option>.
    This will cause qpdf to combine all the content streams into one,
    thus recombining any split tokens. However doing this will prevent
    you from being able to see the original layout of the content
    streams. If you must inspect the original content streams in an
    uncompressed format, you can always run with <option>--qdf
    --normalize-content=n</option> for a QDF file without content
    normalization, or alternatively
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@ -455,3 +455,4 @@ qpdf found shared resources in leaf 0
 qpdf found shared xobject in leaf 0
 QPDF copy foreign with data 1
 QPDF copy foreign with foreign_stream 1
 QPDFObjectHandle need_newline 1
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@ -1591,13 +1591,21 @@ $td->runtest("type checks with object streams",
 # ----------
 $td->notify("--- Coalesce contents ---");
-$n_tests += 6;
+$n_tests += 8;
 $td->runtest("qdf with normalize warnings",
             {$td->COMMAND =>
-                  "qpdf --qdf --static-id coalesce.pdf a.pdf"},
+                  "qpdf --qdf --static-id split-tokens.pdf a.pdf"},
             {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3},
             $td->NORMALIZE_NEWLINES);
 $td->runtest("check output",
             {$td->FILE => "a.pdf"},
             {$td->FILE => "split-tokens.qdf"});
 $td->runtest("coalesce to qdf",
             {$td->COMMAND =>
                  "qpdf --qdf --static-id coalesce.pdf a.pdf"},
             {$td->STRING => "", $td->EXIT_STATUS => 0},
             $td->NORMALIZE_NEWLINES);
 $td->runtest("check output",
             {$td->FILE => "a.pdf"},
             {$td->FILE => "coalesce.qdf"});
@ -1831,12 +1839,12 @@ $td->runtest("unreferenced resources with bad token",
             {$td->COMMAND =>
                  "qpdf --qdf --static-id --split-pages=2" .
                  " --remove-unreferenced-resources=yes" .
-                  " coalesce.pdf split-out-bad-token.pdf"},
+                  " split-tokens.pdf split-out-bad-token.pdf"},
-             {$td->FILE => "coalesce-split.out", $td->EXIT_STATUS => 3},
+             {$td->FILE => "split-tokens-split.out", $td->EXIT_STATUS => 3},
             $td->NORMALIZE_NEWLINES);
 $td->runtest("check output",
             {$td->FILE => "split-out-bad-token-1-2.pdf"},
-             {$td->FILE => "coalesce-split-1-2.pdf"});
+             {$td->FILE => "split-tokens-split-1-2.pdf"});
 $td->runtest("shared images in form xobject",
             {$td->COMMAND => "qpdf --qdf --static-id --split-pages".
--- a/qpdf/qtest/qpdf/coalesce-out.pdf
+++ b/qpdf/qtest/qpdf/coalesce-out.pdf
--- a/qpdf/qtest/qpdf/coalesce-out.qdf
+++ b/qpdf/qtest/qpdf/coalesce-out.qdf
--- a/qpdf/qtest/qpdf/coalesce.pdf
+++ b/qpdf/qtest/qpdf/coalesce.pdf
--- a/qpdf/qtest/qpdf/coalesce.qdf
+++ b/qpdf/qtest/qpdf/coalesce.qdf
--- a/qpdf/qtest/qpdf/normalize-warnings.out
+++ b/qpdf/qtest/qpdf/normalize-warnings.out
@ -1,9 +1,9 @@
-WARNING: coalesce.pdf (offset 671): content normalization encountered bad tokens
+WARNING: split-tokens.pdf (offset 671): content normalization encountered bad tokens
-WARNING: coalesce.pdf (offset 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: split-tokens.pdf (offset 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
-WARNING: coalesce.pdf (offset 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: split-tokens.pdf (offset 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
-WARNING: coalesce.pdf (offset 823): content normalization encountered bad tokens
+WARNING: split-tokens.pdf (offset 823): content normalization encountered bad tokens
-WARNING: coalesce.pdf (offset 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: split-tokens.pdf (offset 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
-WARNING: coalesce.pdf (offset 962): content normalization encountered bad tokens
+WARNING: split-tokens.pdf (offset 962): content normalization encountered bad tokens
-WARNING: coalesce.pdf (offset 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: split-tokens.pdf (offset 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
-WARNING: coalesce.pdf (offset 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: split-tokens.pdf (offset 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
 qpdf: operation succeeded with warnings; resulting file may have some problems
--- a/qpdf/qtest/qpdf/split-tokens-split-1-2.pdf
+++ b/qpdf/qtest/qpdf/split-tokens-split-1-2.pdf
--- a/qpdf/qtest/qpdf/split-tokens-split.out
+++ b/qpdf/qtest/qpdf/split-tokens-split.out
@ -1,4 +1,4 @@
-WARNING: coalesce.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page
+WARNING: split-tokens.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page
 WARNING: empty PDF: content normalization encountered bad tokens
 WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
 WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
--- a/qpdf/qtest/qpdf/split-tokens.pdf
+++ b/qpdf/qtest/qpdf/split-tokens.pdf
@ -0,0 +1,217 @@
 %PDF-1.3
 %¿÷¢þ
 %QDF-1.0
 1 0 obj
 <<
  /Pages 2 0 R
  /Type /Catalog
 >>
 endobj
 2 0 obj
 <<
  /Count 2
  /Kids [
    3 0 R
    4 0 R
  ]
  /Type /Pages
 >>
 endobj
 %% Page 1
 3 0 obj
 <<
  /Contents [
    5 0 R
    7 0 R
    9 0 R
    11 0 R
  ]
  /MediaBox [
    0
    0
    612
    792
  ]
  /Parent 2 0 R
  /Resources <<
    /Font <<
      /F1 13 0 R
    >>
    /ProcSet 14 0 R
  >>
  /Type /Page
 >>
 endobj
 %% Page 2
 4 0 obj
 <<
  /Contents 15 0 R
  /MediaBox [
    0
    0
    612
    792
  ]
  /Parent 2 0 R
  /Resources <<
    /Font <<
      /F1 17 0 R
    >>
    /ProcSet 18 0 R
  >>
  /Type /Page
 >>
 endobj
 %% Contents for page 1
 5 0 obj
 <<
  /Length 6 0 R
 >>
 stream
 BT
  /F1 24 Tf
  72 720 Td
  (Pot
 endstream
 endobj
 %QDF: ignore_newline
 6 0 obj
 33
 endobj
 %% Contents for page 1
 7 0 obj
 <<
  /Length 8 0 R
 >>
 stream
 ato) Tj
 ET [ /array
 endstream
 endobj
 %QDF: ignore_newline
 8 0 obj
 19
 endobj
 %% Contents for page 1
 9 0 obj
 <<
  /Length 10 0 R
 >>
 stream
 /split ] BI
 /CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
 ID xœÅÖIÃ P|ÿC;UÈ`ÀÓ7‘Z©¦Ä˜Úæ<C39A>}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À<E280A1>>”^&®¡uâ]€"!‡•–*¬&<26>E|Sy® ðd-€<<3C>B0Bú@Nê+<hlèKÐî/56L ‰<C2A0>ã £–¹¦>0>Y<>ù!cì\YØ%Yð¥Ö8?& Öëˆ}j’ûè;«<>3<EFBFBD>ÂÖlpÛsHöûtú
 endstream
 endobj
 %QDF: ignore_newline
 10 0 obj
 253
 endobj
 %% Contents for page 1
 11 0 obj
 <<
  /Length 12 0 R
 >>
 stream
 QØTt*hÌUúãwÍÕÐ%¨)p–³"•DiRj¹–DYNUÓÙAv’Fà&
<0A>ÍÔu#c•ÆW	ôß‰W“O
 EI
 endstream
 endobj
 %QDF: ignore_newline
 12 0 obj
 66
 endobj
 13 0 obj
 <<
  /BaseFont /Helvetica
  /Encoding /WinAnsiEncoding
  /Name /F1
  /Subtype /Type1
  /Type /Font
 >>
 endobj
 14 0 obj
 [
  /PDF
  /Text
 ]
 endobj
 %% Contents for page 2
 15 0 obj
 <<
  /Length 16 0 R
 >>
 stream
 BT
  /F1 24 Tf
  72 720 Td
  (Potato) Tj
 ET
 endstream
 endobj
 16 0 obj
 44
 endobj
 17 0 obj
 <<
  /BaseFont /Helvetica
  /Encoding /WinAnsiEncoding
  /Name /F1
  /Subtype /Type1
  /Type /Font
 >>
 endobj
 18 0 obj
 [
  /PDF
  /Text
 ]
 endobj
 xref
 0 19
 0000000000 65535 f 
 0000000025 00000 n 
 0000000079 00000 n 
 0000000171 00000 n 
 0000000416 00000 n 
 0000000634 00000 n 
 0000000744 00000 n 
 0000000786 00000 n 
 0000000882 00000 n 
 0000000924 00000 n 
 0000001255 00000 n 
 0000001299 00000 n 
 0000001444 00000 n 
 0000001464 00000 n 
 0000001583 00000 n 
 0000001642 00000 n 
 0000001743 00000 n 
 0000001763 00000 n 
 0000001882 00000 n 
 trailer <<
  /Root 1 0 R
  /Size 19
  /ID [<fa46a90bcf56476b9904a2e7adb75024><6af379f20e8dcd4e724869daec3ba023>]
 >>
 startxref
 1918
 %%EOF
--- a/qpdf/qtest/qpdf/split-tokens.qdf
+++ b/qpdf/qtest/qpdf/split-tokens.qdf
@ -0,0 +1,231 @@
 %PDF-1.3
 %¿÷¢þ
 %QDF-1.0
 %% Original object ID: 1 0
 1 0 obj
 <<
  /Pages 2 0 R
  /Type /Catalog
 >>
 endobj
 %% Original object ID: 2 0
 2 0 obj
 <<
  /Count 2
  /Kids [
    3 0 R
    4 0 R
  ]
  /Type /Pages
 >>
 endobj
 %% Page 1
 %% Original object ID: 3 0
 3 0 obj
 <<
  /Contents [
    5 0 R
    7 0 R
    9 0 R
    11 0 R
  ]
  /MediaBox [
    0
    0
    612
    792
  ]
  /Parent 2 0 R
  /Resources <<
    /Font <<
      /F1 13 0 R
    >>
    /ProcSet 14 0 R
  >>
  /Type /Page
 >>
 endobj
 %% Page 2
 %% Original object ID: 4 0
 4 0 obj
 <<
  /Contents 15 0 R
  /MediaBox [
    0
    0
    612
    792
  ]
  /Parent 2 0 R
  /Resources <<
    /Font <<
      /F1 17 0 R
    >>
    /ProcSet 18 0 R
  >>
  /Type /Page
 >>
 endobj
 %% Contents for page 1
 %% Original object ID: 5 0
 5 0 obj
 <<
  /Length 6 0 R
 >>
 stream
 BT
  /F1 24 Tf
  72 720 Td
  (Pot
 endstream
 endobj
 %QDF: ignore_newline
 6 0 obj
 33
 endobj
 %% Contents for page 1
 %% Original object ID: 7 0
 7 0 obj
 <<
  /Length 8 0 R
 >>
 stream
 ato) Tj
 ET [ /array
 endstream
 endobj
 %QDF: ignore_newline
 8 0 obj
 19
 endobj
 %% Contents for page 1
 %% Original object ID: 9 0
 9 0 obj
 <<
  /Length 10 0 R
 >>
 stream
 /split ] BI
 /CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
 ID xœÅÖIÃ P|ÿC;UÈ`ÀÓ7‘Z©¦Ä˜Úæ<C39A>}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À<E280A1>>”^&®¡uâ]€"!‡•–*¬&<26>E|Sy® ðd-€<<3C>B0Bú@Nê+<hlèKÐî/56L ‰<C2A0>ã £–¹¦>0>Y<>ù!cì\YØ%Yð¥Ö8?& Öëˆ}j’ûè;«<>3<EFBFBD>ÂÖlpÛsHöûtú
 endstream
 endobj
 %QDF: ignore_newline
 10 0 obj
 253
 endobj
 %% Contents for page 1
 %% Original object ID: 11 0
 11 0 obj
 <<
  /Length 12 0 R
 >>
 stream
 QØTt*hÌUúãwÍÕÐ%¨)p–³"•DiRj¹–DYNUÓÙAv’Fà&
 <EFBFBD>ÍÔu#c•ÆW	ôß‰W“O
 EI
 endstream
 endobj
 %QDF: ignore_newline
 12 0 obj
 65
 endobj
 %% Original object ID: 13 0
 13 0 obj
 <<
  /BaseFont /Helvetica
  /Encoding /WinAnsiEncoding
  /Name /F1
  /Subtype /Type1
  /Type /Font
 >>
 endobj
 %% Original object ID: 14 0
 14 0 obj
 [
  /PDF
  /Text
 ]
 endobj
 %% Contents for page 2
 %% Original object ID: 15 0
 15 0 obj
 <<
  /Length 16 0 R
 >>
 stream
 BT
  /F1 24 Tf
  72 720 Td
  (Potato) Tj
 ET
 endstream
 endobj
 16 0 obj
 44
 endobj
 %% Original object ID: 17 0
 17 0 obj
 <<
  /BaseFont /Helvetica
  /Encoding /WinAnsiEncoding
  /Name /F1
  /Subtype /Type1
  /Type /Font
 >>
 endobj
 %% Original object ID: 18 0
 18 0 obj
 [
  /PDF
  /Text
 ]
 endobj
 xref
 0 19
 0000000000 65535 f 
 0000000052 00000 n 
 0000000133 00000 n 
 0000000252 00000 n 
 0000000524 00000 n 
 0000000769 00000 n 
 0000000879 00000 n 
 0000000948 00000 n 
 0000001044 00000 n 
 0000001113 00000 n 
 0000001444 00000 n 
 0000001516 00000 n 
 0000001660 00000 n 
 0000001708 00000 n 
 0000001855 00000 n 
 0000001942 00000 n 
 0000002043 00000 n 
 0000002091 00000 n 
 0000002238 00000 n 
 trailer <<
  /Root 1 0 R
  /Size 19
  /ID [<fa46a90bcf56476b9904a2e7adb75024><31415926535897932384626433832795>]
 >>
 startxref
 2274
 %%EOF
--- a/qpdf/qtest/qpdf/token-filters-out.pdf
+++ b/qpdf/qtest/qpdf/token-filters-out.pdf