Avoid merging adjacent tokens when concatenating contents (fixes #444)

2024-12-22 10:58:58 +00:00 · 2020-10-23 06:40:27 -04:00 · 2020-10-23 06:40:27 -04:00 · b30deaeeab
commit b30deaeeab
parent 0dea276997
16 changed files with 541 additions and 43 deletions
--- a/6
+++ b/6
@ -1,5 +1,11 @@
 2020-10-23  Jay Berkenbilt  <ejb@ql.org>

+	* Bug fix: when concatenating content streams, insert a newline if
+	needed to prevent the last token from the old stream from being
+	merged with the first token of the new stream. Qpdf was mistakenly
+	concatenating the streams without regard to the specification that
+	content streams are to be broken on token boundaries. Fixes #444.
+
 	* Bug fix: fix-qdf: properly handle empty streams with ignore
 	newline.

--- a/1
+++ b/1
@ -4,7 +4,6 @@ Candidates for upcoming release
 * Open "next" issues
  * bugs
    * #473: zsh completion with directories
-    * #444: concatenated stream/whitespace bug
  * Non-bugs
    * #446: recognize edited QDF files
    * #436: parsing of document with form xobject
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@ -165,6 +165,47 @@ QPDFObjectHandle::ParserCallbacks::terminateParsing()
    throw TerminateParsing();
 }

+class LastChar: public Pipeline
+{
+  public:
+    LastChar(Pipeline* next);
+    virtual ~LastChar() = default;
+    virtual void write(unsigned char* data, size_t len);
+    virtual void finish();
+    unsigned char getLastChar();
+
+  private:
+    unsigned char last_char;
+};
+
+LastChar::LastChar(Pipeline* next) :
+    Pipeline("lastchar", next),
+    last_char(0)
+{
+}
+
+void
+LastChar::write(unsigned char* data, size_t len)
+{
+    if (len > 0)
+    {
+        this->last_char = data[len - 1];
+    }
+    getNext()->write(data, len);
+}
+
+void
+LastChar::finish()
+{
+    getNext()->finish();
+}
+
+unsigned char
+LastChar::getLastChar()
+{
+    return this->last_char;
+}
+
 QPDFObjectHandle::QPDFObjectHandle() :
    initialized(false),
    qpdf(0),
@ -1600,21 +1641,31 @@ QPDFObjectHandle::pipeContentStreams(
    std::vector<QPDFObjectHandle> streams =
        arrayOrStreamToStreamArray(
            description, all_description);
+    bool need_newline = false;
    for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
         iter != streams.end(); ++iter)
    {
+        if (need_newline)
+        {
+            p->write(QUtil::unsigned_char_pointer("\n"), 1);
+        }
+        LastChar lc(p);
        QPDFObjectHandle stream = *iter;
        std::string og =
            QUtil::int_to_string(stream.getObjectID()) + " " +
            QUtil::int_to_string(stream.getGeneration());
        std::string w_description = "content stream object " + og;
-        if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized))
+        if (! stream.pipeStreamData(&lc, 0, qpdf_dl_specialized))
        {
            QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
            throw QPDFExc(qpdf_e_damaged_pdf, "content stream",
                          w_description, 0,
                          "errors while decoding content stream");
        }
+        lc.finish();
+        need_newline = (lc.getLastChar() != static_cast<unsigned char>('\n'));
+        QTC::TC("qpdf", "QPDFObjectHandle need_newline",
+                need_newline ? 0 : 1);
    }
 }

--- a/manual/qpdf-manual.xml
+++ b/manual/qpdf-manual.xml
@ -2090,14 +2090,9 @@ outfile.pdf</option>
        option causes qpdf to combine them into a single stream. Use
        of this option is never necessary for ordinary usage, but it
        can help when working with some files in some cases. For
-        example, some PDF writers split page contents into small
-        streams at arbitrary points that may fall in the middle of
-        lexical tokens within the content, and some PDF readers may
-        get confused on such files. If you use qpdf to coalesce the
-        content streams, such readers may be able to work with the
-        file more easily. This can also be combined with QDF mode or
-        content normalization to make it easier to look at all of a
-        page's contents at once.
+        example, this can also be combined with QDF mode or content
+        normalization to make it easier to look at all of a page's
+        contents at once.
       </para>
      </listitem>
     </varlistentry>
@ -2398,25 +2393,15 @@ outfile.pdf</option>
    You should not use this for &ldquo;production&rdquo; PDF files.
   </para>
   <para>
-    This paragraph discusses edge cases of content normalization that
-    are not of concern to most users and are not relevant when content
-    normalization is not enabled. When normalizing content, if qpdf
-    runs into any lexical errors, it will print a warning indicating
-    that content may be damaged. The only situation in which qpdf is
-    known to cause damage during content normalization is when a
-    page's contents are split across multiple streams and streams are
-    split in the middle of a lexical token such as a string, name, or
-    inline image. There may be some pathological cases in which qpdf
-    could damage content without noticing this, such as if the partial
-    tokens at the end of one stream and the beginning of the next
-    stream are both valid, but usually qpdf will be able to detect
-    this case. For slightly increased safety, you can specify
-    <option>--coalesce-contents</option> in addition to
-    <option>--normalize-content</option> or <option>--qdf</option>.
-    This will cause qpdf to combine all the content streams into one,
-    thus recombining any split tokens. However doing this will prevent
-    you from being able to see the original layout of the content
-    streams. If you must inspect the original content streams in an
+    When normalizing content, if qpdf runs into any lexical errors, it
+    will print a warning indicating that content may be damaged. The
+    only situation in which qpdf is known to cause damage during
+    content normalization is when a page's contents are split across
+    multiple streams and streams are split in the middle of a lexical
+    token such as a string, name, or inline image. Note that files
+    that do this are invalid since the PDF specification states that
+    content streams are not to be split in the middle of a token. If
+    you want to inspect the original content streams in an
    uncompressed format, you can always run with <option>--qdf
    --normalize-content=n</option> for a QDF file without content
    normalization, or alternatively
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@ -455,3 +455,4 @@ qpdf found shared resources in leaf 0
 qpdf found shared xobject in leaf 0
 QPDF copy foreign with data 1
 QPDF copy foreign with foreign_stream 1
+QPDFObjectHandle need_newline 1
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@ -1591,13 +1591,21 @@ $td->runtest("type checks with object streams",

 # ----------
 $td->notify("--- Coalesce contents ---");
-$n_tests += 6;
+$n_tests += 8;

 $td->runtest("qdf with normalize warnings",
             {$td->COMMAND =>
-                  "qpdf --qdf --static-id coalesce.pdf a.pdf"},
+                  "qpdf --qdf --static-id split-tokens.pdf a.pdf"},
             {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3},
             $td->NORMALIZE_NEWLINES);
+$td->runtest("check output",
+             {$td->FILE => "a.pdf"},
+             {$td->FILE => "split-tokens.qdf"});
+$td->runtest("coalesce to qdf",
+             {$td->COMMAND =>
+                  "qpdf --qdf --static-id coalesce.pdf a.pdf"},
+             {$td->STRING => "", $td->EXIT_STATUS => 0},
+             $td->NORMALIZE_NEWLINES);
 $td->runtest("check output",
             {$td->FILE => "a.pdf"},
             {$td->FILE => "coalesce.qdf"});
@ -1831,12 +1839,12 @@ $td->runtest("unreferenced resources with bad token",
             {$td->COMMAND =>
                  "qpdf --qdf --static-id --split-pages=2" .
                  " --remove-unreferenced-resources=yes" .
-                  " coalesce.pdf split-out-bad-token.pdf"},
-             {$td->FILE => "coalesce-split.out", $td->EXIT_STATUS => 3},
+                  " split-tokens.pdf split-out-bad-token.pdf"},
+             {$td->FILE => "split-tokens-split.out", $td->EXIT_STATUS => 3},
             $td->NORMALIZE_NEWLINES);
 $td->runtest("check output",
             {$td->FILE => "split-out-bad-token-1-2.pdf"},
-             {$td->FILE => "coalesce-split-1-2.pdf"});
+             {$td->FILE => "split-tokens-split-1-2.pdf"});

 $td->runtest("shared images in form xobject",
             {$td->COMMAND => "qpdf --qdf --static-id --split-pages".
--- a/qpdf/qtest/qpdf/coalesce-out.pdf
+++ b/qpdf/qtest/qpdf/coalesce-out.pdf
--- a/qpdf/qtest/qpdf/coalesce-out.qdf
+++ b/qpdf/qtest/qpdf/coalesce-out.qdf
--- a/qpdf/qtest/qpdf/coalesce.pdf
+++ b/qpdf/qtest/qpdf/coalesce.pdf
--- a/qpdf/qtest/qpdf/coalesce.qdf
+++ b/qpdf/qtest/qpdf/coalesce.qdf
--- a/qpdf/qtest/qpdf/normalize-warnings.out
+++ b/qpdf/qtest/qpdf/normalize-warnings.out
@ -1,9 +1,9 @@
-WARNING: coalesce.pdf (offset 671): content normalization encountered bad tokens
-WARNING: coalesce.pdf (offset 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
-WARNING: coalesce.pdf (offset 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
-WARNING: coalesce.pdf (offset 823): content normalization encountered bad tokens
-WARNING: coalesce.pdf (offset 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
-WARNING: coalesce.pdf (offset 962): content normalization encountered bad tokens
-WARNING: coalesce.pdf (offset 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
-WARNING: coalesce.pdf (offset 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: split-tokens.pdf (offset 671): content normalization encountered bad tokens
+WARNING: split-tokens.pdf (offset 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: split-tokens.pdf (offset 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: split-tokens.pdf (offset 823): content normalization encountered bad tokens
+WARNING: split-tokens.pdf (offset 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: split-tokens.pdf (offset 962): content normalization encountered bad tokens
+WARNING: split-tokens.pdf (offset 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: split-tokens.pdf (offset 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
 qpdf: operation succeeded with warnings; resulting file may have some problems
--- a/qpdf/qtest/qpdf/split-tokens-split-1-2.pdf
+++ b/qpdf/qtest/qpdf/split-tokens-split-1-2.pdf
--- a/qpdf/qtest/qpdf/split-tokens-split.out
+++ b/qpdf/qtest/qpdf/split-tokens-split.out
@ -1,4 +1,4 @@
-WARNING: coalesce.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page
+WARNING: split-tokens.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page
 WARNING: empty PDF: content normalization encountered bad tokens
 WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
 WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
--- a/qpdf/qtest/qpdf/split-tokens.pdf
+++ b/qpdf/qtest/qpdf/split-tokens.pdf
@ -0,0 +1,217 @@
+%PDF-1.3
+%¿÷¢þ
+%QDF-1.0
+
+1 0 obj
+<<
+  /Pages 2 0 R
+  /Type /Catalog
+>>
+endobj
+
+2 0 obj
+<<
+  /Count 2
+  /Kids [
+    3 0 R
+    4 0 R
+  ]
+  /Type /Pages
+>>
+endobj
+
+%% Page 1
+3 0 obj
+<<
+  /Contents [
+    5 0 R
+    7 0 R
+    9 0 R
+    11 0 R
+  ]
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 13 0 R
+    >>
+    /ProcSet 14 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Page 2
+4 0 obj
+<<
+  /Contents 15 0 R
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 17 0 R
+    >>
+    /ProcSet 18 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Contents for page 1
+5 0 obj
+<<
+  /Length 6 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Pot
+endstream
+endobj
+
+%QDF: ignore_newline
+6 0 obj
+33
+endobj
+
+%% Contents for page 1
+7 0 obj
+<<
+  /Length 8 0 R
+>>
+stream
+ato) Tj
+ET [ /array
+endstream
+endobj
+
+%QDF: ignore_newline
+8 0 obj
+19
+endobj
+
+%% Contents for page 1
+9 0 obj
+<<
+  /Length 10 0 R
+>>
+stream
+/split ] BI
+/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
+ID xœÅÖIÃ P|ÿC;UÈ`ÀÓ7‘Z©¦Ä˜Úæ<C39A>}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À<E280A1>>”^&®¡uâ]€"!‡•–*¬&<26>E|Sy® ðd-€<<3C>B0Bú@Nê+<hlèKÐî/56L ‰<C2A0>ã £–¹¦>0>Y<>ù!cì\YØ%Yð¥Ö8?& Öëˆ}j’ûè;«<>3<EFBFBD>ÂÖlpÛsHöûtú
+endstream
+endobj
+
+%QDF: ignore_newline
+10 0 obj
+253
+endobj
+
+%% Contents for page 1
+11 0 obj
+<<
+  /Length 12 0 R
+>>
+stream
+QØTt*hÌUúãwÍÕÐ%¨)p–³"•DiRj¹–DYNUÓÙAv’Fà&
<0A>ÍÔu#c•ÆW	ôß‰W“O
+EI
+endstream
+endobj
+
+%QDF: ignore_newline
+12 0 obj
+66
+endobj
+
+13 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+14 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+%% Contents for page 2
+15 0 obj
+<<
+  /Length 16 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Potato) Tj
+ET
+endstream
+endobj
+
+16 0 obj
+44
+endobj
+
+17 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+18 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+xref
+0 19
+0000000000 65535 f 
+0000000025 00000 n 
+0000000079 00000 n 
+0000000171 00000 n 
+0000000416 00000 n 
+0000000634 00000 n 
+0000000744 00000 n 
+0000000786 00000 n 
+0000000882 00000 n 
+0000000924 00000 n 
+0000001255 00000 n 
+0000001299 00000 n 
+0000001444 00000 n 
+0000001464 00000 n 
+0000001583 00000 n 
+0000001642 00000 n 
+0000001743 00000 n 
+0000001763 00000 n 
+0000001882 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 19
+  /ID [<fa46a90bcf56476b9904a2e7adb75024><6af379f20e8dcd4e724869daec3ba023>]
+>>
+startxref
+1918
+%%EOF
--- a/qpdf/qtest/qpdf/split-tokens.qdf
+++ b/qpdf/qtest/qpdf/split-tokens.qdf
@ -0,0 +1,231 @@
+%PDF-1.3
+%¿÷¢þ
+%QDF-1.0
+
+%% Original object ID: 1 0
+1 0 obj
+<<
+  /Pages 2 0 R
+  /Type /Catalog
+>>
+endobj
+
+%% Original object ID: 2 0
+2 0 obj
+<<
+  /Count 2
+  /Kids [
+    3 0 R
+    4 0 R
+  ]
+  /Type /Pages
+>>
+endobj
+
+%% Page 1
+%% Original object ID: 3 0
+3 0 obj
+<<
+  /Contents [
+    5 0 R
+    7 0 R
+    9 0 R
+    11 0 R
+  ]
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 13 0 R
+    >>
+    /ProcSet 14 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Page 2
+%% Original object ID: 4 0
+4 0 obj
+<<
+  /Contents 15 0 R
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 17 0 R
+    >>
+    /ProcSet 18 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Contents for page 1
+%% Original object ID: 5 0
+5 0 obj
+<<
+  /Length 6 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Pot
+endstream
+endobj
+
+%QDF: ignore_newline
+6 0 obj
+33
+endobj
+
+%% Contents for page 1
+%% Original object ID: 7 0
+7 0 obj
+<<
+  /Length 8 0 R
+>>
+stream
+ato) Tj
+ET [ /array
+endstream
+endobj
+
+%QDF: ignore_newline
+8 0 obj
+19
+endobj
+
+%% Contents for page 1
+%% Original object ID: 9 0
+9 0 obj
+<<
+  /Length 10 0 R
+>>
+stream
+/split ] BI
+/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
+ID xœÅÖIÃ P|ÿC;UÈ`ÀÓ7‘Z©¦Ä˜Úæ<C39A>}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À<E280A1>>”^&®¡uâ]€"!‡•–*¬&<26>E|Sy® ðd-€<<3C>B0Bú@Nê+<hlèKÐî/56L ‰<C2A0>ã £–¹¦>0>Y<>ù!cì\YØ%Yð¥Ö8?& Öëˆ}j’ûè;«<>3<EFBFBD>ÂÖlpÛsHöûtú
+endstream
+endobj
+
+%QDF: ignore_newline
+10 0 obj
+253
+endobj
+
+%% Contents for page 1
+%% Original object ID: 11 0
+11 0 obj
+<<
+  /Length 12 0 R
+>>
+stream
+QØTt*hÌUúãwÍÕÐ%¨)p–³"•DiRj¹–DYNUÓÙAv’Fà&
+<EFBFBD>ÍÔu#c•ÆW	ôß‰W“O
+EI
+endstream
+endobj
+
+%QDF: ignore_newline
+12 0 obj
+65
+endobj
+
+%% Original object ID: 13 0
+13 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+%% Original object ID: 14 0
+14 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+%% Contents for page 2
+%% Original object ID: 15 0
+15 0 obj
+<<
+  /Length 16 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Potato) Tj
+ET
+endstream
+endobj
+
+16 0 obj
+44
+endobj
+
+%% Original object ID: 17 0
+17 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+%% Original object ID: 18 0
+18 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+xref
+0 19
+0000000000 65535 f 
+0000000052 00000 n 
+0000000133 00000 n 
+0000000252 00000 n 
+0000000524 00000 n 
+0000000769 00000 n 
+0000000879 00000 n 
+0000000948 00000 n 
+0000001044 00000 n 
+0000001113 00000 n 
+0000001444 00000 n 
+0000001516 00000 n 
+0000001660 00000 n 
+0000001708 00000 n 
+0000001855 00000 n 
+0000001942 00000 n 
+0000002043 00000 n 
+0000002091 00000 n 
+0000002238 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 19
+  /ID [<fa46a90bcf56476b9904a2e7adb75024><31415926535897932384626433832795>]
+>>
+startxref
+2274
+%%EOF
--- a/qpdf/qtest/qpdf/token-filters-out.pdf
+++ b/qpdf/qtest/qpdf/token-filters-out.pdf