Detect and report bad tokens in content normalization

2025-02-07 06:08:26 +00:00 · 2018-02-02 21:16:40 -05:00 · 2018-02-02 21:16:40 -05:00 · 5136238f2a
commit 5136238f2a
parent 30709935af
9 changed files with 343 additions and 4 deletions
--- a/19
+++ b/19
@ -153,6 +153,25 @@
 	* Provide heavily annoated examples/pdf-filter-tokens.cc example
 	that illustrates use of some simple token filters.

+	* When normalizing content streams, as in qdf mode, issue warning
+	about bad tokens. Content streams are only normalized when this is
+	explicitly requested, so this has no impact on normal operation.
+	However, in qdf mode, if qpdf detects a bad token, it means that
+	either there's a bug in qpdf's lexer, that the file is damaged, or
+	that the page's contents are split in a weird way. In any of those
+	cases, qpdf could potentially damage the stream's contents by
+	replacing carrige returns with newlines or otherwise messing with
+	spaces. The mostly likely case of this would be an inline image's
+	compressed data being divided across two streams and having the
+	compressed data in the second stream contain a carriage return as
+	part of its binary data. If you are using qdf mode just to look at
+	PDF files in text editors, this usually doesn't matter. In cases
+	of contents split across multiple streams, coalescing streams
+	would eliminate the problem, so the warning mentions this. Prior
+	to this enhancement, the chances of qdf mode writing incorrect
+	data were already very low. This change should make it nearly
+	impossible for qdf mode to unknowingly write invalid data.
+
 2018-02-04  Jay Berkenbilt  <ejb@ql.org>

 	* Add QPDFWriter::setLinearizationPass1Filename method and
--- a/libqpdf/ContentNormalizer.cc
+++ b/libqpdf/ContentNormalizer.cc
@ -1,7 +1,9 @@
 #include <qpdf/ContentNormalizer.hh>
 #include <qpdf/QUtil.hh>

-ContentNormalizer::ContentNormalizer()
+ContentNormalizer::ContentNormalizer() :
+    any_bad_tokens(false),
+    last_token_was_bad(false)
 {
 }

@ -15,6 +17,16 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
    std::string value = token.getRawValue();
    QPDFTokenizer::token_type_e token_type = token.getType();

+    if (token_type == QPDFTokenizer::tt_bad)
+    {
+        this->any_bad_tokens = true;
+        this->last_token_was_bad = true;
+    }
+    else if (token_type != QPDFTokenizer::tt_eof)
+    {
+        this->last_token_was_bad = false;
+    }
+
    switch (token_type)
    {
      case QPDFTokenizer::tt_space:
@ -75,3 +87,15 @@ ContentNormalizer::handleEOF()
 {
    finish();
 }
+
+bool
+ContentNormalizer::anyBadTokens() const
+{
+    return this->any_bad_tokens;
+}
+
+bool
+ContentNormalizer::lastTokenWasBad()const
+{
+    return this->last_token_was_bad;
+}
--- a/libqpdf/QPDF_Stream.cc
+++ b/libqpdf/QPDF_Stream.cc
@ -609,6 +609,33 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline,
        }
    }

+    if (filter &&
+        (! suppress_warnings) &&
+        normalizer.getPointer() &&
+        normalizer->anyBadTokens())
+    {
+        warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
+                     "", this->offset,
+                     "content normalization encountered bad tokens"));
+        if (normalizer->lastTokenWasBad())
+        {
+            QTC::TC("qpdf", "QPDF_Stream bad token at end during normalize");
+            warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
+                         "", this->offset,
+                         "normalized content ended with a bad token;"
+                         " you may be able to resolve this by"
+                         " coalescing content streams in combination"
+                         " with normalizing content. From the command"
+                         " line, specify --coalesce-contents"));
+        }
+        warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
+                     "", this->offset,
+                     "Resulting stream data may be corrupted but is"
+                     " may still useful for manual inspection."
+                     " For more information on this warning, search"
+                     " for content normalization in the manual."));
+    }
+
    return filter;
 }

--- a/libqpdf/qpdf/ContentNormalizer.hh
+++ b/libqpdf/qpdf/ContentNormalizer.hh
@ -10,6 +10,13 @@ class ContentNormalizer: public QPDFObjectHandle::TokenFilter
    virtual ~ContentNormalizer();
    virtual void handleToken(QPDFTokenizer::Token const&);
    virtual void handleEOF();
+
+    bool anyBadTokens() const;
+    bool lastTokenWasBad() const;
+
+  private:
+    bool any_bad_tokens;
+    bool last_token_was_bad;
 };

 #endif // __CONTENTNORMALIZER_HH__
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@ -306,3 +306,4 @@ Pl_QPDFTokenizer found ID 0
 QPDFObjectHandle non-stream in stream array 0
 QPDFObjectHandle coalesce called on stream 0
 QPDFObjectHandle coalesce provide stream data 0
+QPDF_Stream bad token at end during normalize 0
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@ -737,8 +737,16 @@ $td->runtest("stream with tiff predictor",
 show_ntests();
 # ----------
 $td->notify("--- Coalesce contents ---");
-$n_tests += 4;
+$n_tests += 6;

+$td->runtest("qdf with normalize warnings",
+             {$td->COMMAND =>
+                  "qpdf --qdf --static-id coalesce.pdf a.pdf"},
+             {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3},
+             $td->NORMALIZE_NEWLINES);
+$td->runtest("check output",
+             {$td->FILE => "a.pdf"},
+             {$td->FILE => "coalesce.qdf"});
 $td->runtest("coalesce contents with qdf",
             {$td->COMMAND =>
                  "qpdf --qdf --static-id" .
--- a/qpdf/qtest/qpdf/coalesce.qdf
+++ b/qpdf/qtest/qpdf/coalesce.qdf
@ -0,0 +1,231 @@
+%PDF-1.3
+%¿÷¢þ
+%QDF-1.0
+
+%% Original object ID: 1 0
+1 0 obj
+<<
+  /Pages 2 0 R
+  /Type /Catalog
+>>
+endobj
+
+%% Original object ID: 2 0
+2 0 obj
+<<
+  /Count 2
+  /Kids [
+    3 0 R
+    4 0 R
+  ]
+  /Type /Pages
+>>
+endobj
+
+%% Page 1
+%% Original object ID: 3 0
+3 0 obj
+<<
+  /Contents [
+    5 0 R
+    7 0 R
+    9 0 R
+    11 0 R
+  ]
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 13 0 R
+    >>
+    /ProcSet 14 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Page 2
+%% Original object ID: 4 0
+4 0 obj
+<<
+  /Contents 15 0 R
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 17 0 R
+    >>
+    /ProcSet 18 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Contents for page 1
+%% Original object ID: 5 0
+5 0 obj
+<<
+  /Length 6 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Pot
+endstream
+endobj
+
+%QDF: ignore_newline
+6 0 obj
+33
+endobj
+
+%% Contents for page 1
+%% Original object ID: 7 0
+7 0 obj
+<<
+  /Length 8 0 R
+>>
+stream
+ato) Tj
+ET [ /array
+endstream
+endobj
+
+%QDF: ignore_newline
+8 0 obj
+19
+endobj
+
+%% Contents for page 1
+%% Original object ID: 9 0
+9 0 obj
+<<
+  /Length 10 0 R
+>>
+stream
+/split ] BI
+/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
+ID xœÅÖIÃ P|ÿC;UÈ`ÀÓ7‘Z©¦Ä˜Úæ<C39A>}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À<E280A1>>”^&®¡uâ]€"!‡•–*¬&<26>E|Sy® ðd-€<<3C>B0Bú@Nê+<hlèKÐî/56L ‰<C2A0>ã £–¹¦>0>Y<>ù!cì\YØ%Yð¥Ö8?& Öëˆ}j’ûè;«<>3<EFBFBD>ÂÖlpÛsHöûtú
+endstream
+endobj
+
+%QDF: ignore_newline
+10 0 obj
+253
+endobj
+
+%% Contents for page 1
+%% Original object ID: 11 0
+11 0 obj
+<<
+  /Length 12 0 R
+>>
+stream
+QØTt*hÌUúãwÍÕÐ%¨)p–³"•DiRj¹–DYNUÓÙAv’Fà&
+<EFBFBD>ÍÔu#c•ÆW	ôß‰W“O
+EI
+endstream
+endobj
+
+%QDF: ignore_newline
+12 0 obj
+65
+endobj
+
+%% Original object ID: 13 0
+13 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+%% Original object ID: 14 0
+14 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+%% Contents for page 2
+%% Original object ID: 15 0
+15 0 obj
+<<
+  /Length 16 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Potato) Tj
+ET
+endstream
+endobj
+
+16 0 obj
+44
+endobj
+
+%% Original object ID: 17 0
+17 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+%% Original object ID: 18 0
+18 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+xref
+0 19
+0000000000 65535 f 
+0000000052 00000 n 
+0000000133 00000 n 
+0000000252 00000 n 
+0000000524 00000 n 
+0000000769 00000 n 
+0000000879 00000 n 
+0000000948 00000 n 
+0000001044 00000 n 
+0000001113 00000 n 
+0000001444 00000 n 
+0000001516 00000 n 
+0000001660 00000 n 
+0000001708 00000 n 
+0000001855 00000 n 
+0000001942 00000 n 
+0000002043 00000 n 
+0000002091 00000 n 
+0000002238 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 19
+  /ID [<fa46a90bcf56476b9904a2e7adb75024><31415926535897932384626433832795>]
+>>
+startxref
+2274
+%%EOF
--- a/qpdf/qtest/qpdf/good14.out
+++ b/qpdf/qtest/qpdf/good14.out
@ -13,7 +13,9 @@ three lines
 <8a8b>
 (ab)
 <8c><dd> ) >
-<610062> (MOO)-- stream 1 --
+<610062> (MOO)WARNING: good14.pdf (file position 628): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 628): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+-- stream 1 --
 This stream does end with a newline.
 // tests:
 //   bad tokens preserved
@ -31,10 +33,18 @@ This stream does end with a newline.
  
 /good name
 /bad#00name
+WARNING: good14.pdf (file position 860): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 860): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
 -- stream 2 --
 (This stream ends with a \001 bad token
+WARNING: good14.pdf (file position 1316): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 1316): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: good14.pdf (file position 1316): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
 -- stream 3 --
-<AB X-- stream 4 --
+<AB XWARNING: good14.pdf (file position 1406): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 1406): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: good14.pdf (file position 1406): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+-- stream 4 --
 (ends with a name)
 /ThisMustBeLast-- stream 5 --
 % This stream has an inline image marker that is not terminated
@ -44,4 +54,7 @@ BI
 ID
 <506f7
 461746f>
+WARNING: good14.pdf (file position 1549): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 1549): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: good14.pdf (file position 1549): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
 test 3 done
--- a/qpdf/qtest/qpdf/normalize-warnings.out
+++ b/qpdf/qtest/qpdf/normalize-warnings.out
@ -0,0 +1,9 @@
+WARNING: coalesce.pdf (file position 671): content normalization encountered bad tokens
+WARNING: coalesce.pdf (file position 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: coalesce.pdf (file position 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: coalesce.pdf (file position 823): content normalization encountered bad tokens
+WARNING: coalesce.pdf (file position 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: coalesce.pdf (file position 962): content normalization encountered bad tokens
+WARNING: coalesce.pdf (file position 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: coalesce.pdf (file position 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+qpdf: operation succeeded with warnings; resulting file may have some problems