Additional checks for unreferenced resources

Explicitly abandon removal of unreferenced resources if there are any lexical errors in the page's contents. This case always generated a warning, but it now also prevents removal of unreferenced resources, this strongly decreasing the likelihood of data loss.
2024-06-01 01:40:51 +00:00 · 2019-01-17 08:56:58 -05:00 · 2019-01-17 08:56:58 -05:00 · 5cfcd4f361
commit 5cfcd4f361
parent e09ae710dc
5 changed files with 272 additions and 3 deletions
--- a/libqpdf/QPDFPageObjectHelper.cc
+++ b/libqpdf/QPDFPageObjectHelper.cc
@ -99,11 +99,16 @@ QPDFPageObjectHelper::addContentTokenFilter(
 class NameWatcher: public QPDFObjectHandle::TokenFilter
 {
  public:
+    NameWatcher() :
+        saw_bad(false)
+    {
+    }
    virtual ~NameWatcher()
    {
    }
    virtual void handleToken(QPDFTokenizer::Token const&);
    std::set<std::string> names;
+    bool saw_bad;
 };

 void
@ -116,6 +121,10 @@ NameWatcher::handleToken(QPDFTokenizer::Token const& token)
        this->names.insert(
            QPDFObjectHandle::newName(token.getValue()).getName());
    }
+    else if (token.getType() == QPDFTokenizer::tt_bad)
+    {
+        saw_bad = true;
+    }
    writeToken(token);
 }

@ -134,6 +143,14 @@ QPDFPageObjectHelper::removeUnreferencedResources()
            "; not attempting to remove unreferenced objects from this page");
        return;
    }
+    if (nw.saw_bad)
+    {
+        QTC::TC("qpdf", "QPDFPageObjectHelper bad token finding names");
+        this->oh.warnIfPossible(
+            "Bad token found while scanning content stream; "
+            "not attempting to remove unreferenced objects from this page");
+        return;
+    }
    // Walk through /Font and /XObject dictionaries, removing any
    // resources that are not referenced. We must make copies of
    // resource dictionaries down into the dictionaries are mutating
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@ -412,3 +412,4 @@ QPDF copy foreign stream with provider 0
 QPDF copy foreign stream with buffer 0
 QPDF immediate copy stream data 0
 qpdf copy same page more than once 1
+QPDFPageObjectHelper bad token finding names 0
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@ -1384,7 +1384,7 @@ my @sp_cases = (
    [11, 'pdf extension', '', 'split-out.Pdf'],
    [4, 'fallback', '--pages 11-pages.pdf 1-3 minimal.pdf --', 'split-out'],
    );
-$n_tests += 21;
+$n_tests += 23;
 for (@sp_cases)
 {
    $n_tests += 1 + $_->[0];
@ -1482,10 +1482,20 @@ $td->runtest("split shared font, xobject",
 foreach my $i (qw(1 2 3 4))
 {
    $td->runtest("check output ($i)",
-                 {$td->FILE => "shared-font-xobject-split-$i.pdf"},
-                 {$td->FILE => "split-out-shared-font-xobject-$i.pdf"});
+                 {$td->FILE => "split-out-shared-font-xobject-$i.pdf"},
+                 {$td->FILE => "shared-font-xobject-split-$i.pdf"});
 }

+$td->runtest("unreferenced resources with bad token",
+             {$td->COMMAND =>
+                  "qpdf --qdf --static-id --split-pages=2" .
+                  " coalesce.pdf split-out-bad-token.pdf"},
+             {$td->FILE => "coalesce-split.out", $td->EXIT_STATUS => 3},
+             $td->NORMALIZE_NEWLINES);
+$td->runtest("check output",
+             {$td->FILE => "split-out-bad-token-1-2.pdf"},
+             {$td->FILE => "coalesce-split-1-2.pdf"});
+
 show_ntests();
 # ----------
 $td->notify("--- Keep Files Open ---");
--- a/qpdf/qtest/qpdf/coalesce-split-1-2.pdf
+++ b/qpdf/qtest/qpdf/coalesce-split-1-2.pdf
@ -0,0 +1,231 @@
+%PDF-1.3
+%¿÷¢þ
+%QDF-1.0
+
+%% Original object ID: 1 0
+1 0 obj
+<<
+  /Pages 2 0 R
+  /Type /Catalog
+>>
+endobj
+
+%% Original object ID: 2 0
+2 0 obj
+<<
+  /Count 2
+  /Kids [
+    3 0 R
+    4 0 R
+  ]
+  /Type /Pages
+>>
+endobj
+
+%% Page 1
+%% Original object ID: 3 0
+3 0 obj
+<<
+  /Contents [
+    5 0 R
+    7 0 R
+    9 0 R
+    11 0 R
+  ]
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 13 0 R
+    >>
+    /ProcSet 14 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Page 2
+%% Original object ID: 14 0
+4 0 obj
+<<
+  /Contents 15 0 R
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 17 0 R
+    >>
+    /ProcSet 18 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Contents for page 1
+%% Original object ID: 4 0
+5 0 obj
+<<
+  /Length 6 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Pot
+endstream
+endobj
+
+%QDF: ignore_newline
+6 0 obj
+33
+endobj
+
+%% Contents for page 1
+%% Original object ID: 6 0
+7 0 obj
+<<
+  /Length 8 0 R
+>>
+stream
+ato) Tj
+ET [ /array
+endstream
+endobj
+
+%QDF: ignore_newline
+8 0 obj
+19
+endobj
+
+%% Contents for page 1
+%% Original object ID: 8 0
+9 0 obj
+<<
+  /Length 10 0 R
+>>
+stream
+/split ] BI
+/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
+ID xœÅÖIÃ P|ÿC;UÈ`ÀÓ7‘Z©¦Ä˜Úæ<C39A>}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À<E280A1>>”^&®¡uâ]€"!‡•–*¬&<26>E|Sy® ðd-€<<3C>B0Bú@Nê+<hlèKÐî/56L ‰<C2A0>ã £–¹¦>0>Y<>ù!cì\YØ%Yð¥Ö8?& Öëˆ}j’ûè;«<>3<EFBFBD>ÂÖlpÛsHöûtú
+endstream
+endobj
+
+%QDF: ignore_newline
+10 0 obj
+253
+endobj
+
+%% Contents for page 1
+%% Original object ID: 10 0
+11 0 obj
+<<
+  /Length 12 0 R
+>>
+stream
+QØTt*hÌUúãwÍÕÐ%¨)p–³"•DiRj¹–DYNUÓÙAv’Fà&
+<EFBFBD>ÍÔu#c•ÆW	ôß‰W“O
+EI
+endstream
+endobj
+
+%QDF: ignore_newline
+12 0 obj
+65
+endobj
+
+%% Original object ID: 12 0
+13 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+%% Original object ID: 13 0
+14 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+%% Contents for page 2
+%% Original object ID: 15 0
+15 0 obj
+<<
+  /Length 16 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Potato) Tj
+ET
+endstream
+endobj
+
+16 0 obj
+44
+endobj
+
+%% Original object ID: 17 0
+17 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+%% Original object ID: 18 0
+18 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+xref
+0 19
+0000000000 65535 f 
+0000000052 00000 n 
+0000000133 00000 n 
+0000000252 00000 n 
+0000000525 00000 n 
+0000000770 00000 n 
+0000000880 00000 n 
+0000000949 00000 n 
+0000001045 00000 n 
+0000001114 00000 n 
+0000001445 00000 n 
+0000001517 00000 n 
+0000001661 00000 n 
+0000001709 00000 n 
+0000001856 00000 n 
+0000001943 00000 n 
+0000002044 00000 n 
+0000002092 00000 n 
+0000002239 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 19
+  /ID [<31415926535897932384626433832795><31415926535897932384626433832795>]
+>>
+startxref
+2275
+%%EOF
--- a/qpdf/qtest/qpdf/coalesce-split.out
+++ b/qpdf/qtest/qpdf/coalesce-split.out
@ -0,0 +1,10 @@
+WARNING: coalesce.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page
+WARNING: empty PDF: content normalization encountered bad tokens
+WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: empty PDF: content normalization encountered bad tokens
+WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: empty PDF: content normalization encountered bad tokens
+WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+qpdf: operation succeeded with warnings; resulting file may have some problems