2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-06-01 01:40:51 +00:00

Additional checks for unreferenced resources

Explicitly abandon removal of unreferenced resources if there are any
lexical errors in the page's contents. This case always generated a
warning, but it now also prevents removal of unreferenced resources,
this strongly decreasing the likelihood of data loss.
This commit is contained in:
Jay Berkenbilt 2019-01-17 08:56:58 -05:00
parent e09ae710dc
commit 5cfcd4f361
5 changed files with 272 additions and 3 deletions

View File

@ -99,11 +99,16 @@ QPDFPageObjectHelper::addContentTokenFilter(
class NameWatcher: public QPDFObjectHandle::TokenFilter
{
public:
NameWatcher() :
saw_bad(false)
{
}
virtual ~NameWatcher()
{
}
virtual void handleToken(QPDFTokenizer::Token const&);
std::set<std::string> names;
bool saw_bad;
};
void
@ -116,6 +121,10 @@ NameWatcher::handleToken(QPDFTokenizer::Token const& token)
this->names.insert(
QPDFObjectHandle::newName(token.getValue()).getName());
}
else if (token.getType() == QPDFTokenizer::tt_bad)
{
saw_bad = true;
}
writeToken(token);
}
@ -134,6 +143,14 @@ QPDFPageObjectHelper::removeUnreferencedResources()
"; not attempting to remove unreferenced objects from this page");
return;
}
if (nw.saw_bad)
{
QTC::TC("qpdf", "QPDFPageObjectHelper bad token finding names");
this->oh.warnIfPossible(
"Bad token found while scanning content stream; "
"not attempting to remove unreferenced objects from this page");
return;
}
// Walk through /Font and /XObject dictionaries, removing any
// resources that are not referenced. We must make copies of
// resource dictionaries down into the dictionaries are mutating

View File

@ -412,3 +412,4 @@ QPDF copy foreign stream with provider 0
QPDF copy foreign stream with buffer 0
QPDF immediate copy stream data 0
qpdf copy same page more than once 1
QPDFPageObjectHelper bad token finding names 0

View File

@ -1384,7 +1384,7 @@ my @sp_cases = (
[11, 'pdf extension', '', 'split-out.Pdf'],
[4, 'fallback', '--pages 11-pages.pdf 1-3 minimal.pdf --', 'split-out'],
);
$n_tests += 21;
$n_tests += 23;
for (@sp_cases)
{
$n_tests += 1 + $_->[0];
@ -1482,10 +1482,20 @@ $td->runtest("split shared font, xobject",
foreach my $i (qw(1 2 3 4))
{
$td->runtest("check output ($i)",
{$td->FILE => "shared-font-xobject-split-$i.pdf"},
{$td->FILE => "split-out-shared-font-xobject-$i.pdf"});
{$td->FILE => "split-out-shared-font-xobject-$i.pdf"},
{$td->FILE => "shared-font-xobject-split-$i.pdf"});
}
$td->runtest("unreferenced resources with bad token",
{$td->COMMAND =>
"qpdf --qdf --static-id --split-pages=2" .
" coalesce.pdf split-out-bad-token.pdf"},
{$td->FILE => "coalesce-split.out", $td->EXIT_STATUS => 3},
$td->NORMALIZE_NEWLINES);
$td->runtest("check output",
{$td->FILE => "split-out-bad-token-1-2.pdf"},
{$td->FILE => "coalesce-split-1-2.pdf"});
show_ntests();
# ----------
$td->notify("--- Keep Files Open ---");

View File

@ -0,0 +1,231 @@
%PDF-1.3
%¿÷¢þ
%QDF-1.0
%% Original object ID: 1 0
1 0 obj
<<
/Pages 2 0 R
/Type /Catalog
>>
endobj
%% Original object ID: 2 0
2 0 obj
<<
/Count 2
/Kids [
3 0 R
4 0 R
]
/Type /Pages
>>
endobj
%% Page 1
%% Original object ID: 3 0
3 0 obj
<<
/Contents [
5 0 R
7 0 R
9 0 R
11 0 R
]
/MediaBox [
0
0
612
792
]
/Parent 2 0 R
/Resources <<
/Font <<
/F1 13 0 R
>>
/ProcSet 14 0 R
>>
/Type /Page
>>
endobj
%% Page 2
%% Original object ID: 14 0
4 0 obj
<<
/Contents 15 0 R
/MediaBox [
0
0
612
792
]
/Parent 2 0 R
/Resources <<
/Font <<
/F1 17 0 R
>>
/ProcSet 18 0 R
>>
/Type /Page
>>
endobj
%% Contents for page 1
%% Original object ID: 4 0
5 0 obj
<<
/Length 6 0 R
>>
stream
BT
/F1 24 Tf
72 720 Td
(Pot
endstream
endobj
%QDF: ignore_newline
6 0 obj
33
endobj
%% Contents for page 1
%% Original object ID: 6 0
7 0 obj
<<
/Length 8 0 R
>>
stream
ato) Tj
ET [ /array
endstream
endobj
%QDF: ignore_newline
8 0 obj
19
endobj
%% Contents for page 1
%% Original object ID: 8 0
9 0 obj
<<
/Length 10 0 R
>>
stream
/split ] BI
/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
ID xœÅÖIà P|ÿC;UÈ`ÀÓ7 ¦ĘÚæ<C39A>}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À<E280A1>>”^&®¡uâ]€"!‡•*¬&<26>E|Sy® ðd-€<<3C>B0Bú@Nê+<hlèKÐî/56L <C2A0>ã £–¹¦>0>Y<>ù!cì\Y Ø%Yð¥Ö8?& Öëˆ}jûè<>3<EFBFBD>ÂÖlpÛsHöûtú
endstream
endobj
%QDF: ignore_newline
10 0 obj
253
endobj
%% Contents for page 1
%% Original object ID: 10 0
11 0 obj
<<
/Length 12 0 R
>>
stream
QØTt*hÌUúãwÍÕÐ%¨)p³"•DiRj¹DYNUÓÙAvFà&
<EFBFBD>ÍÔu#c•ÆW ô߉W“O
EI
endstream
endobj
%QDF: ignore_newline
12 0 obj
65
endobj
%% Original object ID: 12 0
13 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
/Name /F1
/Subtype /Type1
/Type /Font
>>
endobj
%% Original object ID: 13 0
14 0 obj
[
/PDF
/Text
]
endobj
%% Contents for page 2
%% Original object ID: 15 0
15 0 obj
<<
/Length 16 0 R
>>
stream
BT
/F1 24 Tf
72 720 Td
(Potato) Tj
ET
endstream
endobj
16 0 obj
44
endobj
%% Original object ID: 17 0
17 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
/Name /F1
/Subtype /Type1
/Type /Font
>>
endobj
%% Original object ID: 18 0
18 0 obj
[
/PDF
/Text
]
endobj
xref
0 19
0000000000 65535 f
0000000052 00000 n
0000000133 00000 n
0000000252 00000 n
0000000525 00000 n
0000000770 00000 n
0000000880 00000 n
0000000949 00000 n
0000001045 00000 n
0000001114 00000 n
0000001445 00000 n
0000001517 00000 n
0000001661 00000 n
0000001709 00000 n
0000001856 00000 n
0000001943 00000 n
0000002044 00000 n
0000002092 00000 n
0000002239 00000 n
trailer <<
/Root 1 0 R
/Size 19
/ID [<31415926535897932384626433832795><31415926535897932384626433832795>]
>>
startxref
2275
%%EOF

View File

@ -0,0 +1,10 @@
WARNING: coalesce.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page
WARNING: empty PDF: content normalization encountered bad tokens
WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
WARNING: empty PDF: content normalization encountered bad tokens
WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
WARNING: empty PDF: content normalization encountered bad tokens
WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
qpdf: operation succeeded with warnings; resulting file may have some problems