Detect and report bad tokens in content normalization

This commit is contained in:
Jay Berkenbilt 2018-02-02 21:16:40 -05:00
parent 30709935af
commit 5136238f2a
9 changed files with 343 additions and 4 deletions

View File

@ -153,6 +153,25 @@
* Provide heavily annoated examples/pdf-filter-tokens.cc example
that illustrates use of some simple token filters.
* When normalizing content streams, as in qdf mode, issue warning
about bad tokens. Content streams are only normalized when this is
explicitly requested, so this has no impact on normal operation.
However, in qdf mode, if qpdf detects a bad token, it means that
either there's a bug in qpdf's lexer, that the file is damaged, or
that the page's contents are split in a weird way. In any of those
cases, qpdf could potentially damage the stream's contents by
replacing carrige returns with newlines or otherwise messing with
spaces. The mostly likely case of this would be an inline image's
compressed data being divided across two streams and having the
compressed data in the second stream contain a carriage return as
part of its binary data. If you are using qdf mode just to look at
PDF files in text editors, this usually doesn't matter. In cases
of contents split across multiple streams, coalescing streams
would eliminate the problem, so the warning mentions this. Prior
to this enhancement, the chances of qdf mode writing incorrect
data were already very low. This change should make it nearly
impossible for qdf mode to unknowingly write invalid data.
2018-02-04 Jay Berkenbilt <ejb@ql.org>
* Add QPDFWriter::setLinearizationPass1Filename method and

View File

@ -1,7 +1,9 @@
#include <qpdf/ContentNormalizer.hh>
#include <qpdf/QUtil.hh>
ContentNormalizer::ContentNormalizer()
ContentNormalizer::ContentNormalizer() :
any_bad_tokens(false),
last_token_was_bad(false)
{
}
@ -15,6 +17,16 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
std::string value = token.getRawValue();
QPDFTokenizer::token_type_e token_type = token.getType();
if (token_type == QPDFTokenizer::tt_bad)
{
this->any_bad_tokens = true;
this->last_token_was_bad = true;
}
else if (token_type != QPDFTokenizer::tt_eof)
{
this->last_token_was_bad = false;
}
switch (token_type)
{
case QPDFTokenizer::tt_space:
@ -75,3 +87,15 @@ ContentNormalizer::handleEOF()
{
finish();
}
bool
ContentNormalizer::anyBadTokens() const
{
return this->any_bad_tokens;
}
bool
ContentNormalizer::lastTokenWasBad()const
{
return this->last_token_was_bad;
}

View File

@ -609,6 +609,33 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline,
}
}
if (filter &&
(! suppress_warnings) &&
normalizer.getPointer() &&
normalizer->anyBadTokens())
{
warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
"", this->offset,
"content normalization encountered bad tokens"));
if (normalizer->lastTokenWasBad())
{
QTC::TC("qpdf", "QPDF_Stream bad token at end during normalize");
warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
"", this->offset,
"normalized content ended with a bad token;"
" you may be able to resolve this by"
" coalescing content streams in combination"
" with normalizing content. From the command"
" line, specify --coalesce-contents"));
}
warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
"", this->offset,
"Resulting stream data may be corrupted but is"
" may still useful for manual inspection."
" For more information on this warning, search"
" for content normalization in the manual."));
}
return filter;
}

View File

@ -10,6 +10,13 @@ class ContentNormalizer: public QPDFObjectHandle::TokenFilter
virtual ~ContentNormalizer();
virtual void handleToken(QPDFTokenizer::Token const&);
virtual void handleEOF();
bool anyBadTokens() const;
bool lastTokenWasBad() const;
private:
bool any_bad_tokens;
bool last_token_was_bad;
};
#endif // __CONTENTNORMALIZER_HH__

View File

@ -306,3 +306,4 @@ Pl_QPDFTokenizer found ID 0
QPDFObjectHandle non-stream in stream array 0
QPDFObjectHandle coalesce called on stream 0
QPDFObjectHandle coalesce provide stream data 0
QPDF_Stream bad token at end during normalize 0

View File

@ -737,8 +737,16 @@ $td->runtest("stream with tiff predictor",
show_ntests();
# ----------
$td->notify("--- Coalesce contents ---");
$n_tests += 4;
$n_tests += 6;
$td->runtest("qdf with normalize warnings",
{$td->COMMAND =>
"qpdf --qdf --static-id coalesce.pdf a.pdf"},
{$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3},
$td->NORMALIZE_NEWLINES);
$td->runtest("check output",
{$td->FILE => "a.pdf"},
{$td->FILE => "coalesce.qdf"});
$td->runtest("coalesce contents with qdf",
{$td->COMMAND =>
"qpdf --qdf --static-id" .

View File

@ -0,0 +1,231 @@
%PDF-1.3
%¿÷¢þ
%QDF-1.0
%% Original object ID: 1 0
1 0 obj
<<
/Pages 2 0 R
/Type /Catalog
>>
endobj
%% Original object ID: 2 0
2 0 obj
<<
/Count 2
/Kids [
3 0 R
4 0 R
]
/Type /Pages
>>
endobj
%% Page 1
%% Original object ID: 3 0
3 0 obj
<<
/Contents [
5 0 R
7 0 R
9 0 R
11 0 R
]
/MediaBox [
0
0
612
792
]
/Parent 2 0 R
/Resources <<
/Font <<
/F1 13 0 R
>>
/ProcSet 14 0 R
>>
/Type /Page
>>
endobj
%% Page 2
%% Original object ID: 4 0
4 0 obj
<<
/Contents 15 0 R
/MediaBox [
0
0
612
792
]
/Parent 2 0 R
/Resources <<
/Font <<
/F1 17 0 R
>>
/ProcSet 18 0 R
>>
/Type /Page
>>
endobj
%% Contents for page 1
%% Original object ID: 5 0
5 0 obj
<<
/Length 6 0 R
>>
stream
BT
/F1 24 Tf
72 720 Td
(Pot
endstream
endobj
%QDF: ignore_newline
6 0 obj
33
endobj
%% Contents for page 1
%% Original object ID: 7 0
7 0 obj
<<
/Length 8 0 R
>>
stream
ato) Tj
ET [ /array
endstream
endobj
%QDF: ignore_newline
8 0 obj
19
endobj
%% Contents for page 1
%% Original object ID: 9 0
9 0 obj
<<
/Length 10 0 R
>>
stream
/split ] BI
/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
ID xœÅÖIà P|ÿC;UÈ`ÀÓ7 ¦ĘÚæ<C39A>}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À<E280A1>>”^&®¡uâ]€"!‡•*¬&<26>E|Sy® ðd-€<<3C>B0Bú@Nê+<hlèKÐî/56L <C2A0>ã £–¹¦>0>Y<>ù!cì\Y Ø%Yð¥Ö8?& Öëˆ}jûè<>3<EFBFBD>ÂÖlpÛsHöûtú
endstream
endobj
%QDF: ignore_newline
10 0 obj
253
endobj
%% Contents for page 1
%% Original object ID: 11 0
11 0 obj
<<
/Length 12 0 R
>>
stream
QØTt*hÌUúãwÍÕÐ%¨)p³"•DiRj¹DYNUÓÙAvFà&
<EFBFBD>ÍÔu#c•ÆW ô߉W“O
EI
endstream
endobj
%QDF: ignore_newline
12 0 obj
65
endobj
%% Original object ID: 13 0
13 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
/Name /F1
/Subtype /Type1
/Type /Font
>>
endobj
%% Original object ID: 14 0
14 0 obj
[
/PDF
/Text
]
endobj
%% Contents for page 2
%% Original object ID: 15 0
15 0 obj
<<
/Length 16 0 R
>>
stream
BT
/F1 24 Tf
72 720 Td
(Potato) Tj
ET
endstream
endobj
16 0 obj
44
endobj
%% Original object ID: 17 0
17 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
/Name /F1
/Subtype /Type1
/Type /Font
>>
endobj
%% Original object ID: 18 0
18 0 obj
[
/PDF
/Text
]
endobj
xref
0 19
0000000000 65535 f
0000000052 00000 n
0000000133 00000 n
0000000252 00000 n
0000000524 00000 n
0000000769 00000 n
0000000879 00000 n
0000000948 00000 n
0000001044 00000 n
0000001113 00000 n
0000001444 00000 n
0000001516 00000 n
0000001660 00000 n
0000001708 00000 n
0000001855 00000 n
0000001942 00000 n
0000002043 00000 n
0000002091 00000 n
0000002238 00000 n
trailer <<
/Root 1 0 R
/Size 19
/ID [<fa46a90bcf56476b9904a2e7adb75024><31415926535897932384626433832795>]
>>
startxref
2274
%%EOF

View File

@ -13,7 +13,9 @@ three lines
<8a8b>
(ab)
<8c><dd> ) >
<610062> (MOO)-- stream 1 --
<610062> (MOO)WARNING: good14.pdf (file position 628): content normalization encountered bad tokens
WARNING: good14.pdf (file position 628): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
-- stream 1 --
This stream does end with a newline.
// tests:
// bad tokens preserved
@ -31,10 +33,18 @@ This stream does end with a newline.
/good name
/bad#00name
WARNING: good14.pdf (file position 860): content normalization encountered bad tokens
WARNING: good14.pdf (file position 860): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
-- stream 2 --
(This stream ends with a \001 bad token
WARNING: good14.pdf (file position 1316): content normalization encountered bad tokens
WARNING: good14.pdf (file position 1316): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
WARNING: good14.pdf (file position 1316): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
-- stream 3 --
<AB X-- stream 4 --
<AB XWARNING: good14.pdf (file position 1406): content normalization encountered bad tokens
WARNING: good14.pdf (file position 1406): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
WARNING: good14.pdf (file position 1406): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
-- stream 4 --
(ends with a name)
/ThisMustBeLast-- stream 5 --
% This stream has an inline image marker that is not terminated
@ -44,4 +54,7 @@ BI
ID
<506f7
461746f>
WARNING: good14.pdf (file position 1549): content normalization encountered bad tokens
WARNING: good14.pdf (file position 1549): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
WARNING: good14.pdf (file position 1549): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
test 3 done

View File

@ -0,0 +1,9 @@
WARNING: coalesce.pdf (file position 671): content normalization encountered bad tokens
WARNING: coalesce.pdf (file position 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
WARNING: coalesce.pdf (file position 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
WARNING: coalesce.pdf (file position 823): content normalization encountered bad tokens
WARNING: coalesce.pdf (file position 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
WARNING: coalesce.pdf (file position 962): content normalization encountered bad tokens
WARNING: coalesce.pdf (file position 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
WARNING: coalesce.pdf (file position 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
qpdf: operation succeeded with warnings; resulting file may have some problems