mirror of
https://github.com/qpdf/qpdf.git
synced 2025-01-02 22:50:20 +00:00
Detect and report bad tokens in content normalization
This commit is contained in:
parent
30709935af
commit
5136238f2a
19
ChangeLog
19
ChangeLog
@ -153,6 +153,25 @@
|
||||
* Provide heavily annoated examples/pdf-filter-tokens.cc example
|
||||
that illustrates use of some simple token filters.
|
||||
|
||||
* When normalizing content streams, as in qdf mode, issue warning
|
||||
about bad tokens. Content streams are only normalized when this is
|
||||
explicitly requested, so this has no impact on normal operation.
|
||||
However, in qdf mode, if qpdf detects a bad token, it means that
|
||||
either there's a bug in qpdf's lexer, that the file is damaged, or
|
||||
that the page's contents are split in a weird way. In any of those
|
||||
cases, qpdf could potentially damage the stream's contents by
|
||||
replacing carrige returns with newlines or otherwise messing with
|
||||
spaces. The mostly likely case of this would be an inline image's
|
||||
compressed data being divided across two streams and having the
|
||||
compressed data in the second stream contain a carriage return as
|
||||
part of its binary data. If you are using qdf mode just to look at
|
||||
PDF files in text editors, this usually doesn't matter. In cases
|
||||
of contents split across multiple streams, coalescing streams
|
||||
would eliminate the problem, so the warning mentions this. Prior
|
||||
to this enhancement, the chances of qdf mode writing incorrect
|
||||
data were already very low. This change should make it nearly
|
||||
impossible for qdf mode to unknowingly write invalid data.
|
||||
|
||||
2018-02-04 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Add QPDFWriter::setLinearizationPass1Filename method and
|
||||
|
@ -1,7 +1,9 @@
|
||||
#include <qpdf/ContentNormalizer.hh>
|
||||
#include <qpdf/QUtil.hh>
|
||||
|
||||
ContentNormalizer::ContentNormalizer()
|
||||
ContentNormalizer::ContentNormalizer() :
|
||||
any_bad_tokens(false),
|
||||
last_token_was_bad(false)
|
||||
{
|
||||
}
|
||||
|
||||
@ -15,6 +17,16 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
|
||||
std::string value = token.getRawValue();
|
||||
QPDFTokenizer::token_type_e token_type = token.getType();
|
||||
|
||||
if (token_type == QPDFTokenizer::tt_bad)
|
||||
{
|
||||
this->any_bad_tokens = true;
|
||||
this->last_token_was_bad = true;
|
||||
}
|
||||
else if (token_type != QPDFTokenizer::tt_eof)
|
||||
{
|
||||
this->last_token_was_bad = false;
|
||||
}
|
||||
|
||||
switch (token_type)
|
||||
{
|
||||
case QPDFTokenizer::tt_space:
|
||||
@ -75,3 +87,15 @@ ContentNormalizer::handleEOF()
|
||||
{
|
||||
finish();
|
||||
}
|
||||
|
||||
bool
|
||||
ContentNormalizer::anyBadTokens() const
|
||||
{
|
||||
return this->any_bad_tokens;
|
||||
}
|
||||
|
||||
bool
|
||||
ContentNormalizer::lastTokenWasBad()const
|
||||
{
|
||||
return this->last_token_was_bad;
|
||||
}
|
||||
|
@ -609,6 +609,33 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline,
|
||||
}
|
||||
}
|
||||
|
||||
if (filter &&
|
||||
(! suppress_warnings) &&
|
||||
normalizer.getPointer() &&
|
||||
normalizer->anyBadTokens())
|
||||
{
|
||||
warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
|
||||
"", this->offset,
|
||||
"content normalization encountered bad tokens"));
|
||||
if (normalizer->lastTokenWasBad())
|
||||
{
|
||||
QTC::TC("qpdf", "QPDF_Stream bad token at end during normalize");
|
||||
warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
|
||||
"", this->offset,
|
||||
"normalized content ended with a bad token;"
|
||||
" you may be able to resolve this by"
|
||||
" coalescing content streams in combination"
|
||||
" with normalizing content. From the command"
|
||||
" line, specify --coalesce-contents"));
|
||||
}
|
||||
warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
|
||||
"", this->offset,
|
||||
"Resulting stream data may be corrupted but is"
|
||||
" may still useful for manual inspection."
|
||||
" For more information on this warning, search"
|
||||
" for content normalization in the manual."));
|
||||
}
|
||||
|
||||
return filter;
|
||||
}
|
||||
|
||||
|
@ -10,6 +10,13 @@ class ContentNormalizer: public QPDFObjectHandle::TokenFilter
|
||||
virtual ~ContentNormalizer();
|
||||
virtual void handleToken(QPDFTokenizer::Token const&);
|
||||
virtual void handleEOF();
|
||||
|
||||
bool anyBadTokens() const;
|
||||
bool lastTokenWasBad() const;
|
||||
|
||||
private:
|
||||
bool any_bad_tokens;
|
||||
bool last_token_was_bad;
|
||||
};
|
||||
|
||||
#endif // __CONTENTNORMALIZER_HH__
|
||||
|
@ -306,3 +306,4 @@ Pl_QPDFTokenizer found ID 0
|
||||
QPDFObjectHandle non-stream in stream array 0
|
||||
QPDFObjectHandle coalesce called on stream 0
|
||||
QPDFObjectHandle coalesce provide stream data 0
|
||||
QPDF_Stream bad token at end during normalize 0
|
||||
|
@ -737,8 +737,16 @@ $td->runtest("stream with tiff predictor",
|
||||
show_ntests();
|
||||
# ----------
|
||||
$td->notify("--- Coalesce contents ---");
|
||||
$n_tests += 4;
|
||||
$n_tests += 6;
|
||||
|
||||
$td->runtest("qdf with normalize warnings",
|
||||
{$td->COMMAND =>
|
||||
"qpdf --qdf --static-id coalesce.pdf a.pdf"},
|
||||
{$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3},
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
$td->runtest("check output",
|
||||
{$td->FILE => "a.pdf"},
|
||||
{$td->FILE => "coalesce.qdf"});
|
||||
$td->runtest("coalesce contents with qdf",
|
||||
{$td->COMMAND =>
|
||||
"qpdf --qdf --static-id" .
|
||||
|
231
qpdf/qtest/qpdf/coalesce.qdf
Normal file
231
qpdf/qtest/qpdf/coalesce.qdf
Normal file
@ -0,0 +1,231 @@
|
||||
%PDF-1.3
|
||||
%¿÷¢þ
|
||||
%QDF-1.0
|
||||
|
||||
%% Original object ID: 1 0
|
||||
1 0 obj
|
||||
<<
|
||||
/Pages 2 0 R
|
||||
/Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
|
||||
%% Original object ID: 2 0
|
||||
2 0 obj
|
||||
<<
|
||||
/Count 2
|
||||
/Kids [
|
||||
3 0 R
|
||||
4 0 R
|
||||
]
|
||||
/Type /Pages
|
||||
>>
|
||||
endobj
|
||||
|
||||
%% Page 1
|
||||
%% Original object ID: 3 0
|
||||
3 0 obj
|
||||
<<
|
||||
/Contents [
|
||||
5 0 R
|
||||
7 0 R
|
||||
9 0 R
|
||||
11 0 R
|
||||
]
|
||||
/MediaBox [
|
||||
0
|
||||
0
|
||||
612
|
||||
792
|
||||
]
|
||||
/Parent 2 0 R
|
||||
/Resources <<
|
||||
/Font <<
|
||||
/F1 13 0 R
|
||||
>>
|
||||
/ProcSet 14 0 R
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
|
||||
%% Page 2
|
||||
%% Original object ID: 4 0
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 15 0 R
|
||||
/MediaBox [
|
||||
0
|
||||
0
|
||||
612
|
||||
792
|
||||
]
|
||||
/Parent 2 0 R
|
||||
/Resources <<
|
||||
/Font <<
|
||||
/F1 17 0 R
|
||||
>>
|
||||
/ProcSet 18 0 R
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
|
||||
%% Contents for page 1
|
||||
%% Original object ID: 5 0
|
||||
5 0 obj
|
||||
<<
|
||||
/Length 6 0 R
|
||||
>>
|
||||
stream
|
||||
BT
|
||||
/F1 24 Tf
|
||||
72 720 Td
|
||||
(Pot
|
||||
endstream
|
||||
endobj
|
||||
|
||||
%QDF: ignore_newline
|
||||
6 0 obj
|
||||
33
|
||||
endobj
|
||||
|
||||
%% Contents for page 1
|
||||
%% Original object ID: 7 0
|
||||
7 0 obj
|
||||
<<
|
||||
/Length 8 0 R
|
||||
>>
|
||||
stream
|
||||
ato) Tj
|
||||
ET [ /array
|
||||
endstream
|
||||
endobj
|
||||
|
||||
%QDF: ignore_newline
|
||||
8 0 obj
|
||||
19
|
||||
endobj
|
||||
|
||||
%% Contents for page 1
|
||||
%% Original object ID: 9 0
|
||||
9 0 obj
|
||||
<<
|
||||
/Length 10 0 R
|
||||
>>
|
||||
stream
|
||||
/split ] BI
|
||||
/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
|
||||
ID xœÅÖIà P|ÿC;UÈ`ÀÓ7‘Z©¦Ä˜Úæ<C39A>}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À<E280A1>>”^&®¡uâ]€"!‡•–*¬&<26>E|Sy® ðd-€<<3C>B0Bú@Nê+<hlèKÐî/56L ‰<C2A0>ã £–¹¦>0>Y<>ù!cì\YØ%Yð¥Ö8?& Öëˆ}j’ûè;«<>3<EFBFBD>ÂÖlpÛsHöûtú
|
||||
endstream
|
||||
endobj
|
||||
|
||||
%QDF: ignore_newline
|
||||
10 0 obj
|
||||
253
|
||||
endobj
|
||||
|
||||
%% Contents for page 1
|
||||
%% Original object ID: 11 0
|
||||
11 0 obj
|
||||
<<
|
||||
/Length 12 0 R
|
||||
>>
|
||||
stream
|
||||
QØTt*hÌUúãwÍÕÐ%¨)p–³"•DiRj¹–DYNUÓÙAv’Fà&
|
||||
<EFBFBD>ÍÔu#c•ÆW ô߉W“O
|
||||
EI
|
||||
endstream
|
||||
endobj
|
||||
|
||||
%QDF: ignore_newline
|
||||
12 0 obj
|
||||
65
|
||||
endobj
|
||||
|
||||
%% Original object ID: 13 0
|
||||
13 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica
|
||||
/Encoding /WinAnsiEncoding
|
||||
/Name /F1
|
||||
/Subtype /Type1
|
||||
/Type /Font
|
||||
>>
|
||||
endobj
|
||||
|
||||
%% Original object ID: 14 0
|
||||
14 0 obj
|
||||
[
|
||||
/PDF
|
||||
/Text
|
||||
]
|
||||
endobj
|
||||
|
||||
%% Contents for page 2
|
||||
%% Original object ID: 15 0
|
||||
15 0 obj
|
||||
<<
|
||||
/Length 16 0 R
|
||||
>>
|
||||
stream
|
||||
BT
|
||||
/F1 24 Tf
|
||||
72 720 Td
|
||||
(Potato) Tj
|
||||
ET
|
||||
endstream
|
||||
endobj
|
||||
|
||||
16 0 obj
|
||||
44
|
||||
endobj
|
||||
|
||||
%% Original object ID: 17 0
|
||||
17 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica
|
||||
/Encoding /WinAnsiEncoding
|
||||
/Name /F1
|
||||
/Subtype /Type1
|
||||
/Type /Font
|
||||
>>
|
||||
endobj
|
||||
|
||||
%% Original object ID: 18 0
|
||||
18 0 obj
|
||||
[
|
||||
/PDF
|
||||
/Text
|
||||
]
|
||||
endobj
|
||||
|
||||
xref
|
||||
0 19
|
||||
0000000000 65535 f
|
||||
0000000052 00000 n
|
||||
0000000133 00000 n
|
||||
0000000252 00000 n
|
||||
0000000524 00000 n
|
||||
0000000769 00000 n
|
||||
0000000879 00000 n
|
||||
0000000948 00000 n
|
||||
0000001044 00000 n
|
||||
0000001113 00000 n
|
||||
0000001444 00000 n
|
||||
0000001516 00000 n
|
||||
0000001660 00000 n
|
||||
0000001708 00000 n
|
||||
0000001855 00000 n
|
||||
0000001942 00000 n
|
||||
0000002043 00000 n
|
||||
0000002091 00000 n
|
||||
0000002238 00000 n
|
||||
trailer <<
|
||||
/Root 1 0 R
|
||||
/Size 19
|
||||
/ID [<fa46a90bcf56476b9904a2e7adb75024><31415926535897932384626433832795>]
|
||||
>>
|
||||
startxref
|
||||
2274
|
||||
%%EOF
|
@ -13,7 +13,9 @@ three lines
|
||||
<8a8b>
|
||||
(ab)
|
||||
<8c><dd> ) >
|
||||
<610062> (MOO)-- stream 1 --
|
||||
<610062> (MOO)WARNING: good14.pdf (file position 628): content normalization encountered bad tokens
|
||||
WARNING: good14.pdf (file position 628): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||
-- stream 1 --
|
||||
This stream does end with a newline.
|
||||
// tests:
|
||||
// bad tokens preserved
|
||||
@ -31,10 +33,18 @@ This stream does end with a newline.
|
||||
|
||||
/good name
|
||||
/bad#00name
|
||||
WARNING: good14.pdf (file position 860): content normalization encountered bad tokens
|
||||
WARNING: good14.pdf (file position 860): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||
-- stream 2 --
|
||||
(This stream ends with a \001 bad token
|
||||
WARNING: good14.pdf (file position 1316): content normalization encountered bad tokens
|
||||
WARNING: good14.pdf (file position 1316): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
|
||||
WARNING: good14.pdf (file position 1316): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||
-- stream 3 --
|
||||
<AB X-- stream 4 --
|
||||
<AB XWARNING: good14.pdf (file position 1406): content normalization encountered bad tokens
|
||||
WARNING: good14.pdf (file position 1406): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
|
||||
WARNING: good14.pdf (file position 1406): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||
-- stream 4 --
|
||||
(ends with a name)
|
||||
/ThisMustBeLast-- stream 5 --
|
||||
% This stream has an inline image marker that is not terminated
|
||||
@ -44,4 +54,7 @@ BI
|
||||
ID
|
||||
<506f7
|
||||
461746f>
|
||||
WARNING: good14.pdf (file position 1549): content normalization encountered bad tokens
|
||||
WARNING: good14.pdf (file position 1549): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
|
||||
WARNING: good14.pdf (file position 1549): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||
test 3 done
|
||||
|
9
qpdf/qtest/qpdf/normalize-warnings.out
Normal file
9
qpdf/qtest/qpdf/normalize-warnings.out
Normal file
@ -0,0 +1,9 @@
|
||||
WARNING: coalesce.pdf (file position 671): content normalization encountered bad tokens
|
||||
WARNING: coalesce.pdf (file position 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
|
||||
WARNING: coalesce.pdf (file position 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||
WARNING: coalesce.pdf (file position 823): content normalization encountered bad tokens
|
||||
WARNING: coalesce.pdf (file position 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||
WARNING: coalesce.pdf (file position 962): content normalization encountered bad tokens
|
||||
WARNING: coalesce.pdf (file position 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
|
||||
WARNING: coalesce.pdf (file position 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||
qpdf: operation succeeded with warnings; resulting file may have some problems
|
Loading…
Reference in New Issue
Block a user