mirror of
https://github.com/qpdf/qpdf.git
synced 2025-01-05 08:02:11 +00:00
Detect and report bad tokens in content normalization
This commit is contained in:
parent
30709935af
commit
5136238f2a
19
ChangeLog
19
ChangeLog
@ -153,6 +153,25 @@
|
|||||||
* Provide heavily annoated examples/pdf-filter-tokens.cc example
|
* Provide heavily annoated examples/pdf-filter-tokens.cc example
|
||||||
that illustrates use of some simple token filters.
|
that illustrates use of some simple token filters.
|
||||||
|
|
||||||
|
* When normalizing content streams, as in qdf mode, issue warning
|
||||||
|
about bad tokens. Content streams are only normalized when this is
|
||||||
|
explicitly requested, so this has no impact on normal operation.
|
||||||
|
However, in qdf mode, if qpdf detects a bad token, it means that
|
||||||
|
either there's a bug in qpdf's lexer, that the file is damaged, or
|
||||||
|
that the page's contents are split in a weird way. In any of those
|
||||||
|
cases, qpdf could potentially damage the stream's contents by
|
||||||
|
replacing carrige returns with newlines or otherwise messing with
|
||||||
|
spaces. The mostly likely case of this would be an inline image's
|
||||||
|
compressed data being divided across two streams and having the
|
||||||
|
compressed data in the second stream contain a carriage return as
|
||||||
|
part of its binary data. If you are using qdf mode just to look at
|
||||||
|
PDF files in text editors, this usually doesn't matter. In cases
|
||||||
|
of contents split across multiple streams, coalescing streams
|
||||||
|
would eliminate the problem, so the warning mentions this. Prior
|
||||||
|
to this enhancement, the chances of qdf mode writing incorrect
|
||||||
|
data were already very low. This change should make it nearly
|
||||||
|
impossible for qdf mode to unknowingly write invalid data.
|
||||||
|
|
||||||
2018-02-04 Jay Berkenbilt <ejb@ql.org>
|
2018-02-04 Jay Berkenbilt <ejb@ql.org>
|
||||||
|
|
||||||
* Add QPDFWriter::setLinearizationPass1Filename method and
|
* Add QPDFWriter::setLinearizationPass1Filename method and
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
#include <qpdf/ContentNormalizer.hh>
|
#include <qpdf/ContentNormalizer.hh>
|
||||||
#include <qpdf/QUtil.hh>
|
#include <qpdf/QUtil.hh>
|
||||||
|
|
||||||
ContentNormalizer::ContentNormalizer()
|
ContentNormalizer::ContentNormalizer() :
|
||||||
|
any_bad_tokens(false),
|
||||||
|
last_token_was_bad(false)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -15,6 +17,16 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
|
|||||||
std::string value = token.getRawValue();
|
std::string value = token.getRawValue();
|
||||||
QPDFTokenizer::token_type_e token_type = token.getType();
|
QPDFTokenizer::token_type_e token_type = token.getType();
|
||||||
|
|
||||||
|
if (token_type == QPDFTokenizer::tt_bad)
|
||||||
|
{
|
||||||
|
this->any_bad_tokens = true;
|
||||||
|
this->last_token_was_bad = true;
|
||||||
|
}
|
||||||
|
else if (token_type != QPDFTokenizer::tt_eof)
|
||||||
|
{
|
||||||
|
this->last_token_was_bad = false;
|
||||||
|
}
|
||||||
|
|
||||||
switch (token_type)
|
switch (token_type)
|
||||||
{
|
{
|
||||||
case QPDFTokenizer::tt_space:
|
case QPDFTokenizer::tt_space:
|
||||||
@ -75,3 +87,15 @@ ContentNormalizer::handleEOF()
|
|||||||
{
|
{
|
||||||
finish();
|
finish();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
ContentNormalizer::anyBadTokens() const
|
||||||
|
{
|
||||||
|
return this->any_bad_tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
ContentNormalizer::lastTokenWasBad()const
|
||||||
|
{
|
||||||
|
return this->last_token_was_bad;
|
||||||
|
}
|
||||||
|
@ -609,6 +609,33 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (filter &&
|
||||||
|
(! suppress_warnings) &&
|
||||||
|
normalizer.getPointer() &&
|
||||||
|
normalizer->anyBadTokens())
|
||||||
|
{
|
||||||
|
warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
|
||||||
|
"", this->offset,
|
||||||
|
"content normalization encountered bad tokens"));
|
||||||
|
if (normalizer->lastTokenWasBad())
|
||||||
|
{
|
||||||
|
QTC::TC("qpdf", "QPDF_Stream bad token at end during normalize");
|
||||||
|
warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
|
||||||
|
"", this->offset,
|
||||||
|
"normalized content ended with a bad token;"
|
||||||
|
" you may be able to resolve this by"
|
||||||
|
" coalescing content streams in combination"
|
||||||
|
" with normalizing content. From the command"
|
||||||
|
" line, specify --coalesce-contents"));
|
||||||
|
}
|
||||||
|
warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
|
||||||
|
"", this->offset,
|
||||||
|
"Resulting stream data may be corrupted but is"
|
||||||
|
" may still useful for manual inspection."
|
||||||
|
" For more information on this warning, search"
|
||||||
|
" for content normalization in the manual."));
|
||||||
|
}
|
||||||
|
|
||||||
return filter;
|
return filter;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,6 +10,13 @@ class ContentNormalizer: public QPDFObjectHandle::TokenFilter
|
|||||||
virtual ~ContentNormalizer();
|
virtual ~ContentNormalizer();
|
||||||
virtual void handleToken(QPDFTokenizer::Token const&);
|
virtual void handleToken(QPDFTokenizer::Token const&);
|
||||||
virtual void handleEOF();
|
virtual void handleEOF();
|
||||||
|
|
||||||
|
bool anyBadTokens() const;
|
||||||
|
bool lastTokenWasBad() const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool any_bad_tokens;
|
||||||
|
bool last_token_was_bad;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // __CONTENTNORMALIZER_HH__
|
#endif // __CONTENTNORMALIZER_HH__
|
||||||
|
@ -306,3 +306,4 @@ Pl_QPDFTokenizer found ID 0
|
|||||||
QPDFObjectHandle non-stream in stream array 0
|
QPDFObjectHandle non-stream in stream array 0
|
||||||
QPDFObjectHandle coalesce called on stream 0
|
QPDFObjectHandle coalesce called on stream 0
|
||||||
QPDFObjectHandle coalesce provide stream data 0
|
QPDFObjectHandle coalesce provide stream data 0
|
||||||
|
QPDF_Stream bad token at end during normalize 0
|
||||||
|
@ -737,8 +737,16 @@ $td->runtest("stream with tiff predictor",
|
|||||||
show_ntests();
|
show_ntests();
|
||||||
# ----------
|
# ----------
|
||||||
$td->notify("--- Coalesce contents ---");
|
$td->notify("--- Coalesce contents ---");
|
||||||
$n_tests += 4;
|
$n_tests += 6;
|
||||||
|
|
||||||
|
$td->runtest("qdf with normalize warnings",
|
||||||
|
{$td->COMMAND =>
|
||||||
|
"qpdf --qdf --static-id coalesce.pdf a.pdf"},
|
||||||
|
{$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3},
|
||||||
|
$td->NORMALIZE_NEWLINES);
|
||||||
|
$td->runtest("check output",
|
||||||
|
{$td->FILE => "a.pdf"},
|
||||||
|
{$td->FILE => "coalesce.qdf"});
|
||||||
$td->runtest("coalesce contents with qdf",
|
$td->runtest("coalesce contents with qdf",
|
||||||
{$td->COMMAND =>
|
{$td->COMMAND =>
|
||||||
"qpdf --qdf --static-id" .
|
"qpdf --qdf --static-id" .
|
||||||
|
231
qpdf/qtest/qpdf/coalesce.qdf
Normal file
231
qpdf/qtest/qpdf/coalesce.qdf
Normal file
@ -0,0 +1,231 @@
|
|||||||
|
%PDF-1.3
|
||||||
|
%¿÷¢þ
|
||||||
|
%QDF-1.0
|
||||||
|
|
||||||
|
%% Original object ID: 1 0
|
||||||
|
1 0 obj
|
||||||
|
<<
|
||||||
|
/Pages 2 0 R
|
||||||
|
/Type /Catalog
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%% Original object ID: 2 0
|
||||||
|
2 0 obj
|
||||||
|
<<
|
||||||
|
/Count 2
|
||||||
|
/Kids [
|
||||||
|
3 0 R
|
||||||
|
4 0 R
|
||||||
|
]
|
||||||
|
/Type /Pages
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%% Page 1
|
||||||
|
%% Original object ID: 3 0
|
||||||
|
3 0 obj
|
||||||
|
<<
|
||||||
|
/Contents [
|
||||||
|
5 0 R
|
||||||
|
7 0 R
|
||||||
|
9 0 R
|
||||||
|
11 0 R
|
||||||
|
]
|
||||||
|
/MediaBox [
|
||||||
|
0
|
||||||
|
0
|
||||||
|
612
|
||||||
|
792
|
||||||
|
]
|
||||||
|
/Parent 2 0 R
|
||||||
|
/Resources <<
|
||||||
|
/Font <<
|
||||||
|
/F1 13 0 R
|
||||||
|
>>
|
||||||
|
/ProcSet 14 0 R
|
||||||
|
>>
|
||||||
|
/Type /Page
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%% Page 2
|
||||||
|
%% Original object ID: 4 0
|
||||||
|
4 0 obj
|
||||||
|
<<
|
||||||
|
/Contents 15 0 R
|
||||||
|
/MediaBox [
|
||||||
|
0
|
||||||
|
0
|
||||||
|
612
|
||||||
|
792
|
||||||
|
]
|
||||||
|
/Parent 2 0 R
|
||||||
|
/Resources <<
|
||||||
|
/Font <<
|
||||||
|
/F1 17 0 R
|
||||||
|
>>
|
||||||
|
/ProcSet 18 0 R
|
||||||
|
>>
|
||||||
|
/Type /Page
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%% Contents for page 1
|
||||||
|
%% Original object ID: 5 0
|
||||||
|
5 0 obj
|
||||||
|
<<
|
||||||
|
/Length 6 0 R
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
BT
|
||||||
|
/F1 24 Tf
|
||||||
|
72 720 Td
|
||||||
|
(Pot
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%QDF: ignore_newline
|
||||||
|
6 0 obj
|
||||||
|
33
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%% Contents for page 1
|
||||||
|
%% Original object ID: 7 0
|
||||||
|
7 0 obj
|
||||||
|
<<
|
||||||
|
/Length 8 0 R
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
ato) Tj
|
||||||
|
ET [ /array
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%QDF: ignore_newline
|
||||||
|
8 0 obj
|
||||||
|
19
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%% Contents for page 1
|
||||||
|
%% Original object ID: 9 0
|
||||||
|
9 0 obj
|
||||||
|
<<
|
||||||
|
/Length 10 0 R
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
/split ] BI
|
||||||
|
/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
|
||||||
|
ID xœÅÖIà P|ÿC;UÈ`ÀÓ7‘Z©¦Ä˜Úæ<C39A>}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À<E280A1>>”^&®¡uâ]€"!‡•–*¬&<26>E|Sy® ðd-€<<3C>B0Bú@Nê+<hlèKÐî/56L ‰<C2A0>ã £–¹¦>0>Y<>ù!cì\YØ%Yð¥Ö8?& Öëˆ}j’ûè;«<>3<EFBFBD>ÂÖlpÛsHöûtú
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%QDF: ignore_newline
|
||||||
|
10 0 obj
|
||||||
|
253
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%% Contents for page 1
|
||||||
|
%% Original object ID: 11 0
|
||||||
|
11 0 obj
|
||||||
|
<<
|
||||||
|
/Length 12 0 R
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
QØTt*hÌUúãwÍÕÐ%¨)p–³"•DiRj¹–DYNUÓÙAv’Fà&
|
||||||
|
<EFBFBD>ÍÔu#c•ÆW ô߉W“O
|
||||||
|
EI
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%QDF: ignore_newline
|
||||||
|
12 0 obj
|
||||||
|
65
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%% Original object ID: 13 0
|
||||||
|
13 0 obj
|
||||||
|
<<
|
||||||
|
/BaseFont /Helvetica
|
||||||
|
/Encoding /WinAnsiEncoding
|
||||||
|
/Name /F1
|
||||||
|
/Subtype /Type1
|
||||||
|
/Type /Font
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%% Original object ID: 14 0
|
||||||
|
14 0 obj
|
||||||
|
[
|
||||||
|
/PDF
|
||||||
|
/Text
|
||||||
|
]
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%% Contents for page 2
|
||||||
|
%% Original object ID: 15 0
|
||||||
|
15 0 obj
|
||||||
|
<<
|
||||||
|
/Length 16 0 R
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
BT
|
||||||
|
/F1 24 Tf
|
||||||
|
72 720 Td
|
||||||
|
(Potato) Tj
|
||||||
|
ET
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
|
||||||
|
16 0 obj
|
||||||
|
44
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%% Original object ID: 17 0
|
||||||
|
17 0 obj
|
||||||
|
<<
|
||||||
|
/BaseFont /Helvetica
|
||||||
|
/Encoding /WinAnsiEncoding
|
||||||
|
/Name /F1
|
||||||
|
/Subtype /Type1
|
||||||
|
/Type /Font
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
%% Original object ID: 18 0
|
||||||
|
18 0 obj
|
||||||
|
[
|
||||||
|
/PDF
|
||||||
|
/Text
|
||||||
|
]
|
||||||
|
endobj
|
||||||
|
|
||||||
|
xref
|
||||||
|
0 19
|
||||||
|
0000000000 65535 f
|
||||||
|
0000000052 00000 n
|
||||||
|
0000000133 00000 n
|
||||||
|
0000000252 00000 n
|
||||||
|
0000000524 00000 n
|
||||||
|
0000000769 00000 n
|
||||||
|
0000000879 00000 n
|
||||||
|
0000000948 00000 n
|
||||||
|
0000001044 00000 n
|
||||||
|
0000001113 00000 n
|
||||||
|
0000001444 00000 n
|
||||||
|
0000001516 00000 n
|
||||||
|
0000001660 00000 n
|
||||||
|
0000001708 00000 n
|
||||||
|
0000001855 00000 n
|
||||||
|
0000001942 00000 n
|
||||||
|
0000002043 00000 n
|
||||||
|
0000002091 00000 n
|
||||||
|
0000002238 00000 n
|
||||||
|
trailer <<
|
||||||
|
/Root 1 0 R
|
||||||
|
/Size 19
|
||||||
|
/ID [<fa46a90bcf56476b9904a2e7adb75024><31415926535897932384626433832795>]
|
||||||
|
>>
|
||||||
|
startxref
|
||||||
|
2274
|
||||||
|
%%EOF
|
@ -13,7 +13,9 @@ three lines
|
|||||||
<8a8b>
|
<8a8b>
|
||||||
(ab)
|
(ab)
|
||||||
<8c><dd> ) >
|
<8c><dd> ) >
|
||||||
<610062> (MOO)-- stream 1 --
|
<610062> (MOO)WARNING: good14.pdf (file position 628): content normalization encountered bad tokens
|
||||||
|
WARNING: good14.pdf (file position 628): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||||
|
-- stream 1 --
|
||||||
This stream does end with a newline.
|
This stream does end with a newline.
|
||||||
// tests:
|
// tests:
|
||||||
// bad tokens preserved
|
// bad tokens preserved
|
||||||
@ -31,10 +33,18 @@ This stream does end with a newline.
|
|||||||
|
|
||||||
/good name
|
/good name
|
||||||
/bad#00name
|
/bad#00name
|
||||||
|
WARNING: good14.pdf (file position 860): content normalization encountered bad tokens
|
||||||
|
WARNING: good14.pdf (file position 860): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||||
-- stream 2 --
|
-- stream 2 --
|
||||||
(This stream ends with a \001 bad token
|
(This stream ends with a \001 bad token
|
||||||
|
WARNING: good14.pdf (file position 1316): content normalization encountered bad tokens
|
||||||
|
WARNING: good14.pdf (file position 1316): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
|
||||||
|
WARNING: good14.pdf (file position 1316): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||||
-- stream 3 --
|
-- stream 3 --
|
||||||
<AB X-- stream 4 --
|
<AB XWARNING: good14.pdf (file position 1406): content normalization encountered bad tokens
|
||||||
|
WARNING: good14.pdf (file position 1406): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
|
||||||
|
WARNING: good14.pdf (file position 1406): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||||
|
-- stream 4 --
|
||||||
(ends with a name)
|
(ends with a name)
|
||||||
/ThisMustBeLast-- stream 5 --
|
/ThisMustBeLast-- stream 5 --
|
||||||
% This stream has an inline image marker that is not terminated
|
% This stream has an inline image marker that is not terminated
|
||||||
@ -44,4 +54,7 @@ BI
|
|||||||
ID
|
ID
|
||||||
<506f7
|
<506f7
|
||||||
461746f>
|
461746f>
|
||||||
|
WARNING: good14.pdf (file position 1549): content normalization encountered bad tokens
|
||||||
|
WARNING: good14.pdf (file position 1549): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
|
||||||
|
WARNING: good14.pdf (file position 1549): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||||
test 3 done
|
test 3 done
|
||||||
|
9
qpdf/qtest/qpdf/normalize-warnings.out
Normal file
9
qpdf/qtest/qpdf/normalize-warnings.out
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
WARNING: coalesce.pdf (file position 671): content normalization encountered bad tokens
|
||||||
|
WARNING: coalesce.pdf (file position 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
|
||||||
|
WARNING: coalesce.pdf (file position 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||||
|
WARNING: coalesce.pdf (file position 823): content normalization encountered bad tokens
|
||||||
|
WARNING: coalesce.pdf (file position 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||||
|
WARNING: coalesce.pdf (file position 962): content normalization encountered bad tokens
|
||||||
|
WARNING: coalesce.pdf (file position 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
|
||||||
|
WARNING: coalesce.pdf (file position 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
|
||||||
|
qpdf: operation succeeded with warnings; resulting file may have some problems
|
Loading…
Reference in New Issue
Block a user