2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-05-29 00:10:54 +00:00

Avoid merging adjacent tokens when concatenating contents (fixes #444)

This commit is contained in:
Jay Berkenbilt 2020-10-23 06:40:27 -04:00
parent 0dea276997
commit b30deaeeab
16 changed files with 541 additions and 43 deletions

View File

@ -1,5 +1,11 @@
2020-10-23 Jay Berkenbilt <ejb@ql.org> 2020-10-23 Jay Berkenbilt <ejb@ql.org>
* Bug fix: when concatenating content streams, insert a newline if
needed to prevent the last token from the old stream from being
merged with the first token of the new stream. Qpdf was mistakenly
concatenating the streams without regard to the specification that
content streams are to be broken on token boundaries. Fixes #444.
* Bug fix: fix-qdf: properly handle empty streams with ignore * Bug fix: fix-qdf: properly handle empty streams with ignore
newline. newline.

1
TODO
View File

@ -4,7 +4,6 @@ Candidates for upcoming release
* Open "next" issues * Open "next" issues
* bugs * bugs
* #473: zsh completion with directories * #473: zsh completion with directories
* #444: concatenated stream/whitespace bug
* Non-bugs * Non-bugs
* #446: recognize edited QDF files * #446: recognize edited QDF files
* #436: parsing of document with form xobject * #436: parsing of document with form xobject

View File

@ -165,6 +165,47 @@ QPDFObjectHandle::ParserCallbacks::terminateParsing()
throw TerminateParsing(); throw TerminateParsing();
} }
class LastChar: public Pipeline
{
public:
LastChar(Pipeline* next);
virtual ~LastChar() = default;
virtual void write(unsigned char* data, size_t len);
virtual void finish();
unsigned char getLastChar();
private:
unsigned char last_char;
};
LastChar::LastChar(Pipeline* next) :
Pipeline("lastchar", next),
last_char(0)
{
}
void
LastChar::write(unsigned char* data, size_t len)
{
if (len > 0)
{
this->last_char = data[len - 1];
}
getNext()->write(data, len);
}
void
LastChar::finish()
{
getNext()->finish();
}
unsigned char
LastChar::getLastChar()
{
return this->last_char;
}
QPDFObjectHandle::QPDFObjectHandle() : QPDFObjectHandle::QPDFObjectHandle() :
initialized(false), initialized(false),
qpdf(0), qpdf(0),
@ -1600,21 +1641,31 @@ QPDFObjectHandle::pipeContentStreams(
std::vector<QPDFObjectHandle> streams = std::vector<QPDFObjectHandle> streams =
arrayOrStreamToStreamArray( arrayOrStreamToStreamArray(
description, all_description); description, all_description);
bool need_newline = false;
for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin(); for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
iter != streams.end(); ++iter) iter != streams.end(); ++iter)
{ {
if (need_newline)
{
p->write(QUtil::unsigned_char_pointer("\n"), 1);
}
LastChar lc(p);
QPDFObjectHandle stream = *iter; QPDFObjectHandle stream = *iter;
std::string og = std::string og =
QUtil::int_to_string(stream.getObjectID()) + " " + QUtil::int_to_string(stream.getObjectID()) + " " +
QUtil::int_to_string(stream.getGeneration()); QUtil::int_to_string(stream.getGeneration());
std::string w_description = "content stream object " + og; std::string w_description = "content stream object " + og;
if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized)) if (! stream.pipeStreamData(&lc, 0, qpdf_dl_specialized))
{ {
QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent"); QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
throw QPDFExc(qpdf_e_damaged_pdf, "content stream", throw QPDFExc(qpdf_e_damaged_pdf, "content stream",
w_description, 0, w_description, 0,
"errors while decoding content stream"); "errors while decoding content stream");
} }
lc.finish();
need_newline = (lc.getLastChar() != static_cast<unsigned char>('\n'));
QTC::TC("qpdf", "QPDFObjectHandle need_newline",
need_newline ? 0 : 1);
} }
} }

View File

@ -2090,14 +2090,9 @@ outfile.pdf</option>
option causes qpdf to combine them into a single stream. Use option causes qpdf to combine them into a single stream. Use
of this option is never necessary for ordinary usage, but it of this option is never necessary for ordinary usage, but it
can help when working with some files in some cases. For can help when working with some files in some cases. For
example, some PDF writers split page contents into small example, this can also be combined with QDF mode or content
streams at arbitrary points that may fall in the middle of normalization to make it easier to look at all of a page's
lexical tokens within the content, and some PDF readers may contents at once.
get confused on such files. If you use qpdf to coalesce the
content streams, such readers may be able to work with the
file more easily. This can also be combined with QDF mode or
content normalization to make it easier to look at all of a
page's contents at once.
</para> </para>
</listitem> </listitem>
</varlistentry> </varlistentry>
@ -2398,25 +2393,15 @@ outfile.pdf</option>
You should not use this for &ldquo;production&rdquo; PDF files. You should not use this for &ldquo;production&rdquo; PDF files.
</para> </para>
<para> <para>
This paragraph discusses edge cases of content normalization that When normalizing content, if qpdf runs into any lexical errors, it
are not of concern to most users and are not relevant when content will print a warning indicating that content may be damaged. The
normalization is not enabled. When normalizing content, if qpdf only situation in which qpdf is known to cause damage during
runs into any lexical errors, it will print a warning indicating content normalization is when a page's contents are split across
that content may be damaged. The only situation in which qpdf is multiple streams and streams are split in the middle of a lexical
known to cause damage during content normalization is when a token such as a string, name, or inline image. Note that files
page's contents are split across multiple streams and streams are that do this are invalid since the PDF specification states that
split in the middle of a lexical token such as a string, name, or content streams are not to be split in the middle of a token. If
inline image. There may be some pathological cases in which qpdf you want to inspect the original content streams in an
could damage content without noticing this, such as if the partial
tokens at the end of one stream and the beginning of the next
stream are both valid, but usually qpdf will be able to detect
this case. For slightly increased safety, you can specify
<option>--coalesce-contents</option> in addition to
<option>--normalize-content</option> or <option>--qdf</option>.
This will cause qpdf to combine all the content streams into one,
thus recombining any split tokens. However doing this will prevent
you from being able to see the original layout of the content
streams. If you must inspect the original content streams in an
uncompressed format, you can always run with <option>--qdf uncompressed format, you can always run with <option>--qdf
--normalize-content=n</option> for a QDF file without content --normalize-content=n</option> for a QDF file without content
normalization, or alternatively normalization, or alternatively

View File

@ -455,3 +455,4 @@ qpdf found shared resources in leaf 0
qpdf found shared xobject in leaf 0 qpdf found shared xobject in leaf 0
QPDF copy foreign with data 1 QPDF copy foreign with data 1
QPDF copy foreign with foreign_stream 1 QPDF copy foreign with foreign_stream 1
QPDFObjectHandle need_newline 1

View File

@ -1591,13 +1591,21 @@ $td->runtest("type checks with object streams",
# ---------- # ----------
$td->notify("--- Coalesce contents ---"); $td->notify("--- Coalesce contents ---");
$n_tests += 6; $n_tests += 8;
$td->runtest("qdf with normalize warnings", $td->runtest("qdf with normalize warnings",
{$td->COMMAND => {$td->COMMAND =>
"qpdf --qdf --static-id coalesce.pdf a.pdf"}, "qpdf --qdf --static-id split-tokens.pdf a.pdf"},
{$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3}, {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3},
$td->NORMALIZE_NEWLINES); $td->NORMALIZE_NEWLINES);
$td->runtest("check output",
{$td->FILE => "a.pdf"},
{$td->FILE => "split-tokens.qdf"});
$td->runtest("coalesce to qdf",
{$td->COMMAND =>
"qpdf --qdf --static-id coalesce.pdf a.pdf"},
{$td->STRING => "", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->runtest("check output", $td->runtest("check output",
{$td->FILE => "a.pdf"}, {$td->FILE => "a.pdf"},
{$td->FILE => "coalesce.qdf"}); {$td->FILE => "coalesce.qdf"});
@ -1831,12 +1839,12 @@ $td->runtest("unreferenced resources with bad token",
{$td->COMMAND => {$td->COMMAND =>
"qpdf --qdf --static-id --split-pages=2" . "qpdf --qdf --static-id --split-pages=2" .
" --remove-unreferenced-resources=yes" . " --remove-unreferenced-resources=yes" .
" coalesce.pdf split-out-bad-token.pdf"}, " split-tokens.pdf split-out-bad-token.pdf"},
{$td->FILE => "coalesce-split.out", $td->EXIT_STATUS => 3}, {$td->FILE => "split-tokens-split.out", $td->EXIT_STATUS => 3},
$td->NORMALIZE_NEWLINES); $td->NORMALIZE_NEWLINES);
$td->runtest("check output", $td->runtest("check output",
{$td->FILE => "split-out-bad-token-1-2.pdf"}, {$td->FILE => "split-out-bad-token-1-2.pdf"},
{$td->FILE => "coalesce-split-1-2.pdf"}); {$td->FILE => "split-tokens-split-1-2.pdf"});
$td->runtest("shared images in form xobject", $td->runtest("shared images in form xobject",
{$td->COMMAND => "qpdf --qdf --static-id --split-pages". {$td->COMMAND => "qpdf --qdf --static-id --split-pages".

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,9 +1,9 @@
WARNING: coalesce.pdf (offset 671): content normalization encountered bad tokens WARNING: split-tokens.pdf (offset 671): content normalization encountered bad tokens
WARNING: coalesce.pdf (offset 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents WARNING: split-tokens.pdf (offset 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
WARNING: coalesce.pdf (offset 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. WARNING: split-tokens.pdf (offset 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
WARNING: coalesce.pdf (offset 823): content normalization encountered bad tokens WARNING: split-tokens.pdf (offset 823): content normalization encountered bad tokens
WARNING: coalesce.pdf (offset 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. WARNING: split-tokens.pdf (offset 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
WARNING: coalesce.pdf (offset 962): content normalization encountered bad tokens WARNING: split-tokens.pdf (offset 962): content normalization encountered bad tokens
WARNING: coalesce.pdf (offset 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents WARNING: split-tokens.pdf (offset 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
WARNING: coalesce.pdf (offset 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. WARNING: split-tokens.pdf (offset 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
qpdf: operation succeeded with warnings; resulting file may have some problems qpdf: operation succeeded with warnings; resulting file may have some problems

View File

@ -1,4 +1,4 @@
WARNING: coalesce.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page WARNING: split-tokens.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page
WARNING: empty PDF: content normalization encountered bad tokens WARNING: empty PDF: content normalization encountered bad tokens
WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.

View File

@ -0,0 +1,217 @@
%PDF-1.3
%¿÷¢þ
%QDF-1.0
1 0 obj
<<
/Pages 2 0 R
/Type /Catalog
>>
endobj
2 0 obj
<<
/Count 2
/Kids [
3 0 R
4 0 R
]
/Type /Pages
>>
endobj
%% Page 1
3 0 obj
<<
/Contents [
5 0 R
7 0 R
9 0 R
11 0 R
]
/MediaBox [
0
0
612
792
]
/Parent 2 0 R
/Resources <<
/Font <<
/F1 13 0 R
>>
/ProcSet 14 0 R
>>
/Type /Page
>>
endobj
%% Page 2
4 0 obj
<<
/Contents 15 0 R
/MediaBox [
0
0
612
792
]
/Parent 2 0 R
/Resources <<
/Font <<
/F1 17 0 R
>>
/ProcSet 18 0 R
>>
/Type /Page
>>
endobj
%% Contents for page 1
5 0 obj
<<
/Length 6 0 R
>>
stream
BT
/F1 24 Tf
72 720 Td
(Pot
endstream
endobj
%QDF: ignore_newline
6 0 obj
33
endobj
%% Contents for page 1
7 0 obj
<<
/Length 8 0 R
>>
stream
ato) Tj
ET [ /array
endstream
endobj
%QDF: ignore_newline
8 0 obj
19
endobj
%% Contents for page 1
9 0 obj
<<
/Length 10 0 R
>>
stream
/split ] BI
/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
ID xœÅÖIà P|ÿC;UÈ`ÀÓ7 ¦ĘÚæ<C39A>}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À<E280A1>>”^&®¡uâ]€"!‡•*¬&<26>E|Sy® ðd-€<<3C>B0Bú@Nê+<hlèKÐî/56L <C2A0>ã £–¹¦>0>Y<>ù!cì\Y Ø%Yð¥Ö8?& Öëˆ}jûè<>3<EFBFBD>ÂÖlpÛsHöûtú
endstream
endobj
%QDF: ignore_newline
10 0 obj
253
endobj
%% Contents for page 1
11 0 obj
<<
/Length 12 0 R
>>
stream
QØTt*hÌUúãwÍÕÐ%¨)p³"•DiRj¹DYNUÓÙAvFà& <0A>ÍÔu#c•ÆW ô߉W“O
EI
endstream
endobj
%QDF: ignore_newline
12 0 obj
66
endobj
13 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
/Name /F1
/Subtype /Type1
/Type /Font
>>
endobj
14 0 obj
[
/PDF
/Text
]
endobj
%% Contents for page 2
15 0 obj
<<
/Length 16 0 R
>>
stream
BT
/F1 24 Tf
72 720 Td
(Potato) Tj
ET
endstream
endobj
16 0 obj
44
endobj
17 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
/Name /F1
/Subtype /Type1
/Type /Font
>>
endobj
18 0 obj
[
/PDF
/Text
]
endobj
xref
0 19
0000000000 65535 f
0000000025 00000 n
0000000079 00000 n
0000000171 00000 n
0000000416 00000 n
0000000634 00000 n
0000000744 00000 n
0000000786 00000 n
0000000882 00000 n
0000000924 00000 n
0000001255 00000 n
0000001299 00000 n
0000001444 00000 n
0000001464 00000 n
0000001583 00000 n
0000001642 00000 n
0000001743 00000 n
0000001763 00000 n
0000001882 00000 n
trailer <<
/Root 1 0 R
/Size 19
/ID [<fa46a90bcf56476b9904a2e7adb75024><6af379f20e8dcd4e724869daec3ba023>]
>>
startxref
1918
%%EOF

View File

@ -0,0 +1,231 @@
%PDF-1.3
%¿÷¢þ
%QDF-1.0
%% Original object ID: 1 0
1 0 obj
<<
/Pages 2 0 R
/Type /Catalog
>>
endobj
%% Original object ID: 2 0
2 0 obj
<<
/Count 2
/Kids [
3 0 R
4 0 R
]
/Type /Pages
>>
endobj
%% Page 1
%% Original object ID: 3 0
3 0 obj
<<
/Contents [
5 0 R
7 0 R
9 0 R
11 0 R
]
/MediaBox [
0
0
612
792
]
/Parent 2 0 R
/Resources <<
/Font <<
/F1 13 0 R
>>
/ProcSet 14 0 R
>>
/Type /Page
>>
endobj
%% Page 2
%% Original object ID: 4 0
4 0 obj
<<
/Contents 15 0 R
/MediaBox [
0
0
612
792
]
/Parent 2 0 R
/Resources <<
/Font <<
/F1 17 0 R
>>
/ProcSet 18 0 R
>>
/Type /Page
>>
endobj
%% Contents for page 1
%% Original object ID: 5 0
5 0 obj
<<
/Length 6 0 R
>>
stream
BT
/F1 24 Tf
72 720 Td
(Pot
endstream
endobj
%QDF: ignore_newline
6 0 obj
33
endobj
%% Contents for page 1
%% Original object ID: 7 0
7 0 obj
<<
/Length 8 0 R
>>
stream
ato) Tj
ET [ /array
endstream
endobj
%QDF: ignore_newline
8 0 obj
19
endobj
%% Contents for page 1
%% Original object ID: 9 0
9 0 obj
<<
/Length 10 0 R
>>
stream
/split ] BI
/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
ID xœÅÖIà P|ÿC;UÈ`ÀÓ7 ¦ĘÚæ<C39A>}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À<E280A1>>”^&®¡uâ]€"!‡•*¬&<26>E|Sy® ðd-€<<3C>B0Bú@Nê+<hlèKÐî/56L <C2A0>ã £–¹¦>0>Y<>ù!cì\Y Ø%Yð¥Ö8?& Öëˆ}jûè<>3<EFBFBD>ÂÖlpÛsHöûtú
endstream
endobj
%QDF: ignore_newline
10 0 obj
253
endobj
%% Contents for page 1
%% Original object ID: 11 0
11 0 obj
<<
/Length 12 0 R
>>
stream
QØTt*hÌUúãwÍÕÐ%¨)p³"•DiRj¹DYNUÓÙAvFà&
<EFBFBD>ÍÔu#c•ÆW ô߉W“O
EI
endstream
endobj
%QDF: ignore_newline
12 0 obj
65
endobj
%% Original object ID: 13 0
13 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
/Name /F1
/Subtype /Type1
/Type /Font
>>
endobj
%% Original object ID: 14 0
14 0 obj
[
/PDF
/Text
]
endobj
%% Contents for page 2
%% Original object ID: 15 0
15 0 obj
<<
/Length 16 0 R
>>
stream
BT
/F1 24 Tf
72 720 Td
(Potato) Tj
ET
endstream
endobj
16 0 obj
44
endobj
%% Original object ID: 17 0
17 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
/Name /F1
/Subtype /Type1
/Type /Font
>>
endobj
%% Original object ID: 18 0
18 0 obj
[
/PDF
/Text
]
endobj
xref
0 19
0000000000 65535 f
0000000052 00000 n
0000000133 00000 n
0000000252 00000 n
0000000524 00000 n
0000000769 00000 n
0000000879 00000 n
0000000948 00000 n
0000001044 00000 n
0000001113 00000 n
0000001444 00000 n
0000001516 00000 n
0000001660 00000 n
0000001708 00000 n
0000001855 00000 n
0000001942 00000 n
0000002043 00000 n
0000002091 00000 n
0000002238 00000 n
trailer <<
/Root 1 0 R
/Size 19
/ID [<fa46a90bcf56476b9904a2e7adb75024><31415926535897932384626433832795>]
>>
startxref
2274
%%EOF