mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 10:58:58 +00:00
Improve locating inline image's EI
We've actually seen a PDF file in the wild that contained EI surrounded by delimiters inside the image data, which confused qpdf's naive code. This significantly improves EI detection.
This commit is contained in:
parent
ec9e310c9e
commit
2b6c79bcae
12
ChangeLog
12
ChangeLog
@ -1,3 +1,15 @@
|
|||||||
|
2019-01-30 Jay Berkenbilt <ejb@ql.org>
|
||||||
|
|
||||||
|
* Improve locating of an inline image's EI operator to correctly
|
||||||
|
handle the case of EI appearing inside the image data.
|
||||||
|
|
||||||
|
* Very low-level QPDFTokenizer API now includes an
|
||||||
|
expectInlineImage method that takes an input stream, enabling it
|
||||||
|
to locate an inline image's EI operator better. This is called
|
||||||
|
automatically everywhere within the qpdf library. Most user code
|
||||||
|
will never have to use the low-level tokenizer API. If you use
|
||||||
|
Pl_QPDFTokenizer, this will be done automatically for you.
|
||||||
|
|
||||||
2019-01-29 Jay Berkenbilt <ejb@ql.org>
|
2019-01-29 Jay Berkenbilt <ejb@ql.org>
|
||||||
|
|
||||||
* Bug fix: when returning an inline image token, the tokenizer no
|
* Bug fix: when returning an inline image token, the tokenizer no
|
||||||
|
@ -198,6 +198,7 @@ class QPDFTokenizer
|
|||||||
void resolveLiteral();
|
void resolveLiteral();
|
||||||
bool isSpace(char);
|
bool isSpace(char);
|
||||||
bool isDelimiter(char);
|
bool isDelimiter(char);
|
||||||
|
void findEI(PointerHolder<InputSource> input);
|
||||||
|
|
||||||
enum state_e {
|
enum state_e {
|
||||||
st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt,
|
st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt,
|
||||||
|
@ -47,7 +47,7 @@ QPDFWordTokenFinder::check()
|
|||||||
qpdf_offset_t pos = is->tell();
|
qpdf_offset_t pos = is->tell();
|
||||||
if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)))
|
if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)))
|
||||||
{
|
{
|
||||||
/// QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
|
QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
qpdf_offset_t token_start = is->getLastOffset();
|
qpdf_offset_t token_start = is->getLastOffset();
|
||||||
@ -65,7 +65,6 @@ QPDFWordTokenFinder::check()
|
|||||||
is->seek(pos, SEEK_SET);
|
is->seek(pos, SEEK_SET);
|
||||||
if (! next_okay)
|
if (! next_okay)
|
||||||
{
|
{
|
||||||
/// QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter");
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (token_start == 0)
|
if (token_start == 0)
|
||||||
@ -80,7 +79,7 @@ QPDFWordTokenFinder::check()
|
|||||||
is->seek(pos, SEEK_SET);
|
is->seek(pos, SEEK_SET);
|
||||||
if (! prev_okay)
|
if (! prev_okay)
|
||||||
{
|
{
|
||||||
/// QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter");
|
QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -687,28 +686,133 @@ QPDFTokenizer::expectInlineImage()
|
|||||||
void
|
void
|
||||||
QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input)
|
QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input)
|
||||||
{
|
{
|
||||||
if (input.getPointer())
|
|
||||||
{
|
|
||||||
qpdf_offset_t last_offset = input->getLastOffset();
|
|
||||||
qpdf_offset_t pos = input->tell();
|
|
||||||
|
|
||||||
QPDFWordTokenFinder f(input, "EI");
|
|
||||||
if (input->findFirst("EI", pos, 0, f))
|
|
||||||
{
|
|
||||||
this->m->inline_image_bytes = input->tell() - pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
input->seek(pos, SEEK_SET);
|
|
||||||
input->setLastOffset(last_offset);
|
|
||||||
}
|
|
||||||
if (this->m->state != st_top)
|
if (this->m->state != st_top)
|
||||||
{
|
{
|
||||||
throw std::logic_error("QPDFTokenizer::expectInlineImage called"
|
throw std::logic_error("QPDFTokenizer::expectInlineImage called"
|
||||||
" when tokenizer is in improper state");
|
" when tokenizer is in improper state");
|
||||||
}
|
}
|
||||||
|
findEI(input);
|
||||||
this->m->state = st_inline_image;
|
this->m->state = st_inline_image;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
QPDFTokenizer::findEI(PointerHolder<InputSource> input)
|
||||||
|
{
|
||||||
|
if (! input.getPointer())
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
qpdf_offset_t last_offset = input->getLastOffset();
|
||||||
|
qpdf_offset_t pos = input->tell();
|
||||||
|
|
||||||
|
// Use QPDFWordTokenFinder to find EI surrounded by delimiters.
|
||||||
|
// Then read the next several tokens or up to EOF. If we find any
|
||||||
|
// suspicious-looking or tokens, this is probably still part of
|
||||||
|
// the image data, so keep looking for EI. Stop at the first EI
|
||||||
|
// that passes. If we get to the end without finding one, return
|
||||||
|
// the last EI we found. Store the number of bytes expected in the
|
||||||
|
// inline image including the EI and use that to break out of
|
||||||
|
// inline image, falling back to the old method if needed.
|
||||||
|
|
||||||
|
bool okay = false;
|
||||||
|
bool first_try = true;
|
||||||
|
while (! okay)
|
||||||
|
{
|
||||||
|
QPDFWordTokenFinder f(input, "EI");
|
||||||
|
if (! input->findFirst("EI", input->tell(), 0, f))
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
this->m->inline_image_bytes = input->tell() - pos;
|
||||||
|
|
||||||
|
QPDFTokenizer check;
|
||||||
|
bool found_bad = false;
|
||||||
|
// Look at the next 10 tokens or up to EOF. The next inline
|
||||||
|
// image's image data would look like bad tokens, but there
|
||||||
|
// will always be at least 10 tokens between one inline
|
||||||
|
// image's EI and the next valid one's ID since width, height,
|
||||||
|
// bits per pixel, and color space are all required as well as
|
||||||
|
// a BI and ID. If we get 10 good tokens in a row or hit EOF,
|
||||||
|
// we can be pretty sure we've found the actual EI.
|
||||||
|
for (int i = 0; i < 10; ++i)
|
||||||
|
{
|
||||||
|
QPDFTokenizer::Token t =
|
||||||
|
check.readToken(input, "checker", true);
|
||||||
|
token_type_e type = t.getType();
|
||||||
|
if (type == tt_eof)
|
||||||
|
{
|
||||||
|
okay = true;
|
||||||
|
}
|
||||||
|
else if (type == tt_bad)
|
||||||
|
{
|
||||||
|
found_bad = true;
|
||||||
|
}
|
||||||
|
else if (type == tt_word)
|
||||||
|
{
|
||||||
|
// The qpdf tokenizer lumps alphabetic and otherwise
|
||||||
|
// uncategorized characters into "words". We recognize
|
||||||
|
// strings of alphabetic characters as potential valid
|
||||||
|
// operators for purposes of telling whether we're in
|
||||||
|
// valid content or not. It's not perfect, but it
|
||||||
|
// should work more reliably than what we used to do,
|
||||||
|
// which was already good enough for the vast majority
|
||||||
|
// of files.
|
||||||
|
bool found_alpha = false;
|
||||||
|
bool found_non_printable = false;
|
||||||
|
bool found_other = false;
|
||||||
|
std::string value = t.getValue();
|
||||||
|
for (std::string::iterator iter = value.begin();
|
||||||
|
iter != value.end(); ++iter)
|
||||||
|
{
|
||||||
|
char ch = *iter;
|
||||||
|
if (((ch >= 'a') && (ch <= 'z')) ||
|
||||||
|
((ch >= 'A') && (ch <= 'Z')) ||
|
||||||
|
(ch == '*'))
|
||||||
|
{
|
||||||
|
// Treat '*' as alpha since there are valid
|
||||||
|
// PDF operators that contain * along with
|
||||||
|
// alphabetic characters.
|
||||||
|
found_alpha = true;
|
||||||
|
}
|
||||||
|
else if (((ch < 32) && (! isSpace(ch))) || (ch > 127))
|
||||||
|
{
|
||||||
|
found_non_printable = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
found_other = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (found_non_printable || (found_alpha && found_other))
|
||||||
|
{
|
||||||
|
found_bad = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (okay || found_bad)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (! found_bad)
|
||||||
|
{
|
||||||
|
okay = true;
|
||||||
|
}
|
||||||
|
if (! okay)
|
||||||
|
{
|
||||||
|
first_try = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (okay && (! first_try))
|
||||||
|
{
|
||||||
|
QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
|
||||||
|
}
|
||||||
|
|
||||||
|
input->seek(pos, SEEK_SET);
|
||||||
|
input->setLastOffset(last_offset);
|
||||||
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
|
QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
|
||||||
{
|
{
|
||||||
|
@ -430,6 +430,9 @@ QPDFPageObjectHelper copy shared attribute 0
|
|||||||
qpdf from_nr from repeat_nr 0
|
qpdf from_nr from repeat_nr 0
|
||||||
QPDF resolve duplicated page object 0
|
QPDF resolve duplicated page object 0
|
||||||
QPDF handle direct page object 0
|
QPDF handle direct page object 0
|
||||||
|
QPDFTokenizer finder found wrong word 0
|
||||||
|
QPDFTokenizer finder word not preceded by delimiter 0
|
||||||
QPDFTokenizer found EI the old way 0
|
QPDFTokenizer found EI the old way 0
|
||||||
QPDFTokenizer found EI by byte count 0
|
QPDFTokenizer found EI by byte count 0
|
||||||
QPDFTokenizer inline image at EOF the old way 0
|
QPDFTokenizer inline image at EOF the old way 0
|
||||||
|
QPDFTokenizer found EI after more than one try 0
|
||||||
|
@ -691,6 +691,26 @@ $td->runtest("check pass1 file",
|
|||||||
{$td->FILE => "b.pdf"},
|
{$td->FILE => "b.pdf"},
|
||||||
{$td->FILE => "minimal-linearize-pass1.pdf"});
|
{$td->FILE => "minimal-linearize-pass1.pdf"});
|
||||||
|
|
||||||
|
show_ntests();
|
||||||
|
# ----------
|
||||||
|
$td->notify("--- Inline Images ---");
|
||||||
|
$n_tests += 2;
|
||||||
|
|
||||||
|
# The file large-inline-image.pdf is a hand-crafted file with several
|
||||||
|
# inline images of various sizes including one that is two megabytes,
|
||||||
|
# encoded in base85, and has a base85-encoding that contains EI
|
||||||
|
# surrounded by delimiters several times. This exercises the EI
|
||||||
|
# detection code added in qpdf 8.4.
|
||||||
|
|
||||||
|
$td->runtest("complex inline image parsing",
|
||||||
|
{$td->COMMAND =>
|
||||||
|
"qpdf --qdf --static-id large-inline-image.pdf a.pdf"},
|
||||||
|
{$td->STRING => "", $td->EXIT_STATUS => 0},
|
||||||
|
$td->NORMALIZE_NEWLINES);
|
||||||
|
$td->runtest("check output",
|
||||||
|
{$td->FILE => "a.pdf"},
|
||||||
|
{$td->FILE => "large-inline-image.qdf"});
|
||||||
|
|
||||||
show_ntests();
|
show_ntests();
|
||||||
# ----------
|
# ----------
|
||||||
$td->notify("--- Tokenizer ---");
|
$td->notify("--- Tokenizer ---");
|
||||||
|
BIN
qpdf/qtest/qpdf/large-inline-image.pdf
Normal file
BIN
qpdf/qtest/qpdf/large-inline-image.pdf
Normal file
Binary file not shown.
BIN
qpdf/qtest/qpdf/large-inline-image.qdf
Normal file
BIN
qpdf/qtest/qpdf/large-inline-image.qdf
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user