mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-31 14:01:59 +00:00
commit
0b3debaf86
@ -191,6 +191,8 @@ class QPDFTokenizer
|
|||||||
// returns a tt_inline_image token.
|
// returns a tt_inline_image token.
|
||||||
QPDF_DLL
|
QPDF_DLL
|
||||||
void expectInlineImage(std::shared_ptr<InputSource> input);
|
void expectInlineImage(std::shared_ptr<InputSource> input);
|
||||||
|
QPDF_DLL
|
||||||
|
void expectInlineImage(InputSource& input);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend class QPDFParser;
|
friend class QPDFParser;
|
||||||
@ -217,7 +219,7 @@ class QPDFTokenizer
|
|||||||
|
|
||||||
bool isSpace(char);
|
bool isSpace(char);
|
||||||
bool isDelimiter(char);
|
bool isDelimiter(char);
|
||||||
void findEI(std::shared_ptr<InputSource> input);
|
void findEI(InputSource& input);
|
||||||
|
|
||||||
enum state_e {
|
enum state_e {
|
||||||
st_top,
|
st_top,
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include <qpdf/ContentNormalizer.hh>
|
#include <qpdf/ContentNormalizer.hh>
|
||||||
|
|
||||||
|
#include <qpdf/QPDF_Name.hh>
|
||||||
#include <qpdf/QUtil.hh>
|
#include <qpdf/QUtil.hh>
|
||||||
|
|
||||||
ContentNormalizer::ContentNormalizer() :
|
ContentNormalizer::ContentNormalizer() :
|
||||||
@ -11,7 +12,6 @@ ContentNormalizer::ContentNormalizer() :
|
|||||||
void
|
void
|
||||||
ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
|
ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
|
||||||
{
|
{
|
||||||
std::string value = token.getRawValue();
|
|
||||||
QPDFTokenizer::token_type_e token_type = token.getType();
|
QPDFTokenizer::token_type_e token_type = token.getType();
|
||||||
|
|
||||||
if (token_type == QPDFTokenizer::tt_bad) {
|
if (token_type == QPDFTokenizer::tt_bad) {
|
||||||
@ -24,40 +24,48 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
|
|||||||
switch (token_type) {
|
switch (token_type) {
|
||||||
case QPDFTokenizer::tt_space:
|
case QPDFTokenizer::tt_space:
|
||||||
{
|
{
|
||||||
size_t len = value.length();
|
std::string const& value = token.getRawValue();
|
||||||
for (size_t i = 0; i < len; ++i) {
|
auto size = value.size();
|
||||||
char ch = value.at(i);
|
size_t pos = 0;
|
||||||
if (ch == '\r') {
|
auto r_pos = value.find('\r');
|
||||||
if ((i + 1 < len) && (value.at(i + 1) == '\n')) {
|
while (r_pos != std::string::npos) {
|
||||||
// ignore
|
if (pos != r_pos) {
|
||||||
} else {
|
write(&value[pos], r_pos - pos);
|
||||||
write("\n");
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
write(&ch, 1);
|
|
||||||
}
|
}
|
||||||
|
if (++r_pos >= size) {
|
||||||
|
write("\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (value[r_pos] != '\n') {
|
||||||
|
write("\n");
|
||||||
|
}
|
||||||
|
pos = r_pos;
|
||||||
|
r_pos = value.find('\r', pos);
|
||||||
|
}
|
||||||
|
if (pos < size) {
|
||||||
|
write(&value[pos], size - pos);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
return;
|
||||||
|
|
||||||
case QPDFTokenizer::tt_string:
|
case QPDFTokenizer::tt_string:
|
||||||
// Replacing string and name tokens in this way normalizes their representation as this will
|
// Replacing string and name tokens in this way normalizes their representation as this will
|
||||||
// automatically handle quoting of unprintable characters, etc.
|
// automatically handle quoting of unprintable characters, etc.
|
||||||
writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, token.getValue()));
|
write(QPDFObjectHandle::newString(token.getValue()).unparse());
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case QPDFTokenizer::tt_name:
|
case QPDFTokenizer::tt_name:
|
||||||
writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, token.getValue()));
|
write(QPDF_Name::normalizeName(token.getValue()));
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
writeToken(token);
|
writeToken(token);
|
||||||
break;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
value = token.getRawValue();
|
// tt_string or tt_name
|
||||||
if (((token_type == QPDFTokenizer::tt_string) || (token_type == QPDFTokenizer::tt_name)) &&
|
std::string const& value = token.getRawValue();
|
||||||
((value.find('\r') != std::string::npos) || (value.find('\n') != std::string::npos))) {
|
if (value.find('\r') != std::string::npos || value.find('\n') != std::string::npos) {
|
||||||
write("\n");
|
write("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -36,20 +36,17 @@ void
|
|||||||
Pl_QPDFTokenizer::finish()
|
Pl_QPDFTokenizer::finish()
|
||||||
{
|
{
|
||||||
m->buf.finish();
|
m->buf.finish();
|
||||||
auto input = std::shared_ptr<InputSource>(
|
auto input = BufferInputSource("tokenizer data", m->buf.getBuffer(), true);
|
||||||
// line-break
|
std::string empty;
|
||||||
new BufferInputSource("tokenizer data", m->buf.getBuffer(), true));
|
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
QPDFTokenizer::Token token =
|
auto token = m->tokenizer.readToken(input, empty, true);
|
||||||
m->tokenizer.readToken(input, "offset " + std::to_string(input->tell()), true);
|
|
||||||
m->filter->handleToken(token);
|
m->filter->handleToken(token);
|
||||||
if (token.getType() == QPDFTokenizer::tt_eof) {
|
if (token.getType() == QPDFTokenizer::tt_eof) {
|
||||||
break;
|
break;
|
||||||
} else if (token.isWord("ID")) {
|
} else if (token.isWord("ID")) {
|
||||||
// Read the space after the ID.
|
// Read the space after the ID.
|
||||||
char ch = ' ';
|
char ch = ' ';
|
||||||
input->read(&ch, 1);
|
input.read(&ch, 1);
|
||||||
m->filter->handleToken(
|
m->filter->handleToken(
|
||||||
// line-break
|
// line-break
|
||||||
QPDFTokenizer::Token(QPDFTokenizer::tt_space, std::string(1, ch)));
|
QPDFTokenizer::Token(QPDFTokenizer::tt_space, std::string(1, ch)));
|
||||||
|
@ -148,7 +148,7 @@ QPDFObjectHandle::TokenFilter::write(std::string const& str)
|
|||||||
void
|
void
|
||||||
QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token)
|
QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token)
|
||||||
{
|
{
|
||||||
std::string value = token.getRawValue();
|
std::string const& value = token.getRawValue();
|
||||||
write(value.c_str(), value.length());
|
write(value.c_str(), value.length());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ namespace
|
|||||||
class QPDFWordTokenFinder: public InputSource::Finder
|
class QPDFWordTokenFinder: public InputSource::Finder
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
QPDFWordTokenFinder(std::shared_ptr<InputSource> is, std::string const& str) :
|
QPDFWordTokenFinder(InputSource& is, std::string const& str) :
|
||||||
is(is),
|
is(is),
|
||||||
str(str)
|
str(str)
|
||||||
{
|
{
|
||||||
@ -36,7 +36,7 @@ namespace
|
|||||||
bool check() override;
|
bool check() override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::shared_ptr<InputSource> is;
|
InputSource& is;
|
||||||
std::string str;
|
std::string str;
|
||||||
};
|
};
|
||||||
} // namespace
|
} // namespace
|
||||||
@ -48,21 +48,21 @@ QPDFWordTokenFinder::check()
|
|||||||
// delimiter or EOF.
|
// delimiter or EOF.
|
||||||
QPDFTokenizer tokenizer;
|
QPDFTokenizer tokenizer;
|
||||||
QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
|
QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
|
||||||
qpdf_offset_t pos = is->tell();
|
qpdf_offset_t pos = is.tell();
|
||||||
if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) {
|
if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) {
|
||||||
QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
|
QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
qpdf_offset_t token_start = is->getLastOffset();
|
qpdf_offset_t token_start = is.getLastOffset();
|
||||||
char next;
|
char next;
|
||||||
bool next_okay = false;
|
bool next_okay = false;
|
||||||
if (is->read(&next, 1) == 0) {
|
if (is.read(&next, 1) == 0) {
|
||||||
QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
|
QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
|
||||||
next_okay = true;
|
next_okay = true;
|
||||||
} else {
|
} else {
|
||||||
next_okay = is_delimiter(next);
|
next_okay = is_delimiter(next);
|
||||||
}
|
}
|
||||||
is->seek(pos, SEEK_SET);
|
is.seek(pos, SEEK_SET);
|
||||||
if (!next_okay) {
|
if (!next_okay) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -763,12 +763,18 @@ QPDFTokenizer::presentEOF()
|
|||||||
|
|
||||||
void
|
void
|
||||||
QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
|
QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
|
||||||
|
{
|
||||||
|
expectInlineImage(*input);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
QPDFTokenizer::expectInlineImage(InputSource& input)
|
||||||
{
|
{
|
||||||
if (this->state == st_token_ready) {
|
if (this->state == st_token_ready) {
|
||||||
reset();
|
reset();
|
||||||
} else if (this->state != st_before_token) {
|
} else if (this->state != st_before_token) {
|
||||||
throw std::logic_error("QPDFTokenizer::expectInlineImage called"
|
throw std::logic_error(
|
||||||
" when tokenizer is in improper state");
|
"QPDFTokenizer::expectInlineImage called when tokenizer is in improper state");
|
||||||
}
|
}
|
||||||
findEI(input);
|
findEI(input);
|
||||||
this->before_token = false;
|
this->before_token = false;
|
||||||
@ -777,14 +783,10 @@ QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
|
QPDFTokenizer::findEI(InputSource& input)
|
||||||
{
|
{
|
||||||
if (!input.get()) {
|
qpdf_offset_t last_offset = input.getLastOffset();
|
||||||
return;
|
qpdf_offset_t pos = input.tell();
|
||||||
}
|
|
||||||
|
|
||||||
qpdf_offset_t last_offset = input->getLastOffset();
|
|
||||||
qpdf_offset_t pos = input->tell();
|
|
||||||
|
|
||||||
// Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several
|
// Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several
|
||||||
// tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part
|
// tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part
|
||||||
@ -797,10 +799,10 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
|
|||||||
bool first_try = true;
|
bool first_try = true;
|
||||||
while (!okay) {
|
while (!okay) {
|
||||||
QPDFWordTokenFinder f(input, "EI");
|
QPDFWordTokenFinder f(input, "EI");
|
||||||
if (!input->findFirst("EI", input->tell(), 0, f)) {
|
if (!input.findFirst("EI", input.tell(), 0, f)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
this->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2);
|
inline_image_bytes = QIntC::to_size(input.tell() - pos - 2);
|
||||||
|
|
||||||
QPDFTokenizer check;
|
QPDFTokenizer check;
|
||||||
bool found_bad = false;
|
bool found_bad = false;
|
||||||
@ -858,8 +860,8 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
|
|||||||
QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
|
QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
|
||||||
}
|
}
|
||||||
|
|
||||||
input->seek(pos, SEEK_SET);
|
input.seek(pos, SEEK_SET);
|
||||||
input->setLastOffset(last_offset);
|
input.setLastOffset(last_offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
@ -902,7 +904,7 @@ QPDFTokenizer::readToken(
|
|||||||
throw QPDFExc(
|
throw QPDFExc(
|
||||||
qpdf_e_damaged_pdf,
|
qpdf_e_damaged_pdf,
|
||||||
input.getName(),
|
input.getName(),
|
||||||
context,
|
context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context,
|
||||||
input.getLastOffset(),
|
input.getLastOffset(),
|
||||||
token.getErrorMessage());
|
token.getErrorMessage());
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user