2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-11-09 23:00:57 +00:00

Merge pull request #1253 from m-holger/pl_t

Refactor Pl_QPDFTokenizer
This commit is contained in:
m-holger 2024-08-21 18:29:55 +01:00 committed by GitHub
commit 0b3debaf86
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 57 additions and 48 deletions

View File

@ -191,6 +191,8 @@ class QPDFTokenizer
// returns a tt_inline_image token. // returns a tt_inline_image token.
QPDF_DLL QPDF_DLL
void expectInlineImage(std::shared_ptr<InputSource> input); void expectInlineImage(std::shared_ptr<InputSource> input);
QPDF_DLL
void expectInlineImage(InputSource& input);
private: private:
friend class QPDFParser; friend class QPDFParser;
@ -217,7 +219,7 @@ class QPDFTokenizer
bool isSpace(char); bool isSpace(char);
bool isDelimiter(char); bool isDelimiter(char);
void findEI(std::shared_ptr<InputSource> input); void findEI(InputSource& input);
enum state_e { enum state_e {
st_top, st_top,

View File

@ -1,5 +1,6 @@
#include <qpdf/ContentNormalizer.hh> #include <qpdf/ContentNormalizer.hh>
#include <qpdf/QPDF_Name.hh>
#include <qpdf/QUtil.hh> #include <qpdf/QUtil.hh>
ContentNormalizer::ContentNormalizer() : ContentNormalizer::ContentNormalizer() :
@ -11,7 +12,6 @@ ContentNormalizer::ContentNormalizer() :
void void
ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
{ {
std::string value = token.getRawValue();
QPDFTokenizer::token_type_e token_type = token.getType(); QPDFTokenizer::token_type_e token_type = token.getType();
if (token_type == QPDFTokenizer::tt_bad) { if (token_type == QPDFTokenizer::tt_bad) {
@ -24,40 +24,48 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
switch (token_type) { switch (token_type) {
case QPDFTokenizer::tt_space: case QPDFTokenizer::tt_space:
{ {
size_t len = value.length(); std::string const& value = token.getRawValue();
for (size_t i = 0; i < len; ++i) { auto size = value.size();
char ch = value.at(i); size_t pos = 0;
if (ch == '\r') { auto r_pos = value.find('\r');
if ((i + 1 < len) && (value.at(i + 1) == '\n')) { while (r_pos != std::string::npos) {
// ignore if (pos != r_pos) {
} else { write(&value[pos], r_pos - pos);
}
if (++r_pos >= size) {
write("\n");
return;
}
if (value[r_pos] != '\n') {
write("\n"); write("\n");
} }
} else { pos = r_pos;
write(&ch, 1); r_pos = value.find('\r', pos);
}
if (pos < size) {
write(&value[pos], size - pos);
} }
} }
} return;
break;
case QPDFTokenizer::tt_string: case QPDFTokenizer::tt_string:
// Replacing string and name tokens in this way normalizes their representation as this will // Replacing string and name tokens in this way normalizes their representation as this will
// automatically handle quoting of unprintable characters, etc. // automatically handle quoting of unprintable characters, etc.
writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, token.getValue())); write(QPDFObjectHandle::newString(token.getValue()).unparse());
break; break;
case QPDFTokenizer::tt_name: case QPDFTokenizer::tt_name:
writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, token.getValue())); write(QPDF_Name::normalizeName(token.getValue()));
break; break;
default: default:
writeToken(token); writeToken(token);
break; return;
} }
value = token.getRawValue(); // tt_string or tt_name
if (((token_type == QPDFTokenizer::tt_string) || (token_type == QPDFTokenizer::tt_name)) && std::string const& value = token.getRawValue();
((value.find('\r') != std::string::npos) || (value.find('\n') != std::string::npos))) { if (value.find('\r') != std::string::npos || value.find('\n') != std::string::npos) {
write("\n"); write("\n");
} }
} }

View File

@ -36,20 +36,17 @@ void
Pl_QPDFTokenizer::finish() Pl_QPDFTokenizer::finish()
{ {
m->buf.finish(); m->buf.finish();
auto input = std::shared_ptr<InputSource>( auto input = BufferInputSource("tokenizer data", m->buf.getBuffer(), true);
// line-break std::string empty;
new BufferInputSource("tokenizer data", m->buf.getBuffer(), true));
while (true) { while (true) {
QPDFTokenizer::Token token = auto token = m->tokenizer.readToken(input, empty, true);
m->tokenizer.readToken(input, "offset " + std::to_string(input->tell()), true);
m->filter->handleToken(token); m->filter->handleToken(token);
if (token.getType() == QPDFTokenizer::tt_eof) { if (token.getType() == QPDFTokenizer::tt_eof) {
break; break;
} else if (token.isWord("ID")) { } else if (token.isWord("ID")) {
// Read the space after the ID. // Read the space after the ID.
char ch = ' '; char ch = ' ';
input->read(&ch, 1); input.read(&ch, 1);
m->filter->handleToken( m->filter->handleToken(
// line-break // line-break
QPDFTokenizer::Token(QPDFTokenizer::tt_space, std::string(1, ch))); QPDFTokenizer::Token(QPDFTokenizer::tt_space, std::string(1, ch)));

View File

@ -148,7 +148,7 @@ QPDFObjectHandle::TokenFilter::write(std::string const& str)
void void
QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token) QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token)
{ {
std::string value = token.getRawValue(); std::string const& value = token.getRawValue();
write(value.c_str(), value.length()); write(value.c_str(), value.length());
} }

View File

@ -27,7 +27,7 @@ namespace
class QPDFWordTokenFinder: public InputSource::Finder class QPDFWordTokenFinder: public InputSource::Finder
{ {
public: public:
QPDFWordTokenFinder(std::shared_ptr<InputSource> is, std::string const& str) : QPDFWordTokenFinder(InputSource& is, std::string const& str) :
is(is), is(is),
str(str) str(str)
{ {
@ -36,7 +36,7 @@ namespace
bool check() override; bool check() override;
private: private:
std::shared_ptr<InputSource> is; InputSource& is;
std::string str; std::string str;
}; };
} // namespace } // namespace
@ -48,21 +48,21 @@ QPDFWordTokenFinder::check()
// delimiter or EOF. // delimiter or EOF.
QPDFTokenizer tokenizer; QPDFTokenizer tokenizer;
QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true); QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
qpdf_offset_t pos = is->tell(); qpdf_offset_t pos = is.tell();
if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) {
QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
return false; return false;
} }
qpdf_offset_t token_start = is->getLastOffset(); qpdf_offset_t token_start = is.getLastOffset();
char next; char next;
bool next_okay = false; bool next_okay = false;
if (is->read(&next, 1) == 0) { if (is.read(&next, 1) == 0) {
QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
next_okay = true; next_okay = true;
} else { } else {
next_okay = is_delimiter(next); next_okay = is_delimiter(next);
} }
is->seek(pos, SEEK_SET); is.seek(pos, SEEK_SET);
if (!next_okay) { if (!next_okay) {
return false; return false;
} }
@ -763,12 +763,18 @@ QPDFTokenizer::presentEOF()
void void
QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input) QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
{
expectInlineImage(*input);
}
void
QPDFTokenizer::expectInlineImage(InputSource& input)
{ {
if (this->state == st_token_ready) { if (this->state == st_token_ready) {
reset(); reset();
} else if (this->state != st_before_token) { } else if (this->state != st_before_token) {
throw std::logic_error("QPDFTokenizer::expectInlineImage called" throw std::logic_error(
" when tokenizer is in improper state"); "QPDFTokenizer::expectInlineImage called when tokenizer is in improper state");
} }
findEI(input); findEI(input);
this->before_token = false; this->before_token = false;
@ -777,14 +783,10 @@ QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
} }
void void
QPDFTokenizer::findEI(std::shared_ptr<InputSource> input) QPDFTokenizer::findEI(InputSource& input)
{ {
if (!input.get()) { qpdf_offset_t last_offset = input.getLastOffset();
return; qpdf_offset_t pos = input.tell();
}
qpdf_offset_t last_offset = input->getLastOffset();
qpdf_offset_t pos = input->tell();
// Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several
// tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part
@ -797,10 +799,10 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
bool first_try = true; bool first_try = true;
while (!okay) { while (!okay) {
QPDFWordTokenFinder f(input, "EI"); QPDFWordTokenFinder f(input, "EI");
if (!input->findFirst("EI", input->tell(), 0, f)) { if (!input.findFirst("EI", input.tell(), 0, f)) {
break; break;
} }
this->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2); inline_image_bytes = QIntC::to_size(input.tell() - pos - 2);
QPDFTokenizer check; QPDFTokenizer check;
bool found_bad = false; bool found_bad = false;
@ -858,8 +860,8 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
} }
input->seek(pos, SEEK_SET); input.seek(pos, SEEK_SET);
input->setLastOffset(last_offset); input.setLastOffset(last_offset);
} }
bool bool
@ -902,7 +904,7 @@ QPDFTokenizer::readToken(
throw QPDFExc( throw QPDFExc(
qpdf_e_damaged_pdf, qpdf_e_damaged_pdf,
input.getName(), input.getName(),
context, context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context,
input.getLastOffset(), input.getLastOffset(),
token.getErrorMessage()); token.getErrorMessage());
} }