2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-12-22 10:58:58 +00:00

Merge pull request #1253 from m-holger/pl_t

Refactor Pl_QPDFTokenizer
This commit is contained in:
m-holger 2024-08-21 18:29:55 +01:00 committed by GitHub
commit 0b3debaf86
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 57 additions and 48 deletions

View File

@ -191,6 +191,8 @@ class QPDFTokenizer
// returns a tt_inline_image token.
QPDF_DLL
void expectInlineImage(std::shared_ptr<InputSource> input);
QPDF_DLL
void expectInlineImage(InputSource& input);
private:
friend class QPDFParser;
@ -217,7 +219,7 @@ class QPDFTokenizer
bool isSpace(char);
bool isDelimiter(char);
void findEI(std::shared_ptr<InputSource> input);
void findEI(InputSource& input);
enum state_e {
st_top,

View File

@ -1,5 +1,6 @@
#include <qpdf/ContentNormalizer.hh>
#include <qpdf/QPDF_Name.hh>
#include <qpdf/QUtil.hh>
ContentNormalizer::ContentNormalizer() :
@ -11,7 +12,6 @@ ContentNormalizer::ContentNormalizer() :
void
ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
{
std::string value = token.getRawValue();
QPDFTokenizer::token_type_e token_type = token.getType();
if (token_type == QPDFTokenizer::tt_bad) {
@ -24,40 +24,48 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
switch (token_type) {
case QPDFTokenizer::tt_space:
{
size_t len = value.length();
for (size_t i = 0; i < len; ++i) {
char ch = value.at(i);
if (ch == '\r') {
if ((i + 1 < len) && (value.at(i + 1) == '\n')) {
// ignore
} else {
std::string const& value = token.getRawValue();
auto size = value.size();
size_t pos = 0;
auto r_pos = value.find('\r');
while (r_pos != std::string::npos) {
if (pos != r_pos) {
write(&value[pos], r_pos - pos);
}
if (++r_pos >= size) {
write("\n");
return;
}
if (value[r_pos] != '\n') {
write("\n");
}
} else {
write(&ch, 1);
pos = r_pos;
r_pos = value.find('\r', pos);
}
if (pos < size) {
write(&value[pos], size - pos);
}
}
}
break;
return;
case QPDFTokenizer::tt_string:
// Replacing string and name tokens in this way normalizes their representation as this will
// automatically handle quoting of unprintable characters, etc.
writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, token.getValue()));
write(QPDFObjectHandle::newString(token.getValue()).unparse());
break;
case QPDFTokenizer::tt_name:
writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, token.getValue()));
write(QPDF_Name::normalizeName(token.getValue()));
break;
default:
writeToken(token);
break;
return;
}
value = token.getRawValue();
if (((token_type == QPDFTokenizer::tt_string) || (token_type == QPDFTokenizer::tt_name)) &&
((value.find('\r') != std::string::npos) || (value.find('\n') != std::string::npos))) {
// tt_string or tt_name
std::string const& value = token.getRawValue();
if (value.find('\r') != std::string::npos || value.find('\n') != std::string::npos) {
write("\n");
}
}

View File

@ -36,20 +36,17 @@ void
Pl_QPDFTokenizer::finish()
{
m->buf.finish();
auto input = std::shared_ptr<InputSource>(
// line-break
new BufferInputSource("tokenizer data", m->buf.getBuffer(), true));
auto input = BufferInputSource("tokenizer data", m->buf.getBuffer(), true);
std::string empty;
while (true) {
QPDFTokenizer::Token token =
m->tokenizer.readToken(input, "offset " + std::to_string(input->tell()), true);
auto token = m->tokenizer.readToken(input, empty, true);
m->filter->handleToken(token);
if (token.getType() == QPDFTokenizer::tt_eof) {
break;
} else if (token.isWord("ID")) {
// Read the space after the ID.
char ch = ' ';
input->read(&ch, 1);
input.read(&ch, 1);
m->filter->handleToken(
// line-break
QPDFTokenizer::Token(QPDFTokenizer::tt_space, std::string(1, ch)));

View File

@ -148,7 +148,7 @@ QPDFObjectHandle::TokenFilter::write(std::string const& str)
void
QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token)
{
std::string value = token.getRawValue();
std::string const& value = token.getRawValue();
write(value.c_str(), value.length());
}

View File

@ -27,7 +27,7 @@ namespace
class QPDFWordTokenFinder: public InputSource::Finder
{
public:
QPDFWordTokenFinder(std::shared_ptr<InputSource> is, std::string const& str) :
QPDFWordTokenFinder(InputSource& is, std::string const& str) :
is(is),
str(str)
{
@ -36,7 +36,7 @@ namespace
bool check() override;
private:
std::shared_ptr<InputSource> is;
InputSource& is;
std::string str;
};
} // namespace
@ -48,21 +48,21 @@ QPDFWordTokenFinder::check()
// delimiter or EOF.
QPDFTokenizer tokenizer;
QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
qpdf_offset_t pos = is->tell();
qpdf_offset_t pos = is.tell();
if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) {
QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
return false;
}
qpdf_offset_t token_start = is->getLastOffset();
qpdf_offset_t token_start = is.getLastOffset();
char next;
bool next_okay = false;
if (is->read(&next, 1) == 0) {
if (is.read(&next, 1) == 0) {
QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
next_okay = true;
} else {
next_okay = is_delimiter(next);
}
is->seek(pos, SEEK_SET);
is.seek(pos, SEEK_SET);
if (!next_okay) {
return false;
}
@ -763,12 +763,18 @@ QPDFTokenizer::presentEOF()
void
QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
{
expectInlineImage(*input);
}
void
QPDFTokenizer::expectInlineImage(InputSource& input)
{
if (this->state == st_token_ready) {
reset();
} else if (this->state != st_before_token) {
throw std::logic_error("QPDFTokenizer::expectInlineImage called"
" when tokenizer is in improper state");
throw std::logic_error(
"QPDFTokenizer::expectInlineImage called when tokenizer is in improper state");
}
findEI(input);
this->before_token = false;
@ -777,14 +783,10 @@ QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
}
void
QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
QPDFTokenizer::findEI(InputSource& input)
{
if (!input.get()) {
return;
}
qpdf_offset_t last_offset = input->getLastOffset();
qpdf_offset_t pos = input->tell();
qpdf_offset_t last_offset = input.getLastOffset();
qpdf_offset_t pos = input.tell();
// Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several
// tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part
@ -797,10 +799,10 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
bool first_try = true;
while (!okay) {
QPDFWordTokenFinder f(input, "EI");
if (!input->findFirst("EI", input->tell(), 0, f)) {
if (!input.findFirst("EI", input.tell(), 0, f)) {
break;
}
this->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2);
inline_image_bytes = QIntC::to_size(input.tell() - pos - 2);
QPDFTokenizer check;
bool found_bad = false;
@ -858,8 +860,8 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
}
input->seek(pos, SEEK_SET);
input->setLastOffset(last_offset);
input.seek(pos, SEEK_SET);
input.setLastOffset(last_offset);
}
bool
@ -902,7 +904,7 @@ QPDFTokenizer::readToken(
throw QPDFExc(
qpdf_e_damaged_pdf,
input.getName(),
context,
context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context,
input.getLastOffset(),
token.getErrorMessage());
}