qpdf/libqpdf/Pl_QPDFTokenizer.cc

#include <qpdf/Pl_QPDFTokenizer.hh>
#include <qpdf/QPDF_String.hh>
#include <qpdf/QPDF_Name.hh>
#include <qpdf/QTC.hh>
#include <stdexcept>
#include <string.h>

Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) :
    Pipeline(identifier, next),
    newline_after_next_token(false),
    just_wrote_nl(false),
    last_char_was_cr(false),
    unread_char(false),
    char_to_unread('\0'),
    in_inline_image(false)
{
    memset(this->image_buf, 0, IMAGE_BUF_SIZE);
}

Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
{
}

void
Pl_QPDFTokenizer::writeNext(char const* buf, size_t len)
{
    if (len)
    {
	unsigned char* t = new unsigned char[len];
	memcpy(t, buf, len);
	getNext()->write(t, len);
	delete [] t;
	this->just_wrote_nl = (buf[len-1] == '\n');
    }
}

void
Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token)
{
    std::string value = token.getRawValue();

    switch (token.getType())
    {
      case QPDFTokenizer::tt_string:
	value = QPDF_String(token.getValue()).unparse();
	break;

      case QPDFTokenizer::tt_name:
	value = QPDF_Name(token.getValue()).unparse();
	break;

      default:
	break;
    }
    writeNext(value.c_str(), value.length());
}

void
Pl_QPDFTokenizer::processChar(char ch)
{
    if (this->in_inline_image)
    {
	// Scan through the input looking for EI surrounded by
	// whitespace.  If that pattern appears in the inline image's
	// representation, we're hosed, but this situation seems
	// excessively unlikely, and this code path is only followed
	// during content stream normalization, which is pretty much
	// used for debugging and human inspection of PDF files.
	memmove(this->image_buf,
		this->image_buf + 1,
		IMAGE_BUF_SIZE - 1);
	this->image_buf[IMAGE_BUF_SIZE - 1] = ch;
	if (strchr(" \t\n\v\f\r", this->image_buf[0]) &&
	    (this->image_buf[1] == 'E') &&
	    (this->image_buf[2] == 'I') &&
	    strchr(" \t\n\v\f\r", this->image_buf[3]))
	{
	    // We've found an EI operator.  We've already written the
	    // EI operator to output; terminate with a newline
	    // character and resume normal processing.
	    writeNext("\n", 1);
	    this->in_inline_image = false;
	    QTC::TC("qpdf", "Pl_QPDFTokenizer found EI");
	}
	else
	{
	    writeNext(&ch, 1);
	}
	return;
    }

    tokenizer.presentCharacter(ch);
    QPDFTokenizer::Token token;
    if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))
    {
	writeToken(token);
	if (this->newline_after_next_token)
	{
	    writeNext("\n", 1);
	    this->newline_after_next_token = false;
	}
	if ((token.getType() == QPDFTokenizer::tt_word) &&
	    (token.getValue() == "ID"))
	{
	    // Suspend normal scanning until we find an EI token.
	    this->in_inline_image = true;
	    if (this->unread_char)
	    {
		writeNext(&this->char_to_unread, 1);
		this->unread_char = false;
	    }
	}
    }
    else
    {
	bool suppress = false;
	if ((ch == '\n') && (this->last_char_was_cr))
	{
	    // Always ignore \n following \r
	    suppress = true;
	}

	if ((this->last_char_was_cr = (ch == '\r')))
	{
	    ch = '\n';
	}

	if (this->tokenizer.betweenTokens())
	{
	    if (! suppress)
	    {
		writeNext(&ch, 1);
	    }
	}
	else
	{
	    if (ch == '\n')
	    {
		this->newline_after_next_token = true;
	    }
	}
    }
}


void
Pl_QPDFTokenizer::checkUnread()
{
    if (this->unread_char)
    {
	processChar(this->char_to_unread);
	if (this->unread_char)
	{
	    throw std::logic_error(
		"INTERNAL ERROR: unread_char still true after processing "
		"unread character");
	}
    }
}

void
Pl_QPDFTokenizer::write(unsigned char* buf, size_t len)
{
    checkUnread();
    for (size_t i = 0; i < len; ++i)
    {
	processChar(buf[i]);
	checkUnread();
    }
}

void
Pl_QPDFTokenizer::finish()
{
    this->tokenizer.presentEOF();
    if (! this->in_inline_image)
    {
	QPDFTokenizer::Token token;
	if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))
	{
	    writeToken(token);
	    if (unread_char)
	    {
		if (this->char_to_unread == '\r')
		{
		    this->char_to_unread = '\n';
		}
		writeNext(&this->char_to_unread, 1);
	    }
	}
    }
    if (! this->just_wrote_nl)
    {
	writeNext("\n", 1);
    }

    getNext()->finish();
}
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`#include <qpdf/Pl_QPDFTokenizer.hh>`
			`#include <qpdf/QPDF_String.hh>`
			`#include <qpdf/QPDF_Name.hh>`
be less conservative when skipping over inline images in content normalization git-svn-id: svn+q:///qpdf/trunk@1050 71b93d88-0707-0410-a8cf-f5a4172ac649 2011-04-30 18:20:35 +00:00			`#include <qpdf/QTC.hh>`
removed qexc; non-compatible ABI change git-svn-id: svn+q:///qpdf/trunk@709 71b93d88-0707-0410-a8cf-f5a4172ac649 2009-09-26 18:36:04 +00:00			`#include <stdexcept>`
missing header files for gcc 4.3 git-svn-id: svn+q:///qpdf/trunk@607 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-05-04 16:02:53 +00:00			`#include <string.h>`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00
			`Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) :`
			`Pipeline(identifier, next),`
			`newline_after_next_token(false),`
			`just_wrote_nl(false),`
			`last_char_was_cr(false),`
			`unread_char(false),`
			`char_to_unread('\0'),`
be less conservative when skipping over inline images in content normalization git-svn-id: svn+q:///qpdf/trunk@1050 71b93d88-0707-0410-a8cf-f5a4172ac649 2011-04-30 18:20:35 +00:00			`in_inline_image(false)`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`{`
be less conservative when skipping over inline images in content normalization git-svn-id: svn+q:///qpdf/trunk@1050 71b93d88-0707-0410-a8cf-f5a4172ac649 2011-04-30 18:20:35 +00:00			`memset(this->image_buf, 0, IMAGE_BUF_SIZE);`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`}`

			`Pl_QPDFTokenizer::~Pl_QPDFTokenizer()`
			`{`
			`}`

			`void`
ABI change: fix use of off_t, size_t, and integer types Significantly improve the code's use of off_t for file offsets, size_t for memory sizes, and integer types in cases where there has to be compatibility with external interfaces. Rework sections of the code that would have prevented qpdf from working on files larger than 2 (or maybe 4) GB in size. 2012-06-20 15:20:57 +00:00			`Pl_QPDFTokenizer::writeNext(char const* buf, size_t len)`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`{`
			`if (len)`
			`{`
			`unsigned char* t = new unsigned char[len];`
			`memcpy(t, buf, len);`
			`getNext()->write(t, len);`
			`delete [] t;`
			`this->just_wrote_nl = (buf[len-1] == '\n');`
			`}`
			`}`

			`void`
			`Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token)`
			`{`
			`std::string value = token.getRawValue();`

			`switch (token.getType())`
			`{`
			`case QPDFTokenizer::tt_string:`
			`value = QPDF_String(token.getValue()).unparse();`
			`break;`

			`case QPDFTokenizer::tt_name:`
			`value = QPDF_Name(token.getValue()).unparse();`
			`break;`

			`default:`
			`break;`
			`}`
			`writeNext(value.c_str(), value.length());`
			`}`

			`void`
			`Pl_QPDFTokenizer::processChar(char ch)`
			`{`
be less conservative when skipping over inline images in content normalization git-svn-id: svn+q:///qpdf/trunk@1050 71b93d88-0707-0410-a8cf-f5a4172ac649 2011-04-30 18:20:35 +00:00			`if (this->in_inline_image)`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`{`
be less conservative when skipping over inline images in content normalization git-svn-id: svn+q:///qpdf/trunk@1050 71b93d88-0707-0410-a8cf-f5a4172ac649 2011-04-30 18:20:35 +00:00			`// Scan through the input looking for EI surrounded by`
			`// whitespace. If that pattern appears in the inline image's`
			`// representation, we're hosed, but this situation seems`
			`// excessively unlikely, and this code path is only followed`
			`// during content stream normalization, which is pretty much`
			`// used for debugging and human inspection of PDF files.`
			`memmove(this->image_buf,`
			`this->image_buf + 1,`
			`IMAGE_BUF_SIZE - 1);`
			`this->image_buf[IMAGE_BUF_SIZE - 1] = ch;`
			`if (strchr(" \t\n\v\f\r", this->image_buf[0]) &&`
			`(this->image_buf[1] == 'E') &&`
			`(this->image_buf[2] == 'I') &&`
			`strchr(" \t\n\v\f\r", this->image_buf[3]))`
			`{`
			`// We've found an EI operator. We've already written the`
			`// EI operator to output; terminate with a newline`
			`// character and resume normal processing.`
			`writeNext("\n", 1);`
			`this->in_inline_image = false;`
			`QTC::TC("qpdf", "Pl_QPDFTokenizer found EI");`
			`}`
			`else`
			`{`
			`writeNext(&ch, 1);`
			`}`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`return;`
			`}`

			`tokenizer.presentCharacter(ch);`
			`QPDFTokenizer::Token token;`
			`if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))`
			`{`
			`writeToken(token);`
			`if (this->newline_after_next_token)`
			`{`
			`writeNext("\n", 1);`
			`this->newline_after_next_token = false;`
			`}`
			`if ((token.getType() == QPDFTokenizer::tt_word) &&`
be less conservative when skipping over inline images in content normalization git-svn-id: svn+q:///qpdf/trunk@1050 71b93d88-0707-0410-a8cf-f5a4172ac649 2011-04-30 18:20:35 +00:00			`(token.getValue() == "ID"))`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`{`
be less conservative when skipping over inline images in content normalization git-svn-id: svn+q:///qpdf/trunk@1050 71b93d88-0707-0410-a8cf-f5a4172ac649 2011-04-30 18:20:35 +00:00			`// Suspend normal scanning until we find an EI token.`
			`this->in_inline_image = true;`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`if (this->unread_char)`
			`{`
			`writeNext(&this->char_to_unread, 1);`
			`this->unread_char = false;`
			`}`
			`}`
			`}`
			`else`
			`{`
			`bool suppress = false;`
			`if ((ch == '\n') && (this->last_char_was_cr))`
			`{`
			`// Always ignore \n following \r`
			`suppress = true;`
			`}`

			`if ((this->last_char_was_cr = (ch == '\r')))`
			`{`
			`ch = '\n';`
			`}`

			`if (this->tokenizer.betweenTokens())`
			`{`
			`if (! suppress)`
			`{`
			`writeNext(&ch, 1);`
			`}`
			`}`
			`else`
			`{`
			`if (ch == '\n')`
			`{`
			`this->newline_after_next_token = true;`
			`}`
			`}`
			`}`
			`}`


			`void`
			`Pl_QPDFTokenizer::checkUnread()`
			`{`
			`if (this->unread_char)`
			`{`
			`processChar(this->char_to_unread);`
			`if (this->unread_char)`
			`{`
removed qexc; non-compatible ABI change git-svn-id: svn+q:///qpdf/trunk@709 71b93d88-0707-0410-a8cf-f5a4172ac649 2009-09-26 18:36:04 +00:00			`throw std::logic_error(`
			`"INTERNAL ERROR: unread_char still true after processing "`
			`"unread character");`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`}`
			`}`
			`}`

			`void`
ABI change: fix use of off_t, size_t, and integer types Significantly improve the code's use of off_t for file offsets, size_t for memory sizes, and integer types in cases where there has to be compatibility with external interfaces. Rework sections of the code that would have prevented qpdf from working on files larger than 2 (or maybe 4) GB in size. 2012-06-20 15:20:57 +00:00			`Pl_QPDFTokenizer::write(unsigned char* buf, size_t len)`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`{`
			`checkUnread();`
ABI change: fix use of off_t, size_t, and integer types Significantly improve the code's use of off_t for file offsets, size_t for memory sizes, and integer types in cases where there has to be compatibility with external interfaces. Rework sections of the code that would have prevented qpdf from working on files larger than 2 (or maybe 4) GB in size. 2012-06-20 15:20:57 +00:00			`for (size_t i = 0; i < len; ++i)`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`{`
			`processChar(buf[i]);`
			`checkUnread();`
			`}`
			`}`

			`void`
			`Pl_QPDFTokenizer::finish()`
			`{`
			`this->tokenizer.presentEOF();`
be less conservative when skipping over inline images in content normalization git-svn-id: svn+q:///qpdf/trunk@1050 71b93d88-0707-0410-a8cf-f5a4172ac649 2011-04-30 18:20:35 +00:00			`if (! this->in_inline_image)`
update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 2008-04-29 12:55:25 +00:00			`{`
			`QPDFTokenizer::Token token;`
			`if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))`
			`{`
			`writeToken(token);`
			`if (unread_char)`
			`{`
			`if (this->char_to_unread == '\r')`
			`{`
			`this->char_to_unread = '\n';`
			`}`
			`writeNext(&this->char_to_unread, 1);`
			`}`
			`}`
			`}`
			`if (! this->just_wrote_nl)`
			`{`
			`writeNext("\n", 1);`
			`}`

			`getNext()->finish();`
			`}`