mirror of https://github.com/qpdf/qpdf.git
Lexer enhancements: EOF, comment, space
Significant enhancements to the lexer to improve EOF handling and to support comments and spaces as tokens. Various other minor issues were fixed as well.
This commit is contained in:
parent
bb9e91adbd
commit
d97474868d
32
ChangeLog
32
ChangeLog
|
@ -1,3 +1,35 @@
|
|||
2018-02-04 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Significant lexer (tokenizer) enhancements. These are changes to
|
||||
the QPDFTokenizer class. These changes are of concern only to
|
||||
people who are operating with PDF files at the lexical layer
|
||||
using qpdf. They have little or no impact on most high-level
|
||||
interfaces or the command-line tool.
|
||||
* New token types tt_space and tt_comment to recognize
|
||||
whitespace and comments. this makes it possible to tokenize a
|
||||
PDF file or stream and preserve everything about it.
|
||||
* For backward compatibility, space and comment tokens are not
|
||||
returned by the tokenizer unless
|
||||
QPDFTokenizer.includeIgnorable() is called.
|
||||
* Better handling of null bytes. These are now included in space
|
||||
tokens rather than being their own "tt_word" tokens. This
|
||||
should have no impact on any correct PDF file and has no
|
||||
impact on output, but it may change offsets in some error
|
||||
messages when trying to parse contents of bad files. Under
|
||||
default operation, qpdf does not attempt to parse content
|
||||
streams, so this change is mostly invisible.
|
||||
* Bug fix to handling of bad tokens at ends of streams. Now,
|
||||
when allowEOF() has been called, these are treated as bad tokens
|
||||
(tt_bad or an exception, depending on invocation), and a
|
||||
separate tt_eof token is returned. Before the bad token
|
||||
contents were returned as the value of a tt_eof token. tt_eof
|
||||
tokens are always empty now.
|
||||
* Fix a bug that would, on rare occasions, report the offset in an
|
||||
error message in the wrong space because of spaces or comments
|
||||
adjacent to a bad token.
|
||||
* Clarify in comments exactly where the input source is
|
||||
positioned surrounding calls to readToken and getToken.
|
||||
|
||||
2018-02-04 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Add QPDFWriter::setLinearizationPass1Filename method and
|
||||
|
|
|
@ -33,7 +33,8 @@ class QPDFTokenizer
|
|||
{
|
||||
public:
|
||||
// Token type tt_eof is only returned of allowEOF() is called on
|
||||
// the tokenizer. tt_eof was introduced in QPDF version 4.1.
|
||||
// the tokenizer. tt_eof was introduced in QPDF version 4.1.
|
||||
// tt_space and tt_comment were added in QPDF version 8.
|
||||
enum token_type_e
|
||||
{
|
||||
tt_bad,
|
||||
|
@ -51,6 +52,8 @@ class QPDFTokenizer
|
|||
tt_bool,
|
||||
tt_word,
|
||||
tt_eof,
|
||||
tt_space,
|
||||
tt_comment,
|
||||
};
|
||||
|
||||
class Token
|
||||
|
@ -120,6 +123,11 @@ class QPDFTokenizer
|
|||
QPDF_DLL
|
||||
void allowEOF();
|
||||
|
||||
// If called, readToken will return "ignorable" tokens for space
|
||||
// and comments. This was added in QPDF 8.
|
||||
QPDF_DLL
|
||||
void includeIgnorable();
|
||||
|
||||
// Mode of operation:
|
||||
|
||||
// Keep presenting characters and calling getToken() until
|
||||
|
@ -159,13 +167,15 @@ class QPDFTokenizer
|
|||
private:
|
||||
void reset();
|
||||
void resolveLiteral();
|
||||
bool isSpace(char);
|
||||
|
||||
// Lexer state
|
||||
enum { st_top, st_in_comment, st_in_string, st_lt, st_gt,
|
||||
enum { st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt,
|
||||
st_literal, st_in_hexstring, st_token_ready } state;
|
||||
|
||||
bool pound_special_in_name;
|
||||
bool allow_eof;
|
||||
bool include_ignorable;
|
||||
|
||||
// Current token accumulation
|
||||
token_type_e type;
|
||||
|
|
|
@ -120,15 +120,23 @@ FileInputSource::rewind()
|
|||
size_t
|
||||
FileInputSource::read(char* buffer, size_t length)
|
||||
{
|
||||
this->last_offset = QUtil::tell(this->file);
|
||||
this->last_offset = this->tell();
|
||||
size_t len = fread(buffer, 1, length, this->file);
|
||||
if ((len == 0) && ferror(this->file))
|
||||
if (len == 0)
|
||||
{
|
||||
throw QPDFExc(qpdf_e_system,
|
||||
this->filename, "",
|
||||
this->last_offset,
|
||||
std::string("read ") +
|
||||
QUtil::int_to_string(length) + " bytes");
|
||||
if (ferror(this->file))
|
||||
{
|
||||
throw QPDFExc(qpdf_e_system,
|
||||
this->filename, "",
|
||||
this->last_offset,
|
||||
std::string("read ") +
|
||||
QUtil::int_to_string(length) + " bytes");
|
||||
}
|
||||
else if (length > 0)
|
||||
{
|
||||
this->seek(0, SEEK_END);
|
||||
this->last_offset = this->tell();
|
||||
}
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
|
|
@ -14,7 +14,8 @@
|
|||
|
||||
QPDFTokenizer::QPDFTokenizer() :
|
||||
pound_special_in_name(true),
|
||||
allow_eof(false)
|
||||
allow_eof(false),
|
||||
include_ignorable(false)
|
||||
{
|
||||
reset();
|
||||
}
|
||||
|
@ -32,6 +33,18 @@ QPDFTokenizer::allowEOF()
|
|||
this->allow_eof = true;
|
||||
}
|
||||
|
||||
void
|
||||
QPDFTokenizer::includeIgnorable()
|
||||
{
|
||||
this->include_ignorable = true;
|
||||
}
|
||||
|
||||
bool
|
||||
QPDFTokenizer::isSpace(char ch)
|
||||
{
|
||||
return ((ch == '\0') || QUtil::is_space(ch));
|
||||
}
|
||||
|
||||
void
|
||||
QPDFTokenizer::reset()
|
||||
{
|
||||
|
@ -148,14 +161,21 @@ QPDFTokenizer::presentCharacter(char ch)
|
|||
{
|
||||
// Note: we specifically do not use ctype here. It is
|
||||
// locale-dependent.
|
||||
if (strchr(" \t\n\v\f\r", ch))
|
||||
if (isSpace(ch))
|
||||
{
|
||||
// ignore
|
||||
if (this->include_ignorable)
|
||||
{
|
||||
state = st_in_space;
|
||||
val += ch;
|
||||
}
|
||||
}
|
||||
else if (ch == '%')
|
||||
{
|
||||
// Discard comments
|
||||
state = st_in_comment;
|
||||
if (this->include_ignorable)
|
||||
{
|
||||
val += ch;
|
||||
}
|
||||
}
|
||||
else if (ch == '(')
|
||||
{
|
||||
|
@ -209,12 +229,41 @@ QPDFTokenizer::presentCharacter(char ch)
|
|||
}
|
||||
}
|
||||
}
|
||||
else if (state == st_in_space)
|
||||
{
|
||||
// We only enter this state if include_ignorable is true.
|
||||
if (! isSpace(ch))
|
||||
{
|
||||
type = tt_space;
|
||||
unread_char = true;
|
||||
char_to_unread = ch;
|
||||
state = st_token_ready;
|
||||
}
|
||||
else
|
||||
{
|
||||
val += ch;
|
||||
}
|
||||
}
|
||||
else if (state == st_in_comment)
|
||||
{
|
||||
if ((ch == '\r') || (ch == '\n'))
|
||||
{
|
||||
state = st_top;
|
||||
}
|
||||
{
|
||||
if (this->include_ignorable)
|
||||
{
|
||||
type = tt_comment;
|
||||
unread_char = true;
|
||||
char_to_unread = ch;
|
||||
state = st_token_ready;
|
||||
}
|
||||
else
|
||||
{
|
||||
state = st_top;
|
||||
}
|
||||
}
|
||||
else if (this->include_ignorable)
|
||||
{
|
||||
val += ch;
|
||||
}
|
||||
}
|
||||
else if (state == st_lt)
|
||||
{
|
||||
|
@ -397,7 +446,7 @@ QPDFTokenizer::presentCharacter(char ch)
|
|||
{
|
||||
val += ch;
|
||||
}
|
||||
else if (strchr(" \t\n\v\f\r", ch))
|
||||
else if (isSpace(ch))
|
||||
{
|
||||
// ignore
|
||||
}
|
||||
|
@ -435,19 +484,23 @@ QPDFTokenizer::presentEOF()
|
|||
QTC::TC("qpdf", "QPDF_Tokenizer EOF reading appendable token");
|
||||
resolveLiteral();
|
||||
}
|
||||
else if ((this->include_ignorable) && (state == st_in_space))
|
||||
{
|
||||
type = tt_space;
|
||||
}
|
||||
else if ((this->include_ignorable) && (state == st_in_comment))
|
||||
{
|
||||
type = tt_comment;
|
||||
}
|
||||
else if (betweenTokens())
|
||||
{
|
||||
type = tt_eof;
|
||||
}
|
||||
else if (state != st_token_ready)
|
||||
{
|
||||
QTC::TC("qpdf", "QPDF_Tokenizer EOF reading token",
|
||||
this->allow_eof ? 1 : 0);
|
||||
if ((this->allow_eof) && (state == st_top))
|
||||
{
|
||||
type = tt_eof;
|
||||
}
|
||||
else
|
||||
{
|
||||
type = tt_bad;
|
||||
error_message = "EOF while reading token";
|
||||
}
|
||||
QTC::TC("qpdf", "QPDF_Tokenizer EOF reading token");
|
||||
type = tt_bad;
|
||||
error_message = "EOF while reading token";
|
||||
}
|
||||
|
||||
state = st_token_ready;
|
||||
|
@ -461,6 +514,10 @@ QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
|
|||
ch = this->char_to_unread;
|
||||
if (ready)
|
||||
{
|
||||
if (type == tt_bad)
|
||||
{
|
||||
val = raw_val;
|
||||
}
|
||||
token = Token(type, val, raw_val, error_message);
|
||||
reset();
|
||||
}
|
||||
|
@ -470,7 +527,10 @@ QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
|
|||
bool
|
||||
QPDFTokenizer::betweenTokens()
|
||||
{
|
||||
return ((state == st_top) || (state == st_in_comment));
|
||||
return ((state == st_top) ||
|
||||
((! this->include_ignorable) &&
|
||||
((state == st_in_comment) ||
|
||||
(state == st_in_space))));
|
||||
}
|
||||
|
||||
QPDFTokenizer::Token
|
||||
|
@ -493,6 +553,13 @@ QPDFTokenizer::readToken(PointerHolder<InputSource> input,
|
|||
{
|
||||
presentEOF();
|
||||
presented_eof = true;
|
||||
if ((type == tt_eof) && (! this->allow_eof))
|
||||
{
|
||||
QTC::TC("qpdf", "QPDF_Tokenizer EOF when not allowed");
|
||||
type = tt_bad;
|
||||
error_message = "unexpected EOF";
|
||||
offset = input->getLastOffset();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -502,12 +569,11 @@ QPDFTokenizer::readToken(PointerHolder<InputSource> input,
|
|||
}
|
||||
else
|
||||
{
|
||||
if (QUtil::is_space(static_cast<unsigned char>(ch)) &&
|
||||
(input->getLastOffset() == offset))
|
||||
presentCharacter(ch);
|
||||
if (betweenTokens() && (input->getLastOffset() == offset))
|
||||
{
|
||||
++offset;
|
||||
}
|
||||
presentCharacter(ch);
|
||||
if (max_len && (raw_val.length() >= max_len) &&
|
||||
(this->state != st_token_ready))
|
||||
{
|
||||
|
@ -515,6 +581,8 @@ QPDFTokenizer::readToken(PointerHolder<InputSource> input,
|
|||
QTC::TC("qpdf", "QPDFTokenizer block long token");
|
||||
this->type = tt_bad;
|
||||
this->state = st_token_ready;
|
||||
error_message =
|
||||
"exceeded allowable length while reading token";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -233,7 +233,7 @@ QPDFWriter copy use_aes 1
|
|||
QPDFObjectHandle indirect without context 0
|
||||
QPDFObjectHandle trailing data in parse 0
|
||||
qpdf pages encryption password 0
|
||||
QPDF_Tokenizer EOF reading token 1
|
||||
QPDF_Tokenizer EOF reading token 0
|
||||
QPDF_Tokenizer EOF reading appendable token 0
|
||||
QPDFWriter extra header text no newline 0
|
||||
QPDFWriter extra header text add newline 0
|
||||
|
@ -302,3 +302,4 @@ qpdf-c called qpdf_set_compress_streams 0
|
|||
qpdf-c called qpdf_set_preserve_unreferenced_objects 0
|
||||
qpdf-c called qpdf_set_newline_before_endstream 0
|
||||
QPDF_Stream TIFF predictor 0
|
||||
QPDF_Tokenizer EOF when not allowed 0
|
||||
|
|
|
@ -240,7 +240,7 @@ foreach my $d (@bug_tests)
|
|||
show_ntests();
|
||||
# ----------
|
||||
$td->notify("--- Miscellaneous Tests ---");
|
||||
$n_tests += 97;
|
||||
$n_tests += 99;
|
||||
|
||||
$td->runtest("qpdf version",
|
||||
{$td->COMMAND => "qpdf --version"},
|
||||
|
@ -263,11 +263,21 @@ $td->runtest("check pass1 file",
|
|||
{$td->FILE => "b.pdf"},
|
||||
{$td->FILE => "minimal-linearize-pass1.pdf"});
|
||||
|
||||
$td->runtest("tokenizer with no ignorable",
|
||||
{$td->COMMAND => "test_tokenizer -no-ignorable tokens.pdf"},
|
||||
{$td->FILE => "tokens-no-ignorable.out", $td->EXIT_STATUS => 0},
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
|
||||
$td->runtest("tokenizer",
|
||||
{$td->COMMAND => "test_tokenizer tokens.pdf"},
|
||||
{$td->FILE => "tokens.out", $td->EXIT_STATUS => 0},
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
|
||||
$td->runtest("tokenizer with max_len",
|
||||
{$td->COMMAND => "test_tokenizer -maxlen 50 tokens.pdf"},
|
||||
{$td->FILE => "tokens-maxlen.out", $td->EXIT_STATUS => 0},
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
|
||||
foreach (my $i = 1; $i <= 3; ++$i)
|
||||
{
|
||||
$td->runtest("misc tests",
|
||||
|
|
|
@ -2,9 +2,9 @@ WARNING: bad16.pdf (trailer, file position 753): unexpected dictionary close tok
|
|||
WARNING: bad16.pdf (trailer, file position 756): unexpected dictionary close token
|
||||
WARNING: bad16.pdf (trailer, file position 759): unknown token while reading object; treating as string
|
||||
WARNING: bad16.pdf: file is damaged
|
||||
WARNING: bad16.pdf (trailer, file position 773): EOF while reading token
|
||||
WARNING: bad16.pdf (trailer, file position 779): unexpected EOF
|
||||
WARNING: bad16.pdf: Attempting to reconstruct cross-reference table
|
||||
WARNING: bad16.pdf (trailer, file position 753): unexpected dictionary close token
|
||||
WARNING: bad16.pdf (trailer, file position 756): unexpected dictionary close token
|
||||
WARNING: bad16.pdf (trailer, file position 759): unknown token while reading object; treating as string
|
||||
bad16.pdf (trailer, file position 773): EOF while reading token
|
||||
bad16.pdf (trailer, file position 779): unexpected EOF
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
WARNING: bad16.pdf (trailer, file position 753): unexpected dictionary close token
|
||||
WARNING: bad16.pdf (trailer, file position 756): unexpected dictionary close token
|
||||
WARNING: bad16.pdf (trailer, file position 759): unknown token while reading object; treating as string
|
||||
bad16.pdf (trailer, file position 773): EOF while reading token
|
||||
bad16.pdf (trailer, file position 779): unexpected EOF
|
||||
|
|
|
@ -39,4 +39,12 @@ This stream does end with a newline.
|
|||
-- stream 4 --
|
||||
(ends with a name)
|
||||
/ThisMustBeLast
|
||||
-- stream 5 --
|
||||
% This stream has an inline image marker that is not terminated
|
||||
(Potato)
|
||||
|
||||
BI
|
||||
ID
|
||||
<506f7
|
||||
461746f>
|
||||
test 3 done
|
||||
|
|
|
@ -125,8 +125,21 @@ stream
|
|||
/ThisMustBeLastendstream
|
||||
endobj
|
||||
|
||||
13 0 obj
|
||||
<< /Length 103 >>
|
||||
stream
|
||||
% This stream has an inline image marker that is not terminated
|
||||
<506f7
|
||||
461746f>
|
||||
BI
|
||||
ID
|
||||
<506f7
|
||||
461746f>
|
||||
endstream
|
||||
endobj
|
||||
|
||||
xref
|
||||
0 13
|
||||
0 14
|
||||
0000000000 65535 f
|
||||
0000000045 00000 n
|
||||
0000000099 00000 n
|
||||
|
@ -140,11 +153,12 @@ xref
|
|||
0000001283 00000 n
|
||||
0000001374 00000 n
|
||||
0000001430 00000 n
|
||||
0000001515 00000 n
|
||||
trailer <<
|
||||
/Size 13
|
||||
/Size 14
|
||||
/Root 1 0 R
|
||||
/QStreams [ 7 0 R 8 0 R 10 0 R 11 0 R 12 0 R ]
|
||||
/QStreams [ 7 0 R 8 0 R 10 0 R 11 0 R 12 0 R 13 0 R ]
|
||||
>>
|
||||
startxref
|
||||
1515
|
||||
1670
|
||||
%%EOF
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
%% Original object ID: 1 0
|
||||
1 0 obj
|
||||
<<
|
||||
/Pages 12 0 R
|
||||
/Pages 14 0 R
|
||||
/Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
|
@ -110,12 +110,32 @@ endobj
|
|||
34
|
||||
endobj
|
||||
|
||||
%% Original object ID: 2 0
|
||||
%% Original object ID: 13 0
|
||||
12 0 obj
|
||||
<<
|
||||
/Length 13 0 R
|
||||
>>
|
||||
stream
|
||||
% This stream has an inline image marker that is not terminated
|
||||
<506f7
|
||||
461746f>
|
||||
BI
|
||||
ID
|
||||
<506f7
|
||||
461746f>
|
||||
endstream
|
||||
endobj
|
||||
|
||||
13 0 obj
|
||||
103
|
||||
endobj
|
||||
|
||||
%% Original object ID: 2 0
|
||||
14 0 obj
|
||||
<<
|
||||
/Count 1
|
||||
/Kids [
|
||||
13 0 R
|
||||
15 0 R
|
||||
]
|
||||
/Type /Pages
|
||||
>>
|
||||
|
@ -123,21 +143,21 @@ endobj
|
|||
|
||||
%% Page 1
|
||||
%% Original object ID: 3 0
|
||||
13 0 obj
|
||||
15 0 obj
|
||||
<<
|
||||
/Contents 14 0 R
|
||||
/Contents 16 0 R
|
||||
/MediaBox [
|
||||
0
|
||||
0
|
||||
612
|
||||
792
|
||||
]
|
||||
/Parent 12 0 R
|
||||
/Parent 14 0 R
|
||||
/Resources <<
|
||||
/Font <<
|
||||
/F1 16 0 R
|
||||
/F1 18 0 R
|
||||
>>
|
||||
/ProcSet 17 0 R
|
||||
/ProcSet 19 0 R
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
|
@ -145,9 +165,9 @@ endobj
|
|||
|
||||
%% Contents for page 1
|
||||
%% Original object ID: 4 0
|
||||
14 0 obj
|
||||
16 0 obj
|
||||
<<
|
||||
/Length 15 0 R
|
||||
/Length 17 0 R
|
||||
>>
|
||||
stream
|
||||
BT
|
||||
|
@ -158,12 +178,12 @@ ET
|
|||
endstream
|
||||
endobj
|
||||
|
||||
15 0 obj
|
||||
17 0 obj
|
||||
44
|
||||
endobj
|
||||
|
||||
%% Original object ID: 6 0
|
||||
16 0 obj
|
||||
18 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica
|
||||
/Encoding /WinAnsiEncoding
|
||||
|
@ -174,7 +194,7 @@ endobj
|
|||
endobj
|
||||
|
||||
%% Original object ID: 5 0
|
||||
17 0 obj
|
||||
19 0 obj
|
||||
[
|
||||
/PDF
|
||||
/Text
|
||||
|
@ -182,7 +202,7 @@ endobj
|
|||
endobj
|
||||
|
||||
xref
|
||||
0 18
|
||||
0 20
|
||||
0000000000 65535 f
|
||||
0000000052 00000 n
|
||||
0000000134 00000 n
|
||||
|
@ -195,12 +215,14 @@ xref
|
|||
0000001151 00000 n
|
||||
0000001197 00000 n
|
||||
0000001310 00000 n
|
||||
0000001357 00000 n
|
||||
0000001468 00000 n
|
||||
0000001715 00000 n
|
||||
0000001816 00000 n
|
||||
0000001863 00000 n
|
||||
0000002009 00000 n
|
||||
0000001358 00000 n
|
||||
0000001518 00000 n
|
||||
0000001566 00000 n
|
||||
0000001677 00000 n
|
||||
0000001924 00000 n
|
||||
0000002025 00000 n
|
||||
0000002072 00000 n
|
||||
0000002218 00000 n
|
||||
trailer <<
|
||||
/QStreams [
|
||||
2 0 R
|
||||
|
@ -208,11 +230,12 @@ trailer <<
|
|||
6 0 R
|
||||
8 0 R
|
||||
10 0 R
|
||||
12 0 R
|
||||
]
|
||||
/Root 1 0 R
|
||||
/Size 18
|
||||
/Size 20
|
||||
/ID [<31415926535897932384626433832795><31415926535897932384626433832795>]
|
||||
>>
|
||||
startxref
|
||||
2045
|
||||
2254
|
||||
%%EOF
|
||||
|
|
|
@ -13,4 +13,4 @@ WARNING: issue-100.pdf (object 5 0, file position 489): attempting to recover st
|
|||
WARNING: issue-100.pdf (object 5 0, file position 489): recovered stream length: 12
|
||||
WARNING: issue-100.pdf (trailer, file position 953): expected dictionary key but found non-name object; inserting key /QPDFFake1
|
||||
WARNING: issue-100.pdf (trailer, file position 953): dictionary ended prematurely; using null as value for last key
|
||||
issue-100.pdf (file position 1138): unable to find /Root dictionary
|
||||
issue-100.pdf (file position 1144): unable to find /Root dictionary
|
||||
|
|
|
@ -16,7 +16,7 @@ WARNING: issue-101.pdf (trailer, file position 2613): /Length key in stream dict
|
|||
WARNING: issue-101.pdf (trailer, file position 2684): attempting to recover stream length
|
||||
WARNING: issue-101.pdf (trailer, file position 2684): recovered stream length: 74
|
||||
WARNING: issue-101.pdf (trailer, file position 2928): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (trailer, file position 2929): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (trailer, file position 2930): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (trailer, file position 2928): expected dictionary key but found non-name object; inserting key /QPDFFake1
|
||||
WARNING: issue-101.pdf (trailer, file position 2928): expected dictionary key but found non-name object; inserting key /QPDFFake2
|
||||
WARNING: issue-101.pdf (trailer, file position 2928): expected dictionary key but found non-name object; inserting key /QPDFFake3
|
||||
|
@ -45,7 +45,7 @@ WARNING: issue-101.pdf (file position 696): unknown token while reading object;
|
|||
WARNING: issue-101.pdf (file position 698): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 701): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 711): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 742): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 743): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 745): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 747): unknown token while reading object; treating as string
|
||||
WARNING: issue-101.pdf (file position 777): unknown token while reading object; treating as string
|
||||
|
|
|
@ -2,4 +2,4 @@ WARNING: issue-146.pdf: file is damaged
|
|||
WARNING: issue-146.pdf: can't find startxref
|
||||
WARNING: issue-146.pdf: Attempting to reconstruct cross-reference table
|
||||
WARNING: issue-146.pdf (trailer, file position 20728): unknown token while reading object; treating as string
|
||||
issue-146.pdf (trailer, file position 20732): EOF while reading token
|
||||
issue-146.pdf (trailer, file position 20732): unexpected EOF
|
||||
|
|
|
@ -6,5 +6,5 @@ WARNING: issue-51.pdf (file position 70): loop detected resolving object 2 0
|
|||
WARNING: issue-51.pdf (object 2 0, file position 26): /Length key in stream dictionary is not an integer
|
||||
WARNING: issue-51.pdf (object 2 0, file position 71): attempting to recover stream length
|
||||
WARNING: issue-51.pdf (object 2 0, file position 71): unable to recover stream data; treating stream as empty
|
||||
WARNING: issue-51.pdf (object 2 0, file position 977): EOF while reading token
|
||||
WARNING: issue-51.pdf (object 2 0, file position 977): unexpected EOF
|
||||
qpdf: operation succeeded with warnings; resulting file may have some problems
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
WARNING: issue-99.pdf: file is damaged
|
||||
WARNING: issue-99.pdf (file position 3526): xref not found
|
||||
WARNING: issue-99.pdf: Attempting to reconstruct cross-reference table
|
||||
issue-99.pdf (file position 4793): unable to find /Root dictionary
|
||||
issue-99.pdf (file position 4798): unable to find /Root dictionary
|
||||
|
|
|
@ -2,4 +2,4 @@ WARNING: issue-99b.pdf: file is damaged
|
|||
WARNING: issue-99b.pdf (object 1 0, file position 9): object with ID 0
|
||||
WARNING: issue-99b.pdf: Attempting to reconstruct cross-reference table
|
||||
WARNING: issue-99b.pdf: object 1 0 not found in file after regenerating cross reference table
|
||||
issue-99b.pdf (file position 757): unable to find /Root dictionary
|
||||
issue-99b.pdf (file position 763): unable to find /Root dictionary
|
||||
|
|
|
@ -2,7 +2,7 @@ checking linearization-bounds-1.pdf
|
|||
PDF Version: 1.3
|
||||
File is not encrypted
|
||||
File is linearized
|
||||
WARNING: linearization-bounds-1.pdf (linearization hint stream: object 62 0, file position 1001182): EOF while reading token
|
||||
WARNING: linearization-bounds-1.pdf (linearization hint stream: object 62 0, file position 12302): unexpected EOF
|
||||
WARNING: linearization-bounds-1.pdf (linearization hint stream: object 62 0, file position 1183): attempting to recover stream length
|
||||
WARNING: linearization-bounds-1.pdf (linearization hint stream: object 62 0, file position 1183): recovered stream length: 106
|
||||
linearization-bounds-1.pdf (linearization hint table, file position 1183): /S (shared object) offset is out of bounds
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -13,7 +13,8 @@ static char const* whoami = 0;
|
|||
|
||||
void usage()
|
||||
{
|
||||
std::cerr << "Usage: " << whoami << " filename"
|
||||
std::cerr << "Usage: " << whoami
|
||||
<< " [-maxlen len | -no-ignorable] filename"
|
||||
<< std::endl;
|
||||
exit(2);
|
||||
}
|
||||
|
@ -83,6 +84,10 @@ static char const* tokenTypeName(QPDFTokenizer::token_type_e ttype)
|
|||
return "word";
|
||||
case QPDFTokenizer::tt_eof:
|
||||
return "eof";
|
||||
case QPDFTokenizer::tt_space:
|
||||
return "space";
|
||||
case QPDFTokenizer::tt_comment:
|
||||
return "comment";
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -108,7 +113,8 @@ sanitize(std::string const& value)
|
|||
}
|
||||
|
||||
static void
|
||||
try_skipping(PointerHolder<InputSource> is, char const* what, Finder& f)
|
||||
try_skipping(QPDFTokenizer& tokenizer, PointerHolder<InputSource> is,
|
||||
size_t max_len, char const* what, Finder& f)
|
||||
{
|
||||
std::cout << "skipping to " << what << std::endl;
|
||||
qpdf_offset_t offset = is->tell();
|
||||
|
@ -121,6 +127,7 @@ try_skipping(PointerHolder<InputSource> is, char const* what, Finder& f)
|
|||
|
||||
static void
|
||||
dump_tokens(PointerHolder<InputSource> is, std::string const& label,
|
||||
size_t max_len, bool include_ignorable,
|
||||
bool skip_streams, bool skip_inline_images)
|
||||
{
|
||||
Finder f1(is, "endstream");
|
||||
|
@ -129,11 +136,16 @@ dump_tokens(PointerHolder<InputSource> is, std::string const& label,
|
|||
bool done = false;
|
||||
QPDFTokenizer tokenizer;
|
||||
tokenizer.allowEOF();
|
||||
if (include_ignorable)
|
||||
{
|
||||
tokenizer.includeIgnorable();
|
||||
}
|
||||
while (! done)
|
||||
{
|
||||
QPDFTokenizer::Token token = tokenizer.readToken(is, "test", true);
|
||||
QPDFTokenizer::Token token =
|
||||
tokenizer.readToken(is, "test", true, max_len);
|
||||
|
||||
qpdf_offset_t offset = is->tell() - token.getRawValue().length();
|
||||
qpdf_offset_t offset = is->getLastOffset();
|
||||
std::cout << offset << ": "
|
||||
<< tokenTypeName(token.getType());
|
||||
if (token.getType() != QPDFTokenizer::tt_eof)
|
||||
|
@ -153,12 +165,12 @@ dump_tokens(PointerHolder<InputSource> is, std::string const& label,
|
|||
if (skip_streams &&
|
||||
(token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream")))
|
||||
{
|
||||
try_skipping(is, "endstream", f1);
|
||||
try_skipping(tokenizer, is, max_len, "endstream", f1);
|
||||
}
|
||||
else if (skip_inline_images &&
|
||||
(token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID")))
|
||||
{
|
||||
try_skipping(is, "EI", f2);
|
||||
try_skipping(tokenizer, is, max_len, "EI", f2);
|
||||
}
|
||||
else if (token.getType() == QPDFTokenizer::tt_eof)
|
||||
{
|
||||
|
@ -168,17 +180,16 @@ dump_tokens(PointerHolder<InputSource> is, std::string const& label,
|
|||
std::cout << "--- END " << label << " ---" << std::endl;
|
||||
}
|
||||
|
||||
static void process(char const* filename)
|
||||
static void process(char const* filename, bool include_ignorable,
|
||||
size_t max_len)
|
||||
{
|
||||
PointerHolder<InputSource> is;
|
||||
QPDFTokenizer tokenizer;
|
||||
tokenizer.allowEOF();
|
||||
|
||||
// Tokenize file, skipping streams
|
||||
FileInputSource* fis = new FileInputSource();
|
||||
fis->setFilename(filename);
|
||||
is = fis;
|
||||
dump_tokens(is, "FILE", true, false);
|
||||
dump_tokens(is, "FILE", max_len, include_ignorable, true, false);
|
||||
|
||||
// Tokenize content streams, skipping inline images
|
||||
QPDF qpdf;
|
||||
|
@ -201,7 +212,8 @@ static void process(char const* filename)
|
|||
BufferInputSource* bis = new BufferInputSource(
|
||||
"content data", content_data.getPointer());
|
||||
is = bis;
|
||||
dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno), false, true);
|
||||
dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno),
|
||||
max_len, include_ignorable, false, true);
|
||||
}
|
||||
|
||||
// Tokenize object streams
|
||||
|
@ -220,7 +232,7 @@ static void process(char const* filename)
|
|||
is = bis;
|
||||
dump_tokens(is, "OBJECT STREAM " +
|
||||
QUtil::int_to_string((*iter).getObjectID()),
|
||||
false, false);
|
||||
max_len, include_ignorable, false, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -242,15 +254,47 @@ int main(int argc, char* argv[])
|
|||
whoami += 3;
|
||||
}
|
||||
|
||||
if (argc != 2)
|
||||
char const* filename = 0;
|
||||
size_t max_len = 0;
|
||||
bool include_ignorable = true;
|
||||
for (int i = 1; i < argc; ++i)
|
||||
{
|
||||
if (argv[i][0] == '-')
|
||||
{
|
||||
if (strcmp(argv[i], "-maxlen") == 0)
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
usage();
|
||||
}
|
||||
max_len = QUtil::string_to_int(argv[i]);
|
||||
}
|
||||
else if (strcmp(argv[i], "-no-ignorable") == 0)
|
||||
{
|
||||
include_ignorable = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
usage();
|
||||
}
|
||||
}
|
||||
else if (filename)
|
||||
{
|
||||
usage();
|
||||
}
|
||||
else
|
||||
{
|
||||
filename = argv[i];
|
||||
}
|
||||
}
|
||||
if (filename == 0)
|
||||
{
|
||||
usage();
|
||||
}
|
||||
|
||||
char const* filename = argv[1];
|
||||
try
|
||||
{
|
||||
process(filename);
|
||||
process(filename, include_ignorable, max_len);
|
||||
}
|
||||
catch (std::exception& e)
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue