Handle more broken files

Space rather than newline after xref, missing /ID in trailer for
encrypted file.  This enables qpdf to handle some files that xpdf can
handle.  Adobe reader can't necessarily handle them.
This commit is contained in:
Jay Berkenbilt 2013-06-15 12:38:25 -04:00
parent a1d5a3e916
commit a85007cb0d
7 changed files with 59 additions and 11 deletions

View File

@ -1,3 +1,8 @@
2013-06-15 Jay Berkenbilt <ejb@ql.org>
* Handle some additional broken files with missing /ID in trailer
for encrypted files and with space rather than newline after xref.
2013-06-14 Jay Berkenbilt <ejb@ql.org>
* Detect and correct /Outlines dictionary being a direct object

View File

@ -430,11 +430,22 @@ QPDF::read_xref(qpdf_offset_t xref_offset)
std::map<int, int> free_table;
while (xref_offset)
{
char buf[7];
memset(buf, 0, sizeof(buf));
this->file->seek(xref_offset, SEEK_SET);
std::string line = this->file->readLine(50);
if (line == "xref")
this->file->read(buf, sizeof(buf) - 1);
// The PDF spec says xref must be followed by a line
// terminator, but files exist in the wild where it is
// terminated by arbitrary whitespace.
PCRE xref_re("^xref\\s+");
PCRE::Match m = xref_re.match(buf);
if (m)
{
xref_offset = read_xrefTable(this->file->tell());
QTC::TC("qpdf", "QPDF xref space",
((buf[4] == '\n') ? 0 :
(buf[4] == '\r') ? 1 :
(buf[4] == ' ') ? 2 : 9999));
xref_offset = read_xrefTable(xref_offset + m.getMatch(0).length());
}
else
{

View File

@ -791,17 +791,24 @@ QPDF::initializeEncryption()
// encryption dictionary.
this->encrypted = true;
std::string id1;
QPDFObjectHandle id_obj = this->trailer.getKey("/ID");
if (! (id_obj.isArray() &&
(id_obj.getArrayNItems() == 2) &&
id_obj.getArrayItem(0).isString()))
if ((id_obj.isArray() &&
(id_obj.getArrayNItems() == 2) &&
id_obj.getArrayItem(0).isString()))
{
throw QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
"trailer", this->file->getLastOffset(),
"invalid /ID in trailer dictionary");
id1 = id_obj.getArrayItem(0).getStringValue();
}
else
{
// Treating a missing ID as the empty string enables qpdf to
// decrypt some invalid encrypted files with no /ID that
// poppler can read but Adobe Reader can't.
warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
"trailer", this->file->getLastOffset(),
"invalid /ID in trailer dictionary"));
}
std::string id1 = id_obj.getArrayItem(0).getStringValue();
QPDFObjectHandle encryption_dict = this->trailer.getKey("/Encrypt");
if (! encryption_dict.isDictionary())
{

View File

@ -264,3 +264,4 @@ QPDFObjectHandle inline image token 0
QPDF not caching overridden objstm object 0
QPDFWriter original obj non-zero gen 0
QPDF_optimization indirect outlines 0
QPDF xref space 2

View File

@ -199,7 +199,7 @@ $td->runtest("remove page we don't have",
show_ntests();
# ----------
$td->notify("--- Miscellaneous Tests ---");
$n_tests += 64;
$n_tests += 65;
$td->runtest("qpdf version",
{$td->COMMAND => "qpdf --version"},
@ -509,6 +509,14 @@ $td->runtest("check file",
{$td->FILE => "a.pdf"},
{$td->FILE => "gen1.qdf"});
# This file, from a user, is missing /ID in its trailer even though it
# is encrypted and also has a space instead of a newline after its
# xref keyword. xpdf can open it, but Adobe reader can't.
$td->runtest("check broken file",
{$td->COMMAND => "qpdf --check invalid-id-xref.pdf"},
{$td->FILE => "invalid-id-xref.out", $td->EXIT_STATUS => 3},
$td->NORMALIZE_NEWLINES);
show_ntests();
# ----------
$td->notify("--- Numeric range parsing tests ---");

View File

@ -0,0 +1,16 @@
WARNING: invalid-id-xref.pdf (trailer, file position 2493795): invalid /ID in trailer dictionary
checking invalid-id-xref.pdf
PDF Version: 1.1
R = 3
P = -1804
User password =
extract for accessibility: not allowed
extract for any purpose: allowed
print low resolution: allowed
print high resolution: allowed
modify document assembly: not allowed
modify forms: not allowed
modify annotations: allowed
modify other: not allowed
modify anything: not allowed
File is not linearized

Binary file not shown.