From 30f109e244f365111d5219903f13d64cf1a95054 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Thu, 10 Aug 2017 19:37:05 -0400 Subject: [PATCH] Read xref table without PCRE Also accept more errors than before. --- ChangeLog | 3 + include/qpdf/QPDF.hh | 4 + libqpdf/QPDF.cc | 180 +++++++++++++++++++++++++++++--- qpdf/qpdf.testcov | 4 + qpdf/qtest/qpdf.test | 9 +- qpdf/qtest/qpdf/xref-errors.out | 15 +++ qpdf/qtest/qpdf/xref-errors.pdf | 79 ++++++++++++++ 7 files changed, 276 insertions(+), 18 deletions(-) create mode 100644 qpdf/qtest/qpdf/xref-errors.out create mode 100644 qpdf/qtest/qpdf/xref-errors.pdf diff --git a/ChangeLog b/ChangeLog index 6b7454eb..267ab05f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ 2017-08-10 Jay Berkenbilt + * Be more forgiving of certain types of errors in the xref table + that don't interfere with interpreting the table. + * Remove unused "tracing" parameter from PointerHolder's (T*, bool) constructor. This change breaks source code compatibility, but since this argument to PointerHolder has not diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index 92a66a34..004e4b9c 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -652,6 +652,10 @@ class QPDF void setTrailer(QPDFObjectHandle obj); void read_xref(qpdf_offset_t offset); void reconstruct_xref(QPDFExc& e); + bool parse_xrefFirst(std::string const& line, + int& obj, int& num, int& bytes); + bool parse_xrefEntry(std::string const& line, + qpdf_offset_t& f1, int& f2, char& type); qpdf_offset_t read_xrefTable(qpdf_offset_t offset); qpdf_offset_t read_xrefStream(qpdf_offset_t offset); qpdf_offset_t processXRefStream( diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index 25ef0dfd..fc0c103f 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -9,7 +9,6 @@ #include #include -#include #include #include #include @@ -537,12 +536,162 @@ QPDF::read_xref(qpdf_offset_t xref_offset) this->deleted_objects.clear(); } +bool +QPDF::parse_xrefFirst(std::string const& line, + int& obj, int& num, int& bytes) +{ + // is_space and is_digit both return false on '\0', so this will + // not overrun the null-terminated buffer. + char const* p = line.c_str(); + char const* start = line.c_str(); + + // Skip zero or more spaces + while (QUtil::is_space(*p)) + { + ++p; + } + // Require digit + if (! QUtil::is_digit(*p)) + { + return false; + } + // Gather digits + std::string obj_str; + while (QUtil::is_digit(*p)) + { + obj_str.append(1, *p++); + } + // Require space + if (! QUtil::is_space(*p)) + { + return false; + } + // Skip spaces + while (QUtil::is_space(*p)) + { + ++p; + } + // Require digit + if (! QUtil::is_digit(*p)) + { + return false; + } + // Gather digits + std::string num_str; + while (QUtil::is_digit(*p)) + { + num_str.append(1, *p++); + } + // Skip any space including line terminators + while (QUtil::is_space(*p)) + { + ++p; + } + bytes = p - start; + obj = atoi(obj_str.c_str()); + num = atoi(num_str.c_str()); + return true; +} + +bool +QPDF::parse_xrefEntry(std::string const& line, + qpdf_offset_t& f1, int& f2, char& type) +{ + // is_space and is_digit both return false on '\0', so this will + // not overrun the null-terminated buffer. + char const* p = line.c_str(); + + // Skip zero or more spaces. There aren't supposed to be any. + bool invalid = false; + while (QUtil::is_space(*p)) + { + ++p; + QTC::TC("qpdf", "QPDF ignore first space in xref entry"); + invalid = true; + } + // Require digit + if (! QUtil::is_digit(*p)) + { + return false; + } + // Gather digits + std::string f1_str; + while (QUtil::is_digit(*p)) + { + f1_str.append(1, *p++); + } + // Require space + if (! QUtil::is_space(*p)) + { + return false; + } + if (QUtil::is_space(*(p+1))) + { + QTC::TC("qpdf", "QPDF ignore first extra space in xref entry"); + invalid = true; + } + // Skip spaces + while (QUtil::is_space(*p)) + { + ++p; + } + // Require digit + if (! QUtil::is_digit(*p)) + { + return false; + } + // Gather digits + std::string f2_str; + while (QUtil::is_digit(*p)) + { + f2_str.append(1, *p++); + } + // Require space + if (! QUtil::is_space(*p)) + { + return false; + } + if (QUtil::is_space(*(p+1))) + { + QTC::TC("qpdf", "QPDF ignore second extra space in xref entry"); + invalid = true; + } + // Skip spaces + while (QUtil::is_space(*p)) + { + ++p; + } + if ((*p == 'f') || (*p == 'n')) + { + type = *p; + } + else + { + return false; + } + if ((f1_str.length() != 10) || (f2_str.length() != 5)) + { + QTC::TC("qpdf", "QPDF ignore length error xref entry"); + invalid = true; + } + + if (invalid) + { + warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(), + "xref table", + this->file->getLastOffset(), + "accepting invalid xref table entry")); + } + + f1 = QUtil::string_to_ll(f1_str.c_str()); + f2 = atoi(f2_str.c_str()); + + return true; +} + qpdf_offset_t QPDF::read_xrefTable(qpdf_offset_t xref_offset) { - PCRE xref_first_re("^\\s*(\\d+)\\s+(\\d+)\\s*"); - PCRE xref_entry_re("(?s:(^\\d{10}) (\\d{5}) ([fn])\\s*$)"); - std::vector deleted_items; this->file->seek(xref_offset, SEEK_SET); @@ -553,18 +702,17 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) memset(linebuf, 0, sizeof(linebuf)); this->file->read(linebuf, sizeof(linebuf) - 1); std::string line = linebuf; - PCRE::Match m1 = xref_first_re.match(line.c_str()); - if (! m1) + int obj = 0; + int num = 0; + int bytes = 0; + if (! parse_xrefFirst(line, obj, num, bytes)) { QTC::TC("qpdf", "QPDF invalid xref"); throw QPDFExc(qpdf_e_damaged_pdf, this->file->getName(), "xref table", this->file->getLastOffset(), "xref syntax invalid"); } - file->seek(this->file->getLastOffset() + m1.getMatch(0).length(), - SEEK_SET); - int obj = atoi(m1.getMatch(1).c_str()); - int num = atoi(m1.getMatch(2).c_str()); + this->file->seek(this->file->getLastOffset() + bytes, SEEK_SET); for (int i = obj; i < obj + num; ++i) { if (i == 0) @@ -573,8 +721,11 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) this->first_xref_item_offset = this->file->tell(); } std::string xref_entry = this->file->readLine(30); - PCRE::Match m2 = xref_entry_re.match(xref_entry.c_str()); - if (! m2) + // For xref_table, these will always be small enough to be ints + qpdf_offset_t f1 = 0; + int f2 = 0; + char type = '\0'; + if (! parse_xrefEntry(xref_entry, f1, f2, type)) { QTC::TC("qpdf", "QPDF invalid xref entry"); throw QPDFExc( @@ -583,11 +734,6 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) "invalid xref entry (obj=" + QUtil::int_to_string(i) + ")"); } - - // For xref_table, these will always be small enough to be ints - qpdf_offset_t f1 = QUtil::string_to_ll(m2.getMatch(1).c_str()); - int f2 = atoi(m2.getMatch(2).c_str()); - char type = m2.getMatch(3).at(0); if (type == 'f') { // Save deleted items until after we've checked the diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 2860f55e..c08ed721 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -289,3 +289,7 @@ qpdf single-pages %d 0 qpdf single-pages .pdf 0 qpdf single-pages other 0 QPDFTokenizer allowing bad token 0 +QPDF ignore first space in xref entry 0 +QPDF ignore first extra space in xref entry 0 +QPDF ignore second extra space in xref entry 0 +QPDF ignore length error xref entry 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index b3d13bde..d0cba589 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -232,7 +232,7 @@ foreach my $d (@bug_tests) show_ntests(); # ---------- $td->notify("--- Miscellaneous Tests ---"); -$n_tests += 86; +$n_tests += 87; $td->runtest("qpdf version", {$td->COMMAND => "qpdf --version"}, @@ -669,6 +669,13 @@ $td->runtest("ignore bad token", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); +$td->runtest("recoverable xref errors", + {$td->COMMAND => + "qpdf --check --show-xref xref-errors.pdf"}, + {$td->FILE => "xref-errors.out", + $td->EXIT_STATUS => 3}, + $td->NORMALIZE_NEWLINES); + show_ntests(); # ---------- $td->notify("--- Single Page ---"); diff --git a/qpdf/qtest/qpdf/xref-errors.out b/qpdf/qtest/qpdf/xref-errors.out new file mode 100644 index 00000000..7a2cf384 --- /dev/null +++ b/qpdf/qtest/qpdf/xref-errors.out @@ -0,0 +1,15 @@ +WARNING: xref-errors.pdf (xref table, file position 585): accepting invalid xref table entry +WARNING: xref-errors.pdf (xref table, file position 606): accepting invalid xref table entry +WARNING: xref-errors.pdf (xref table, file position 627): accepting invalid xref table entry +WARNING: xref-errors.pdf (xref table, file position 648): accepting invalid xref table entry +WARNING: xref-errors.pdf (xref table, file position 667): accepting invalid xref table entry +checking xref-errors.pdf +PDF Version: 1.3 +File is not encrypted +File is not linearized +1/0: uncompressed; offset = 9 +2/0: uncompressed; offset = 63 +3/0: uncompressed; offset = 135 +4/0: uncompressed; offset = 307 +5/0: uncompressed; offset = 403 +6/0: uncompressed; offset = 438 diff --git a/qpdf/qtest/qpdf/xref-errors.pdf b/qpdf/qtest/qpdf/xref-errors.pdf new file mode 100644 index 00000000..1778ab92 --- /dev/null +++ b/qpdf/qtest/qpdf/xref-errors.pdf @@ -0,0 +1,79 @@ +%PDF-1.3 +1 0 obj +<< + /Type /Catalog + /Pages 2 0 R +>> +endobj + +2 0 obj +<< + /Type /Pages + /Kids [ + 3 0 R + ] + /Count 1 +>> +endobj + +3 0 obj +<< + /Type /Page + /Parent 2 0 R + /MediaBox [0 0 612 792] + /Contents 4 0 R + /Resources << + /ProcSet 5 0 R + /Font << + /F1 6 0 R + >> + >> +>> +endobj + +4 0 obj +<< + /Length 44 +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET +endstream +endobj + +5 0 obj +[ + /PDF + /Text +] +endobj + +6 0 obj +<< + /Type /Font + /Subtype /Type1 + /Name /F1 + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding +>> +endobj + +xref +0 7 +0000000000 65535 f + 0000000009 00000 n +0000000063 00000 n +0000000135 00000 n +000000307 00000 n +0000000403 0000 n +0000000438 00000 n +trailer << + /Size 7 + /Root 1 0 R +>> +startxref +556 +%%EOF