Find PDF header anywhere in the first 1024 bytes

This commit is contained in:
Jay Berkenbilt 2012-12-25 14:38:18 -05:00
parent bcfc9847be
commit 7f84239cad
10 changed files with 137 additions and 13 deletions

View File

@ -1,3 +1,9 @@
2012-12-25 Jay Berkenbilt <ejb@ql.org>
* Allow PDF header to appear anywhere in the first 1024 bytes of
the file as recommended in the implementation notes of the Adobe
version of the PDF spec.
2012-11-20 Jay Berkenbilt <ejb@ql.org>
* Add zlib and libpcre to Requires.private in the pkg-config file

9
TODO
View File

@ -1,12 +1,3 @@
Next
====
* Find PDF header in the first 1024 bytes of the file. Treat the
location of the PDF header as offset 0 for purposes of resolving
explicit file locations as this is what other implementations
appear to do.
General
=======

View File

@ -0,0 +1,61 @@
#include <qpdf/OffsetInputSource.hh>
OffsetInputSource::OffsetInputSource(PointerHolder<InputSource> proxied,
qpdf_offset_t global_offset) :
proxied(proxied),
global_offset(global_offset)
{
}
OffsetInputSource::~OffsetInputSource()
{
}
qpdf_offset_t
OffsetInputSource::findAndSkipNextEOL()
{
return this->proxied->findAndSkipNextEOL() - this->global_offset;
}
std::string const&
OffsetInputSource::getName() const
{
return this->proxied->getName();
}
qpdf_offset_t
OffsetInputSource::tell()
{
return this->proxied->tell() - this->global_offset;
}
void
OffsetInputSource::seek(qpdf_offset_t offset, int whence)
{
if (whence == SEEK_SET)
{
this->proxied->seek(offset + global_offset, whence);
}
else
{
this->proxied->seek(offset, whence);
}
}
void
OffsetInputSource::rewind()
{
seek(0, SEEK_SET);
}
size_t
OffsetInputSource::read(char* buffer, size_t length)
{
return this->proxied->read(buffer, length);
}
void
OffsetInputSource::unreadCh(char ch)
{
this->proxied->unreadCh(ch);
}

View File

@ -13,6 +13,7 @@
#include <qpdf/Pl_Discard.hh>
#include <qpdf/FileInputSource.hh>
#include <qpdf/BufferInputSource.hh>
#include <qpdf/OffsetInputSource.hh>
#include <qpdf/QPDFExc.hh>
#include <qpdf/QPDF_Null.hh>
@ -213,7 +214,7 @@ QPDF::getWarnings()
void
QPDF::parse(char const* password)
{
PCRE header_re("^%PDF-(1.\\d+)\\b");
PCRE header_re("\\A((?s).*?)%PDF-(1.\\d+)\\b");
PCRE eof_re("(?s:startxref\\s+(\\d+)\\s+%%EOF\\b)");
if (password)
@ -221,11 +222,24 @@ QPDF::parse(char const* password)
this->provided_password = password;
}
std::string line = this->file->readLine(20);
// Find the header anywhere in the first 1024 bytes of the file.
char buffer[1044];
this->file->read(buffer, sizeof(buffer));
std::string line(buffer);
PCRE::Match m1 = header_re.match(line.c_str());
if (m1)
{
this->pdf_version = m1.getMatch(1);
size_t global_offset = m1.getMatch(1).length();
if (global_offset != 0)
{
// Emperical evidence strongly suggests that when there is
// leading material prior to the PDF header, all explicit
// offsets in the file are such that 0 points to the
// beginning of the header.
QTC::TC("qpdf", "QPDF global offset");
this->file = new OffsetInputSource(this->file, global_offset);
}
this->pdf_version = m1.getMatch(2);
if (atof(this->pdf_version.c_str()) < 1.2)
{
this->tokenizer.allowPoundAnywhereInName();

View File

@ -12,6 +12,7 @@ SRCS_libqpdf = \
libqpdf/FileInputSource.cc \
libqpdf/InputSource.cc \
libqpdf/MD5.cc \
libqpdf/OffsetInputSource.cc \
libqpdf/PCRE.cc \
libqpdf/Pipeline.cc \
libqpdf/Pl_AES_PDF.cc \

View File

@ -0,0 +1,29 @@
#ifndef __QPDF_OFFSETINPUTSOURCE_HH__
#define __QPDF_OFFSETINPUTSOURCE_HH__
// This class implements an InputSource that proxies for an underlying
// input source but offset a specific number of bytes.
#include <qpdf/InputSource.hh>
#include <qpdf/PointerHolder.hh>
class OffsetInputSource: public InputSource
{
public:
OffsetInputSource(PointerHolder<InputSource>, qpdf_offset_t global_offset);
virtual ~OffsetInputSource();
virtual qpdf_offset_t findAndSkipNextEOL();
virtual std::string const& getName() const;
virtual qpdf_offset_t tell();
virtual void seek(qpdf_offset_t offset, int whence);
virtual void rewind();
virtual size_t read(char* buffer, size_t length);
virtual void unreadCh(char ch);
private:
PointerHolder<InputSource> proxied;
qpdf_offset_t global_offset;
};
#endif // __QPDF_OFFSETINPUTSOURCE_HH__

View File

@ -243,3 +243,4 @@ QPDF_Tokenizer EOF reading appendable token 0
QPDFWriter extra header text no newline 0
QPDFWriter extra header text add newline 0
QPDF bogus 0 offset 0
QPDF global offset 0

View File

@ -149,7 +149,7 @@ $td->runtest("remove page we don't have",
$td->NORMALIZE_NEWLINES);
# ----------
$td->notify("--- Miscellaneous Tests ---");
$n_tests += 56;
$n_tests += 57;
$td->runtest("qpdf version",
{$td->COMMAND => "qpdf --version"},
@ -414,6 +414,10 @@ $td->runtest("object with zero offset",
{$td->COMMAND => "qpdf --check zero-offset.pdf"},
{$td->FILE => "zero-offset.out", $td->EXIT_STATUS => 3},
$td->NORMALIZE_NEWLINES);
$td->runtest("check file with leading junk",
{$td->COMMAND => "qpdf --check leading-junk.pdf"},
{$td->FILE => "leading-junk.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
show_ntests();
# ----------

View File

@ -0,0 +1,17 @@
checking leading-junk.pdf
PDF Version: 1.4
R = 3
P = -4
User password =
extract for accessibility: allowed
extract for any purpose: allowed
print low resolution: allowed
print high resolution: allowed
modify document assembly: allowed
modify forms: allowed
modify annotations: allowed
modify other: allowed
modify anything: allowed
File is linearized
No syntax or stream encoding errors found; the file may still contain
errors that qpdf cannot detect

Binary file not shown.