From 7f84239cad2ec58166245394e56a4647085e025e Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Tue, 25 Dec 2012 14:38:18 -0500 Subject: [PATCH] Find PDF header anywhere in the first 1024 bytes --- ChangeLog | 6 +++ TODO | 9 ----- libqpdf/OffsetInputSource.cc | 61 ++++++++++++++++++++++++++++++ libqpdf/QPDF.cc | 20 ++++++++-- libqpdf/build.mk | 1 + libqpdf/qpdf/OffsetInputSource.hh | 29 ++++++++++++++ qpdf/qpdf.testcov | 1 + qpdf/qtest/qpdf.test | 6 ++- qpdf/qtest/qpdf/leading-junk.out | 17 +++++++++ qpdf/qtest/qpdf/leading-junk.pdf | Bin 0 -> 13670 bytes 10 files changed, 137 insertions(+), 13 deletions(-) create mode 100644 libqpdf/OffsetInputSource.cc create mode 100644 libqpdf/qpdf/OffsetInputSource.hh create mode 100644 qpdf/qtest/qpdf/leading-junk.out create mode 100644 qpdf/qtest/qpdf/leading-junk.pdf diff --git a/ChangeLog b/ChangeLog index 88f57f30..a06ffdcf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2012-12-25 Jay Berkenbilt + + * Allow PDF header to appear anywhere in the first 1024 bytes of + the file as recommended in the implementation notes of the Adobe + version of the PDF spec. + 2012-11-20 Jay Berkenbilt * Add zlib and libpcre to Requires.private in the pkg-config file diff --git a/TODO b/TODO index 73ffc087..c8e0ceca 100644 --- a/TODO +++ b/TODO @@ -1,12 +1,3 @@ -Next -==== - - * Find PDF header in the first 1024 bytes of the file. Treat the - location of the PDF header as offset 0 for purposes of resolving - explicit file locations as this is what other implementations - appear to do. - - General ======= diff --git a/libqpdf/OffsetInputSource.cc b/libqpdf/OffsetInputSource.cc new file mode 100644 index 00000000..c1ec4102 --- /dev/null +++ b/libqpdf/OffsetInputSource.cc @@ -0,0 +1,61 @@ +#include + +OffsetInputSource::OffsetInputSource(PointerHolder proxied, + qpdf_offset_t global_offset) : + proxied(proxied), + global_offset(global_offset) +{ +} + +OffsetInputSource::~OffsetInputSource() +{ +} + +qpdf_offset_t +OffsetInputSource::findAndSkipNextEOL() +{ + return this->proxied->findAndSkipNextEOL() - this->global_offset; +} + +std::string const& +OffsetInputSource::getName() const +{ + return this->proxied->getName(); +} + +qpdf_offset_t +OffsetInputSource::tell() +{ + return this->proxied->tell() - this->global_offset; +} + +void +OffsetInputSource::seek(qpdf_offset_t offset, int whence) +{ + if (whence == SEEK_SET) + { + this->proxied->seek(offset + global_offset, whence); + } + else + { + this->proxied->seek(offset, whence); + } +} + +void +OffsetInputSource::rewind() +{ + seek(0, SEEK_SET); +} + +size_t +OffsetInputSource::read(char* buffer, size_t length) +{ + return this->proxied->read(buffer, length); +} + +void +OffsetInputSource::unreadCh(char ch) +{ + this->proxied->unreadCh(ch); +} diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index ccbfaf7c..ba96cb64 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -213,7 +214,7 @@ QPDF::getWarnings() void QPDF::parse(char const* password) { - PCRE header_re("^%PDF-(1.\\d+)\\b"); + PCRE header_re("\\A((?s).*?)%PDF-(1.\\d+)\\b"); PCRE eof_re("(?s:startxref\\s+(\\d+)\\s+%%EOF\\b)"); if (password) @@ -221,11 +222,24 @@ QPDF::parse(char const* password) this->provided_password = password; } - std::string line = this->file->readLine(20); + // Find the header anywhere in the first 1024 bytes of the file. + char buffer[1044]; + this->file->read(buffer, sizeof(buffer)); + std::string line(buffer); PCRE::Match m1 = header_re.match(line.c_str()); if (m1) { - this->pdf_version = m1.getMatch(1); + size_t global_offset = m1.getMatch(1).length(); + if (global_offset != 0) + { + // Emperical evidence strongly suggests that when there is + // leading material prior to the PDF header, all explicit + // offsets in the file are such that 0 points to the + // beginning of the header. + QTC::TC("qpdf", "QPDF global offset"); + this->file = new OffsetInputSource(this->file, global_offset); + } + this->pdf_version = m1.getMatch(2); if (atof(this->pdf_version.c_str()) < 1.2) { this->tokenizer.allowPoundAnywhereInName(); diff --git a/libqpdf/build.mk b/libqpdf/build.mk index 6debf107..0ad96a2d 100644 --- a/libqpdf/build.mk +++ b/libqpdf/build.mk @@ -12,6 +12,7 @@ SRCS_libqpdf = \ libqpdf/FileInputSource.cc \ libqpdf/InputSource.cc \ libqpdf/MD5.cc \ + libqpdf/OffsetInputSource.cc \ libqpdf/PCRE.cc \ libqpdf/Pipeline.cc \ libqpdf/Pl_AES_PDF.cc \ diff --git a/libqpdf/qpdf/OffsetInputSource.hh b/libqpdf/qpdf/OffsetInputSource.hh new file mode 100644 index 00000000..aedc574a --- /dev/null +++ b/libqpdf/qpdf/OffsetInputSource.hh @@ -0,0 +1,29 @@ +#ifndef __QPDF_OFFSETINPUTSOURCE_HH__ +#define __QPDF_OFFSETINPUTSOURCE_HH__ + +// This class implements an InputSource that proxies for an underlying +// input source but offset a specific number of bytes. + +#include +#include + +class OffsetInputSource: public InputSource +{ + public: + OffsetInputSource(PointerHolder, qpdf_offset_t global_offset); + virtual ~OffsetInputSource(); + + virtual qpdf_offset_t findAndSkipNextEOL(); + virtual std::string const& getName() const; + virtual qpdf_offset_t tell(); + virtual void seek(qpdf_offset_t offset, int whence); + virtual void rewind(); + virtual size_t read(char* buffer, size_t length); + virtual void unreadCh(char ch); + + private: + PointerHolder proxied; + qpdf_offset_t global_offset; +}; + +#endif // __QPDF_OFFSETINPUTSOURCE_HH__ diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 3458297a..937d2b0c 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -243,3 +243,4 @@ QPDF_Tokenizer EOF reading appendable token 0 QPDFWriter extra header text no newline 0 QPDFWriter extra header text add newline 0 QPDF bogus 0 offset 0 +QPDF global offset 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index 16af5832..35645466 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -149,7 +149,7 @@ $td->runtest("remove page we don't have", $td->NORMALIZE_NEWLINES); # ---------- $td->notify("--- Miscellaneous Tests ---"); -$n_tests += 56; +$n_tests += 57; $td->runtest("qpdf version", {$td->COMMAND => "qpdf --version"}, @@ -414,6 +414,10 @@ $td->runtest("object with zero offset", {$td->COMMAND => "qpdf --check zero-offset.pdf"}, {$td->FILE => "zero-offset.out", $td->EXIT_STATUS => 3}, $td->NORMALIZE_NEWLINES); +$td->runtest("check file with leading junk", + {$td->COMMAND => "qpdf --check leading-junk.pdf"}, + {$td->FILE => "leading-junk.out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); show_ntests(); # ---------- diff --git a/qpdf/qtest/qpdf/leading-junk.out b/qpdf/qtest/qpdf/leading-junk.out new file mode 100644 index 00000000..58847c9d --- /dev/null +++ b/qpdf/qtest/qpdf/leading-junk.out @@ -0,0 +1,17 @@ +checking leading-junk.pdf +PDF Version: 1.4 +R = 3 +P = -4 +User password = +extract for accessibility: allowed +extract for any purpose: allowed +print low resolution: allowed +print high resolution: allowed +modify document assembly: allowed +modify forms: allowed +modify annotations: allowed +modify other: allowed +modify anything: allowed +File is linearized +No syntax or stream encoding errors found; the file may still contain +errors that qpdf cannot detect diff --git a/qpdf/qtest/qpdf/leading-junk.pdf b/qpdf/qtest/qpdf/leading-junk.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2b2a0a2cacac9a8d7c1455a9f1e8005323115d95 GIT binary patch literal 13670 zcmc&*2Y3|Kw?_yd0V&c1A&lW4AYjPc=>wq!5_%F6ih)q3hCs55n++v^QZy7PY6KJz zVpRSVsS1dIp-W5r(^R_hcodNq5D*Bwb7yD6W@ld{e(!s`-DK?w#2) zshXsw*pi768Mcvufgy2SqQg*YCNL!Le(sY%jwDFJE{_O|h#yY1cD<1 z(HzSW;oXQq@R1b=Byz-HB0QGh7$Q81U?>j6VhEZf!ut}4qG_IJ-#$>Ysdz%-|Me9( z)}iUx8;T4h%@<+`;}c;c++zX>{^HlOF@#v_81~j%j%69k7*A4`F@d5jVkHZAc=X2v2}iBLv8vIEOYGClDoAk|w-+7h+HZ#S08ivPh&UNfdZS7X(!V z5IC7pC`1eG|6T$TH_E0s#*G5~-~{N0TupaM4yTbiWMD{0RBUvhVVk=kVl$knP!s6{ zh74OACnamWBw0&MH@u0-NK1#}AdQ1KqD3goNl?a70uk;251mj+FvgdV5KBsMSONly z@o`Urif8F!%t9G5touaQpzB1}Z0-{&mocE;B?A*-o4lq&9Lb8t7qojym$gs}Slr7( z$b1+bjNc3hdfQcv2=AA!6$4}prxP%#fm7dcqd=guq!NCef! za*~K-Nn;VsP&_ShstlhLfmaxqus_7t!yhR1V)}IZ6P+qKwJw@sfRsQWs4eh@!cMsz zBr*7qeGW}Z3oN7?_8QaAP-bF$Ker0Qps zt%cuLypY&-T9XUG|885aZ&t-D$M}imtYBbU-HIcnbj@%iyqlIfT63l-l0hPNV?7NY zkYej(OHTn-F$ZF#G|2jBSQHa7WT#OA*gH6OIcd!0kvtd#tZNVe(DK1G5MOeDzOi&bEG&pw*Zhcnmhyt=h~5H%<0Uai(fSgsch@DbX^?Q?#mz3aKlSjx<%} zNt&fdmSK6CBV~$20s~?yR3*zxswVNU@6dG$%B_S?9tmP7cDrPH0<2}=fL(!{<6-YY zdnBscIAgJK187>x0UrQuAXefLFOtA294ia3(nuOdiAYp*2{?cQ5k=7jS|T}8rGbEC zwlo_tY;GG}0Sp5ka$9SH8l3_$FGd>k353#xG%idya3O8N?nc`5)wEv)T#+V-8p9%z z*ECW=B!^^{lUX1-g;8Zu<`q~!Riu_?F;-I2V(OW}CL@5s-P{~t=7+gDx~w)W^D@*x z-JM9L)e_jna9|c9k*Xqr)mci?Rm6y_%rGPa+$#w*heTfF5OxU?2^N0Xa44e2!}gPB zR6!IKUR5-ir$`ablqgl<6$O{%-Q5lJXH4Us2b(KVlppzOW z3y6|r8VX33kqVn1RuN@dgoM;JnF6OcLE=@sM*XivQOHdQ?j(|;j>f;9nPlTKlc4JFMisS#ulVRBiss;tj7UlfhW~UD zg_yAK4*vFF?`jc_(U)g(zS`a+v=q-g*9+4j+JxV?L$0DSgW`6fgw+u=q#mcKJg0K1 zETABVZFCQ+MSusRYYl9DtH3|Cl9 zB56v6L=XT{2?`j9O#luObx{Q1Wm3`^N)#ytHd>Hdz~LNXI1$3eFtDI0D$6LUNOGmJ zI5x=5;-(+QCJ)w<($mPj#^B2_-XwL?501r`V6egFXs|`oIEA8U0Z9^Aq=V7m4h_tN zfT2aiLta5FNM2H87D4K1v@SCODXA)kLT5Nwr%H)L$%WJ0k($Q&XG7|_Fh2Kw^w9T( zE63bUf4K6ss#}*#`{C1~?7bb=vsA=(kkp+J*K92ij%T z?Gb)@UxT5|CS8BhV?(P2=+z4i9*!GhLBf;ai;*z-+doKnvh{N#adopZ|Bv9X#Bt>Y zcU-#U(5O3=>n`ilajBBGu=2%6HLk@@kG)VKskVIPE;ZwEn^qs_qrR~q;mNPXNSM{? zA0#}9^ZAjOerb8+hC8>{t(%#@f3&@c5;>b{cKBxW$Xeq+Yq?_h%0vCL0)oWcHwqqq z^jgK>7NZ`#VnM=_=!%g*=H-xopfI$%EB3ju=-qIUZQrI!+padve*J}k2Wnn?l2u^) zD1@DLYJSZ=IbTiK7kqM1{LY+PbBbyd6_(2!9WvX31->I_r3$!QH?AT41jaw3&5)nN z2lslsDvxb>_;RP7V}j}r{P6;pf3eoXOK-6&rsSlhE|{G^arl~^*OIT%JB~CC|IG@9 zPu_&Z+n-?kBj$`dzQvaQ_MegK*NN-9PRMM~tjdrhFXq)*od4mKcWNbexIO#)?mDTp zUcSK2e}8}9J^7#ILe6>J;KBF7ePo7TxfA^(^4xZ7{?6|w$bW^Rxo_BC-N(O8rsXu8F%95{+Sgs z()P-?(#_=8;*0cZ8?w}G+hc@ENDTB)Ox_) zcLvPTs;1f}WS_cJvEt+gPY;E}{qlO~2b1p~+_WL@(v5zDt!Vh9P|B|qYG^NWZmbC} zy!fd*M2b|`C+x)dqSq9R*c-?n2-|C{6Vu0ooyMtyYH_2ByzxZ z>rR96$)S{AIn>ZzW;AT&>$mLFctS$rQ|Hm(O26)Eb7$4|rP(K98^nD}FV9Li|0TKl zX3nHdxk-tEtEVJCI{w$hbZc(-BvJT*!=IJnYZLmnm2!OTzPXJT&S>5}vHy23L{Zl= z8`OTXxLZuqsqfX?c;(zDSF*ZZh&VlB^{~O!yU+M0uXFn42RT+We6lF~D(*)#4DDsk zjS1~KtU5Sn)8?#JEnClV?iw1?+s360i_7gL*I)UcS_&HWLHW&jv*yj(@K#}!-Ea54 z+`Y?63mW*BZtHG?_A84T+RKc_3t4eeQHya!Ul%NFl_U3?ykTY4^t8e4>jYK1Q}5Gy zOP1cOp0w-e*8K061x_Ec&2iz^;nEf>8a`Q+_A84T+RKba?3wi2eX1{-|5oos?N=5xw3iu;Wh3g^CqEt2^M1qAwMO4dkSi6`o0@c+DXN$s^&uNj zX-bp7s+1pnuwT$O6`stie9w63_@>Q&`9P`CVrM~1 zvd`w_H3m){{?eREZOauL=#;7Sx|AF3=r?cH=u_KwR~h9aH+-@v<5w0nw0ooRx9ux$ zmOFAwNb9yV`{&OdG_L#h&X2dZ35pohX-LMU+*_R%?-_r4@A^(BX9vyST-a{+(Sr1k z%fD((4WA^68yx+&z#7`U!T8&a`GY3ba+eCfFa!i&ehy< z8~u^=)w`kZp0D4(&9SCEGRn`WK6cI0+7>iepCroql|&8gWk%!R?zzs7#{BT}vSC|) zwCxBO7q~V)V)^|^Dm~}aOH;4EdSzPofO8!eywoFq#g1(;Z_k#Z>R2~ltWOffO|5>+ z4XnM)Xk48B^P?GknuN2V`z!owK0T{T$eX8TebKMs+`|>@E0XVI2Sog!Zb3H|eg5rR z_8#@L$lwmvjXCR+M4@HKpJ-t1Wk%!F@lFGCld>m{Yqt0EjL(}>%NJeAySm@D_|?`Q z&i!zL`pJpE(gwfTgu4-OX#B-RlUB|u2)t;m6rUuDo1p!e8~^Bn{`zN)$E~cyCxlL| zIb~+23%#cQyUM*EC;q-YaZsNb(qD^q1g!{)tr2~C%G>Rhzk0p-hU#RV^+#8ui`5+m zSHlnPaqpR;;%;B_r*zCMxUUqy<6Me|CSUxH88{xd^BVtL-y3(}VbCRkTq4Hp#&`mE zi^B=V4JP>K5;0rZjB9C7;u0|%TA<4wpFq22F*b_vgxO%hxGcdP;~>JgL<|}PBTZXf zBCJaUniyQ&Lbw4l-J`+I)f7!|&)VsswTWX{FhX~?(;E)}c(utx-xkb_U(Z_X85X}W|j7B??@k3m17Wel2HEMp9A(D0g%wd!SgjI5WOU~Solo?%#B zOUiM`+V0O$yv1gY0hGMYW%1_0Yd#LQD|(FyIGeo2d|<_sf<+Fu%6Xs5L+hnw4BBlh zW8jEojK`T)(iUD2Eph^hQ@^B~0PP``G3X(&j4?Rxz0QTkCd(La%|AiF_2hLfwA*-( ziKJ+e6MZ~>A=1`(h^#dpB4^EAk+;S}1jSyqKqH|g9?&Iei3fsqB5yws$ynlnpk%Ca zJpSdW1TN5xgxj^f#zes)hoBG4TMof9D3&pJ7Q`~f;#+{Ado1fB}|KYWVBXg7C1H9{fId~W1_08zw-*8l(j literal 0 HcmV?d00001