From 4bb3046f0b139337a00e9182c9b47d1a3f8f8bb3 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sat, 17 Feb 2018 18:47:57 -0500 Subject: [PATCH] Properly handle strings with PDF Doc Encoding (fixes #179) The QPDF_String::getUTF8Val() method was not treating strings that weren't explicitly Unicode as PDF Doc Encoded. This only affects characters in the range 0x80 through 0xa0. --- ChangeLog | 3 ++ examples/qtest/bookmarks.test | 8 ++++- examples/qtest/bookmarks/issue-179.out | 12 +++++++ examples/qtest/bookmarks/issue-179.pdf | Bin 0 -> 1824 bytes include/qpdf/QPDFObjectHandle.hh | 7 ++++ libqpdf/QPDF_String.cc | 45 ++++++++++++++++++++++++- qpdf/build.mk | 8 ++++- qpdf/qtest/qpdf.test | 10 ++++++ qpdf/qtest/qpdf/pdf-doc-to-utf8.in | 33 ++++++++++++++++++ qpdf/qtest/qpdf/pdf-doc-to-utf8.out | 33 ++++++++++++++++++ qpdf/test_pdf_doc_encoding.cc | 45 +++++++++++++++++++++++++ 11 files changed, 201 insertions(+), 3 deletions(-) create mode 100644 examples/qtest/bookmarks/issue-179.out create mode 100644 examples/qtest/bookmarks/issue-179.pdf create mode 100644 qpdf/qtest/qpdf/pdf-doc-to-utf8.in create mode 100644 qpdf/qtest/qpdf/pdf-doc-to-utf8.out create mode 100644 qpdf/test_pdf_doc_encoding.cc diff --git a/ChangeLog b/ChangeLog index 6c9e581e..8c78fb9d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ 2018-02-17 Jay Berkenbilt + * Fix QPDFObjectHandle::getUTF8Val() to properly handle strings + that are encoded with PDF Doc Encoding. Fixes #179. + * Add qpdf_check_pdf to the "C" API. This method just attempts to read the entire file and produce no output, making possible to assess whether the file has any errors that qpdf can detect. diff --git a/examples/qtest/bookmarks.test b/examples/qtest/bookmarks.test index 655be2e6..395357e3 100644 --- a/examples/qtest/bookmarks.test +++ b/examples/qtest/bookmarks.test @@ -48,4 +48,10 @@ $td->runtest("bookmarks deleted", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); -$td->report(10); +$td->runtest("non-trivial pdf doc to unicode", + {$td->COMMAND => "pdf-bookmarks issue-179.pdf"}, + {$td->FILE => "issue-179.out", + $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); + +$td->report(11); diff --git a/examples/qtest/bookmarks/issue-179.out b/examples/qtest/bookmarks/issue-179.out new file mode 100644 index 00000000..1ff8dec9 --- /dev/null +++ b/examples/qtest/bookmarks/issue-179.out @@ -0,0 +1,12 @@ +ž +žč +žđ +žć +žš +ž ajklyghvbnmxcseqwuioprtzdf +š +šč +šđ +šć +šž +š ajklyghvbnmxcseqwuioprtzdf diff --git a/examples/qtest/bookmarks/issue-179.pdf b/examples/qtest/bookmarks/issue-179.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e03d8c42b937f71b6906ed07690b5b8e1ad8b276 GIT binary patch literal 1824 zcmai#X;hPE7RMba#s)&L=%5AT8zHR9o0TLAVM&64h7tl+ED0EsV520_Bq*{75enl# zCCGAEiZ}`r(2DHJu$58~1jGP}QIS>xISOJ2vBOeeUhHvZj;H6$^XcB_`Ty^|=lSrv z7!Jd4H_nZWfMWtVOrjeB0mlJYKz{fr0`B1fhMqny2I&HoAXi=k<^o_) zj4E0pgL3kko(s@u2)I}#dS@hlSYgcx*%e}$>b*ZuERqO(<+0(A77LK5BsV+-pb&9x z6e=I&%2fiD7{G#Du~HtR5Q>!^9yI9uw*}?H9?(RGkA|qE0QfmTCV+ktg;E8ONEDDQ zmK{-j1>kUmwcBWwSOI{3(NKAYSSS~XX*8uuAr?px@Yrwr!+K>;{VY?1s7B=GAYp9#oo^OETno*proe`bH(XUqi%DI zy|7|`QD(V&y=ZGS>Mvo}5PmfQFbCez&iM`X6Zzly+8^s)|5Uxvt>KtY$IhtlmER!& zsppFd_VXJ;l}J6b$oli&HW%O&_ty!r*svcrN;eU8^&MPz&rjc!A>uRBa|sXCMP?Vh zZ^s&@lC7$@Y8=kCe8voBTjb{cKpEeA+5FO_6bGZvHsp4v$DzEkA6fV8w#W)w{NQNI z0D&iJvECGzP+JA#IW@au<2W2AqRW8G;Y5Z@_gG4;Pj&L}E|#5r_%Mv#_X1T(o#s7? zww^?jcI+(AU4Pb&xNkEo%NsWC%$yDmWd)l~+qQ)>H#yrRGc!W7=Q`8-^~KhS%Ez9s zNBVWMFBQOOLK7v@3!QeM?TR)FB^cCQeVulDcJ`LdV#@eHWPthUux=qMP8mlpMGUh zC9hms&dMMUB*icEPL$p#y}r~Q!QY(^e$hE*Y|p>{<^GzW1IxW4Bmd1pY}8m~ZE)zr zilyt(xp`7Cz|VW)xP`SM@3#2|S@ZXwB7Cknwli}+C4AlG{*l*p@5N0lJ9HBe%U7G( z9ltQV>iF?EW-6@u*}hM1K6;XH)_i!z>6|V1u0CIJF_&@Pfpl!Ds3f1UK~f$it<+2W z4ad3ljh0f={!Byo&xC{6-U}U7ce;1q%}>kG2=3GxHQb@b`*ZBd2@SWkk@GsUr0?f4 z#ljhz6DZbSi_C%}$DB3W^j*8oWf@g1r8l)m-uK){-pEM3Sy)kadE2dL+qHZRJ>Z)r zUm;;FAT{|05_p;psvgDHpa1oQ4~pgfIazz6cAHNQtku;bZgb-`nb(s|Y*!qAOI__j zkDbnr@eSABzN~U^M(LBF*fs8L#Hc+o^nS_7@|?JT#k2-B5B8kGI=-Dbm5AvET8CRe zqSsI!wJh`|m;?qkV)TYO4jL<^UtRr_2!+vG@7)k)sICWtTTGgjDEZ8iz)Q7S6$(6`_qeK>&(bW zR|@u0y;vrxi&;$)BWz*BTjQ{?0p)dPfm)UuU(qm6y*Gd^pQP&v+LF>v|E6@`nRu!f z(mv1FmG@ZJD!Lh#A4^d4KIQ8SA@*U%|C|Ax@*gv<%Ebr{()m6|O* zH@L4RA@H#_+n|QkfVU8D^$r901i*1{RfSBi24w7vz`Q=H`=L@$#p9)Qf+Gtc|lO-PY>ev^@@MRM<$H!lKH v1|#8}7M7)NT}({Y-9@6k|M24xaHUG1P{k_5QPBP);&ISA#4v;W5b%Ekr5OuG literal 0 HcmV?d00001 diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index 53b219ce..a2f54a73 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -442,6 +442,13 @@ class QPDFObjectHandle // Methods for string objects QPDF_DLL std::string getStringValue(); + // If a string starts with the UTF-16 marker, it is converted from + // UTF-16 to UTF-8. Otherwise, it is treated as a string encoded + // with PDF Doc Encoding. PDF Doc Encoding is identical to + // ISO-8859-1 except in the range from 0200 through 0240, where + // there is a mapping of characters to Unicode. QPDF versions + // prior to version erroneously left characters in that range + // unmapped. QPDF_DLL std::string getUTF8Value(); diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc index ca8d3adc..60a3e0df 100644 --- a/libqpdf/QPDF_String.cc +++ b/libqpdf/QPDF_String.cc @@ -8,6 +8,43 @@ // be used. #include +// First element is 128 +static unsigned short pdf_doc_to_unicode[] = { + 0x2022, // 0x80 BULLET + 0x2020, // 0x81 DAGGER + 0x2021, // 0x82 DOUBLE DAGGER + 0x2026, // 0x83 HORIZONTAL ELLIPSIS + 0x2014, // 0x84 EM DASH + 0x2013, // 0x85 EN DASH + 0x0192, // 0x86 SMALL LETTER F WITH HOOK + 0x2044, // 0x87 FRACTION SLASH (solidus) + 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 0x2212, // 0x8a MINUS SIGN + 0x2030, // 0x8b PER MILLE SIGN + 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase) + 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left) + 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright) + 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft) + 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright) + 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase) + 0x2122, // 0x92 TRADE MARK SIGN + 0xfb01, // 0x93 LATIN SMALL LIGATURE FI + 0xfb02, // 0x94 LATIN SMALL LIGATURE FL + 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE + 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE + 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON + 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS + 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON + 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I + 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE + 0x0153, // 0x9c LATIN SMALL LIGATURE OE + 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON + 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON + 0xfffd, // 0x9f UNDEFINED + 0x20ac, // 0xa0 EURO SIGN +}; + // See above about ctype. static bool is_ascii_printable(unsigned char ch) { @@ -209,7 +246,13 @@ QPDF_String::getUTF8Val() const { for (unsigned int i = 0; i < len; ++i) { - result += QUtil::toUTF8(static_cast(this->val.at(i))); + unsigned char ch = static_cast(this->val.at(i)); + unsigned short val = ch; + if ((ch >= 128) && (ch <= 160)) + { + val = pdf_doc_to_unicode[ch - 128]; + } + result += QUtil::toUTF8(val); } } return result; diff --git a/qpdf/build.mk b/qpdf/build.mk index 1bc21836..1692fc92 100644 --- a/qpdf/build.mk +++ b/qpdf/build.mk @@ -1,4 +1,10 @@ -BINS_qpdf = qpdf test_driver pdf_from_scratch test_large_file test_tokenizer +BINS_qpdf = \ + qpdf \ + pdf_from_scratch \ + test_driver \ + test_large_file \ + test_pdf_doc_encoding \ + test_tokenizer CBINS_qpdf = qpdf-ctest TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B))) diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index 6854e651..9e6c1a87 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -82,6 +82,16 @@ compare_pdfs("p1-a.pdf", "p1-a-p2-b.pdf", 1); compare_pdfs("p1-a-p2-a.pdf", "p1-a-p2-b.pdf", 1); flush_tiff_cache(); +show_ntests(); +# ---------- +$td->notify("--- PDF Doc Encoding ---"); +$n_tests += 1; + +$td->runtest("PDF doc encoding to Unicode", + {$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"}, + {$td->FILE => "pdf-doc-to-utf8.out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); + show_ntests(); # ---------- $td->notify("--- Stream Replacement Tests ---"); diff --git a/qpdf/qtest/qpdf/pdf-doc-to-utf8.in b/qpdf/qtest/qpdf/pdf-doc-to-utf8.in new file mode 100644 index 00000000..951f084f --- /dev/null +++ b/qpdf/qtest/qpdf/pdf-doc-to-utf8.in @@ -0,0 +1,33 @@ + 128 0x80 0200 U+2022 BULLET + 129 0x81 0201 U+2020 DAGGER + 130 0x82 0202 U+2021 DOUBLE DAGGER + 131 0x83 0203 U+2026 HORIZONTAL ELLIPSIS + 132 0x84 0204 U+2014 EM DASH + 133 0x85 0205 U+2013 EN DASH + 134 0x86 0206 U+0192 SMALL LETTER F WITH HOOK + 135 0x87 0207 U+2044 FRACTION SLASH (solidus) + 136 0x88 0210 U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 137 0x89 0211 U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 138 0x8a 0212 U+2212 MINUS SIGN + 139 0x8b 0213 U+2030 PER MILLE SIGN + 140 0x8c 0214 U+201E DOUBLE LOW-9 QUOTATION MARK (quotedblbase) + 141 0x8d 0215 U+201C LEFT DOUBLE QUOTATION MARK (double quote left) + 142 0x8e 0216 U+201D RIGHT DOUBLE QUOTATION MARK (quotedblright) + 143 0x8f 0217 U+2018 LEFT SINGLE QUOTATION MARK (quoteleft) + 144 0x90 0220 U+2019 RIGHT SINGLE QUOTATION MARK (quoteright) + 145 0x91 0221 U+201A SINGLE LOW-9 QUOTATION MARK (quotesinglbase) + 146 0x92 0222 U+2122 TRADE MARK SIGN + 147 0x93 0223 U+FB01 LATIN SMALL LIGATURE FI + 148 0x94 0224 U+FB02 LATIN SMALL LIGATURE FL + 149 0x95 0225 U+0141 LATIN CAPITAL LETTER L WITH STROKE + 150 0x96 0226 U+0152 LATIN CAPITAL LIGATURE OE + 151 0x97 0227 U+0160 LATIN CAPITAL LETTER S WITH CARON + 152 0x98 0230 U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS + 153 0x99 0231 U+017D LATIN CAPITAL LETTER Z WITH CARON + 154 0x9a 0232 U+0131 LATIN SMALL LETTER DOTLESS I + 155 0x9b 0233 U+0142 LATIN SMALL LETTER L WITH STROKE + 156 0x9c 0234 U+0153 LATIN SMALL LIGATURE OE + 157 0x9d 0235 U+0161 LATIN SMALL LETTER S WITH CARON + 158 0x9e 0236 U+017E LATIN SMALL LETTER Z WITH CARON + 159 0x9f 0237 U+FFFD UNDEFINED + 160 0xa0 0240 U+20AC EURO SIGN diff --git a/qpdf/qtest/qpdf/pdf-doc-to-utf8.out b/qpdf/qtest/qpdf/pdf-doc-to-utf8.out new file mode 100644 index 00000000..ee757ebe --- /dev/null +++ b/qpdf/qtest/qpdf/pdf-doc-to-utf8.out @@ -0,0 +1,33 @@ +• 128 0x80 0200 U+2022 BULLET +† 129 0x81 0201 U+2020 DAGGER +‡ 130 0x82 0202 U+2021 DOUBLE DAGGER +… 131 0x83 0203 U+2026 HORIZONTAL ELLIPSIS +— 132 0x84 0204 U+2014 EM DASH +– 133 0x85 0205 U+2013 EN DASH +ƒ 134 0x86 0206 U+0192 SMALL LETTER F WITH HOOK +⁄ 135 0x87 0207 U+2044 FRACTION SLASH (solidus) +‹ 136 0x88 0210 U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK +› 137 0x89 0211 U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +− 138 0x8a 0212 U+2212 MINUS SIGN +‰ 139 0x8b 0213 U+2030 PER MILLE SIGN +„ 140 0x8c 0214 U+201E DOUBLE LOW-9 QUOTATION MARK (quotedblbase) +“ 141 0x8d 0215 U+201C LEFT DOUBLE QUOTATION MARK (double quote left) +” 142 0x8e 0216 U+201D RIGHT DOUBLE QUOTATION MARK (quotedblright) +‘ 143 0x8f 0217 U+2018 LEFT SINGLE QUOTATION MARK (quoteleft) +’ 144 0x90 0220 U+2019 RIGHT SINGLE QUOTATION MARK (quoteright) +‚ 145 0x91 0221 U+201A SINGLE LOW-9 QUOTATION MARK (quotesinglbase) +™ 146 0x92 0222 U+2122 TRADE MARK SIGN +fi 147 0x93 0223 U+FB01 LATIN SMALL LIGATURE FI +fl 148 0x94 0224 U+FB02 LATIN SMALL LIGATURE FL +Ł 149 0x95 0225 U+0141 LATIN CAPITAL LETTER L WITH STROKE +Œ 150 0x96 0226 U+0152 LATIN CAPITAL LIGATURE OE +Š 151 0x97 0227 U+0160 LATIN CAPITAL LETTER S WITH CARON +Ÿ 152 0x98 0230 U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS +Ž 153 0x99 0231 U+017D LATIN CAPITAL LETTER Z WITH CARON +ı 154 0x9a 0232 U+0131 LATIN SMALL LETTER DOTLESS I +ł 155 0x9b 0233 U+0142 LATIN SMALL LETTER L WITH STROKE +œ 156 0x9c 0234 U+0153 LATIN SMALL LIGATURE OE +š 157 0x9d 0235 U+0161 LATIN SMALL LETTER S WITH CARON +ž 158 0x9e 0236 U+017E LATIN SMALL LETTER Z WITH CARON +� 159 0x9f 0237 U+FFFD UNDEFINED +€ 160 0xa0 0240 U+20AC EURO SIGN diff --git a/qpdf/test_pdf_doc_encoding.cc b/qpdf/test_pdf_doc_encoding.cc new file mode 100644 index 00000000..3be66509 --- /dev/null +++ b/qpdf/test_pdf_doc_encoding.cc @@ -0,0 +1,45 @@ +#include +#include +#include +#include +#include + +static char const* whoami = 0; + +void usage() +{ + std::cerr << "Usage: " << whoami << " infile" << std::endl; + exit(2); +} + +int main(int argc, char* argv[]) +{ + if ((whoami = strrchr(argv[0], '/')) == NULL) + { + whoami = argv[0]; + } + else + { + ++whoami; + } + // For libtool's sake.... + if (strncmp(whoami, "lt-", 3) == 0) + { + whoami += 3; + } + + if (argc != 2) + { + usage(); + } + char const* infilename = argv[1]; + std::list lines = + QUtil::read_lines_from_file(infilename); + for (std::list::iterator iter = lines.begin(); + iter != lines.end(); ++iter) + { + QPDFObjectHandle str = QPDFObjectHandle::newString(*iter); + std::cout << str.getUTF8Value() << std::endl; + } + return 0; +}