From a478cbb6dc0e630b919813ad0e7ae1a72510c69d Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Tue, 15 Feb 2022 15:56:06 -0500 Subject: [PATCH] Silently/transparently recognize UTF-16LE as UTF-16 (fixes #649) The PDF spec only allows UTF-16BE, but most readers seem to accept UTF-16LE as well, so now qpdf does too. --- ChangeLog | 4 ++++ include/qpdf/QUtil.hh | 15 ++++++++++----- libqpdf/QUtil.cc | 14 +++++++++++--- libtests/qtest/qutil/qutil.out | 1 + libtests/qutil.cc | 4 +++- qpdf/qtest/qpdf.test | 9 ++++++++- qpdf/qtest/qpdf/utf16le-attachments.out | 8 ++++++++ qpdf/qtest/qpdf/utf16le.pdf | Bin 0 -> 3805 bytes 8 files changed, 45 insertions(+), 10 deletions(-) create mode 100644 qpdf/qtest/qpdf/utf16le-attachments.out create mode 100644 qpdf/qtest/qpdf/utf16le.pdf diff --git a/ChangeLog b/ChangeLog index 894504c0..08cb1b16 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2022-02-15 Jay Berkenbilt + * When analyzing PDF strings, recognize UTF-16LE as UTF-16. The + PDF spec only allows UTF-16BE, but most readers seem to allow + both. Fixes #649. + * Bug fix: 10.6.0 inadvertently removed an unknown/undocumented CLI parsing feature, which has been restored in 10.6.2. Fixes #652. diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh index b4cb1f6a..c1c22110 100644 --- a/include/qpdf/QUtil.hh +++ b/include/qpdf/QUtil.hh @@ -267,8 +267,11 @@ namespace QUtil QPDF_DLL std::string toUTF16(unsigned long uval); - // Test whether this is a UTF-16 big-endian string. This is - // indicated by first two bytes being 0xFE 0xFF. + // Test whether this is a UTF-16 string. This is indicated by + // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE + // (little-endian). Starting in qpdf 10.6.2, this detects + // little-endian as well as big-endian. Even though the PDF spec + // doesn't allow little-endian, most readers seem to accept it. QPDF_DLL bool is_utf16(std::string const&); @@ -309,8 +312,8 @@ namespace QUtil bool utf8_to_pdf_doc( std::string const& utf8, std::string& pdfdoc, char unknown_char = '?'); - // Convert a UTF-16 big-endian encoded string to UTF-8. - // Unrepresentable code points are converted to U+FFFD. + // Convert a UTF-16 encoded string to UTF-8. Unrepresentable code + // points are converted to U+FFFD. QPDF_DLL std::string utf16_to_utf8(std::string const& utf16); @@ -331,7 +334,9 @@ namespace QUtil // help us guess. If there are no characters with the high bit // set, has_8bit_chars is false, and the other values are also // false, even though ASCII strings are valid UTF-8. is_valid_utf8 - // means that the string is non-trivially valid UTF-8. + // means that the string is non-trivially valid UTF-8. Although + // the PDF spec requires UTF-16 to be UTF-16BE, qpdf (and just + // about everything else) accepts UTF-16LE (as of 10.6.2). QPDF_DLL void analyze_encoding(std::string const& str, bool& has_8bit_chars, diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index f01746b6..d0802334 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -2400,7 +2400,8 @@ bool QUtil::is_utf16(std::string const& val) { return ((val.length() >= 2) && - (val.at(0) == '\xfe') && (val.at(1) == '\xff')); + (((val.at(0) == '\xfe') && (val.at(1) == '\xff')) || + ((val.at(0) == '\xff') && (val.at(1) == '\xfe')))); } std::string @@ -2414,8 +2415,13 @@ QUtil::utf16_to_utf8(std::string const& val) unsigned long codepoint = 0L; size_t len = val.length(); size_t start = 0; + bool is_le = false; if (is_utf16(val)) { + if (static_cast(val.at(0)) == 0xff) + { + is_le = true; + } start += 2; } // If the string has an odd number of bytes, the last byte is @@ -2428,10 +2434,12 @@ QUtil::utf16_to_utf8(std::string const& val) // codepoint not followed by a low codepoint will be // discarded, and a low codepoint not preceded by a high // codepoint will just get its low 10 bits output. + auto msb = is_le ? i+1 : i; + auto lsb = is_le ? i : i+1; unsigned short bits = QIntC::to_ushort( - (static_cast(val.at(i)) << 8) + - static_cast(val.at(i+1))); + (static_cast(val.at(msb)) << 8) + + static_cast(val.at(lsb))); if ((bits & 0xFC00) == 0xD800) { codepoint = 0x10000U + ((bits & 0x3FFU) << 10U); diff --git a/libtests/qtest/qutil/qutil.out b/libtests/qtest/qutil/qutil.out index fa284237..fc6a0df1 100644 --- a/libtests/qtest/qutil/qutil.out +++ b/libtests/qtest/qutil/qutil.out @@ -63,6 +63,7 @@ HAGOOGAMAGOOGLE: 0 0x80000000 -> ff fd π π +LE: π ---- utf8_to_ascii ¿Does π have fingers? ?Does ? have fingers? diff --git a/libtests/qutil.cc b/libtests/qutil.cc index 2e4d9cdd..a1340c0e 100644 --- a/libtests/qutil.cc +++ b/libtests/qutil.cc @@ -303,6 +303,7 @@ void to_utf16_test() std::string s(QUtil::utf8_to_utf16("\xcf\x80")); std::cout << QUtil::utf16_to_utf8(s) << std::endl; std::cout << QUtil::utf16_to_utf8(s + ".") << std::endl; + std::cout << "LE: " << QUtil::utf16_to_utf8("\xff\xfe\xc0\x03") << std::endl; } void utf8_to_ascii_test() @@ -388,7 +389,8 @@ void transcoding_test() check_analyze("pi = \317\200", true, true, false); check_analyze("pi != \317", true, false, false); check_analyze("pi != 22/7", false, false, false); - check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true); + check_analyze(std::string("\xfe\xff\x00\x51", 4), true, false, true); + check_analyze(std::string("\xff\xfe\x51\x00", 4), true, false, true); std::cout << "analysis done" << std::endl; std::string input1("a\302\277b"); std::string input2("a\317\200b"); diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index 14205d88..16921a27 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -73,7 +73,7 @@ flush_tiff_cache(); show_ntests(); # ---------- $td->notify("--- Character Encoding ---"); -$n_tests += 3; +$n_tests += 4; $td->runtest("PDF doc encoding to Unicode", {$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"}, @@ -88,6 +88,13 @@ $td->runtest("UTF-16 encoding errors", {$td->FILE => "unicode-errors.out", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); +# UTF-16LE is not allowed by the PDF spec, but it seems that most +# readers accept it. +$td->runtest("UTF-16LE strings", + {$td->COMMAND => "qpdf --list-attachments --verbose utf16le.pdf"}, + {$td->FILE => "utf16le-attachments.out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); + # Tests to exercise QPDFArgParser belong in arg_parser.test in # libtests. These tests are supposed to be specific to the qpdf cli. # Since they were written prior to moving QPDFArgParser into the diff --git a/qpdf/qtest/qpdf/utf16le-attachments.out b/qpdf/qtest/qpdf/utf16le-attachments.out new file mode 100644 index 00000000..74abc20b --- /dev/null +++ b/qpdf/qtest/qpdf/utf16le-attachments.out @@ -0,0 +1,8 @@ +potato.png -> 6,0 + preferred name: π.png + all names: + /F -> π.png + /UF -> π.png + all data streams: + /F -> 6,0 + /UF -> 6,0 diff --git a/qpdf/qtest/qpdf/utf16le.pdf b/qpdf/qtest/qpdf/utf16le.pdf new file mode 100644 index 0000000000000000000000000000000000000000..17c7f2bc0e92e1b1503e3f8b837c7ab56e2114ee GIT binary patch literal 3805 zcma)9c{r47|F>kDI8h=co@9x{td=a1B_#X4#4OL)XEZbRJvsJl$J!gRWl1Q9kR*F2 z6(vO!l{GX8WqqGf&gp%-uHXB6W}fT0ukZc&uJ`@N=ObaFt$P9?hk{A8tQ2g*5C9w? z zl4#WLJ5&I*jg+?t@P;5-SQ^%o>;gH$KoWtGgc0aJli35uyvZ~yjV$L)a*+nm+y0I} zxiBK`xM-1mNHhQe$e(p1Pyu@Y^8@`OAEP+xPp*t$JFYsq+kNQ*3fn&NmOCp(SI{1j zNCfe4I0^~E;fioLTmkwjKnrO1do$aU@CNZv=l>`CpJo}=?WETtlW5y>Qv9C&97u4( zo+kT4O~9d_0s;vrDkA|$#vCaSaA%NaAeHPx!T&fFT`0=;HV}Z)cA)Jup^)+BAPrFd z+w#3)#yEbO2BYzva-bwwukRyt^E8F+M~OYa%Vr@ zSK5z|U>7|c@E(1+x6byQv@+O%}#svwK~HC z33~)(hUl(`pZ9B=JGH#5#$Fgfr=J{aT=mkoELSW~?@Leb`w$gT92KF{piQ?d$eJ}Z z?3uVndu&8AG^QE$6s2vUyTx(YyHaAY*#}aOJCm;nKg`;5ac_+0FDIJ@Ai#p)i-`!A-9-=s z4o-3EY2Be+iqElo;xZ^$GYe?!x}aUE$)CYVh|~Bz<}dyWrq6U7a$>XuFPNC?2pf3u z9b&hIXL3=6;2crEQC;EDFzPY5MkKQgcYee!)?HaT5w_x4I=tQoSi112FjxH%n&3B=SZBjYaTqf1h(_u`Sh&D10(P ztBX4ILby>V`uEAFHnO(^?yG*)&4; zI`M?0W;~sTn>FcLSd;Bm)ogF#46(NCjCu2h72=(x60FgDH=X8J=G5RY7)`3J^52E* zo2S2_c!0v23JJ}px!cwkOtB}0HqIVMnArO&%Kt&(HzTXnr4!iU9L2LI+oEtRH&(qj zyUNG0#br*UoDQDP*HG>!o0nB;N1k~g(OFUt3R1a>yVs9P^LzKE_=#z~tlAP=iOWc{ zo>c)eKYQzfO2(9>+VR0`Cuqqqev1GfHT-j$Jq} zU-!5ep~J(ovNXpQ)j&J9s#f zZOjITZg|{Y3YF&J5lgx8v1vrW?!n_l!|1MsfuP9}kxh3Ip_c0VesT+s3^Be*Tb{Ra zmG=41ee8d@+Bjh3E902q8&XWK{N#af^I7!7+AHZVUDpLuo_jnQ zS_#0#T5gE?lmada(Y(Hr6L@ubA=j*{CjOC~AKElC%X7NwErcmzjkeh?BQ7$NEx{wJ zk+_W9yQ{-QG29f4_&EXIb0%uSD4y-V0Zc-)wg->|k3HCST3Be_FCq zQqKCBr#EO*38QQkEnz(w!i6Ru7o`wBB&=kmaSG@#8{1tptWlEMy$J6m_Dx*sZVEs+ z=0pVz{k4x9{+=mPqd&RT)S6l{S#mzDJ7d1OLENk~us(L6w813AzdK6dBVI92fS}X-;55!i5!-#%UP2NdvipsC|oWK2Q zYKR;rPx1+_vwolLwJF zie59I?S3QNAXKv`qNBp=AGnxSof07S&c?#MGrW1kPqOe(35t6@LrwJZHz4t4g9?o& z|F&L8*fRy>tvMUx+6encQJ2|O=`Z|=5&L^gl0H>SSB%^c?{G0A9eU-zOso%{dU8bp zQ>k7bS^BcXAy1yI#7;S=RxdmKo>h8<@wa^4IwRhpfDUp|~oKZdNw&G&Y@+kHCVN&tDtMe75%(n$> z7XB{Aa_n=pKw^q-LB`c2689gwyN87qF(c)!p*>0OK+XMqL%Cs=RfCN*i>(g7Vc_CG z;mpgPu`*b^7bfLaqmN6&y>AcMhlSRjoGkg{USd}Auy2Nl+8z#eaUH}>cY@-P+JR4U zTFUsf`fD}g#X*ZJU5|s&{KJIb-ul_p&n3uNv<>TQ-rIeoLpn6&t<=QARmV*C99S(5 z1rvK{wFH+h-S};9LVQcE${xYefIoj*F&5As^-#?S&^-ELXg?RH^Vc^0)g9>sMtrHp;o8D9PBDuPd!yP;`t(V(AV|UtEC!)+*|(jYIe%Wjm{(sUY1-{Q~L0H zVwU41%f@kE+(urE%t&NGWZEBgnr!$Y5u+E4uhB#CBX=poUiB-Xi85S5qjcv4p3xYX zs0BB3l6r!Y>cc(NV~JAp*=Bx`AsL5WHj@R*?Gh>>a!?zWXKAzb+dvKuj;(==`;F{L?U`#>S%}Cadq}e}%dtCFIQ5j+JRn~?H zYc*SC7M22Q?`sT)C7P?d=Kt!@Z`+8n@Vz_#Aiw`l55rMP4Q5DW_@rG92Fx3N<+ik0a6Z97)mEh@)3zIM7$hRPKe85v<9XZ}eUEZn0iZS#E*E zzz^Q`Ll!Vd@eVcp!TQku56(x0jhUG|^+Q>}k*1vbthag-!@D((h_iLS;Ov&ym+cv} z;OrTv@QkrA{l(1H{YzE`gWkz{8cPMY*|oeL=;;g6-0)aP1d$+`=|*w^Rco_IEI1~~A75|?&6bc2M z(|_5}Kj)xPu#jA&Y%3HqGMS;&7`jn@o9ZLrfV{poV6Tc(QbeM$1T-FrM56I>P;s`jD5*C3~|KEsheMqHYDYWe&qR