diff --git a/ChangeLog b/ChangeLog index 546c5658..2ef10600 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2022-01-11 Jay Berkenbilt + + * Bug fix: add missing characters from PDF doc encoding. + Fixes #606. + 2021-12-29 Jay Berkenbilt * Add method QUtil::file_can_be_opened diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index daa663a3..c71e7923 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -37,8 +37,20 @@ # include #endif -// First element is 128 +// First element is 24 +static unsigned short pdf_doc_low_to_unicode[] = { + 0x02d8, // 0x18 BREVE + 0x02c7, // 0x19 CARON + 0x02c6, // 0x1a MODIFIER LETTER CIRCUMFLEX ACCENT + 0x02d9, // 0x1b DOT ABOVE + 0x02dd, // 0x1c DOUBLE ACUTE ACCENT + 0x02db, // 0x1d OGONEK + 0x02da, // 0x1e RING ABOVE + 0x02dc, // 0x1f SMALL TILDE +}; +// First element is 127 static unsigned short pdf_doc_to_unicode[] = { + 0xfffd, // 0x7f UNDEFINED 0x2022, // 0x80 BULLET 0x2020, // 0x81 DAGGER 0x2021, // 0x82 DOUBLE DAGGER @@ -2032,6 +2044,30 @@ encode_pdfdoc(unsigned long codepoint) unsigned char ch = '\0'; switch (codepoint) { + case 0x02d8: + ch = 0x18; + break; + case 0x02c7: + ch = 0x19; + break; + case 0x02c6: + ch = 0x1a; + break; + case 0x02d9: + ch = 0x1b; + break; + case 0x02dd: + ch = 0x1c; + break; + case 0x02db: + ch = 0x1d; + break; + case 0x02da: + ch = 0x1e; + break; + case 0x02dc: + ch = 0x1f; + break; case 0x2022: ch = 0x80; break; @@ -2427,9 +2463,13 @@ QUtil::pdf_doc_to_utf8(std::string const& val) { unsigned char ch = static_cast(val.at(i)); unsigned short ch_short = ch; - if ((ch >= 128) && (ch <= 160)) + if ((ch >= 127) && (ch <= 160)) { - ch_short = pdf_doc_to_unicode[ch - 128]; + ch_short = pdf_doc_to_unicode[ch - 127]; + } + else if ((ch >= 24) && (ch <= 31)) + { + ch_short = pdf_doc_low_to_unicode[ch - 24]; } result += QUtil::toUTF8(ch_short); } diff --git a/libtests/qtest/qutil/qutil.out b/libtests/qtest/qutil/qutil.out index 90f1fd16..bcb89def 100644 --- a/libtests/qtest/qutil/qutil.out +++ b/libtests/qtest/qutil/qutil.out @@ -69,6 +69,7 @@ HAGOOGAMAGOOGLE: 0 Does * have fingers? ---- transcoding bidirectional pdf doc done +bidirectional pdf doc low done bidirectional win ansi done bidirectional mac roman done analysis done @@ -85,6 +86,8 @@ alternatives 2: 83a9e99e 0: 717561636b done alternatives +w˘wˇwˆw˙w˝w˛w˚w˜w�w +done low characters ---- whoami quack1 quack2 diff --git a/libtests/qutil.cc b/libtests/qutil.cc index 46eb840c..cd2b7796 100644 --- a/libtests/qutil.cc +++ b/libtests/qutil.cc @@ -308,12 +308,12 @@ void utf8_to_ascii_test() void transcoding_test(std::string (*to_utf8)(std::string const&), std::string (*from_utf8)(std::string const&, char), - int last, std::string unknown) + int first, int last, std::string unknown) { std::string in(" "); std::string out; std::string back; - for (int i = 128; i <= last; ++i) + for (int i = first; i <= last; ++i) { in.at(0) = static_cast(static_cast(i)); out = (*to_utf8)(in); @@ -355,13 +355,16 @@ void print_alternatives(std::string const& str) void transcoding_test() { transcoding_test(&QUtil::pdf_doc_to_utf8, - &QUtil::utf8_to_pdf_doc, 160, "\x9f"); + &QUtil::utf8_to_pdf_doc, 127, 160, "\x9f"); std::cout << "bidirectional pdf doc done" << std::endl; + transcoding_test(&QUtil::pdf_doc_to_utf8, + &QUtil::utf8_to_pdf_doc, 24, 31, "?"); + std::cout << "bidirectional pdf doc low done" << std::endl; transcoding_test(&QUtil::win_ansi_to_utf8, - &QUtil::utf8_to_win_ansi, 160, "?"); + &QUtil::utf8_to_win_ansi, 128, 160, "?"); std::cout << "bidirectional win ansi done" << std::endl; transcoding_test(&QUtil::mac_roman_to_utf8, - &QUtil::utf8_to_mac_roman, 255, "?"); + &QUtil::utf8_to_mac_roman, 128, 255, "?"); std::cout << "bidirectional mac roman done" << std::endl; check_analyze("pi = \317\200", true, true, false); check_analyze("pi != \317", true, false, false); @@ -396,6 +399,10 @@ void transcoding_test() print_alternatives(utf8); print_alternatives("quack"); std::cout << "done alternatives" << std::endl; + std::string low = QUtil::pdf_doc_to_utf8( + "w\030w\031w\032w\033w\034w\035w\036w\037w\177w"); + std::cout << low << std::endl; + std::cout << "done low characters" << std::endl; } void print_whoami(char const* str) diff --git a/qpdf/qtest/qpdf/json-image-streams-all.out b/qpdf/qtest/qpdf/json-image-streams-all.out index 3dea8852..fa5a211c 100644 --- a/qpdf/qtest/qpdf/json-image-streams-all.out +++ b/qpdf/qtest/qpdf/json-image-streams-all.out @@ -604,7 +604,7 @@ "trailer": { "/ID": [ "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶", - "'+“‰¤V2«PP ç`m\"\u001d" + "'+“‰¤V2«PP ç`m\"˛" ], "/Root": "1 0 R", "/Size": 31 diff --git a/qpdf/qtest/qpdf/json-image-streams-small.out b/qpdf/qtest/qpdf/json-image-streams-small.out index 92d0c4f3..be7aaabb 100644 --- a/qpdf/qtest/qpdf/json-image-streams-small.out +++ b/qpdf/qtest/qpdf/json-image-streams-small.out @@ -615,8 +615,8 @@ }, "trailer": { "/ID": [ - "Z§¯•Py»’~’46\u001dı\u0011¢", - "Z§¯•Py»’~’46\u001dı\u0011¢" + "Z§¯•Py»’~’46˛ı\u0011¢", + "Z§¯•Py»’~’46˛ı\u0011¢" ], "/Root": "1 0 R", "/Size": 31 diff --git a/qpdf/qtest/qpdf/json-image-streams-specialized.out b/qpdf/qtest/qpdf/json-image-streams-specialized.out index c342f9e6..50a1fc0d 100644 --- a/qpdf/qtest/qpdf/json-image-streams-specialized.out +++ b/qpdf/qtest/qpdf/json-image-streams-specialized.out @@ -604,7 +604,7 @@ "trailer": { "/ID": [ "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶", - "'+“‰¤V2«PP ç`m\"\u001d" + "'+“‰¤V2«PP ç`m\"˛" ], "/Root": "1 0 R", "/Size": 31 diff --git a/qpdf/qtest/qpdf/json-image-streams.out b/qpdf/qtest/qpdf/json-image-streams.out index 2cfbd531..ac8ca2b9 100644 --- a/qpdf/qtest/qpdf/json-image-streams.out +++ b/qpdf/qtest/qpdf/json-image-streams.out @@ -604,7 +604,7 @@ "trailer": { "/ID": [ "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶", - "'+“‰¤V2«PP ç`m\"\u001d" + "'+“‰¤V2«PP ç`m\"˛" ], "/Root": "1 0 R", "/Size": 31 diff --git a/qpdf/qtest/qpdf/json-page-labels-num-tree.out b/qpdf/qtest/qpdf/json-page-labels-num-tree.out index d0f73a61..cc474335 100644 --- a/qpdf/qtest/qpdf/json-page-labels-num-tree.out +++ b/qpdf/qtest/qpdf/json-page-labels-num-tree.out @@ -1518,8 +1518,8 @@ "99 0 R": 47, "trailer": { "/ID": [ - "’ù\u0019Þxtó¼\\·¯½\u001eŁ7»", - "\rþ\u0018©LÞ\u000fKýÈl\u0003¯\u0019\u0001\u000e" + "’ùˇÞxtó¼\\·¯½˚Ł7»", + "\rþ˘©LÞ\u000fKýÈl\u0003¯ˇ\u0001\u000e" ], "/Root": "1 0 R", "/Size": 100 diff --git a/qpdf/qtest/qpdf/page_api_2-json.out b/qpdf/qtest/qpdf/page_api_2-json.out index 172ce1c1..bef00d02 100644 --- a/qpdf/qtest/qpdf/page_api_2-json.out +++ b/qpdf/qtest/qpdf/page_api_2-json.out @@ -178,8 +178,8 @@ }, "trailer": { "/ID": [ - "û\u0018·ƒÿ{5⁄\u0005Ú−S*º‘o", - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý\u001f\u0002" + "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o", + "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002" ], "/Info": "2 0 R", "/Root": "1 0 R",