2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-05-31 17:30:54 +00:00

Add missing characters from PDF doc encoding (fixes #606)

This commit is contained in:
Jay Berkenbilt 2022-01-11 15:06:17 -05:00 committed by Jay Berkenbilt
parent 77c31305fe
commit 370710657a
10 changed files with 72 additions and 17 deletions

View File

@ -1,3 +1,8 @@
2022-01-11 Jay Berkenbilt <ejb@ql.org>
* Bug fix: add missing characters from PDF doc encoding.
Fixes #606.
2021-12-29 Jay Berkenbilt <ejb@ql.org> 2021-12-29 Jay Berkenbilt <ejb@ql.org>
* Add method QUtil::file_can_be_opened * Add method QUtil::file_can_be_opened

View File

@ -37,8 +37,20 @@
# include <sys/stat.h> # include <sys/stat.h>
#endif #endif
// First element is 128 // First element is 24
static unsigned short pdf_doc_low_to_unicode[] = {
0x02d8, // 0x18 BREVE
0x02c7, // 0x19 CARON
0x02c6, // 0x1a MODIFIER LETTER CIRCUMFLEX ACCENT
0x02d9, // 0x1b DOT ABOVE
0x02dd, // 0x1c DOUBLE ACUTE ACCENT
0x02db, // 0x1d OGONEK
0x02da, // 0x1e RING ABOVE
0x02dc, // 0x1f SMALL TILDE
};
// First element is 127
static unsigned short pdf_doc_to_unicode[] = { static unsigned short pdf_doc_to_unicode[] = {
0xfffd, // 0x7f UNDEFINED
0x2022, // 0x80 BULLET 0x2022, // 0x80 BULLET
0x2020, // 0x81 DAGGER 0x2020, // 0x81 DAGGER
0x2021, // 0x82 DOUBLE DAGGER 0x2021, // 0x82 DOUBLE DAGGER
@ -2032,6 +2044,30 @@ encode_pdfdoc(unsigned long codepoint)
unsigned char ch = '\0'; unsigned char ch = '\0';
switch (codepoint) switch (codepoint)
{ {
case 0x02d8:
ch = 0x18;
break;
case 0x02c7:
ch = 0x19;
break;
case 0x02c6:
ch = 0x1a;
break;
case 0x02d9:
ch = 0x1b;
break;
case 0x02dd:
ch = 0x1c;
break;
case 0x02db:
ch = 0x1d;
break;
case 0x02da:
ch = 0x1e;
break;
case 0x02dc:
ch = 0x1f;
break;
case 0x2022: case 0x2022:
ch = 0x80; ch = 0x80;
break; break;
@ -2427,9 +2463,13 @@ QUtil::pdf_doc_to_utf8(std::string const& val)
{ {
unsigned char ch = static_cast<unsigned char>(val.at(i)); unsigned char ch = static_cast<unsigned char>(val.at(i));
unsigned short ch_short = ch; unsigned short ch_short = ch;
if ((ch >= 128) && (ch <= 160)) if ((ch >= 127) && (ch <= 160))
{ {
ch_short = pdf_doc_to_unicode[ch - 128]; ch_short = pdf_doc_to_unicode[ch - 127];
}
else if ((ch >= 24) && (ch <= 31))
{
ch_short = pdf_doc_low_to_unicode[ch - 24];
} }
result += QUtil::toUTF8(ch_short); result += QUtil::toUTF8(ch_short);
} }

View File

@ -69,6 +69,7 @@ HAGOOGAMAGOOGLE: 0
<c0>Does * have fingers? <c0>Does * have fingers?
---- transcoding ---- transcoding
bidirectional pdf doc done bidirectional pdf doc done
bidirectional pdf doc low done
bidirectional win ansi done bidirectional win ansi done
bidirectional mac roman done bidirectional mac roman done
analysis done analysis done
@ -85,6 +86,8 @@ alternatives
2: 83a9e99e 2: 83a9e99e
0: 717561636b 0: 717561636b
done alternatives done alternatives
w˘wˇwˆw˙w˝w˛w˚w˜w<EFBFBD>w
done low characters
---- whoami ---- whoami
quack1 quack1
quack2 quack2

View File

@ -308,12 +308,12 @@ void utf8_to_ascii_test()
void transcoding_test(std::string (*to_utf8)(std::string const&), void transcoding_test(std::string (*to_utf8)(std::string const&),
std::string (*from_utf8)(std::string const&, char), std::string (*from_utf8)(std::string const&, char),
int last, std::string unknown) int first, int last, std::string unknown)
{ {
std::string in(" "); std::string in(" ");
std::string out; std::string out;
std::string back; std::string back;
for (int i = 128; i <= last; ++i) for (int i = first; i <= last; ++i)
{ {
in.at(0) = static_cast<char>(static_cast<unsigned char>(i)); in.at(0) = static_cast<char>(static_cast<unsigned char>(i));
out = (*to_utf8)(in); out = (*to_utf8)(in);
@ -355,13 +355,16 @@ void print_alternatives(std::string const& str)
void transcoding_test() void transcoding_test()
{ {
transcoding_test(&QUtil::pdf_doc_to_utf8, transcoding_test(&QUtil::pdf_doc_to_utf8,
&QUtil::utf8_to_pdf_doc, 160, "\x9f"); &QUtil::utf8_to_pdf_doc, 127, 160, "\x9f");
std::cout << "bidirectional pdf doc done" << std::endl; std::cout << "bidirectional pdf doc done" << std::endl;
transcoding_test(&QUtil::pdf_doc_to_utf8,
&QUtil::utf8_to_pdf_doc, 24, 31, "?");
std::cout << "bidirectional pdf doc low done" << std::endl;
transcoding_test(&QUtil::win_ansi_to_utf8, transcoding_test(&QUtil::win_ansi_to_utf8,
&QUtil::utf8_to_win_ansi, 160, "?"); &QUtil::utf8_to_win_ansi, 128, 160, "?");
std::cout << "bidirectional win ansi done" << std::endl; std::cout << "bidirectional win ansi done" << std::endl;
transcoding_test(&QUtil::mac_roman_to_utf8, transcoding_test(&QUtil::mac_roman_to_utf8,
&QUtil::utf8_to_mac_roman, 255, "?"); &QUtil::utf8_to_mac_roman, 128, 255, "?");
std::cout << "bidirectional mac roman done" << std::endl; std::cout << "bidirectional mac roman done" << std::endl;
check_analyze("pi = \317\200", true, true, false); check_analyze("pi = \317\200", true, true, false);
check_analyze("pi != \317", true, false, false); check_analyze("pi != \317", true, false, false);
@ -396,6 +399,10 @@ void transcoding_test()
print_alternatives(utf8); print_alternatives(utf8);
print_alternatives("quack"); print_alternatives("quack");
std::cout << "done alternatives" << std::endl; std::cout << "done alternatives" << std::endl;
std::string low = QUtil::pdf_doc_to_utf8(
"w\030w\031w\032w\033w\034w\035w\036w\037w\177w");
std::cout << low << std::endl;
std::cout << "done low characters" << std::endl;
} }
void print_whoami(char const* str) void print_whoami(char const* str)

View File

@ -604,7 +604,7 @@
"trailer": { "trailer": {
"/ID": [ "/ID": [
"S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶", "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
"'+“‰¤V2«PP ç`m\"\u001d" "'+“‰¤V2«PP ç`m\"˛"
], ],
"/Root": "1 0 R", "/Root": "1 0 R",
"/Size": 31 "/Size": 31

View File

@ -615,8 +615,8 @@
}, },
"trailer": { "trailer": {
"/ID": [ "/ID": [
"Z§¯•Py»~46\u001dı\u0011¢", "Z§¯•Py»~46˛ı\u0011¢",
"Z§¯•Py»~46\u001dı\u0011¢" "Z§¯•Py»~46˛ı\u0011¢"
], ],
"/Root": "1 0 R", "/Root": "1 0 R",
"/Size": 31 "/Size": 31

View File

@ -604,7 +604,7 @@
"trailer": { "trailer": {
"/ID": [ "/ID": [
"S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶", "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
"'+“‰¤V2«PP ç`m\"\u001d" "'+“‰¤V2«PP ç`m\"˛"
], ],
"/Root": "1 0 R", "/Root": "1 0 R",
"/Size": 31 "/Size": 31

View File

@ -604,7 +604,7 @@
"trailer": { "trailer": {
"/ID": [ "/ID": [
"S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶", "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
"'+“‰¤V2«PP ç`m\"\u001d" "'+“‰¤V2«PP ç`m\"˛"
], ],
"/Root": "1 0 R", "/Root": "1 0 R",
"/Size": 31 "/Size": 31

View File

@ -1518,8 +1518,8 @@
"99 0 R": 47, "99 0 R": 47,
"trailer": { "trailer": {
"/ID": [ "/ID": [
"’ù\u0019Þxtó¼\\·¯½\u001eŁ7»", "’ùˇÞxtó¼\\·¯½˚Ł7»",
"\rþ\u0018©LÞ\u000fKýÈl\u0003¯\u0019\u0001\u000e" "\rþ˘©LÞ\u000fKýÈl\u0003¯ˇ\u0001\u000e"
], ],
"/Root": "1 0 R", "/Root": "1 0 R",
"/Size": 100 "/Size": 100

View File

@ -178,8 +178,8 @@
}, },
"trailer": { "trailer": {
"/ID": [ "/ID": [
\u0018·ƒÿ{5\u0005ÚS*ºo", ˘·ƒÿ{5\u0005ÚS*ºo",
"÷\u0017ž³QY¿ÔÀ\u000f\u0012¼ý\u001f\u0002" "÷\u0017ž³QY¿ÔÀ\u000f\u0012¼ý˜\u0002"
], ],
"/Info": "2 0 R", "/Info": "2 0 R",
"/Root": "1 0 R", "/Root": "1 0 R",