Add missing characters from PDF doc encoding (fixes #606)

This commit is contained in:
Jay Berkenbilt 2022-01-11 15:06:17 -05:00 committed by Jay Berkenbilt
parent 77c31305fe
commit 370710657a
10 changed files with 72 additions and 17 deletions

View File

@ -1,3 +1,8 @@
2022-01-11 Jay Berkenbilt <ejb@ql.org>
* Bug fix: add missing characters from PDF doc encoding.
Fixes #606.
2021-12-29 Jay Berkenbilt <ejb@ql.org>
* Add method QUtil::file_can_be_opened

View File

@ -37,8 +37,20 @@
# include <sys/stat.h>
#endif
// First element is 128
// First element is 24
static unsigned short pdf_doc_low_to_unicode[] = {
0x02d8, // 0x18 BREVE
0x02c7, // 0x19 CARON
0x02c6, // 0x1a MODIFIER LETTER CIRCUMFLEX ACCENT
0x02d9, // 0x1b DOT ABOVE
0x02dd, // 0x1c DOUBLE ACUTE ACCENT
0x02db, // 0x1d OGONEK
0x02da, // 0x1e RING ABOVE
0x02dc, // 0x1f SMALL TILDE
};
// First element is 127
static unsigned short pdf_doc_to_unicode[] = {
0xfffd, // 0x7f UNDEFINED
0x2022, // 0x80 BULLET
0x2020, // 0x81 DAGGER
0x2021, // 0x82 DOUBLE DAGGER
@ -2032,6 +2044,30 @@ encode_pdfdoc(unsigned long codepoint)
unsigned char ch = '\0';
switch (codepoint)
{
case 0x02d8:
ch = 0x18;
break;
case 0x02c7:
ch = 0x19;
break;
case 0x02c6:
ch = 0x1a;
break;
case 0x02d9:
ch = 0x1b;
break;
case 0x02dd:
ch = 0x1c;
break;
case 0x02db:
ch = 0x1d;
break;
case 0x02da:
ch = 0x1e;
break;
case 0x02dc:
ch = 0x1f;
break;
case 0x2022:
ch = 0x80;
break;
@ -2427,9 +2463,13 @@ QUtil::pdf_doc_to_utf8(std::string const& val)
{
unsigned char ch = static_cast<unsigned char>(val.at(i));
unsigned short ch_short = ch;
if ((ch >= 128) && (ch <= 160))
if ((ch >= 127) && (ch <= 160))
{
ch_short = pdf_doc_to_unicode[ch - 128];
ch_short = pdf_doc_to_unicode[ch - 127];
}
else if ((ch >= 24) && (ch <= 31))
{
ch_short = pdf_doc_low_to_unicode[ch - 24];
}
result += QUtil::toUTF8(ch_short);
}

View File

@ -69,6 +69,7 @@ HAGOOGAMAGOOGLE: 0
<c0>Does * have fingers?
---- transcoding
bidirectional pdf doc done
bidirectional pdf doc low done
bidirectional win ansi done
bidirectional mac roman done
analysis done
@ -85,6 +86,8 @@ alternatives
2: 83a9e99e
0: 717561636b
done alternatives
w˘wˇwˆw˙w˝w˛w˚w˜w<EFBFBD>w
done low characters
---- whoami
quack1
quack2

View File

@ -308,12 +308,12 @@ void utf8_to_ascii_test()
void transcoding_test(std::string (*to_utf8)(std::string const&),
std::string (*from_utf8)(std::string const&, char),
int last, std::string unknown)
int first, int last, std::string unknown)
{
std::string in(" ");
std::string out;
std::string back;
for (int i = 128; i <= last; ++i)
for (int i = first; i <= last; ++i)
{
in.at(0) = static_cast<char>(static_cast<unsigned char>(i));
out = (*to_utf8)(in);
@ -355,13 +355,16 @@ void print_alternatives(std::string const& str)
void transcoding_test()
{
transcoding_test(&QUtil::pdf_doc_to_utf8,
&QUtil::utf8_to_pdf_doc, 160, "\x9f");
&QUtil::utf8_to_pdf_doc, 127, 160, "\x9f");
std::cout << "bidirectional pdf doc done" << std::endl;
transcoding_test(&QUtil::pdf_doc_to_utf8,
&QUtil::utf8_to_pdf_doc, 24, 31, "?");
std::cout << "bidirectional pdf doc low done" << std::endl;
transcoding_test(&QUtil::win_ansi_to_utf8,
&QUtil::utf8_to_win_ansi, 160, "?");
&QUtil::utf8_to_win_ansi, 128, 160, "?");
std::cout << "bidirectional win ansi done" << std::endl;
transcoding_test(&QUtil::mac_roman_to_utf8,
&QUtil::utf8_to_mac_roman, 255, "?");
&QUtil::utf8_to_mac_roman, 128, 255, "?");
std::cout << "bidirectional mac roman done" << std::endl;
check_analyze("pi = \317\200", true, true, false);
check_analyze("pi != \317", true, false, false);
@ -396,6 +399,10 @@ void transcoding_test()
print_alternatives(utf8);
print_alternatives("quack");
std::cout << "done alternatives" << std::endl;
std::string low = QUtil::pdf_doc_to_utf8(
"w\030w\031w\032w\033w\034w\035w\036w\037w\177w");
std::cout << low << std::endl;
std::cout << "done low characters" << std::endl;
}
void print_whoami(char const* str)

View File

@ -604,7 +604,7 @@
"trailer": {
"/ID": [
"S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
"'+“‰¤V2«PP ç`m\"\u001d"
"'+“‰¤V2«PP ç`m\"˛"
],
"/Root": "1 0 R",
"/Size": 31

View File

@ -615,8 +615,8 @@
},
"trailer": {
"/ID": [
"Z§¯•Py»~46\u001dı\u0011¢",
"Z§¯•Py»~46\u001dı\u0011¢"
"Z§¯•Py»~46˛ı\u0011¢",
"Z§¯•Py»~46˛ı\u0011¢"
],
"/Root": "1 0 R",
"/Size": 31

View File

@ -604,7 +604,7 @@
"trailer": {
"/ID": [
"S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
"'+“‰¤V2«PP ç`m\"\u001d"
"'+“‰¤V2«PP ç`m\"˛"
],
"/Root": "1 0 R",
"/Size": 31

View File

@ -604,7 +604,7 @@
"trailer": {
"/ID": [
"S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
"'+“‰¤V2«PP ç`m\"\u001d"
"'+“‰¤V2«PP ç`m\"˛"
],
"/Root": "1 0 R",
"/Size": 31

View File

@ -1518,8 +1518,8 @@
"99 0 R": 47,
"trailer": {
"/ID": [
"’ù\u0019Þxtó¼\\·¯½\u001eŁ7»",
"\rþ\u0018©LÞ\u000fKýÈl\u0003¯\u0019\u0001\u000e"
"’ùˇÞxtó¼\\·¯½˚Ł7»",
"\rþ˘©LÞ\u000fKýÈl\u0003¯ˇ\u0001\u000e"
],
"/Root": "1 0 R",
"/Size": 100

View File

@ -178,8 +178,8 @@
},
"trailer": {
"/ID": [
\u0018·ƒÿ{5\u0005ÚS*ºo",
"÷\u0017ž³QY¿ÔÀ\u000f\u0012¼ý\u001f\u0002"
˘·ƒÿ{5\u0005ÚS*ºo",
"÷\u0017ž³QY¿ÔÀ\u000f\u0012¼ý˜\u0002"
],
"/Info": "2 0 R",
"/Root": "1 0 R",