mirror of
https://github.com/qpdf/qpdf.git
synced 2024-05-31 17:30:54 +00:00
Add missing characters from PDF doc encoding (fixes #606)
This commit is contained in:
parent
77c31305fe
commit
370710657a
|
@ -1,3 +1,8 @@
|
||||||
|
2022-01-11 Jay Berkenbilt <ejb@ql.org>
|
||||||
|
|
||||||
|
* Bug fix: add missing characters from PDF doc encoding.
|
||||||
|
Fixes #606.
|
||||||
|
|
||||||
2021-12-29 Jay Berkenbilt <ejb@ql.org>
|
2021-12-29 Jay Berkenbilt <ejb@ql.org>
|
||||||
|
|
||||||
* Add method QUtil::file_can_be_opened
|
* Add method QUtil::file_can_be_opened
|
||||||
|
|
|
@ -37,8 +37,20 @@
|
||||||
# include <sys/stat.h>
|
# include <sys/stat.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// First element is 128
|
// First element is 24
|
||||||
|
static unsigned short pdf_doc_low_to_unicode[] = {
|
||||||
|
0x02d8, // 0x18 BREVE
|
||||||
|
0x02c7, // 0x19 CARON
|
||||||
|
0x02c6, // 0x1a MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||||
|
0x02d9, // 0x1b DOT ABOVE
|
||||||
|
0x02dd, // 0x1c DOUBLE ACUTE ACCENT
|
||||||
|
0x02db, // 0x1d OGONEK
|
||||||
|
0x02da, // 0x1e RING ABOVE
|
||||||
|
0x02dc, // 0x1f SMALL TILDE
|
||||||
|
};
|
||||||
|
// First element is 127
|
||||||
static unsigned short pdf_doc_to_unicode[] = {
|
static unsigned short pdf_doc_to_unicode[] = {
|
||||||
|
0xfffd, // 0x7f UNDEFINED
|
||||||
0x2022, // 0x80 BULLET
|
0x2022, // 0x80 BULLET
|
||||||
0x2020, // 0x81 DAGGER
|
0x2020, // 0x81 DAGGER
|
||||||
0x2021, // 0x82 DOUBLE DAGGER
|
0x2021, // 0x82 DOUBLE DAGGER
|
||||||
|
@ -2032,6 +2044,30 @@ encode_pdfdoc(unsigned long codepoint)
|
||||||
unsigned char ch = '\0';
|
unsigned char ch = '\0';
|
||||||
switch (codepoint)
|
switch (codepoint)
|
||||||
{
|
{
|
||||||
|
case 0x02d8:
|
||||||
|
ch = 0x18;
|
||||||
|
break;
|
||||||
|
case 0x02c7:
|
||||||
|
ch = 0x19;
|
||||||
|
break;
|
||||||
|
case 0x02c6:
|
||||||
|
ch = 0x1a;
|
||||||
|
break;
|
||||||
|
case 0x02d9:
|
||||||
|
ch = 0x1b;
|
||||||
|
break;
|
||||||
|
case 0x02dd:
|
||||||
|
ch = 0x1c;
|
||||||
|
break;
|
||||||
|
case 0x02db:
|
||||||
|
ch = 0x1d;
|
||||||
|
break;
|
||||||
|
case 0x02da:
|
||||||
|
ch = 0x1e;
|
||||||
|
break;
|
||||||
|
case 0x02dc:
|
||||||
|
ch = 0x1f;
|
||||||
|
break;
|
||||||
case 0x2022:
|
case 0x2022:
|
||||||
ch = 0x80;
|
ch = 0x80;
|
||||||
break;
|
break;
|
||||||
|
@ -2427,9 +2463,13 @@ QUtil::pdf_doc_to_utf8(std::string const& val)
|
||||||
{
|
{
|
||||||
unsigned char ch = static_cast<unsigned char>(val.at(i));
|
unsigned char ch = static_cast<unsigned char>(val.at(i));
|
||||||
unsigned short ch_short = ch;
|
unsigned short ch_short = ch;
|
||||||
if ((ch >= 128) && (ch <= 160))
|
if ((ch >= 127) && (ch <= 160))
|
||||||
{
|
{
|
||||||
ch_short = pdf_doc_to_unicode[ch - 128];
|
ch_short = pdf_doc_to_unicode[ch - 127];
|
||||||
|
}
|
||||||
|
else if ((ch >= 24) && (ch <= 31))
|
||||||
|
{
|
||||||
|
ch_short = pdf_doc_low_to_unicode[ch - 24];
|
||||||
}
|
}
|
||||||
result += QUtil::toUTF8(ch_short);
|
result += QUtil::toUTF8(ch_short);
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,6 +69,7 @@ HAGOOGAMAGOOGLE: 0
|
||||||
<c0>Does * have fingers?
|
<c0>Does * have fingers?
|
||||||
---- transcoding
|
---- transcoding
|
||||||
bidirectional pdf doc done
|
bidirectional pdf doc done
|
||||||
|
bidirectional pdf doc low done
|
||||||
bidirectional win ansi done
|
bidirectional win ansi done
|
||||||
bidirectional mac roman done
|
bidirectional mac roman done
|
||||||
analysis done
|
analysis done
|
||||||
|
@ -85,6 +86,8 @@ alternatives
|
||||||
2: 83a9e99e
|
2: 83a9e99e
|
||||||
0: 717561636b
|
0: 717561636b
|
||||||
done alternatives
|
done alternatives
|
||||||
|
w˘wˇwˆw˙w˝w˛w˚w˜w<EFBFBD>w
|
||||||
|
done low characters
|
||||||
---- whoami
|
---- whoami
|
||||||
quack1
|
quack1
|
||||||
quack2
|
quack2
|
||||||
|
|
|
@ -308,12 +308,12 @@ void utf8_to_ascii_test()
|
||||||
|
|
||||||
void transcoding_test(std::string (*to_utf8)(std::string const&),
|
void transcoding_test(std::string (*to_utf8)(std::string const&),
|
||||||
std::string (*from_utf8)(std::string const&, char),
|
std::string (*from_utf8)(std::string const&, char),
|
||||||
int last, std::string unknown)
|
int first, int last, std::string unknown)
|
||||||
{
|
{
|
||||||
std::string in(" ");
|
std::string in(" ");
|
||||||
std::string out;
|
std::string out;
|
||||||
std::string back;
|
std::string back;
|
||||||
for (int i = 128; i <= last; ++i)
|
for (int i = first; i <= last; ++i)
|
||||||
{
|
{
|
||||||
in.at(0) = static_cast<char>(static_cast<unsigned char>(i));
|
in.at(0) = static_cast<char>(static_cast<unsigned char>(i));
|
||||||
out = (*to_utf8)(in);
|
out = (*to_utf8)(in);
|
||||||
|
@ -355,13 +355,16 @@ void print_alternatives(std::string const& str)
|
||||||
void transcoding_test()
|
void transcoding_test()
|
||||||
{
|
{
|
||||||
transcoding_test(&QUtil::pdf_doc_to_utf8,
|
transcoding_test(&QUtil::pdf_doc_to_utf8,
|
||||||
&QUtil::utf8_to_pdf_doc, 160, "\x9f");
|
&QUtil::utf8_to_pdf_doc, 127, 160, "\x9f");
|
||||||
std::cout << "bidirectional pdf doc done" << std::endl;
|
std::cout << "bidirectional pdf doc done" << std::endl;
|
||||||
|
transcoding_test(&QUtil::pdf_doc_to_utf8,
|
||||||
|
&QUtil::utf8_to_pdf_doc, 24, 31, "?");
|
||||||
|
std::cout << "bidirectional pdf doc low done" << std::endl;
|
||||||
transcoding_test(&QUtil::win_ansi_to_utf8,
|
transcoding_test(&QUtil::win_ansi_to_utf8,
|
||||||
&QUtil::utf8_to_win_ansi, 160, "?");
|
&QUtil::utf8_to_win_ansi, 128, 160, "?");
|
||||||
std::cout << "bidirectional win ansi done" << std::endl;
|
std::cout << "bidirectional win ansi done" << std::endl;
|
||||||
transcoding_test(&QUtil::mac_roman_to_utf8,
|
transcoding_test(&QUtil::mac_roman_to_utf8,
|
||||||
&QUtil::utf8_to_mac_roman, 255, "?");
|
&QUtil::utf8_to_mac_roman, 128, 255, "?");
|
||||||
std::cout << "bidirectional mac roman done" << std::endl;
|
std::cout << "bidirectional mac roman done" << std::endl;
|
||||||
check_analyze("pi = \317\200", true, true, false);
|
check_analyze("pi = \317\200", true, true, false);
|
||||||
check_analyze("pi != \317", true, false, false);
|
check_analyze("pi != \317", true, false, false);
|
||||||
|
@ -396,6 +399,10 @@ void transcoding_test()
|
||||||
print_alternatives(utf8);
|
print_alternatives(utf8);
|
||||||
print_alternatives("quack");
|
print_alternatives("quack");
|
||||||
std::cout << "done alternatives" << std::endl;
|
std::cout << "done alternatives" << std::endl;
|
||||||
|
std::string low = QUtil::pdf_doc_to_utf8(
|
||||||
|
"w\030w\031w\032w\033w\034w\035w\036w\037w\177w");
|
||||||
|
std::cout << low << std::endl;
|
||||||
|
std::cout << "done low characters" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_whoami(char const* str)
|
void print_whoami(char const* str)
|
||||||
|
|
|
@ -604,7 +604,7 @@
|
||||||
"trailer": {
|
"trailer": {
|
||||||
"/ID": [
|
"/ID": [
|
||||||
"S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
|
"S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
|
||||||
"'+“‰¤V2«PP ç`m\"\u001d"
|
"'+“‰¤V2«PP ç`m\"˛"
|
||||||
],
|
],
|
||||||
"/Root": "1 0 R",
|
"/Root": "1 0 R",
|
||||||
"/Size": 31
|
"/Size": 31
|
||||||
|
|
|
@ -615,8 +615,8 @@
|
||||||
},
|
},
|
||||||
"trailer": {
|
"trailer": {
|
||||||
"/ID": [
|
"/ID": [
|
||||||
"Z§¯•Py»’~’46\u001dı\u0011¢",
|
"Z§¯•Py»’~’46˛ı\u0011¢",
|
||||||
"Z§¯•Py»’~’46\u001dı\u0011¢"
|
"Z§¯•Py»’~’46˛ı\u0011¢"
|
||||||
],
|
],
|
||||||
"/Root": "1 0 R",
|
"/Root": "1 0 R",
|
||||||
"/Size": 31
|
"/Size": 31
|
||||||
|
|
|
@ -604,7 +604,7 @@
|
||||||
"trailer": {
|
"trailer": {
|
||||||
"/ID": [
|
"/ID": [
|
||||||
"S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
|
"S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
|
||||||
"'+“‰¤V2«PP ç`m\"\u001d"
|
"'+“‰¤V2«PP ç`m\"˛"
|
||||||
],
|
],
|
||||||
"/Root": "1 0 R",
|
"/Root": "1 0 R",
|
||||||
"/Size": 31
|
"/Size": 31
|
||||||
|
|
|
@ -604,7 +604,7 @@
|
||||||
"trailer": {
|
"trailer": {
|
||||||
"/ID": [
|
"/ID": [
|
||||||
"S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
|
"S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
|
||||||
"'+“‰¤V2«PP ç`m\"\u001d"
|
"'+“‰¤V2«PP ç`m\"˛"
|
||||||
],
|
],
|
||||||
"/Root": "1 0 R",
|
"/Root": "1 0 R",
|
||||||
"/Size": 31
|
"/Size": 31
|
||||||
|
|
|
@ -1518,8 +1518,8 @@
|
||||||
"99 0 R": 47,
|
"99 0 R": 47,
|
||||||
"trailer": {
|
"trailer": {
|
||||||
"/ID": [
|
"/ID": [
|
||||||
"’ù\u0019Þxtó¼\\·¯½\u001eŁ7»",
|
"’ùˇÞxtó¼\\·¯½˚Ł7»",
|
||||||
"\rþ\u0018©LÞ\u000fKýÈl\u0003¯\u0019\u0001\u000e"
|
"\rþ˘©LÞ\u000fKýÈl\u0003¯ˇ\u0001\u000e"
|
||||||
],
|
],
|
||||||
"/Root": "1 0 R",
|
"/Root": "1 0 R",
|
||||||
"/Size": 100
|
"/Size": 100
|
||||||
|
|
|
@ -178,8 +178,8 @@
|
||||||
},
|
},
|
||||||
"trailer": {
|
"trailer": {
|
||||||
"/ID": [
|
"/ID": [
|
||||||
"û\u0018·ƒÿ{5⁄\u0005Ú−S*º‘o",
|
"û˘·ƒÿ{5⁄\u0005Ú−S*º‘o",
|
||||||
"÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý\u001f\u0002"
|
"÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002"
|
||||||
],
|
],
|
||||||
"/Info": "2 0 R",
|
"/Info": "2 0 R",
|
||||||
"/Root": "1 0 R",
|
"/Root": "1 0 R",
|
||||||
|
|
Loading…
Reference in New Issue
Block a user