Replace switch statements with static map initializers

Character transcoding from Unicode to single-byte characters used
hard-coded switch statements because the code predated our adoption of
C++11. Now we have thread-safe, static initialization of map literals,
so use that instead.
This commit is contained in:
Jay Berkenbilt 2022-04-23 16:03:02 -04:00
parent 37f05e67d8
commit 5bbb0d4c30
2 changed files with 69 additions and 571 deletions

10
TODO
View File

@ -11,9 +11,6 @@ In order:
Other (do in any order):
Misc
* Get rid of "ugly switch statements" in QUtil.cc -- replace with
static map initializers. (Search for "ugly switch statements" below
as well.)
* Consider exposing get_next_utf8_codepoint in QUtil
* Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val
does to detect UTF-8 encoded strings per PDF 2.0 spec.
@ -396,10 +393,9 @@ we might do about it.
* When mapping characters to widths, we will need to care about
character encoding. For built-in fonts, we can create a map from
Unicode code point to width and then go from the font's encoding to
unicode to the width. Get rid of "ugly switch statements" in
QUtil.cc and replace with static map initializers. See
misc/character-encoding/ (not on github) and font metric information
for the 14 standard fonts in my local pdf-spec directory.
unicode to the width. See misc/character-encoding/ (not on github)
and font metric information for the 14 standard fonts in my local
pdf-spec directory.
* Once we know about character widths, we can correctly support
auto-sized variable text fields (0 Tf). If this is fixed, search for

View File

@ -16,6 +16,7 @@
#include <fstream>
#include <iomanip>
#include <locale>
#include <map>
#include <memory>
#include <regex>
#include <set>
@ -251,6 +252,59 @@ static unsigned short mac_roman_to_unicode[] = {
0x02c7, // 0xff
};
static std::map<unsigned long, unsigned char> unicode_to_win_ansi = {
{0x20ac, 0x80}, {0x201a, 0x82}, {0x192, 0x83}, {0x201e, 0x84},
{0x2026, 0x85}, {0x2020, 0x86}, {0x2021, 0x87}, {0x2c6, 0x88},
{0x2030, 0x89}, {0x160, 0x8a}, {0x2039, 0x8b}, {0x152, 0x8c},
{0x17d, 0x8e}, {0x2018, 0x91}, {0x2019, 0x92}, {0x201c, 0x93},
{0x201d, 0x94}, {0x2022, 0x95}, {0x2013, 0x96}, {0x2014, 0x97},
{0x303, 0x98}, {0x2122, 0x99}, {0x161, 0x9a}, {0x203a, 0x9b},
{0x153, 0x9c}, {0x17e, 0x9e}, {0x178, 0x9f}, {0xa0, 0xa0},
};
static std::map<unsigned long, unsigned char> unicode_to_mac_roman = {
{0xc4, 0x80}, {0xc5, 0x81}, {0xc7, 0x82}, {0xc9, 0x83},
{0xd1, 0x84}, {0xd6, 0x85}, {0xdc, 0x86}, {0xe1, 0x87},
{0xe0, 0x88}, {0xe2, 0x89}, {0xe4, 0x8a}, {0xe3, 0x8b},
{0xe5, 0x8c}, {0xe7, 0x8d}, {0xe9, 0x8e}, {0xe8, 0x8f},
{0xea, 0x90}, {0xeb, 0x91}, {0xed, 0x92}, {0xec, 0x93},
{0xee, 0x94}, {0xef, 0x95}, {0xf1, 0x96}, {0xf3, 0x97},
{0xf2, 0x98}, {0xf4, 0x99}, {0xf6, 0x9a}, {0xf5, 0x9b},
{0xfa, 0x9c}, {0xf9, 0x9d}, {0xfb, 0x9e}, {0xfc, 0x9f},
{0x2020, 0xa0}, {0xb0, 0xa1}, {0xa2, 0xa2}, {0xa3, 0xa3},
{0xa7, 0xa4}, {0x2022, 0xa5}, {0xb6, 0xa6}, {0xdf, 0xa7},
{0xae, 0xa8}, {0xa9, 0xa9}, {0x2122, 0xaa}, {0x301, 0xab},
{0x308, 0xac}, {0xc6, 0xae}, {0xd8, 0xaf}, {0xb1, 0xb1},
{0xa5, 0xb4}, {0x3bc, 0xb5}, {0x1d43, 0xbb}, {0x1d52, 0xbc},
{0xe6, 0xbe}, {0xf8, 0xbf}, {0xbf, 0xc0}, {0xa1, 0xc1},
{0xac, 0xc2}, {0x192, 0xc4}, {0xab, 0xc7}, {0xbb, 0xc8},
{0x2026, 0xc9}, {0xc0, 0xcb}, {0xc3, 0xcc}, {0xd5, 0xcd},
{0x152, 0xce}, {0x153, 0xcf}, {0x2013, 0xd0}, {0x2014, 0xd1},
{0x201c, 0xd2}, {0x201d, 0xd3}, {0x2018, 0xd4}, {0x2019, 0xd5},
{0xf7, 0xd6}, {0xff, 0xd8}, {0x178, 0xd9}, {0x2044, 0xda},
{0xa4, 0xdb}, {0x2039, 0xdc}, {0x203a, 0xdd}, {0xfb01, 0xde},
{0xfb02, 0xdf}, {0x2021, 0xe0}, {0xb7, 0xe1}, {0x201a, 0xe2},
{0x201e, 0xe3}, {0x2030, 0xe4}, {0xc2, 0xe5}, {0xca, 0xe6},
{0xc1, 0xe7}, {0xcb, 0xe8}, {0xc8, 0xe9}, {0xcd, 0xea},
{0xce, 0xeb}, {0xcf, 0xec}, {0xcc, 0xed}, {0xd3, 0xee},
{0xd4, 0xef}, {0xd2, 0xf1}, {0xda, 0xf2}, {0xdb, 0xf3},
{0xd9, 0xf4}, {0x131, 0xf5}, {0x2c6, 0xf6}, {0x303, 0xf7},
{0x304, 0xf8}, {0x306, 0xf9}, {0x307, 0xfa}, {0x30a, 0xfb},
{0x327, 0xfc}, {0x30b, 0xfd}, {0x328, 0xfe}, {0x2c7, 0xff},
};
static std::map<unsigned long, unsigned char> unicode_to_pdf_doc = {
{0x02d8, 0x18}, {0x02c7, 0x19}, {0x02c6, 0x1a}, {0x02d9, 0x1b},
{0x02dd, 0x1c}, {0x02db, 0x1d}, {0x02da, 0x1e}, {0x02dc, 0x1f},
{0x2022, 0x80}, {0x2020, 0x81}, {0x2021, 0x82}, {0x2026, 0x83},
{0x2014, 0x84}, {0x2013, 0x85}, {0x0192, 0x86}, {0x2044, 0x87},
{0x2039, 0x88}, {0x203a, 0x89}, {0x2212, 0x8a}, {0x2030, 0x8b},
{0x201e, 0x8c}, {0x201c, 0x8d}, {0x201d, 0x8e}, {0x2018, 0x8f},
{0x2019, 0x90}, {0x201a, 0x91}, {0x2122, 0x92}, {0xfb01, 0x93},
{0xfb02, 0x94}, {0x0141, 0x95}, {0x0152, 0x96}, {0x0160, 0x97},
{0x0178, 0x98}, {0x017d, 0x99}, {0x0131, 0x9a}, {0x0142, 0x9b},
{0x0153, 0x9c}, {0x0161, 0x9d}, {0x017e, 0x9e}, {0xfffd, 0x9f},
{0x20ac, 0xa0},
};
namespace
{
class FileCloser
@ -1447,583 +1501,31 @@ enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc };
static unsigned char
encode_winansi(unsigned long codepoint)
{
// Use this ugly switch statement to avoid a static, which is not
// thread-safe.
unsigned char ch = '\0';
switch (codepoint) {
case 0x20ac:
ch = 0x80;
break;
case 0x201a:
ch = 0x82;
break;
case 0x192:
ch = 0x83;
break;
case 0x201e:
ch = 0x84;
break;
case 0x2026:
ch = 0x85;
break;
case 0x2020:
ch = 0x86;
break;
case 0x2021:
ch = 0x87;
break;
case 0x2c6:
ch = 0x88;
break;
case 0x2030:
ch = 0x89;
break;
case 0x160:
ch = 0x8a;
break;
case 0x2039:
ch = 0x8b;
break;
case 0x152:
ch = 0x8c;
break;
case 0x17d:
ch = 0x8e;
break;
case 0x2018:
ch = 0x91;
break;
case 0x2019:
ch = 0x92;
break;
case 0x201c:
ch = 0x93;
break;
case 0x201d:
ch = 0x94;
break;
case 0x2022:
ch = 0x95;
break;
case 0x2013:
ch = 0x96;
break;
case 0x2014:
ch = 0x97;
break;
case 0x303:
ch = 0x98;
break;
case 0x2122:
ch = 0x99;
break;
case 0x161:
ch = 0x9a;
break;
case 0x203a:
ch = 0x9b;
break;
case 0x153:
ch = 0x9c;
break;
case 0x17e:
ch = 0x9e;
break;
case 0x178:
ch = 0x9f;
break;
case 0xa0:
ch = 0xa0;
break;
default:
break;
auto i = unicode_to_win_ansi.find(codepoint);
if (i != unicode_to_win_ansi.end()) {
return i->second;
}
return ch;
return '\0';
}
static unsigned char
encode_macroman(unsigned long codepoint)
{
// Use this ugly switch statement to avoid a static, which is not
// thread-safe.
unsigned char ch = '\0';
switch (codepoint) {
case 0xc4:
ch = 0x80;
break;
case 0xc5:
ch = 0x81;
break;
case 0xc7:
ch = 0x82;
break;
case 0xc9:
ch = 0x83;
break;
case 0xd1:
ch = 0x84;
break;
case 0xd6:
ch = 0x85;
break;
case 0xdc:
ch = 0x86;
break;
case 0xe1:
ch = 0x87;
break;
case 0xe0:
ch = 0x88;
break;
case 0xe2:
ch = 0x89;
break;
case 0xe4:
ch = 0x8a;
break;
case 0xe3:
ch = 0x8b;
break;
case 0xe5:
ch = 0x8c;
break;
case 0xe7:
ch = 0x8d;
break;
case 0xe9:
ch = 0x8e;
break;
case 0xe8:
ch = 0x8f;
break;
case 0xea:
ch = 0x90;
break;
case 0xeb:
ch = 0x91;
break;
case 0xed:
ch = 0x92;
break;
case 0xec:
ch = 0x93;
break;
case 0xee:
ch = 0x94;
break;
case 0xef:
ch = 0x95;
break;
case 0xf1:
ch = 0x96;
break;
case 0xf3:
ch = 0x97;
break;
case 0xf2:
ch = 0x98;
break;
case 0xf4:
ch = 0x99;
break;
case 0xf6:
ch = 0x9a;
break;
case 0xf5:
ch = 0x9b;
break;
case 0xfa:
ch = 0x9c;
break;
case 0xf9:
ch = 0x9d;
break;
case 0xfb:
ch = 0x9e;
break;
case 0xfc:
ch = 0x9f;
break;
case 0x2020:
ch = 0xa0;
break;
case 0xb0:
ch = 0xa1;
break;
case 0xa2:
ch = 0xa2;
break;
case 0xa3:
ch = 0xa3;
break;
case 0xa7:
ch = 0xa4;
break;
case 0x2022:
ch = 0xa5;
break;
case 0xb6:
ch = 0xa6;
break;
case 0xdf:
ch = 0xa7;
break;
case 0xae:
ch = 0xa8;
break;
case 0xa9:
ch = 0xa9;
break;
case 0x2122:
ch = 0xaa;
break;
case 0x301:
ch = 0xab;
break;
case 0x308:
ch = 0xac;
break;
case 0xc6:
ch = 0xae;
break;
case 0xd8:
ch = 0xaf;
break;
case 0xb1:
ch = 0xb1;
break;
case 0xa5:
ch = 0xb4;
break;
case 0x3bc:
ch = 0xb5;
break;
case 0x1d43:
ch = 0xbb;
break;
case 0x1d52:
ch = 0xbc;
break;
case 0xe6:
ch = 0xbe;
break;
case 0xf8:
ch = 0xbf;
break;
case 0xbf:
ch = 0xc0;
break;
case 0xa1:
ch = 0xc1;
break;
case 0xac:
ch = 0xc2;
break;
case 0x192:
ch = 0xc4;
break;
case 0xab:
ch = 0xc7;
break;
case 0xbb:
ch = 0xc8;
break;
case 0x2026:
ch = 0xc9;
break;
case 0xc0:
ch = 0xcb;
break;
case 0xc3:
ch = 0xcc;
break;
case 0xd5:
ch = 0xcd;
break;
case 0x152:
ch = 0xce;
break;
case 0x153:
ch = 0xcf;
break;
case 0x2013:
ch = 0xd0;
break;
case 0x2014:
ch = 0xd1;
break;
case 0x201c:
ch = 0xd2;
break;
case 0x201d:
ch = 0xd3;
break;
case 0x2018:
ch = 0xd4;
break;
case 0x2019:
ch = 0xd5;
break;
case 0xf7:
ch = 0xd6;
break;
case 0xff:
ch = 0xd8;
break;
case 0x178:
ch = 0xd9;
break;
case 0x2044:
ch = 0xda;
break;
case 0xa4:
ch = 0xdb;
break;
case 0x2039:
ch = 0xdc;
break;
case 0x203a:
ch = 0xdd;
break;
case 0xfb01:
ch = 0xde;
break;
case 0xfb02:
ch = 0xdf;
break;
case 0x2021:
ch = 0xe0;
break;
case 0xb7:
ch = 0xe1;
break;
case 0x201a:
ch = 0xe2;
break;
case 0x201e:
ch = 0xe3;
break;
case 0x2030:
ch = 0xe4;
break;
case 0xc2:
ch = 0xe5;
break;
case 0xca:
ch = 0xe6;
break;
case 0xc1:
ch = 0xe7;
break;
case 0xcb:
ch = 0xe8;
break;
case 0xc8:
ch = 0xe9;
break;
case 0xcd:
ch = 0xea;
break;
case 0xce:
ch = 0xeb;
break;
case 0xcf:
ch = 0xec;
break;
case 0xcc:
ch = 0xed;
break;
case 0xd3:
ch = 0xee;
break;
case 0xd4:
ch = 0xef;
break;
case 0xd2:
ch = 0xf1;
break;
case 0xda:
ch = 0xf2;
break;
case 0xdb:
ch = 0xf3;
break;
case 0xd9:
ch = 0xf4;
break;
case 0x131:
ch = 0xf5;
break;
case 0x2c6:
ch = 0xf6;
break;
case 0x303:
ch = 0xf7;
break;
case 0x304:
ch = 0xf8;
break;
case 0x306:
ch = 0xf9;
break;
case 0x307:
ch = 0xfa;
break;
case 0x30a:
ch = 0xfb;
break;
case 0x327:
ch = 0xfc;
break;
case 0x30b:
ch = 0xfd;
break;
case 0x328:
ch = 0xfe;
break;
case 0x2c7:
ch = 0xff;
break;
default:
break;
auto i = unicode_to_mac_roman.find(codepoint);
if (i != unicode_to_mac_roman.end()) {
return i->second;
}
return ch;
return '\0';
}
static unsigned char
encode_pdfdoc(unsigned long codepoint)
{
// Use this ugly switch statement to avoid a static, which is not
// thread-safe.
unsigned char ch = '\0';
switch (codepoint) {
case 0x02d8:
ch = 0x18;
break;
case 0x02c7:
ch = 0x19;
break;
case 0x02c6:
ch = 0x1a;
break;
case 0x02d9:
ch = 0x1b;
break;
case 0x02dd:
ch = 0x1c;
break;
case 0x02db:
ch = 0x1d;
break;
case 0x02da:
ch = 0x1e;
break;
case 0x02dc:
ch = 0x1f;
break;
case 0x2022:
ch = 0x80;
break;
case 0x2020:
ch = 0x81;
break;
case 0x2021:
ch = 0x82;
break;
case 0x2026:
ch = 0x83;
break;
case 0x2014:
ch = 0x84;
break;
case 0x2013:
ch = 0x85;
break;
case 0x0192:
ch = 0x86;
break;
case 0x2044:
ch = 0x87;
break;
case 0x2039:
ch = 0x88;
break;
case 0x203a:
ch = 0x89;
break;
case 0x2212:
ch = 0x8a;
break;
case 0x2030:
ch = 0x8b;
break;
case 0x201e:
ch = 0x8c;
break;
case 0x201c:
ch = 0x8d;
break;
case 0x201d:
ch = 0x8e;
break;
case 0x2018:
ch = 0x8f;
break;
case 0x2019:
ch = 0x90;
break;
case 0x201a:
ch = 0x91;
break;
case 0x2122:
ch = 0x92;
break;
case 0xfb01:
ch = 0x93;
break;
case 0xfb02:
ch = 0x94;
break;
case 0x0141:
ch = 0x95;
break;
case 0x0152:
ch = 0x96;
break;
case 0x0160:
ch = 0x97;
break;
case 0x0178:
ch = 0x98;
break;
case 0x017d:
ch = 0x99;
break;
case 0x0131:
ch = 0x9a;
break;
case 0x0142:
ch = 0x9b;
break;
case 0x0153:
ch = 0x9c;
break;
case 0x0161:
ch = 0x9d;
break;
case 0x017e:
ch = 0x9e;
break;
case 0xfffd:
ch = 0x9f;
break;
case 0x20ac:
ch = 0xa0;
break;
default:
break;
auto i = unicode_to_pdf_doc.find(codepoint);
if (i != unicode_to_pdf_doc.end()) {
return i->second;
}
return ch;
return '\0';
}
unsigned long