From 5bbb0d4c307bff58e9928a1c757438d033687ce3 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sat, 23 Apr 2022 16:03:02 -0400 Subject: [PATCH] Replace switch statements with static map initializers Character transcoding from Unicode to single-byte characters used hard-coded switch statements because the code predated our adoption of C++11. Now we have thread-safe, static initialization of map literals, so use that instead. --- TODO | 10 +- libqpdf/QUtil.cc | 630 +++++------------------------------------------ 2 files changed, 69 insertions(+), 571 deletions(-) diff --git a/TODO b/TODO index 956d1ef3..1cbf977f 100644 --- a/TODO +++ b/TODO @@ -11,9 +11,6 @@ In order: Other (do in any order): Misc -* Get rid of "ugly switch statements" in QUtil.cc -- replace with - static map initializers. (Search for "ugly switch statements" below - as well.) * Consider exposing get_next_utf8_codepoint in QUtil * Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val does to detect UTF-8 encoded strings per PDF 2.0 spec. @@ -396,10 +393,9 @@ we might do about it. * When mapping characters to widths, we will need to care about character encoding. For built-in fonts, we can create a map from Unicode code point to width and then go from the font's encoding to - unicode to the width. Get rid of "ugly switch statements" in - QUtil.cc and replace with static map initializers. See - misc/character-encoding/ (not on github) and font metric information - for the 14 standard fonts in my local pdf-spec directory. + unicode to the width. See misc/character-encoding/ (not on github) + and font metric information for the 14 standard fonts in my local + pdf-spec directory. * Once we know about character widths, we can correctly support auto-sized variable text fields (0 Tf). If this is fixed, search for diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index 51bd304c..a9e77777 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -251,6 +252,59 @@ static unsigned short mac_roman_to_unicode[] = { 0x02c7, // 0xff }; +static std::map unicode_to_win_ansi = { + {0x20ac, 0x80}, {0x201a, 0x82}, {0x192, 0x83}, {0x201e, 0x84}, + {0x2026, 0x85}, {0x2020, 0x86}, {0x2021, 0x87}, {0x2c6, 0x88}, + {0x2030, 0x89}, {0x160, 0x8a}, {0x2039, 0x8b}, {0x152, 0x8c}, + {0x17d, 0x8e}, {0x2018, 0x91}, {0x2019, 0x92}, {0x201c, 0x93}, + {0x201d, 0x94}, {0x2022, 0x95}, {0x2013, 0x96}, {0x2014, 0x97}, + {0x303, 0x98}, {0x2122, 0x99}, {0x161, 0x9a}, {0x203a, 0x9b}, + {0x153, 0x9c}, {0x17e, 0x9e}, {0x178, 0x9f}, {0xa0, 0xa0}, +}; +static std::map unicode_to_mac_roman = { + {0xc4, 0x80}, {0xc5, 0x81}, {0xc7, 0x82}, {0xc9, 0x83}, + {0xd1, 0x84}, {0xd6, 0x85}, {0xdc, 0x86}, {0xe1, 0x87}, + {0xe0, 0x88}, {0xe2, 0x89}, {0xe4, 0x8a}, {0xe3, 0x8b}, + {0xe5, 0x8c}, {0xe7, 0x8d}, {0xe9, 0x8e}, {0xe8, 0x8f}, + {0xea, 0x90}, {0xeb, 0x91}, {0xed, 0x92}, {0xec, 0x93}, + {0xee, 0x94}, {0xef, 0x95}, {0xf1, 0x96}, {0xf3, 0x97}, + {0xf2, 0x98}, {0xf4, 0x99}, {0xf6, 0x9a}, {0xf5, 0x9b}, + {0xfa, 0x9c}, {0xf9, 0x9d}, {0xfb, 0x9e}, {0xfc, 0x9f}, + {0x2020, 0xa0}, {0xb0, 0xa1}, {0xa2, 0xa2}, {0xa3, 0xa3}, + {0xa7, 0xa4}, {0x2022, 0xa5}, {0xb6, 0xa6}, {0xdf, 0xa7}, + {0xae, 0xa8}, {0xa9, 0xa9}, {0x2122, 0xaa}, {0x301, 0xab}, + {0x308, 0xac}, {0xc6, 0xae}, {0xd8, 0xaf}, {0xb1, 0xb1}, + {0xa5, 0xb4}, {0x3bc, 0xb5}, {0x1d43, 0xbb}, {0x1d52, 0xbc}, + {0xe6, 0xbe}, {0xf8, 0xbf}, {0xbf, 0xc0}, {0xa1, 0xc1}, + {0xac, 0xc2}, {0x192, 0xc4}, {0xab, 0xc7}, {0xbb, 0xc8}, + {0x2026, 0xc9}, {0xc0, 0xcb}, {0xc3, 0xcc}, {0xd5, 0xcd}, + {0x152, 0xce}, {0x153, 0xcf}, {0x2013, 0xd0}, {0x2014, 0xd1}, + {0x201c, 0xd2}, {0x201d, 0xd3}, {0x2018, 0xd4}, {0x2019, 0xd5}, + {0xf7, 0xd6}, {0xff, 0xd8}, {0x178, 0xd9}, {0x2044, 0xda}, + {0xa4, 0xdb}, {0x2039, 0xdc}, {0x203a, 0xdd}, {0xfb01, 0xde}, + {0xfb02, 0xdf}, {0x2021, 0xe0}, {0xb7, 0xe1}, {0x201a, 0xe2}, + {0x201e, 0xe3}, {0x2030, 0xe4}, {0xc2, 0xe5}, {0xca, 0xe6}, + {0xc1, 0xe7}, {0xcb, 0xe8}, {0xc8, 0xe9}, {0xcd, 0xea}, + {0xce, 0xeb}, {0xcf, 0xec}, {0xcc, 0xed}, {0xd3, 0xee}, + {0xd4, 0xef}, {0xd2, 0xf1}, {0xda, 0xf2}, {0xdb, 0xf3}, + {0xd9, 0xf4}, {0x131, 0xf5}, {0x2c6, 0xf6}, {0x303, 0xf7}, + {0x304, 0xf8}, {0x306, 0xf9}, {0x307, 0xfa}, {0x30a, 0xfb}, + {0x327, 0xfc}, {0x30b, 0xfd}, {0x328, 0xfe}, {0x2c7, 0xff}, +}; +static std::map unicode_to_pdf_doc = { + {0x02d8, 0x18}, {0x02c7, 0x19}, {0x02c6, 0x1a}, {0x02d9, 0x1b}, + {0x02dd, 0x1c}, {0x02db, 0x1d}, {0x02da, 0x1e}, {0x02dc, 0x1f}, + {0x2022, 0x80}, {0x2020, 0x81}, {0x2021, 0x82}, {0x2026, 0x83}, + {0x2014, 0x84}, {0x2013, 0x85}, {0x0192, 0x86}, {0x2044, 0x87}, + {0x2039, 0x88}, {0x203a, 0x89}, {0x2212, 0x8a}, {0x2030, 0x8b}, + {0x201e, 0x8c}, {0x201c, 0x8d}, {0x201d, 0x8e}, {0x2018, 0x8f}, + {0x2019, 0x90}, {0x201a, 0x91}, {0x2122, 0x92}, {0xfb01, 0x93}, + {0xfb02, 0x94}, {0x0141, 0x95}, {0x0152, 0x96}, {0x0160, 0x97}, + {0x0178, 0x98}, {0x017d, 0x99}, {0x0131, 0x9a}, {0x0142, 0x9b}, + {0x0153, 0x9c}, {0x0161, 0x9d}, {0x017e, 0x9e}, {0xfffd, 0x9f}, + {0x20ac, 0xa0}, +}; + namespace { class FileCloser @@ -1447,583 +1501,31 @@ enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc }; static unsigned char encode_winansi(unsigned long codepoint) { - // Use this ugly switch statement to avoid a static, which is not - // thread-safe. - unsigned char ch = '\0'; - switch (codepoint) { - case 0x20ac: - ch = 0x80; - break; - case 0x201a: - ch = 0x82; - break; - case 0x192: - ch = 0x83; - break; - case 0x201e: - ch = 0x84; - break; - case 0x2026: - ch = 0x85; - break; - case 0x2020: - ch = 0x86; - break; - case 0x2021: - ch = 0x87; - break; - case 0x2c6: - ch = 0x88; - break; - case 0x2030: - ch = 0x89; - break; - case 0x160: - ch = 0x8a; - break; - case 0x2039: - ch = 0x8b; - break; - case 0x152: - ch = 0x8c; - break; - case 0x17d: - ch = 0x8e; - break; - case 0x2018: - ch = 0x91; - break; - case 0x2019: - ch = 0x92; - break; - case 0x201c: - ch = 0x93; - break; - case 0x201d: - ch = 0x94; - break; - case 0x2022: - ch = 0x95; - break; - case 0x2013: - ch = 0x96; - break; - case 0x2014: - ch = 0x97; - break; - case 0x303: - ch = 0x98; - break; - case 0x2122: - ch = 0x99; - break; - case 0x161: - ch = 0x9a; - break; - case 0x203a: - ch = 0x9b; - break; - case 0x153: - ch = 0x9c; - break; - case 0x17e: - ch = 0x9e; - break; - case 0x178: - ch = 0x9f; - break; - case 0xa0: - ch = 0xa0; - break; - default: - break; + auto i = unicode_to_win_ansi.find(codepoint); + if (i != unicode_to_win_ansi.end()) { + return i->second; } - return ch; + return '\0'; } static unsigned char encode_macroman(unsigned long codepoint) { - // Use this ugly switch statement to avoid a static, which is not - // thread-safe. - unsigned char ch = '\0'; - switch (codepoint) { - case 0xc4: - ch = 0x80; - break; - case 0xc5: - ch = 0x81; - break; - case 0xc7: - ch = 0x82; - break; - case 0xc9: - ch = 0x83; - break; - case 0xd1: - ch = 0x84; - break; - case 0xd6: - ch = 0x85; - break; - case 0xdc: - ch = 0x86; - break; - case 0xe1: - ch = 0x87; - break; - case 0xe0: - ch = 0x88; - break; - case 0xe2: - ch = 0x89; - break; - case 0xe4: - ch = 0x8a; - break; - case 0xe3: - ch = 0x8b; - break; - case 0xe5: - ch = 0x8c; - break; - case 0xe7: - ch = 0x8d; - break; - case 0xe9: - ch = 0x8e; - break; - case 0xe8: - ch = 0x8f; - break; - case 0xea: - ch = 0x90; - break; - case 0xeb: - ch = 0x91; - break; - case 0xed: - ch = 0x92; - break; - case 0xec: - ch = 0x93; - break; - case 0xee: - ch = 0x94; - break; - case 0xef: - ch = 0x95; - break; - case 0xf1: - ch = 0x96; - break; - case 0xf3: - ch = 0x97; - break; - case 0xf2: - ch = 0x98; - break; - case 0xf4: - ch = 0x99; - break; - case 0xf6: - ch = 0x9a; - break; - case 0xf5: - ch = 0x9b; - break; - case 0xfa: - ch = 0x9c; - break; - case 0xf9: - ch = 0x9d; - break; - case 0xfb: - ch = 0x9e; - break; - case 0xfc: - ch = 0x9f; - break; - case 0x2020: - ch = 0xa0; - break; - case 0xb0: - ch = 0xa1; - break; - case 0xa2: - ch = 0xa2; - break; - case 0xa3: - ch = 0xa3; - break; - case 0xa7: - ch = 0xa4; - break; - case 0x2022: - ch = 0xa5; - break; - case 0xb6: - ch = 0xa6; - break; - case 0xdf: - ch = 0xa7; - break; - case 0xae: - ch = 0xa8; - break; - case 0xa9: - ch = 0xa9; - break; - case 0x2122: - ch = 0xaa; - break; - case 0x301: - ch = 0xab; - break; - case 0x308: - ch = 0xac; - break; - case 0xc6: - ch = 0xae; - break; - case 0xd8: - ch = 0xaf; - break; - case 0xb1: - ch = 0xb1; - break; - case 0xa5: - ch = 0xb4; - break; - case 0x3bc: - ch = 0xb5; - break; - case 0x1d43: - ch = 0xbb; - break; - case 0x1d52: - ch = 0xbc; - break; - case 0xe6: - ch = 0xbe; - break; - case 0xf8: - ch = 0xbf; - break; - case 0xbf: - ch = 0xc0; - break; - case 0xa1: - ch = 0xc1; - break; - case 0xac: - ch = 0xc2; - break; - case 0x192: - ch = 0xc4; - break; - case 0xab: - ch = 0xc7; - break; - case 0xbb: - ch = 0xc8; - break; - case 0x2026: - ch = 0xc9; - break; - case 0xc0: - ch = 0xcb; - break; - case 0xc3: - ch = 0xcc; - break; - case 0xd5: - ch = 0xcd; - break; - case 0x152: - ch = 0xce; - break; - case 0x153: - ch = 0xcf; - break; - case 0x2013: - ch = 0xd0; - break; - case 0x2014: - ch = 0xd1; - break; - case 0x201c: - ch = 0xd2; - break; - case 0x201d: - ch = 0xd3; - break; - case 0x2018: - ch = 0xd4; - break; - case 0x2019: - ch = 0xd5; - break; - case 0xf7: - ch = 0xd6; - break; - case 0xff: - ch = 0xd8; - break; - case 0x178: - ch = 0xd9; - break; - case 0x2044: - ch = 0xda; - break; - case 0xa4: - ch = 0xdb; - break; - case 0x2039: - ch = 0xdc; - break; - case 0x203a: - ch = 0xdd; - break; - case 0xfb01: - ch = 0xde; - break; - case 0xfb02: - ch = 0xdf; - break; - case 0x2021: - ch = 0xe0; - break; - case 0xb7: - ch = 0xe1; - break; - case 0x201a: - ch = 0xe2; - break; - case 0x201e: - ch = 0xe3; - break; - case 0x2030: - ch = 0xe4; - break; - case 0xc2: - ch = 0xe5; - break; - case 0xca: - ch = 0xe6; - break; - case 0xc1: - ch = 0xe7; - break; - case 0xcb: - ch = 0xe8; - break; - case 0xc8: - ch = 0xe9; - break; - case 0xcd: - ch = 0xea; - break; - case 0xce: - ch = 0xeb; - break; - case 0xcf: - ch = 0xec; - break; - case 0xcc: - ch = 0xed; - break; - case 0xd3: - ch = 0xee; - break; - case 0xd4: - ch = 0xef; - break; - case 0xd2: - ch = 0xf1; - break; - case 0xda: - ch = 0xf2; - break; - case 0xdb: - ch = 0xf3; - break; - case 0xd9: - ch = 0xf4; - break; - case 0x131: - ch = 0xf5; - break; - case 0x2c6: - ch = 0xf6; - break; - case 0x303: - ch = 0xf7; - break; - case 0x304: - ch = 0xf8; - break; - case 0x306: - ch = 0xf9; - break; - case 0x307: - ch = 0xfa; - break; - case 0x30a: - ch = 0xfb; - break; - case 0x327: - ch = 0xfc; - break; - case 0x30b: - ch = 0xfd; - break; - case 0x328: - ch = 0xfe; - break; - case 0x2c7: - ch = 0xff; - break; - default: - break; + auto i = unicode_to_mac_roman.find(codepoint); + if (i != unicode_to_mac_roman.end()) { + return i->second; } - return ch; + return '\0'; } static unsigned char encode_pdfdoc(unsigned long codepoint) { - // Use this ugly switch statement to avoid a static, which is not - // thread-safe. - unsigned char ch = '\0'; - switch (codepoint) { - case 0x02d8: - ch = 0x18; - break; - case 0x02c7: - ch = 0x19; - break; - case 0x02c6: - ch = 0x1a; - break; - case 0x02d9: - ch = 0x1b; - break; - case 0x02dd: - ch = 0x1c; - break; - case 0x02db: - ch = 0x1d; - break; - case 0x02da: - ch = 0x1e; - break; - case 0x02dc: - ch = 0x1f; - break; - case 0x2022: - ch = 0x80; - break; - case 0x2020: - ch = 0x81; - break; - case 0x2021: - ch = 0x82; - break; - case 0x2026: - ch = 0x83; - break; - case 0x2014: - ch = 0x84; - break; - case 0x2013: - ch = 0x85; - break; - case 0x0192: - ch = 0x86; - break; - case 0x2044: - ch = 0x87; - break; - case 0x2039: - ch = 0x88; - break; - case 0x203a: - ch = 0x89; - break; - case 0x2212: - ch = 0x8a; - break; - case 0x2030: - ch = 0x8b; - break; - case 0x201e: - ch = 0x8c; - break; - case 0x201c: - ch = 0x8d; - break; - case 0x201d: - ch = 0x8e; - break; - case 0x2018: - ch = 0x8f; - break; - case 0x2019: - ch = 0x90; - break; - case 0x201a: - ch = 0x91; - break; - case 0x2122: - ch = 0x92; - break; - case 0xfb01: - ch = 0x93; - break; - case 0xfb02: - ch = 0x94; - break; - case 0x0141: - ch = 0x95; - break; - case 0x0152: - ch = 0x96; - break; - case 0x0160: - ch = 0x97; - break; - case 0x0178: - ch = 0x98; - break; - case 0x017d: - ch = 0x99; - break; - case 0x0131: - ch = 0x9a; - break; - case 0x0142: - ch = 0x9b; - break; - case 0x0153: - ch = 0x9c; - break; - case 0x0161: - ch = 0x9d; - break; - case 0x017e: - ch = 0x9e; - break; - case 0xfffd: - ch = 0x9f; - break; - case 0x20ac: - ch = 0xa0; - break; - default: - break; + auto i = unicode_to_pdf_doc.find(codepoint); + if (i != unicode_to_pdf_doc.end()) { + return i->second; } - return ch; + return '\0'; } unsigned long