diff --git a/ChangeLog b/ChangeLog index 0d9053e0..9eedd250 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2019-01-03 Jay Berkenbilt + + * Add method QUtil::utf8_to_ascii, which returns an ASCII string + for a UTF-8 string, replacing out-of-range characters with a + specified substitute. + 2019-01-02 Jay Berkenbilt * Add method QPDFObjectHandle::getResourceNames that returns a set diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh index 96c0530b..5532149c 100644 --- a/include/qpdf/QUtil.hh +++ b/include/qpdf/QUtil.hh @@ -152,6 +152,12 @@ namespace QUtil QPDF_DLL std::string toUTF16(unsigned long uval); + // Convert a UTF-8 encoded string to ASCII by replacing all + // characters outside of ascii with the given unknown_char. + QPDF_DLL + std::string utf8_to_ascii( + std::string const& utf8, char unknown_char = '?'); + // If secure random number generation is supported on your // platform and qpdf was not compiled with insecure random number // generation, this returns a cryptographically secure random diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index 44ffec7f..7c2d9bc9 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -892,3 +892,26 @@ QUtil::parse_numrange(char const* range, int max) } return result; } + +std::string +QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char) +{ + std::string ascii_value; + for (size_t i = 0; i < utf8.length(); ++i) + { + unsigned char ch = static_cast(utf8.at(i)); + if (ch < 128) + { + ascii_value.append(1, ch); + } + else if ((ch & 0xc0) == 0x80) + { + // Ignore subsequent byte of UTF-8 encoded character + } + else + { + ascii_value.append(1, unknown_char); + } + } + return ascii_value; +} diff --git a/libtests/qtest/qutil/qutil.out b/libtests/qtest/qutil/qutil.out index 8223bf5b..f47301e4 100644 --- a/libtests/qtest/qutil/qutil.out +++ b/libtests/qtest/qutil/qutil.out @@ -47,6 +47,10 @@ HAGOOGAMAGOOGLE: 0 0xdead -> ff fd 0x7fffffff -> ff fd 0x80000000 -> ff fd +---- utf8_to_ascii +Does π have fingers? +Does ? have fingers? +Does * have fingers? ---- whoami quack1 quack2 diff --git a/libtests/qutil.cc b/libtests/qutil.cc index 025f4e43..de51da58 100644 --- a/libtests/qutil.cc +++ b/libtests/qutil.cc @@ -220,6 +220,17 @@ void to_utf16_test() print_utf16(0x80000000UL); } +void utf8_to_ascii_test() +{ + char const* input = "Does \317\200 have fingers?"; + std::cout << input + << std::endl + << QUtil::utf8_to_ascii(input) + << std::endl + << QUtil::utf8_to_ascii(input, '*') + << std::endl; +} + void print_whoami(char const* str) { PointerHolder dup(true, QUtil::copy_string(str)); @@ -328,6 +339,8 @@ int main(int argc, char* argv[]) to_utf8_test(); std::cout << "---- utf16" << std::endl; to_utf16_test(); + std::cout << "---- utf8_to_ascii" << std::endl; + utf8_to_ascii_test(); std::cout << "---- whoami" << std::endl; get_whoami_test(); std::cout << "---- file" << std::endl;