diff --git a/ChangeLog b/ChangeLog index 9a75f4ad..7ff658c7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -14,6 +14,11 @@ the first bug in qpdf's history that could result in silent loss of data when processing a correct input file. Fixes #276. +2019-01-14 Jay Berkenbilt + + * Add versions of utf8 to single-byte character transcoders that + return a success code. + 2019-01-13 Jay Berkenbilt * Add several more string transcoding and analysis methods to diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh index d9b0783e..5fe8e97c 100644 --- a/include/qpdf/QUtil.hh +++ b/include/qpdf/QUtil.hh @@ -178,6 +178,22 @@ namespace QUtil std::string utf8_to_pdf_doc( std::string const& utf8, char unknown_char = '?'); + // These versions return true if the conversion was successful and + // false if any unrepresentable characters were found and had to + // be substituted with the unknown character. + QPDF_DLL + bool utf8_to_ascii( + std::string const& utf8, std::string& ascii, char unknown_char = '?'); + QPDF_DLL + bool utf8_to_win_ansi( + std::string const& utf8, std::string& win, char unknown_char = '?'); + QPDF_DLL + bool utf8_to_mac_roman( + std::string const& utf8, std::string& mac, char unknown_char = '?'); + QPDF_DLL + bool utf8_to_pdf_doc( + std::string const& utf8, std::string& pdfdoc, char unknown_char = '?'); + // Convert a UTF-16 big-endian encoded string to UTF-8. // Unrepresentable code points are converted to U+FFFD. QPDF_DLL diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index 19b6fdab..e645c4fc 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -1705,11 +1705,12 @@ unsigned long get_next_utf8_codepoint( return codepoint; } -static std::string -transcode_utf8(std::string const& utf8_val, encoding_e encoding, - char unknown) +static bool +transcode_utf8(std::string const& utf8_val, std::string& result, + encoding_e encoding, char unknown) { - std::string result; + bool okay = true; + result.clear(); if (encoding == e_utf16) { result += "\xfe\xff"; @@ -1721,6 +1722,7 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding, unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error); if (error) { + okay = false; if (encoding == e_utf16) { result += "\xff\xfd"; @@ -1768,11 +1770,21 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding, } if (ch == '\0') { + okay = false; ch = static_cast(unknown); } result.append(1, ch); } } + return okay; +} + +static std::string +transcode_utf8(std::string const& utf8_val, encoding_e encoding, + char unknown) +{ + std::string result; + transcode_utf8(utf8_val, result, encoding, unknown); return result; } @@ -1806,6 +1818,34 @@ QUtil::utf8_to_pdf_doc(std::string const& utf8, char unknown_char) return transcode_utf8(utf8, e_pdfdoc, unknown_char); } +bool +QUtil::utf8_to_ascii(std::string const& utf8, std::string& ascii, + char unknown_char) +{ + return transcode_utf8(utf8, ascii, e_ascii, unknown_char); +} + +bool +QUtil::utf8_to_win_ansi(std::string const& utf8, std::string& win, + char unknown_char) +{ + return transcode_utf8(utf8, win, e_winansi, unknown_char); +} + +bool +QUtil::utf8_to_mac_roman(std::string const& utf8, std::string& mac, + char unknown_char) +{ + return transcode_utf8(utf8, mac, e_macroman, unknown_char); +} + +bool +QUtil::utf8_to_pdf_doc(std::string const& utf8, std::string& pdfdoc, + char unknown_char) +{ + return transcode_utf8(utf8, pdfdoc, e_pdfdoc, unknown_char); +} + bool QUtil::is_utf16(std::string const& val) { diff --git a/libtests/qutil.cc b/libtests/qutil.cc index 91a656be..35877b9c 100644 --- a/libtests/qutil.cc +++ b/libtests/qutil.cc @@ -292,6 +292,22 @@ void transcoding_test() check_analyze("pi != 22/7", false, false, false); check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true); std::cout << "analysis done" << std::endl; + std::string input1("a\302\277b"); + std::string input2("a\317\200b"); + std::string input3("ab"); + std::string output; + assert(! QUtil::utf8_to_ascii(input1, output)); + assert(! QUtil::utf8_to_ascii(input2, output)); + assert(QUtil::utf8_to_ascii(input3, output)); + assert(QUtil::utf8_to_win_ansi(input1, output)); + assert(! QUtil::utf8_to_win_ansi(input2, output)); + assert(QUtil::utf8_to_win_ansi(input3, output)); + assert(QUtil::utf8_to_mac_roman(input1, output)); + assert(! QUtil::utf8_to_mac_roman(input2, output)); + assert(QUtil::utf8_to_mac_roman(input3, output)); + assert(QUtil::utf8_to_pdf_doc(input1, output)); + assert(! QUtil::utf8_to_pdf_doc(input2, output)); + assert(QUtil::utf8_to_pdf_doc(input3, output)); } void print_whoami(char const* str)