2
1
mirror of https://github.com/qpdf/qpdf.git synced 2025-01-03 07:12:28 +00:00

Move utf8_to_utf16 into QUtil

This commit is contained in:
Jay Berkenbilt 2019-01-05 13:00:18 -05:00
parent ae18bfd142
commit 089ce5902e
4 changed files with 74 additions and 58 deletions

View File

@ -1,3 +1,7 @@
2019-01-05 Jay Berkenbilt <ejb@ql.org>
* Add method QUtil::utf8_to_utf16.
2019-01-04 Jay Berkenbilt <ejb@ql.org> 2019-01-04 Jay Berkenbilt <ejb@ql.org>
* Add new option --optimize-images, which recompresses every image * Add new option --optimize-images, which recompresses every image

View File

@ -152,8 +152,14 @@ namespace QUtil
QPDF_DLL QPDF_DLL
std::string toUTF16(unsigned long uval); std::string toUTF16(unsigned long uval);
// Convert a UTF-8 encoded string to ASCII by replacing all // Convert a UTF-8 encoded string to UTF-16. Unrepresentable code
// characters outside of ascii with the given unknown_char. // points are converted to U+FFFD.
QPDF_DLL
std::string utf8_to_utf16(std::string const& utf8);
// Convert a UTF-8 encoded string to the specified single-byte
// encoding system by replacing all unsupported characters with
// the given unknown_char.
QPDF_DLL QPDF_DLL
std::string utf8_to_ascii( std::string utf8_to_ascii(
std::string const& utf8, char unknown_char = '?'); std::string const& utf8, char unknown_char = '?');

View File

@ -64,65 +64,10 @@ QPDF_String::~QPDF_String()
{ {
} }
enum encoding_e { e_utf16 };
static
std::string
transcode_utf8(std::string const& utf8_val, encoding_e encoding)
{
std::string result = "\xfe\xff";
size_t len = utf8_val.length();
for (size_t i = 0; i < len; ++i)
{
unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));
if (ch < 128)
{
result += QUtil::toUTF16(ch);
}
else
{
size_t bytes_needed = 0;
unsigned bit_check = 0x40;
unsigned char to_clear = 0x80;
while (ch & bit_check)
{
++bytes_needed;
to_clear |= bit_check;
bit_check >>= 1;
}
if (((bytes_needed > 5) || (bytes_needed < 1)) ||
((i + bytes_needed) >= len))
{
result += "\xff\xfd";
}
else
{
unsigned long codepoint = (ch & ~to_clear);
while (bytes_needed > 0)
{
--bytes_needed;
ch = utf8_val.at(++i);
if ((ch & 0xc0) != 0x80)
{
--i;
codepoint = 0xfffd;
break;
}
codepoint <<= 6;
codepoint += (ch & 0x3f);
}
result += QUtil::toUTF16(codepoint);
}
}
}
return result;
}
QPDF_String* QPDF_String*
QPDF_String::new_utf16(std::string const& utf8_val) QPDF_String::new_utf16(std::string const& utf8_val)
{ {
return new QPDF_String(transcode_utf8(utf8_val, e_utf16)); return new QPDF_String(QUtil::utf8_to_utf16(utf8_val));
} }
std::string std::string

View File

@ -893,6 +893,67 @@ QUtil::parse_numrange(char const* range, int max)
return result; return result;
} }
enum encoding_e { e_utf16 };
static
std::string
transcode_utf8(std::string const& utf8_val, encoding_e encoding)
{
std::string result = "\xfe\xff";
size_t len = utf8_val.length();
for (size_t i = 0; i < len; ++i)
{
unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));
if (ch < 128)
{
result += QUtil::toUTF16(ch);
}
else
{
size_t bytes_needed = 0;
unsigned bit_check = 0x40;
unsigned char to_clear = 0x80;
while (ch & bit_check)
{
++bytes_needed;
to_clear |= bit_check;
bit_check >>= 1;
}
if (((bytes_needed > 5) || (bytes_needed < 1)) ||
((i + bytes_needed) >= len))
{
result += "\xff\xfd";
}
else
{
unsigned long codepoint = (ch & ~to_clear);
while (bytes_needed > 0)
{
--bytes_needed;
ch = utf8_val.at(++i);
if ((ch & 0xc0) != 0x80)
{
--i;
codepoint = 0xfffd;
break;
}
codepoint <<= 6;
codepoint += (ch & 0x3f);
}
result += QUtil::toUTF16(codepoint);
}
}
}
return result;
}
std::string
QUtil::utf8_to_utf16(std::string const& utf8)
{
return transcode_utf8(utf8, e_utf16);
}
std::string std::string
QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char) QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char)
{ {