Handle odd PDFDoc codepoints in UTF-8 during transcoding (fixes #650)

There are codepoints in PDFDoc that are not valid UTF-8 but map to
valid UTF-8. We were handling those correctly with bidirectional
mapping.

However, if those same code points appeared in UTF-8, where they have
no meaning, they were left as fixed points when converting to PDFDoc,
where they do have meaning. This change recognizes them as errors.
This commit is contained in:
Jay Berkenbilt 2022-02-15 08:29:29 -05:00
parent 2b8d0f385b
commit 1065bbb016
4 changed files with 35 additions and 4 deletions

View File

@ -1,3 +1,9 @@
2022-02-15 Jay Berkenbilt <ejb@ql.org>
* Don't map 0x18 through 0x1f, 0x7f, 0x9f, or 0xad as fixed points
when transcoding UTF-8 to PDFDoc. These codepoints have different
meanings in those two encoding systems. Fixes #650.
2022-02-11 Jay Berkenbilt <ejb@ql.org>
* 10.6.1: release

View File

@ -2272,6 +2272,16 @@ transcode_utf8(std::string const& utf8_val, std::string& result,
{
result += QUtil::toUTF16(QIntC::to_ulong(ch));
}
else if ((encoding == e_pdfdoc) &&
(((ch >= 0x18) && (ch <= 0x1f)) || (ch == 127)))
{
// PDFDocEncoding maps some low characters to Unicode,
// so if we encounter those invalid UTF-8 code points,
// map them to unknown so reversing the mapping
// doesn't change them into other characters.
okay = false;
result.append(1, unknown);
}
else
{
result.append(1, ch);
@ -2281,6 +2291,13 @@ transcode_utf8(std::string const& utf8_val, std::string& result,
{
result += QUtil::toUTF16(codepoint);
}
else if ((codepoint == 0xad) && (encoding == e_pdfdoc))
{
// PDFDocEncoding omits 0x00ad (soft hyphen), but rather
// than treating it as undefined, map it to a regular
// hyphen.
result.append(1, '-');
}
else if ((codepoint > 160) && (codepoint < 256) &&
((encoding == e_winansi) || (encoding == e_pdfdoc)))
{

View File

@ -88,7 +88,8 @@ alternatives
2: 83a9e99e
0: 717561636b
done alternatives
w˘wˇwˆw˙w˝w˛w˚w˜w<EFBFBD>w<EFBFBD>w
w˘wˇwˆw˙w˝w˛w˚w˜w<EFBFBD>w<EFBFBD>w<EFBFBD>w
w?w?w?w?w?w?w?w?w?w?w-w
done other characters
---- whoami
quack1

View File

@ -418,9 +418,16 @@ void transcoding_test()
print_alternatives(utf8);
print_alternatives("quack");
std::cout << "done alternatives" << std::endl;
std::string other = QUtil::pdf_doc_to_utf8(
"w\030w\031w\032w\033w\034w\035w\036w\037w\177w\255w");
std::cout << other << std::endl;
// These are characters are either valid in PDFDoc and invalid in
// UTF-8 or the other way around.
std::string other("w\x18w\x19w\x1aw\x1bw\x1cw\x1dw\x1ew\x1fw\x7fw");
std::string other_doc = other + "\x9fw\xadw";
std::cout << QUtil::pdf_doc_to_utf8(other_doc) << std::endl;
std::string other_utf8 =
other + QUtil::toUTF8(0x9f) + "w" + QUtil::toUTF8(0xad) + "w";
std::string other_to_utf8;
assert(! QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8));
std::cout << other_to_utf8 << std::endl;
std::cout << "done other characters" << std::endl;
}