mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 02:49:00 +00:00
Handle odd PDFDoc codepoints in UTF-8 during transcoding (fixes #650)
There are codepoints in PDFDoc that are not valid UTF-8 but map to valid UTF-8. We were handling those correctly with bidirectional mapping. However, if those same code points appeared in UTF-8, where they have no meaning, they were left as fixed points when converting to PDFDoc, where they do have meaning. This change recognizes them as errors.
This commit is contained in:
parent
2b8d0f385b
commit
1065bbb016
@ -1,3 +1,9 @@
|
||||
2022-02-15 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Don't map 0x18 through 0x1f, 0x7f, 0x9f, or 0xad as fixed points
|
||||
when transcoding UTF-8 to PDFDoc. These codepoints have different
|
||||
meanings in those two encoding systems. Fixes #650.
|
||||
|
||||
2022-02-11 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* 10.6.1: release
|
||||
|
@ -2272,6 +2272,16 @@ transcode_utf8(std::string const& utf8_val, std::string& result,
|
||||
{
|
||||
result += QUtil::toUTF16(QIntC::to_ulong(ch));
|
||||
}
|
||||
else if ((encoding == e_pdfdoc) &&
|
||||
(((ch >= 0x18) && (ch <= 0x1f)) || (ch == 127)))
|
||||
{
|
||||
// PDFDocEncoding maps some low characters to Unicode,
|
||||
// so if we encounter those invalid UTF-8 code points,
|
||||
// map them to unknown so reversing the mapping
|
||||
// doesn't change them into other characters.
|
||||
okay = false;
|
||||
result.append(1, unknown);
|
||||
}
|
||||
else
|
||||
{
|
||||
result.append(1, ch);
|
||||
@ -2281,6 +2291,13 @@ transcode_utf8(std::string const& utf8_val, std::string& result,
|
||||
{
|
||||
result += QUtil::toUTF16(codepoint);
|
||||
}
|
||||
else if ((codepoint == 0xad) && (encoding == e_pdfdoc))
|
||||
{
|
||||
// PDFDocEncoding omits 0x00ad (soft hyphen), but rather
|
||||
// than treating it as undefined, map it to a regular
|
||||
// hyphen.
|
||||
result.append(1, '-');
|
||||
}
|
||||
else if ((codepoint > 160) && (codepoint < 256) &&
|
||||
((encoding == e_winansi) || (encoding == e_pdfdoc)))
|
||||
{
|
||||
|
@ -88,7 +88,8 @@ alternatives
|
||||
2: 83a9e99e
|
||||
0: 717561636b
|
||||
done alternatives
|
||||
w˘wˇwˆw˙w˝w˛w˚w˜w<EFBFBD>w<EFBFBD>w
|
||||
w˘wˇwˆw˙w˝w˛w˚w˜w<EFBFBD>w<EFBFBD>w<EFBFBD>w
|
||||
w?w?w?w?w?w?w?w?w?w?w-w
|
||||
done other characters
|
||||
---- whoami
|
||||
quack1
|
||||
|
@ -418,9 +418,16 @@ void transcoding_test()
|
||||
print_alternatives(utf8);
|
||||
print_alternatives("quack");
|
||||
std::cout << "done alternatives" << std::endl;
|
||||
std::string other = QUtil::pdf_doc_to_utf8(
|
||||
"w\030w\031w\032w\033w\034w\035w\036w\037w\177w\255w");
|
||||
std::cout << other << std::endl;
|
||||
// These are characters are either valid in PDFDoc and invalid in
|
||||
// UTF-8 or the other way around.
|
||||
std::string other("w\x18w\x19w\x1aw\x1bw\x1cw\x1dw\x1ew\x1fw\x7fw");
|
||||
std::string other_doc = other + "\x9fw\xadw";
|
||||
std::cout << QUtil::pdf_doc_to_utf8(other_doc) << std::endl;
|
||||
std::string other_utf8 =
|
||||
other + QUtil::toUTF8(0x9f) + "w" + QUtil::toUTF8(0xad) + "w";
|
||||
std::string other_to_utf8;
|
||||
assert(! QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8));
|
||||
std::cout << other_to_utf8 << std::endl;
|
||||
std::cout << "done other characters" << std::endl;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user