mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 02:49:00 +00:00
Properly handle strings with PDF Doc Encoding (fixes #179)
The QPDF_String::getUTF8Val() method was not treating strings that weren't explicitly Unicode as PDF Doc Encoded. This only affects characters in the range 0x80 through 0xa0.
This commit is contained in:
parent
2780a1871d
commit
4bb3046f0b
@ -1,5 +1,8 @@
|
||||
2018-02-17 Jay Berkenbilt <ejb@ql.org>
|
||||
|
||||
* Fix QPDFObjectHandle::getUTF8Val() to properly handle strings
|
||||
that are encoded with PDF Doc Encoding. Fixes #179.
|
||||
|
||||
* Add qpdf_check_pdf to the "C" API. This method just attempts to
|
||||
read the entire file and produce no output, making possible to
|
||||
assess whether the file has any errors that qpdf can detect.
|
||||
|
@ -48,4 +48,10 @@ $td->runtest("bookmarks deleted",
|
||||
$td->EXIT_STATUS => 0},
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
|
||||
$td->report(10);
|
||||
$td->runtest("non-trivial pdf doc to unicode",
|
||||
{$td->COMMAND => "pdf-bookmarks issue-179.pdf"},
|
||||
{$td->FILE => "issue-179.out",
|
||||
$td->EXIT_STATUS => 0},
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
|
||||
$td->report(11);
|
||||
|
12
examples/qtest/bookmarks/issue-179.out
Normal file
12
examples/qtest/bookmarks/issue-179.out
Normal file
@ -0,0 +1,12 @@
|
||||
ž
|
||||
žč
|
||||
žđ
|
||||
žć
|
||||
žš
|
||||
ž ajklyghvbnmxcseqwuioprtzdf
|
||||
š
|
||||
šč
|
||||
šđ
|
||||
šć
|
||||
šž
|
||||
š ajklyghvbnmxcseqwuioprtzdf
|
BIN
examples/qtest/bookmarks/issue-179.pdf
Normal file
BIN
examples/qtest/bookmarks/issue-179.pdf
Normal file
Binary file not shown.
@ -442,6 +442,13 @@ class QPDFObjectHandle
|
||||
// Methods for string objects
|
||||
QPDF_DLL
|
||||
std::string getStringValue();
|
||||
// If a string starts with the UTF-16 marker, it is converted from
|
||||
// UTF-16 to UTF-8. Otherwise, it is treated as a string encoded
|
||||
// with PDF Doc Encoding. PDF Doc Encoding is identical to
|
||||
// ISO-8859-1 except in the range from 0200 through 0240, where
|
||||
// there is a mapping of characters to Unicode. QPDF versions
|
||||
// prior to version erroneously left characters in that range
|
||||
// unmapped.
|
||||
QPDF_DLL
|
||||
std::string getUTF8Value();
|
||||
|
||||
|
@ -8,6 +8,43 @@
|
||||
// be used.
|
||||
#include <string.h>
|
||||
|
||||
// First element is 128
|
||||
static unsigned short pdf_doc_to_unicode[] = {
|
||||
0x2022, // 0x80 BULLET
|
||||
0x2020, // 0x81 DAGGER
|
||||
0x2021, // 0x82 DOUBLE DAGGER
|
||||
0x2026, // 0x83 HORIZONTAL ELLIPSIS
|
||||
0x2014, // 0x84 EM DASH
|
||||
0x2013, // 0x85 EN DASH
|
||||
0x0192, // 0x86 SMALL LETTER F WITH HOOK
|
||||
0x2044, // 0x87 FRACTION SLASH (solidus)
|
||||
0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
0x2212, // 0x8a MINUS SIGN
|
||||
0x2030, // 0x8b PER MILLE SIGN
|
||||
0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
|
||||
0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
|
||||
0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
|
||||
0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
|
||||
0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
|
||||
0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
|
||||
0x2122, // 0x92 TRADE MARK SIGN
|
||||
0xfb01, // 0x93 LATIN SMALL LIGATURE FI
|
||||
0xfb02, // 0x94 LATIN SMALL LIGATURE FL
|
||||
0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
|
||||
0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
|
||||
0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
|
||||
0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
|
||||
0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
|
||||
0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
|
||||
0x0153, // 0x9c LATIN SMALL LIGATURE OE
|
||||
0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
|
||||
0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
|
||||
0xfffd, // 0x9f UNDEFINED
|
||||
0x20ac, // 0xa0 EURO SIGN
|
||||
};
|
||||
|
||||
// See above about ctype.
|
||||
static bool is_ascii_printable(unsigned char ch)
|
||||
{
|
||||
@ -209,7 +246,13 @@ QPDF_String::getUTF8Val() const
|
||||
{
|
||||
for (unsigned int i = 0; i < len; ++i)
|
||||
{
|
||||
result += QUtil::toUTF8(static_cast<unsigned char>(this->val.at(i)));
|
||||
unsigned char ch = static_cast<unsigned char>(this->val.at(i));
|
||||
unsigned short val = ch;
|
||||
if ((ch >= 128) && (ch <= 160))
|
||||
{
|
||||
val = pdf_doc_to_unicode[ch - 128];
|
||||
}
|
||||
result += QUtil::toUTF8(val);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
|
@ -1,4 +1,10 @@
|
||||
BINS_qpdf = qpdf test_driver pdf_from_scratch test_large_file test_tokenizer
|
||||
BINS_qpdf = \
|
||||
qpdf \
|
||||
pdf_from_scratch \
|
||||
test_driver \
|
||||
test_large_file \
|
||||
test_pdf_doc_encoding \
|
||||
test_tokenizer
|
||||
CBINS_qpdf = qpdf-ctest
|
||||
|
||||
TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B)))
|
||||
|
@ -82,6 +82,16 @@ compare_pdfs("p1-a.pdf", "p1-a-p2-b.pdf", 1);
|
||||
compare_pdfs("p1-a-p2-a.pdf", "p1-a-p2-b.pdf", 1);
|
||||
flush_tiff_cache();
|
||||
|
||||
show_ntests();
|
||||
# ----------
|
||||
$td->notify("--- PDF Doc Encoding ---");
|
||||
$n_tests += 1;
|
||||
|
||||
$td->runtest("PDF doc encoding to Unicode",
|
||||
{$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"},
|
||||
{$td->FILE => "pdf-doc-to-utf8.out", $td->EXIT_STATUS => 0},
|
||||
$td->NORMALIZE_NEWLINES);
|
||||
|
||||
show_ntests();
|
||||
# ----------
|
||||
$td->notify("--- Stream Replacement Tests ---");
|
||||
|
33
qpdf/qtest/qpdf/pdf-doc-to-utf8.in
Normal file
33
qpdf/qtest/qpdf/pdf-doc-to-utf8.in
Normal file
@ -0,0 +1,33 @@
|
||||
€ 128 0x80 0200 U+2022 BULLET
|
||||
<EFBFBD> 129 0x81 0201 U+2020 DAGGER
|
||||
‚ 130 0x82 0202 U+2021 DOUBLE DAGGER
|
||||
ƒ 131 0x83 0203 U+2026 HORIZONTAL ELLIPSIS
|
||||
„ 132 0x84 0204 U+2014 EM DASH
|
||||
… 133 0x85 0205 U+2013 EN DASH
|
||||
† 134 0x86 0206 U+0192 SMALL LETTER F WITH HOOK
|
||||
‡ 135 0x87 0207 U+2044 FRACTION SLASH (solidus)
|
||||
ˆ 136 0x88 0210 U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
‰ 137 0x89 0211 U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
Š 138 0x8a 0212 U+2212 MINUS SIGN
|
||||
‹ 139 0x8b 0213 U+2030 PER MILLE SIGN
|
||||
Œ 140 0x8c 0214 U+201E DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
|
||||
<EFBFBD> 141 0x8d 0215 U+201C LEFT DOUBLE QUOTATION MARK (double quote left)
|
||||
Ž 142 0x8e 0216 U+201D RIGHT DOUBLE QUOTATION MARK (quotedblright)
|
||||
<EFBFBD> 143 0x8f 0217 U+2018 LEFT SINGLE QUOTATION MARK (quoteleft)
|
||||
<EFBFBD> 144 0x90 0220 U+2019 RIGHT SINGLE QUOTATION MARK (quoteright)
|
||||
‘ 145 0x91 0221 U+201A SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
|
||||
’ 146 0x92 0222 U+2122 TRADE MARK SIGN
|
||||
“ 147 0x93 0223 U+FB01 LATIN SMALL LIGATURE FI
|
||||
” 148 0x94 0224 U+FB02 LATIN SMALL LIGATURE FL
|
||||
• 149 0x95 0225 U+0141 LATIN CAPITAL LETTER L WITH STROKE
|
||||
– 150 0x96 0226 U+0152 LATIN CAPITAL LIGATURE OE
|
||||
— 151 0x97 0227 U+0160 LATIN CAPITAL LETTER S WITH CARON
|
||||
˜ 152 0x98 0230 U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
™ 153 0x99 0231 U+017D LATIN CAPITAL LETTER Z WITH CARON
|
||||
š 154 0x9a 0232 U+0131 LATIN SMALL LETTER DOTLESS I
|
||||
› 155 0x9b 0233 U+0142 LATIN SMALL LETTER L WITH STROKE
|
||||
œ 156 0x9c 0234 U+0153 LATIN SMALL LIGATURE OE
|
||||
<EFBFBD> 157 0x9d 0235 U+0161 LATIN SMALL LETTER S WITH CARON
|
||||
ž 158 0x9e 0236 U+017E LATIN SMALL LETTER Z WITH CARON
|
||||
Ÿ 159 0x9f 0237 U+FFFD UNDEFINED
|
||||
160 0xa0 0240 U+20AC EURO SIGN
|
33
qpdf/qtest/qpdf/pdf-doc-to-utf8.out
Normal file
33
qpdf/qtest/qpdf/pdf-doc-to-utf8.out
Normal file
@ -0,0 +1,33 @@
|
||||
• 128 0x80 0200 U+2022 BULLET
|
||||
† 129 0x81 0201 U+2020 DAGGER
|
||||
‡ 130 0x82 0202 U+2021 DOUBLE DAGGER
|
||||
… 131 0x83 0203 U+2026 HORIZONTAL ELLIPSIS
|
||||
— 132 0x84 0204 U+2014 EM DASH
|
||||
– 133 0x85 0205 U+2013 EN DASH
|
||||
ƒ 134 0x86 0206 U+0192 SMALL LETTER F WITH HOOK
|
||||
⁄ 135 0x87 0207 U+2044 FRACTION SLASH (solidus)
|
||||
‹ 136 0x88 0210 U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
› 137 0x89 0211 U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
− 138 0x8a 0212 U+2212 MINUS SIGN
|
||||
‰ 139 0x8b 0213 U+2030 PER MILLE SIGN
|
||||
„ 140 0x8c 0214 U+201E DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
|
||||
“ 141 0x8d 0215 U+201C LEFT DOUBLE QUOTATION MARK (double quote left)
|
||||
” 142 0x8e 0216 U+201D RIGHT DOUBLE QUOTATION MARK (quotedblright)
|
||||
‘ 143 0x8f 0217 U+2018 LEFT SINGLE QUOTATION MARK (quoteleft)
|
||||
’ 144 0x90 0220 U+2019 RIGHT SINGLE QUOTATION MARK (quoteright)
|
||||
‚ 145 0x91 0221 U+201A SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
|
||||
™ 146 0x92 0222 U+2122 TRADE MARK SIGN
|
||||
fi 147 0x93 0223 U+FB01 LATIN SMALL LIGATURE FI
|
||||
fl 148 0x94 0224 U+FB02 LATIN SMALL LIGATURE FL
|
||||
Ł 149 0x95 0225 U+0141 LATIN CAPITAL LETTER L WITH STROKE
|
||||
Œ 150 0x96 0226 U+0152 LATIN CAPITAL LIGATURE OE
|
||||
Š 151 0x97 0227 U+0160 LATIN CAPITAL LETTER S WITH CARON
|
||||
Ÿ 152 0x98 0230 U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
Ž 153 0x99 0231 U+017D LATIN CAPITAL LETTER Z WITH CARON
|
||||
ı 154 0x9a 0232 U+0131 LATIN SMALL LETTER DOTLESS I
|
||||
ł 155 0x9b 0233 U+0142 LATIN SMALL LETTER L WITH STROKE
|
||||
œ 156 0x9c 0234 U+0153 LATIN SMALL LIGATURE OE
|
||||
š 157 0x9d 0235 U+0161 LATIN SMALL LETTER S WITH CARON
|
||||
ž 158 0x9e 0236 U+017E LATIN SMALL LETTER Z WITH CARON
|
||||
<EFBFBD> 159 0x9f 0237 U+FFFD UNDEFINED
|
||||
€ 160 0xa0 0240 U+20AC EURO SIGN
|
45
qpdf/test_pdf_doc_encoding.cc
Normal file
45
qpdf/test_pdf_doc_encoding.cc
Normal file
@ -0,0 +1,45 @@
|
||||
#include <qpdf/QUtil.hh>
|
||||
#include <qpdf/QPDFObjectHandle.hh>
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
static char const* whoami = 0;
|
||||
|
||||
void usage()
|
||||
{
|
||||
std::cerr << "Usage: " << whoami << " infile" << std::endl;
|
||||
exit(2);
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
if ((whoami = strrchr(argv[0], '/')) == NULL)
|
||||
{
|
||||
whoami = argv[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
++whoami;
|
||||
}
|
||||
// For libtool's sake....
|
||||
if (strncmp(whoami, "lt-", 3) == 0)
|
||||
{
|
||||
whoami += 3;
|
||||
}
|
||||
|
||||
if (argc != 2)
|
||||
{
|
||||
usage();
|
||||
}
|
||||
char const* infilename = argv[1];
|
||||
std::list<std::string> lines =
|
||||
QUtil::read_lines_from_file(infilename);
|
||||
for (std::list<std::string>::iterator iter = lines.begin();
|
||||
iter != lines.end(); ++iter)
|
||||
{
|
||||
QPDFObjectHandle str = QPDFObjectHandle::newString(*iter);
|
||||
std::cout << str.getUTF8Value() << std::endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user