Properly handle strings with PDF Doc Encoding (fixes #179)

The QPDF_String::getUTF8Val() method was not treating strings that
weren't explicitly Unicode as PDF Doc Encoded. This only affects
characters in the range 0x80 through 0xa0.
This commit is contained in:
Jay Berkenbilt 2018-02-17 18:47:57 -05:00
parent 2780a1871d
commit 4bb3046f0b
11 changed files with 201 additions and 3 deletions

View File

@ -1,5 +1,8 @@
2018-02-17 Jay Berkenbilt <ejb@ql.org>
* Fix QPDFObjectHandle::getUTF8Val() to properly handle strings
that are encoded with PDF Doc Encoding. Fixes #179.
* Add qpdf_check_pdf to the "C" API. This method just attempts to
read the entire file and produce no output, making possible to
assess whether the file has any errors that qpdf can detect.

View File

@ -48,4 +48,10 @@ $td->runtest("bookmarks deleted",
$td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->report(10);
$td->runtest("non-trivial pdf doc to unicode",
{$td->COMMAND => "pdf-bookmarks issue-179.pdf"},
{$td->FILE => "issue-179.out",
$td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->report(11);

View File

@ -0,0 +1,12 @@
ž
žč
žđ
žć
žš
ž ajklyghvbnmxcseqwuioprtzdf
š
šč
šđ
šć
šž
š ajklyghvbnmxcseqwuioprtzdf

Binary file not shown.

View File

@ -442,6 +442,13 @@ class QPDFObjectHandle
// Methods for string objects
QPDF_DLL
std::string getStringValue();
// If a string starts with the UTF-16 marker, it is converted from
// UTF-16 to UTF-8. Otherwise, it is treated as a string encoded
// with PDF Doc Encoding. PDF Doc Encoding is identical to
// ISO-8859-1 except in the range from 0200 through 0240, where
// there is a mapping of characters to Unicode. QPDF versions
// prior to version erroneously left characters in that range
// unmapped.
QPDF_DLL
std::string getUTF8Value();

View File

@ -8,6 +8,43 @@
// be used.
#include <string.h>
// First element is 128
static unsigned short pdf_doc_to_unicode[] = {
0x2022, // 0x80 BULLET
0x2020, // 0x81 DAGGER
0x2021, // 0x82 DOUBLE DAGGER
0x2026, // 0x83 HORIZONTAL ELLIPSIS
0x2014, // 0x84 EM DASH
0x2013, // 0x85 EN DASH
0x0192, // 0x86 SMALL LETTER F WITH HOOK
0x2044, // 0x87 FRACTION SLASH (solidus)
0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x2212, // 0x8a MINUS SIGN
0x2030, // 0x8b PER MILLE SIGN
0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
0x2122, // 0x92 TRADE MARK SIGN
0xfb01, // 0x93 LATIN SMALL LIGATURE FI
0xfb02, // 0x94 LATIN SMALL LIGATURE FL
0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
0x0153, // 0x9c LATIN SMALL LIGATURE OE
0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
0xfffd, // 0x9f UNDEFINED
0x20ac, // 0xa0 EURO SIGN
};
// See above about ctype.
static bool is_ascii_printable(unsigned char ch)
{
@ -209,7 +246,13 @@ QPDF_String::getUTF8Val() const
{
for (unsigned int i = 0; i < len; ++i)
{
result += QUtil::toUTF8(static_cast<unsigned char>(this->val.at(i)));
unsigned char ch = static_cast<unsigned char>(this->val.at(i));
unsigned short val = ch;
if ((ch >= 128) && (ch <= 160))
{
val = pdf_doc_to_unicode[ch - 128];
}
result += QUtil::toUTF8(val);
}
}
return result;

View File

@ -1,4 +1,10 @@
BINS_qpdf = qpdf test_driver pdf_from_scratch test_large_file test_tokenizer
BINS_qpdf = \
qpdf \
pdf_from_scratch \
test_driver \
test_large_file \
test_pdf_doc_encoding \
test_tokenizer
CBINS_qpdf = qpdf-ctest
TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B)))

View File

@ -82,6 +82,16 @@ compare_pdfs("p1-a.pdf", "p1-a-p2-b.pdf", 1);
compare_pdfs("p1-a-p2-a.pdf", "p1-a-p2-b.pdf", 1);
flush_tiff_cache();
show_ntests();
# ----------
$td->notify("--- PDF Doc Encoding ---");
$n_tests += 1;
$td->runtest("PDF doc encoding to Unicode",
{$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"},
{$td->FILE => "pdf-doc-to-utf8.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
show_ntests();
# ----------
$td->notify("--- Stream Replacement Tests ---");

View File

@ -0,0 +1,33 @@
€ 128 0x80 0200 U+2022 BULLET
<EFBFBD> 129 0x81 0201 U+2020 DAGGER
130 0x82 0202 U+2021 DOUBLE DAGGER
ƒ 131 0x83 0203 U+2026 HORIZONTAL ELLIPSIS
„ 132 0x84 0204 U+2014 EM DASH
… 133 0x85 0205 U+2013 EN DASH
† 134 0x86 0206 U+0192 SMALL LETTER F WITH HOOK
‡ 135 0x87 0207 U+2044 FRACTION SLASH (solidus)
ˆ 136 0x88 0210 U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
‰ 137 0x89 0211 U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
Š 138 0x8a 0212 U+2212 MINUS SIGN
139 0x8b 0213 U+2030 PER MILLE SIGN
Π140 0x8c 0214 U+201E DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
<EFBFBD> 141 0x8d 0215 U+201C LEFT DOUBLE QUOTATION MARK (double quote left)
Ž 142 0x8e 0216 U+201D RIGHT DOUBLE QUOTATION MARK (quotedblright)
<EFBFBD> 143 0x8f 0217 U+2018 LEFT SINGLE QUOTATION MARK (quoteleft)
<EFBFBD> 144 0x90 0220 U+2019 RIGHT SINGLE QUOTATION MARK (quoteright)
145 0x91 0221 U+201A SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
146 0x92 0222 U+2122 TRADE MARK SIGN
“ 147 0x93 0223 U+FB01 LATIN SMALL LIGATURE FI
” 148 0x94 0224 U+FB02 LATIN SMALL LIGATURE FL
• 149 0x95 0225 U+0141 LATIN CAPITAL LETTER L WITH STROKE
150 0x96 0226 U+0152 LATIN CAPITAL LIGATURE OE
— 151 0x97 0227 U+0160 LATIN CAPITAL LETTER S WITH CARON
˜ 152 0x98 0230 U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
™ 153 0x99 0231 U+017D LATIN CAPITAL LETTER Z WITH CARON
š 154 0x9a 0232 U+0131 LATIN SMALL LETTER DOTLESS I
155 0x9b 0233 U+0142 LATIN SMALL LETTER L WITH STROKE
œ 156 0x9c 0234 U+0153 LATIN SMALL LIGATURE OE
<EFBFBD> 157 0x9d 0235 U+0161 LATIN SMALL LETTER S WITH CARON
ž 158 0x9e 0236 U+017E LATIN SMALL LETTER Z WITH CARON
Ÿ 159 0x9f 0237 U+FFFD UNDEFINED
  160 0xa0 0240 U+20AC EURO SIGN

View File

@ -0,0 +1,33 @@
• 128 0x80 0200 U+2022 BULLET
† 129 0x81 0201 U+2020 DAGGER
‡ 130 0x82 0202 U+2021 DOUBLE DAGGER
… 131 0x83 0203 U+2026 HORIZONTAL ELLIPSIS
— 132 0x84 0204 U+2014 EM DASH
133 0x85 0205 U+2013 EN DASH
ƒ 134 0x86 0206 U+0192 SMALL LETTER F WITH HOOK
135 0x87 0207 U+2044 FRACTION SLASH (solidus)
136 0x88 0210 U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
137 0x89 0211 U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
138 0x8a 0212 U+2212 MINUS SIGN
‰ 139 0x8b 0213 U+2030 PER MILLE SIGN
„ 140 0x8c 0214 U+201E DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
“ 141 0x8d 0215 U+201C LEFT DOUBLE QUOTATION MARK (double quote left)
” 142 0x8e 0216 U+201D RIGHT DOUBLE QUOTATION MARK (quotedblright)
143 0x8f 0217 U+2018 LEFT SINGLE QUOTATION MARK (quoteleft)
144 0x90 0220 U+2019 RIGHT SINGLE QUOTATION MARK (quoteright)
145 0x91 0221 U+201A SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
™ 146 0x92 0222 U+2122 TRADE MARK SIGN
fi 147 0x93 0223 U+FB01 LATIN SMALL LIGATURE FI
fl 148 0x94 0224 U+FB02 LATIN SMALL LIGATURE FL
Ł 149 0x95 0225 U+0141 LATIN CAPITAL LETTER L WITH STROKE
Π150 0x96 0226 U+0152 LATIN CAPITAL LIGATURE OE
Š 151 0x97 0227 U+0160 LATIN CAPITAL LETTER S WITH CARON
Ÿ 152 0x98 0230 U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
Ž 153 0x99 0231 U+017D LATIN CAPITAL LETTER Z WITH CARON
ı 154 0x9a 0232 U+0131 LATIN SMALL LETTER DOTLESS I
ł 155 0x9b 0233 U+0142 LATIN SMALL LETTER L WITH STROKE
œ 156 0x9c 0234 U+0153 LATIN SMALL LIGATURE OE
š 157 0x9d 0235 U+0161 LATIN SMALL LETTER S WITH CARON
ž 158 0x9e 0236 U+017E LATIN SMALL LETTER Z WITH CARON
<EFBFBD> 159 0x9f 0237 U+FFFD UNDEFINED
€ 160 0xa0 0240 U+20AC EURO SIGN

View File

@ -0,0 +1,45 @@
#include <qpdf/QUtil.hh>
#include <qpdf/QPDFObjectHandle.hh>
#include <iostream>
#include <stdlib.h>
#include <string.h>
static char const* whoami = 0;
void usage()
{
std::cerr << "Usage: " << whoami << " infile" << std::endl;
exit(2);
}
int main(int argc, char* argv[])
{
if ((whoami = strrchr(argv[0], '/')) == NULL)
{
whoami = argv[0];
}
else
{
++whoami;
}
// For libtool's sake....
if (strncmp(whoami, "lt-", 3) == 0)
{
whoami += 3;
}
if (argc != 2)
{
usage();
}
char const* infilename = argv[1];
std::list<std::string> lines =
QUtil::read_lines_from_file(infilename);
for (std::list<std::string>::iterator iter = lines.begin();
iter != lines.end(); ++iter)
{
QPDFObjectHandle str = QPDFObjectHandle::newString(*iter);
std::cout << str.getUTF8Value() << std::endl;
}
return 0;
}