mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 10:58:58 +00:00
Properly handle strings with PDF Doc Encoding (fixes #179)
The QPDF_String::getUTF8Val() method was not treating strings that weren't explicitly Unicode as PDF Doc Encoded. This only affects characters in the range 0x80 through 0xa0.
This commit is contained in:
parent
2780a1871d
commit
4bb3046f0b
@ -1,5 +1,8 @@
|
|||||||
2018-02-17 Jay Berkenbilt <ejb@ql.org>
|
2018-02-17 Jay Berkenbilt <ejb@ql.org>
|
||||||
|
|
||||||
|
* Fix QPDFObjectHandle::getUTF8Val() to properly handle strings
|
||||||
|
that are encoded with PDF Doc Encoding. Fixes #179.
|
||||||
|
|
||||||
* Add qpdf_check_pdf to the "C" API. This method just attempts to
|
* Add qpdf_check_pdf to the "C" API. This method just attempts to
|
||||||
read the entire file and produce no output, making possible to
|
read the entire file and produce no output, making possible to
|
||||||
assess whether the file has any errors that qpdf can detect.
|
assess whether the file has any errors that qpdf can detect.
|
||||||
|
@ -48,4 +48,10 @@ $td->runtest("bookmarks deleted",
|
|||||||
$td->EXIT_STATUS => 0},
|
$td->EXIT_STATUS => 0},
|
||||||
$td->NORMALIZE_NEWLINES);
|
$td->NORMALIZE_NEWLINES);
|
||||||
|
|
||||||
$td->report(10);
|
$td->runtest("non-trivial pdf doc to unicode",
|
||||||
|
{$td->COMMAND => "pdf-bookmarks issue-179.pdf"},
|
||||||
|
{$td->FILE => "issue-179.out",
|
||||||
|
$td->EXIT_STATUS => 0},
|
||||||
|
$td->NORMALIZE_NEWLINES);
|
||||||
|
|
||||||
|
$td->report(11);
|
||||||
|
12
examples/qtest/bookmarks/issue-179.out
Normal file
12
examples/qtest/bookmarks/issue-179.out
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
ž
|
||||||
|
žč
|
||||||
|
žđ
|
||||||
|
žć
|
||||||
|
žš
|
||||||
|
ž ajklyghvbnmxcseqwuioprtzdf
|
||||||
|
š
|
||||||
|
šč
|
||||||
|
šđ
|
||||||
|
šć
|
||||||
|
šž
|
||||||
|
š ajklyghvbnmxcseqwuioprtzdf
|
BIN
examples/qtest/bookmarks/issue-179.pdf
Normal file
BIN
examples/qtest/bookmarks/issue-179.pdf
Normal file
Binary file not shown.
@ -442,6 +442,13 @@ class QPDFObjectHandle
|
|||||||
// Methods for string objects
|
// Methods for string objects
|
||||||
QPDF_DLL
|
QPDF_DLL
|
||||||
std::string getStringValue();
|
std::string getStringValue();
|
||||||
|
// If a string starts with the UTF-16 marker, it is converted from
|
||||||
|
// UTF-16 to UTF-8. Otherwise, it is treated as a string encoded
|
||||||
|
// with PDF Doc Encoding. PDF Doc Encoding is identical to
|
||||||
|
// ISO-8859-1 except in the range from 0200 through 0240, where
|
||||||
|
// there is a mapping of characters to Unicode. QPDF versions
|
||||||
|
// prior to version erroneously left characters in that range
|
||||||
|
// unmapped.
|
||||||
QPDF_DLL
|
QPDF_DLL
|
||||||
std::string getUTF8Value();
|
std::string getUTF8Value();
|
||||||
|
|
||||||
|
@ -8,6 +8,43 @@
|
|||||||
// be used.
|
// be used.
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
|
// First element is 128
|
||||||
|
static unsigned short pdf_doc_to_unicode[] = {
|
||||||
|
0x2022, // 0x80 BULLET
|
||||||
|
0x2020, // 0x81 DAGGER
|
||||||
|
0x2021, // 0x82 DOUBLE DAGGER
|
||||||
|
0x2026, // 0x83 HORIZONTAL ELLIPSIS
|
||||||
|
0x2014, // 0x84 EM DASH
|
||||||
|
0x2013, // 0x85 EN DASH
|
||||||
|
0x0192, // 0x86 SMALL LETTER F WITH HOOK
|
||||||
|
0x2044, // 0x87 FRACTION SLASH (solidus)
|
||||||
|
0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||||
|
0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||||
|
0x2212, // 0x8a MINUS SIGN
|
||||||
|
0x2030, // 0x8b PER MILLE SIGN
|
||||||
|
0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
|
||||||
|
0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
|
||||||
|
0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
|
||||||
|
0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
|
||||||
|
0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
|
||||||
|
0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
|
||||||
|
0x2122, // 0x92 TRADE MARK SIGN
|
||||||
|
0xfb01, // 0x93 LATIN SMALL LIGATURE FI
|
||||||
|
0xfb02, // 0x94 LATIN SMALL LIGATURE FL
|
||||||
|
0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
|
||||||
|
0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
|
||||||
|
0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
|
||||||
|
0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||||
|
0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
|
||||||
|
0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
|
||||||
|
0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
|
||||||
|
0x0153, // 0x9c LATIN SMALL LIGATURE OE
|
||||||
|
0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
|
||||||
|
0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
|
||||||
|
0xfffd, // 0x9f UNDEFINED
|
||||||
|
0x20ac, // 0xa0 EURO SIGN
|
||||||
|
};
|
||||||
|
|
||||||
// See above about ctype.
|
// See above about ctype.
|
||||||
static bool is_ascii_printable(unsigned char ch)
|
static bool is_ascii_printable(unsigned char ch)
|
||||||
{
|
{
|
||||||
@ -209,7 +246,13 @@ QPDF_String::getUTF8Val() const
|
|||||||
{
|
{
|
||||||
for (unsigned int i = 0; i < len; ++i)
|
for (unsigned int i = 0; i < len; ++i)
|
||||||
{
|
{
|
||||||
result += QUtil::toUTF8(static_cast<unsigned char>(this->val.at(i)));
|
unsigned char ch = static_cast<unsigned char>(this->val.at(i));
|
||||||
|
unsigned short val = ch;
|
||||||
|
if ((ch >= 128) && (ch <= 160))
|
||||||
|
{
|
||||||
|
val = pdf_doc_to_unicode[ch - 128];
|
||||||
|
}
|
||||||
|
result += QUtil::toUTF8(val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
BINS_qpdf = qpdf test_driver pdf_from_scratch test_large_file test_tokenizer
|
BINS_qpdf = \
|
||||||
|
qpdf \
|
||||||
|
pdf_from_scratch \
|
||||||
|
test_driver \
|
||||||
|
test_large_file \
|
||||||
|
test_pdf_doc_encoding \
|
||||||
|
test_tokenizer
|
||||||
CBINS_qpdf = qpdf-ctest
|
CBINS_qpdf = qpdf-ctest
|
||||||
|
|
||||||
TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B)))
|
TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B)))
|
||||||
|
@ -82,6 +82,16 @@ compare_pdfs("p1-a.pdf", "p1-a-p2-b.pdf", 1);
|
|||||||
compare_pdfs("p1-a-p2-a.pdf", "p1-a-p2-b.pdf", 1);
|
compare_pdfs("p1-a-p2-a.pdf", "p1-a-p2-b.pdf", 1);
|
||||||
flush_tiff_cache();
|
flush_tiff_cache();
|
||||||
|
|
||||||
|
show_ntests();
|
||||||
|
# ----------
|
||||||
|
$td->notify("--- PDF Doc Encoding ---");
|
||||||
|
$n_tests += 1;
|
||||||
|
|
||||||
|
$td->runtest("PDF doc encoding to Unicode",
|
||||||
|
{$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"},
|
||||||
|
{$td->FILE => "pdf-doc-to-utf8.out", $td->EXIT_STATUS => 0},
|
||||||
|
$td->NORMALIZE_NEWLINES);
|
||||||
|
|
||||||
show_ntests();
|
show_ntests();
|
||||||
# ----------
|
# ----------
|
||||||
$td->notify("--- Stream Replacement Tests ---");
|
$td->notify("--- Stream Replacement Tests ---");
|
||||||
|
33
qpdf/qtest/qpdf/pdf-doc-to-utf8.in
Normal file
33
qpdf/qtest/qpdf/pdf-doc-to-utf8.in
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
€ 128 0x80 0200 U+2022 BULLET
|
||||||
|
<EFBFBD> 129 0x81 0201 U+2020 DAGGER
|
||||||
|
‚ 130 0x82 0202 U+2021 DOUBLE DAGGER
|
||||||
|
ƒ 131 0x83 0203 U+2026 HORIZONTAL ELLIPSIS
|
||||||
|
„ 132 0x84 0204 U+2014 EM DASH
|
||||||
|
… 133 0x85 0205 U+2013 EN DASH
|
||||||
|
† 134 0x86 0206 U+0192 SMALL LETTER F WITH HOOK
|
||||||
|
‡ 135 0x87 0207 U+2044 FRACTION SLASH (solidus)
|
||||||
|
ˆ 136 0x88 0210 U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||||
|
‰ 137 0x89 0211 U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||||
|
Š 138 0x8a 0212 U+2212 MINUS SIGN
|
||||||
|
‹ 139 0x8b 0213 U+2030 PER MILLE SIGN
|
||||||
|
Œ 140 0x8c 0214 U+201E DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
|
||||||
|
<EFBFBD> 141 0x8d 0215 U+201C LEFT DOUBLE QUOTATION MARK (double quote left)
|
||||||
|
Ž 142 0x8e 0216 U+201D RIGHT DOUBLE QUOTATION MARK (quotedblright)
|
||||||
|
<EFBFBD> 143 0x8f 0217 U+2018 LEFT SINGLE QUOTATION MARK (quoteleft)
|
||||||
|
<EFBFBD> 144 0x90 0220 U+2019 RIGHT SINGLE QUOTATION MARK (quoteright)
|
||||||
|
‘ 145 0x91 0221 U+201A SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
|
||||||
|
’ 146 0x92 0222 U+2122 TRADE MARK SIGN
|
||||||
|
“ 147 0x93 0223 U+FB01 LATIN SMALL LIGATURE FI
|
||||||
|
” 148 0x94 0224 U+FB02 LATIN SMALL LIGATURE FL
|
||||||
|
• 149 0x95 0225 U+0141 LATIN CAPITAL LETTER L WITH STROKE
|
||||||
|
– 150 0x96 0226 U+0152 LATIN CAPITAL LIGATURE OE
|
||||||
|
— 151 0x97 0227 U+0160 LATIN CAPITAL LETTER S WITH CARON
|
||||||
|
˜ 152 0x98 0230 U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||||
|
™ 153 0x99 0231 U+017D LATIN CAPITAL LETTER Z WITH CARON
|
||||||
|
š 154 0x9a 0232 U+0131 LATIN SMALL LETTER DOTLESS I
|
||||||
|
› 155 0x9b 0233 U+0142 LATIN SMALL LETTER L WITH STROKE
|
||||||
|
œ 156 0x9c 0234 U+0153 LATIN SMALL LIGATURE OE
|
||||||
|
<EFBFBD> 157 0x9d 0235 U+0161 LATIN SMALL LETTER S WITH CARON
|
||||||
|
ž 158 0x9e 0236 U+017E LATIN SMALL LETTER Z WITH CARON
|
||||||
|
Ÿ 159 0x9f 0237 U+FFFD UNDEFINED
|
||||||
|
160 0xa0 0240 U+20AC EURO SIGN
|
33
qpdf/qtest/qpdf/pdf-doc-to-utf8.out
Normal file
33
qpdf/qtest/qpdf/pdf-doc-to-utf8.out
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
• 128 0x80 0200 U+2022 BULLET
|
||||||
|
† 129 0x81 0201 U+2020 DAGGER
|
||||||
|
‡ 130 0x82 0202 U+2021 DOUBLE DAGGER
|
||||||
|
… 131 0x83 0203 U+2026 HORIZONTAL ELLIPSIS
|
||||||
|
— 132 0x84 0204 U+2014 EM DASH
|
||||||
|
– 133 0x85 0205 U+2013 EN DASH
|
||||||
|
ƒ 134 0x86 0206 U+0192 SMALL LETTER F WITH HOOK
|
||||||
|
⁄ 135 0x87 0207 U+2044 FRACTION SLASH (solidus)
|
||||||
|
‹ 136 0x88 0210 U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||||
|
› 137 0x89 0211 U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||||
|
− 138 0x8a 0212 U+2212 MINUS SIGN
|
||||||
|
‰ 139 0x8b 0213 U+2030 PER MILLE SIGN
|
||||||
|
„ 140 0x8c 0214 U+201E DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
|
||||||
|
“ 141 0x8d 0215 U+201C LEFT DOUBLE QUOTATION MARK (double quote left)
|
||||||
|
” 142 0x8e 0216 U+201D RIGHT DOUBLE QUOTATION MARK (quotedblright)
|
||||||
|
‘ 143 0x8f 0217 U+2018 LEFT SINGLE QUOTATION MARK (quoteleft)
|
||||||
|
’ 144 0x90 0220 U+2019 RIGHT SINGLE QUOTATION MARK (quoteright)
|
||||||
|
‚ 145 0x91 0221 U+201A SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
|
||||||
|
™ 146 0x92 0222 U+2122 TRADE MARK SIGN
|
||||||
|
fi 147 0x93 0223 U+FB01 LATIN SMALL LIGATURE FI
|
||||||
|
fl 148 0x94 0224 U+FB02 LATIN SMALL LIGATURE FL
|
||||||
|
Ł 149 0x95 0225 U+0141 LATIN CAPITAL LETTER L WITH STROKE
|
||||||
|
Œ 150 0x96 0226 U+0152 LATIN CAPITAL LIGATURE OE
|
||||||
|
Š 151 0x97 0227 U+0160 LATIN CAPITAL LETTER S WITH CARON
|
||||||
|
Ÿ 152 0x98 0230 U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||||
|
Ž 153 0x99 0231 U+017D LATIN CAPITAL LETTER Z WITH CARON
|
||||||
|
ı 154 0x9a 0232 U+0131 LATIN SMALL LETTER DOTLESS I
|
||||||
|
ł 155 0x9b 0233 U+0142 LATIN SMALL LETTER L WITH STROKE
|
||||||
|
œ 156 0x9c 0234 U+0153 LATIN SMALL LIGATURE OE
|
||||||
|
š 157 0x9d 0235 U+0161 LATIN SMALL LETTER S WITH CARON
|
||||||
|
ž 158 0x9e 0236 U+017E LATIN SMALL LETTER Z WITH CARON
|
||||||
|
<EFBFBD> 159 0x9f 0237 U+FFFD UNDEFINED
|
||||||
|
€ 160 0xa0 0240 U+20AC EURO SIGN
|
45
qpdf/test_pdf_doc_encoding.cc
Normal file
45
qpdf/test_pdf_doc_encoding.cc
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
#include <qpdf/QUtil.hh>
|
||||||
|
#include <qpdf/QPDFObjectHandle.hh>
|
||||||
|
#include <iostream>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
static char const* whoami = 0;
|
||||||
|
|
||||||
|
void usage()
|
||||||
|
{
|
||||||
|
std::cerr << "Usage: " << whoami << " infile" << std::endl;
|
||||||
|
exit(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
if ((whoami = strrchr(argv[0], '/')) == NULL)
|
||||||
|
{
|
||||||
|
whoami = argv[0];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
++whoami;
|
||||||
|
}
|
||||||
|
// For libtool's sake....
|
||||||
|
if (strncmp(whoami, "lt-", 3) == 0)
|
||||||
|
{
|
||||||
|
whoami += 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc != 2)
|
||||||
|
{
|
||||||
|
usage();
|
||||||
|
}
|
||||||
|
char const* infilename = argv[1];
|
||||||
|
std::list<std::string> lines =
|
||||||
|
QUtil::read_lines_from_file(infilename);
|
||||||
|
for (std::list<std::string>::iterator iter = lines.begin();
|
||||||
|
iter != lines.end(); ++iter)
|
||||||
|
{
|
||||||
|
QPDFObjectHandle str = QPDFObjectHandle::newString(*iter);
|
||||||
|
std::cout << str.getUTF8Value() << std::endl;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user