mirror of
https://github.com/qpdf/qpdf.git
synced 2024-12-22 19:08:59 +00:00
Better support for creating Unicode strings
This commit is contained in:
parent
e44c395c51
commit
952a665a4e
@ -1,3 +1,11 @@
|
|||||||
|
2018-06-21 Jay Berkenbilt <ejb@ql.org>
|
||||||
|
|
||||||
|
* Added QPDFObject::newUnicodeString and QPDFObject::unparseBinary
|
||||||
|
to allow for more convenient creation of strings that are
|
||||||
|
explicitly encoded in UTF-16 BE. This is useful for creating
|
||||||
|
Unicode strings that appear outside of content streams, such as in
|
||||||
|
page labels, outlines, form field values, etc.
|
||||||
|
|
||||||
2018-06-20 Jay Berkenbilt <ejb@ql.org>
|
2018-06-20 Jay Berkenbilt <ejb@ql.org>
|
||||||
|
|
||||||
* Added new classes QPDFAcroFormDocumentHelper,
|
* Added new classes QPDFAcroFormDocumentHelper,
|
||||||
|
@ -344,6 +344,12 @@ class QPDFObjectHandle
|
|||||||
static QPDFObjectHandle newName(std::string const& name);
|
static QPDFObjectHandle newName(std::string const& name);
|
||||||
QPDF_DLL
|
QPDF_DLL
|
||||||
static QPDFObjectHandle newString(std::string const& str);
|
static QPDFObjectHandle newString(std::string const& str);
|
||||||
|
// Create a string encoded in UTF-16 from the given utf8-encoded
|
||||||
|
// string. Such strings are appropriately encoded to appear in PDF
|
||||||
|
// files outside of content streams, such as in document metadata
|
||||||
|
// form field values, page labels, outlines, and similar locations.
|
||||||
|
QPDF_DLL
|
||||||
|
static QPDFObjectHandle newUnicodeString(std::string const& utf8_str);
|
||||||
QPDF_DLL
|
QPDF_DLL
|
||||||
static QPDFObjectHandle newOperator(std::string const&);
|
static QPDFObjectHandle newOperator(std::string const&);
|
||||||
QPDF_DLL
|
QPDF_DLL
|
||||||
@ -715,6 +721,10 @@ class QPDFObjectHandle
|
|||||||
std::string unparse();
|
std::string unparse();
|
||||||
QPDF_DLL
|
QPDF_DLL
|
||||||
std::string unparseResolved();
|
std::string unparseResolved();
|
||||||
|
// For strings only, force binary representation. Otherwise, same
|
||||||
|
// as unparse.
|
||||||
|
QPDF_DLL
|
||||||
|
std::string unparseBinary();
|
||||||
|
|
||||||
// Legacy helper methods for commonly performed operations on
|
// Legacy helper methods for commonly performed operations on
|
||||||
// pages. Newer code should use QPDFPageObjectHelper instead. The
|
// pages. Newer code should use QPDFPageObjectHelper instead. The
|
||||||
|
@ -1221,6 +1221,20 @@ QPDFObjectHandle::unparseResolved()
|
|||||||
return this->m->obj->unparse();
|
return this->m->obj->unparse();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string
|
||||||
|
QPDFObjectHandle::unparseBinary()
|
||||||
|
{
|
||||||
|
if (this->isString())
|
||||||
|
{
|
||||||
|
return dynamic_cast<QPDF_String*>(
|
||||||
|
this->m->obj.getPointer())->unparse(true);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return unparse();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
QPDFObjectHandle
|
QPDFObjectHandle
|
||||||
QPDFObjectHandle::parse(std::string const& object_str,
|
QPDFObjectHandle::parse(std::string const& object_str,
|
||||||
std::string const& object_description)
|
std::string const& object_description)
|
||||||
@ -1845,6 +1859,12 @@ QPDFObjectHandle::newString(std::string const& str)
|
|||||||
return QPDFObjectHandle(new QPDF_String(str));
|
return QPDFObjectHandle(new QPDF_String(str));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
QPDFObjectHandle
|
||||||
|
QPDFObjectHandle::newUnicodeString(std::string const& utf8_str)
|
||||||
|
{
|
||||||
|
return QPDFObjectHandle(QPDF_String::new_utf16(utf8_str));
|
||||||
|
}
|
||||||
|
|
||||||
QPDFObjectHandle
|
QPDFObjectHandle
|
||||||
QPDFObjectHandle::newOperator(std::string const& value)
|
QPDFObjectHandle::newOperator(std::string const& value)
|
||||||
{
|
{
|
||||||
|
@ -64,6 +64,58 @@ QPDF_String::~QPDF_String()
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
QPDF_String*
|
||||||
|
QPDF_String::new_utf16(std::string const& utf8_val)
|
||||||
|
{
|
||||||
|
std::string result = "\xfe\xff";
|
||||||
|
size_t len = utf8_val.length();
|
||||||
|
for (size_t i = 0; i < len; ++i)
|
||||||
|
{
|
||||||
|
unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));
|
||||||
|
if (ch < 128)
|
||||||
|
{
|
||||||
|
result += QUtil::toUTF16(ch);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
size_t bytes_needed = 0;
|
||||||
|
unsigned bit_check = 0x40;
|
||||||
|
unsigned char to_clear = 0x80;
|
||||||
|
while (ch & bit_check)
|
||||||
|
{
|
||||||
|
++bytes_needed;
|
||||||
|
to_clear |= bit_check;
|
||||||
|
bit_check >>= 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (((bytes_needed > 5) || (bytes_needed < 1)) ||
|
||||||
|
((i + bytes_needed) >= len))
|
||||||
|
{
|
||||||
|
result += "\xff\xfd";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
unsigned long codepoint = (ch & ~to_clear);
|
||||||
|
while (bytes_needed > 0)
|
||||||
|
{
|
||||||
|
--bytes_needed;
|
||||||
|
ch = utf8_val.at(++i);
|
||||||
|
if ((ch & 0xc0) != 0x80)
|
||||||
|
{
|
||||||
|
--i;
|
||||||
|
codepoint = 0xfffd;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
codepoint <<= 6;
|
||||||
|
codepoint += (ch & 0x3f);
|
||||||
|
}
|
||||||
|
result += QUtil::toUTF16(codepoint);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new QPDF_String(result);
|
||||||
|
}
|
||||||
|
|
||||||
std::string
|
std::string
|
||||||
QPDF_String::unparse()
|
QPDF_String::unparse()
|
||||||
{
|
{
|
||||||
|
@ -9,6 +9,7 @@ class QPDF_String: public QPDFObject
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
QPDF_String(std::string const& val);
|
QPDF_String(std::string const& val);
|
||||||
|
static QPDF_String* new_utf16(std::string const& utf8_val);
|
||||||
virtual ~QPDF_String();
|
virtual ~QPDF_String();
|
||||||
virtual std::string unparse();
|
virtual std::string unparse();
|
||||||
virtual QPDFObject::object_type_e getTypeCode() const;
|
virtual QPDFObject::object_type_e getTypeCode() const;
|
||||||
|
@ -4,6 +4,7 @@ BINS_qpdf = \
|
|||||||
test_driver \
|
test_driver \
|
||||||
test_large_file \
|
test_large_file \
|
||||||
test_pdf_doc_encoding \
|
test_pdf_doc_encoding \
|
||||||
|
test_pdf_unicode \
|
||||||
test_tokenizer
|
test_tokenizer
|
||||||
CBINS_qpdf = qpdf-ctest
|
CBINS_qpdf = qpdf-ctest
|
||||||
|
|
||||||
|
@ -84,13 +84,21 @@ flush_tiff_cache();
|
|||||||
|
|
||||||
show_ntests();
|
show_ntests();
|
||||||
# ----------
|
# ----------
|
||||||
$td->notify("--- PDF Doc Encoding ---");
|
$td->notify("--- Character Encoding ---");
|
||||||
$n_tests += 1;
|
$n_tests += 3;
|
||||||
|
|
||||||
$td->runtest("PDF doc encoding to Unicode",
|
$td->runtest("PDF doc encoding to Unicode",
|
||||||
{$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"},
|
{$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"},
|
||||||
{$td->FILE => "pdf-doc-to-utf8.out", $td->EXIT_STATUS => 0},
|
{$td->FILE => "pdf-doc-to-utf8.out", $td->EXIT_STATUS => 0},
|
||||||
$td->NORMALIZE_NEWLINES);
|
$td->NORMALIZE_NEWLINES);
|
||||||
|
$td->runtest("UTF-16 encoding",
|
||||||
|
{$td->COMMAND => "test_pdf_unicode unicode.in"},
|
||||||
|
{$td->FILE => "unicode.out", $td->EXIT_STATUS => 0},
|
||||||
|
$td->NORMALIZE_NEWLINES);
|
||||||
|
$td->runtest("UTF-16 encoding errors",
|
||||||
|
{$td->COMMAND => "test_pdf_unicode unicode-errors.in"},
|
||||||
|
{$td->FILE => "unicode-errors.out", $td->EXIT_STATUS => 0},
|
||||||
|
$td->NORMALIZE_NEWLINES);
|
||||||
|
|
||||||
show_ntests();
|
show_ntests();
|
||||||
# ----------
|
# ----------
|
||||||
|
7
qpdf/qtest/qpdf/unicode-errors.in
Normal file
7
qpdf/qtest/qpdf/unicode-errors.in
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
This file has utf-8 encoding errors and should be edited as a binary file.
|
||||||
|
|
||||||
|
0: too many bytes: þafter
|
||||||
|
1: too few bytes: €after
|
||||||
|
2: invalid codepoint (U+DEAD): íºafter
|
||||||
|
3: not enough bytes for character: ð<>„!after (! included)
|
||||||
|
4: not enough bytes left in file ð
|
7
qpdf/qtest/qpdf/unicode-errors.out
Normal file
7
qpdf/qtest/qpdf/unicode-errors.out
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
This file has utf-8 encoding errors and should be edited as a binary file. // <feff0054006800690073002000660069006c006500200068006100730020007500740066002d003800200065006e0063006f00640069006e00670020006500720072006f0072007300200061006e0064002000730068006f0075006c0064002000620065002000650064006900740065006400200061007300200061002000620069006e006100720079002000660069006c0065002e>
|
||||||
|
// <feff>
|
||||||
|
0: too many bytes: <20>after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072>
|
||||||
|
1: too few bytes: <20>after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072>
|
||||||
|
2: invalid codepoint (U+DEAD): <20>after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072>
|
||||||
|
3: not enough bytes for character: <20>!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029>
|
||||||
|
4: not enough bytes left in file <20> // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd>
|
5
qpdf/qtest/qpdf/unicode.in
Normal file
5
qpdf/qtest/qpdf/unicode.in
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
This is a potato: 🥔 (u+01f954).
|
||||||
|
If you wanted to, you could cook some sweet 🥔 π.
|
||||||
|
If you think wwwwww is good, you should try ʬʬʬʬʬʬ.
|
||||||
|
బంగాళాదుంప సలాడ్
|
||||||
|
𝄞 𝄢 𝄪 𝅂
|
5
qpdf/qtest/qpdf/unicode.out
Normal file
5
qpdf/qtest/qpdf/unicode.out
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
This is a potato: 🥔 (u+01f954). // <feff00540068006900730020006900730020006100200070006f007400610074006f003a0020d83edd54002000280075002b0030003100660039003500340029002e>
|
||||||
|
If you wanted to, you could cook some sweet 🥔 π. // <feff0049006600200079006f0075002000770061006e00740065006400200074006f002c00200079006f007500200063006f0075006c006400200063006f006f006b00200073006f006d00650020007300770065006500740020d83edd54002003c0002e>
|
||||||
|
If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // <feff0049006600200079006f00750020007400680069006e006b002000770077007700770077007700200069007300200067006f006f0064002c00200079006f0075002000730068006f0075006c00640020007400720079002002ac02ac02ac02ac02ac02ac002e>
|
||||||
|
బంగాళాదుంప సలాడ్ // <feff0c2c0c020c170c3e0c330c3e0c260c410c020c2a00200c380c320c3e0c210c4d>
|
||||||
|
𝄞 𝄢 𝄪 𝅂 // <feffd834dd1e0020d834dd220020d834dd2a0020d834dd42>
|
46
qpdf/test_pdf_unicode.cc
Normal file
46
qpdf/test_pdf_unicode.cc
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
#include <qpdf/QUtil.hh>
|
||||||
|
#include <qpdf/QPDFObjectHandle.hh>
|
||||||
|
#include <iostream>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
static char const* whoami = 0;
|
||||||
|
|
||||||
|
void usage()
|
||||||
|
{
|
||||||
|
std::cerr << "Usage: " << whoami << " infile" << std::endl;
|
||||||
|
exit(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
if ((whoami = strrchr(argv[0], '/')) == NULL)
|
||||||
|
{
|
||||||
|
whoami = argv[0];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
++whoami;
|
||||||
|
}
|
||||||
|
// For libtool's sake....
|
||||||
|
if (strncmp(whoami, "lt-", 3) == 0)
|
||||||
|
{
|
||||||
|
whoami += 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc != 2)
|
||||||
|
{
|
||||||
|
usage();
|
||||||
|
}
|
||||||
|
char const* infilename = argv[1];
|
||||||
|
std::list<std::string> lines =
|
||||||
|
QUtil::read_lines_from_file(infilename);
|
||||||
|
for (std::list<std::string>::iterator iter = lines.begin();
|
||||||
|
iter != lines.end(); ++iter)
|
||||||
|
{
|
||||||
|
QPDFObjectHandle str = QPDFObjectHandle::newUnicodeString(*iter);
|
||||||
|
std::cout << str.getUTF8Value() << " // "
|
||||||
|
<< str.unparseBinary() << std::endl;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user