Move remaining existing transcoding to QUtil

2025-01-08 17:24:06 +00:00 · 2019-01-13 08:00:14 -05:00 · 2019-01-13 08:00:14 -05:00 · 698485468a
commit 698485468a
parent 5cfcd4f361
4 changed files with 288 additions and 103 deletions
--- a/include/qpdf/QUtil.hh
+++ b/include/qpdf/QUtil.hh
@ -147,13 +147,18 @@ namespace QUtil
    std::string toUTF8(unsigned long uval);

    // Return a string containing the byte representation of the
-    // UTF-16 BE encoding for the unicode value passed in.
+    // UTF-16 big-endian encoding for the unicode value passed in.
    // Unrepresentable code points are converted to U+FFFD.
    QPDF_DLL
    std::string toUTF16(unsigned long uval);

-    // Convert a UTF-8 encoded string to UTF-16. Unrepresentable code
-    // points are converted to U+FFFD.
+    // Test whether this is a UTF-16 big-endian string. This is
+    // indicated by first two bytes being 0xFE 0xFF.
+    QPDF_DLL
+    bool is_utf16(std::string const&);
+
+    // Convert a UTF-8 encoded string to UTF-16 big-endian.
+    // Unrepresentable code points are converted to U+FFFD.
    QPDF_DLL
    std::string utf8_to_utf16(std::string const& utf8);

@ -169,6 +174,24 @@ namespace QUtil
    QPDF_DLL
    std::string utf8_to_mac_roman(
        std::string const& utf8, char unknown_char = '?');
+    QPDF_DLL
+    std::string utf8_to_pdf_doc(
+        std::string const& utf8, char unknown_char = '?');
+
+    // Convert a UTF-16 big-endian encoded string to UTF-8.
+    // Unrepresentable code points are converted to U+FFFD.
+    QPDF_DLL
+    std::string utf16_to_utf8(std::string const& utf16);
+
+    // Convert from the specified single-byte encoding system to
+    // UTF-8. There is no ascii_to_utf8 because all ASCII strings are
+    // already valid UTF-8.
+    QPDF_DLL
+    std::string win_ansi_to_utf8(std::string const& win);
+    QPDF_DLL
+    std::string mac_roman_to_utf8(std::string const& mac);
+    QPDF_DLL
+    std::string pdf_doc_to_utf8(std::string const& pdfdoc);

    // If secure random number generation is supported on your
    // platform and qpdf was not compiled with insecure random number
--- a/libqpdf/QPDF_String.cc
+++ b/libqpdf/QPDF_String.cc
@ -8,43 +8,6 @@
 // be used.
 #include <string.h>

-// First element is 128
-static unsigned short pdf_doc_to_unicode[] = {
-    0x2022,    // 0x80    BULLET
-    0x2020,    // 0x81    DAGGER
-    0x2021,    // 0x82    DOUBLE DAGGER
-    0x2026,    // 0x83    HORIZONTAL ELLIPSIS
-    0x2014,    // 0x84    EM DASH
-    0x2013,    // 0x85    EN DASH
-    0x0192,    // 0x86    SMALL LETTER F WITH HOOK
-    0x2044,    // 0x87    FRACTION SLASH (solidus)
-    0x2039,    // 0x88    SINGLE LEFT-POINTING ANGLE QUOTATION MARK
-    0x203a,    // 0x89    SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
-    0x2212,    // 0x8a    MINUS SIGN
-    0x2030,    // 0x8b    PER MILLE SIGN
-    0x201e,    // 0x8c    DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
-    0x201c,    // 0x8d    LEFT DOUBLE QUOTATION MARK (double quote left)
-    0x201d,    // 0x8e    RIGHT DOUBLE QUOTATION MARK (quotedblright)
-    0x2018,    // 0x8f    LEFT SINGLE QUOTATION MARK (quoteleft)
-    0x2019,    // 0x90    RIGHT SINGLE QUOTATION MARK (quoteright)
-    0x201a,    // 0x91    SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
-    0x2122,    // 0x92    TRADE MARK SIGN
-    0xfb01,    // 0x93    LATIN SMALL LIGATURE FI
-    0xfb02,    // 0x94    LATIN SMALL LIGATURE FL
-    0x0141,    // 0x95    LATIN CAPITAL LETTER L WITH STROKE
-    0x0152,    // 0x96    LATIN CAPITAL LIGATURE OE
-    0x0160,    // 0x97    LATIN CAPITAL LETTER S WITH CARON
-    0x0178,    // 0x98    LATIN CAPITAL LETTER Y WITH DIAERESIS
-    0x017d,    // 0x99    LATIN CAPITAL LETTER Z WITH CARON
-    0x0131,    // 0x9a    LATIN SMALL LETTER DOTLESS I
-    0x0142,    // 0x9b    LATIN SMALL LETTER L WITH STROKE
-    0x0153,    // 0x9c    LATIN SMALL LIGATURE OE
-    0x0161,    // 0x9d    LATIN SMALL LETTER S WITH CARON
-    0x017e,    // 0x9e    LATIN SMALL LETTER Z WITH CARON
-    0xfffd,    // 0x9f    UNDEFINED
-    0x20ac,    // 0xa0    EURO SIGN
-};
-
 // See above about ctype.
 static bool is_ascii_printable(unsigned char ch)
 {
@ -210,62 +173,12 @@ QPDF_String::getVal() const
 std::string
 QPDF_String::getUTF8Val() const
 {
-    std::string result;
-    size_t len = this->val.length();
-    if ((len >= 2) && (len % 2 == 0) &&
-	(this->val.at(0) == '\xfe') && (this->val.at(1) == '\xff'))
+    if (QUtil::is_utf16(this->val))
    {
-	// This is a Unicode string using big-endian UTF-16.  This
-	// code uses unsigned long and unsigned short to hold
-	// codepoint values.  It requires unsigned long to be at least
-	// 32 bits and unsigned short to be at least 16 bits, but it
-	// will work fine if they are larger.
-	unsigned long codepoint = 0L;
-	for (unsigned int i = 2; i < len; i += 2)
-	{
-	    // Convert from UTF16-BE.  If we get a malformed
-	    // codepoint, this code will generate incorrect output
-	    // without giving a warning.  Specifically, a high
-	    // codepoint not followed by a low codepoint will be
-	    // discarded, and a low codepoint not preceded by a high
-	    // codepoint will just get its low 10 bits output.
-	    unsigned short bits =
-		(static_cast<unsigned char>(this->val.at(i)) << 8) +
-		static_cast<unsigned char>(this->val.at(i+1));
-	    if ((bits & 0xFC00) == 0xD800)
-	    {
-		codepoint = 0x10000 + ((bits & 0x3FF) << 10);
-		continue;
-	    }
-	    else if ((bits & 0xFC00) == 0xDC00)
-	    {
-		if (codepoint != 0)
-		{
-		    QTC::TC("qpdf", "QPDF_String non-trivial UTF-16");
-		}
-		codepoint += bits & 0x3FF;
-	    }
-	    else
-	    {
-		codepoint = bits;
-	    }
-
-	    result += QUtil::toUTF8(codepoint);
-	    codepoint = 0;
-	}
+        return QUtil::utf16_to_utf8(this->val);
    }
    else
    {
-	for (unsigned int i = 0; i < len; ++i)
-	{
-            unsigned char ch = static_cast<unsigned char>(this->val.at(i));
-            unsigned short val = ch;
-            if ((ch >= 128) && (ch <= 160))
-            {
-                val = pdf_doc_to_unicode[ch - 128];
-            }
-	    result += QUtil::toUTF8(val);
-	}
+        return QUtil::pdf_doc_to_utf8(this->val);
    }
-    return result;
 }
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@ -8,6 +8,7 @@
 #endif
 #include <qpdf/SecureRandomDataProvider.hh>
 #include <qpdf/QPDFSystemError.hh>
+#include <qpdf/QTC.hh>

 #include <cmath>
 #include <iomanip>
@ -29,6 +30,43 @@
 #include <sys/stat.h>
 #endif

+// First element is 128
+static unsigned short pdf_doc_to_unicode[] = {
+    0x2022,    // 0x80    BULLET
+    0x2020,    // 0x81    DAGGER
+    0x2021,    // 0x82    DOUBLE DAGGER
+    0x2026,    // 0x83    HORIZONTAL ELLIPSIS
+    0x2014,    // 0x84    EM DASH
+    0x2013,    // 0x85    EN DASH
+    0x0192,    // 0x86    SMALL LETTER F WITH HOOK
+    0x2044,    // 0x87    FRACTION SLASH (solidus)
+    0x2039,    // 0x88    SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+    0x203a,    // 0x89    SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+    0x2212,    // 0x8a    MINUS SIGN
+    0x2030,    // 0x8b    PER MILLE SIGN
+    0x201e,    // 0x8c    DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
+    0x201c,    // 0x8d    LEFT DOUBLE QUOTATION MARK (double quote left)
+    0x201d,    // 0x8e    RIGHT DOUBLE QUOTATION MARK (quotedblright)
+    0x2018,    // 0x8f    LEFT SINGLE QUOTATION MARK (quoteleft)
+    0x2019,    // 0x90    RIGHT SINGLE QUOTATION MARK (quoteright)
+    0x201a,    // 0x91    SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
+    0x2122,    // 0x92    TRADE MARK SIGN
+    0xfb01,    // 0x93    LATIN SMALL LIGATURE FI
+    0xfb02,    // 0x94    LATIN SMALL LIGATURE FL
+    0x0141,    // 0x95    LATIN CAPITAL LETTER L WITH STROKE
+    0x0152,    // 0x96    LATIN CAPITAL LIGATURE OE
+    0x0160,    // 0x97    LATIN CAPITAL LETTER S WITH CARON
+    0x0178,    // 0x98    LATIN CAPITAL LETTER Y WITH DIAERESIS
+    0x017d,    // 0x99    LATIN CAPITAL LETTER Z WITH CARON
+    0x0131,    // 0x9a    LATIN SMALL LETTER DOTLESS I
+    0x0142,    // 0x9b    LATIN SMALL LETTER L WITH STROKE
+    0x0153,    // 0x9c    LATIN SMALL LIGATURE OE
+    0x0161,    // 0x9d    LATIN SMALL LETTER S WITH CARON
+    0x017e,    // 0x9e    LATIN SMALL LETTER Z WITH CARON
+    0xfffd,    // 0x9f    UNDEFINED
+    0x20ac,    // 0xa0    EURO SIGN
+};
+
 std::string
 QUtil::int_to_string(long long num, int length)
 {
@ -895,7 +933,7 @@ QUtil::parse_numrange(char const* range, int max)
    return result;
 }

-enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman };
+enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc };

 static unsigned char
 encode_winansi(unsigned long codepoint)
@ -1342,6 +1380,119 @@ encode_macroman(unsigned long codepoint)
    return ch;
 }

+static unsigned char
+encode_pdfdoc(unsigned long codepoint)
+{
+    // Use this ugly switch statement to avoid a static, which is not
+    // thread-safe.
+    unsigned char ch = '\0';
+    switch (codepoint)
+    {
+      case 0x2022:
+        ch = 0x80;
+        break;
+      case 0x2020:
+        ch = 0x81;
+        break;
+      case 0x2021:
+        ch = 0x82;
+        break;
+      case 0x2026:
+        ch = 0x83;
+        break;
+      case 0x2014:
+        ch = 0x84;
+        break;
+      case 0x2013:
+        ch = 0x85;
+        break;
+      case 0x0192:
+        ch = 0x86;
+        break;
+      case 0x2044:
+        ch = 0x87;
+        break;
+      case 0x2039:
+        ch = 0x88;
+        break;
+      case 0x203a:
+        ch = 0x89;
+        break;
+      case 0x2212:
+        ch = 0x8a;
+        break;
+      case 0x2030:
+        ch = 0x8b;
+        break;
+      case 0x201e:
+        ch = 0x8c;
+        break;
+      case 0x201c:
+        ch = 0x8d;
+        break;
+      case 0x201d:
+        ch = 0x8e;
+        break;
+      case 0x2018:
+        ch = 0x8f;
+        break;
+      case 0x2019:
+        ch = 0x90;
+        break;
+      case 0x201a:
+        ch = 0x91;
+        break;
+      case 0x2122:
+        ch = 0x92;
+        break;
+      case 0xfb01:
+        ch = 0x93;
+        break;
+      case 0xfb02:
+        ch = 0x94;
+        break;
+      case 0x0141:
+        ch = 0x95;
+        break;
+      case 0x0152:
+        ch = 0x96;
+        break;
+      case 0x0160:
+        ch = 0x97;
+        break;
+      case 0x0178:
+        ch = 0x98;
+        break;
+      case 0x017d:
+        ch = 0x99;
+        break;
+      case 0x0131:
+        ch = 0x9a;
+        break;
+      case 0x0142:
+        ch = 0x9b;
+        break;
+      case 0x0153:
+        ch = 0x9c;
+        break;
+      case 0x0161:
+        ch = 0x9d;
+        break;
+      case 0x017e:
+        ch = 0x9e;
+        break;
+      case 0xfffd:
+        ch = 0x9f;
+        break;
+      case 0x20ac:
+        ch = 0xa0;
+        break;
+      default:
+        break;
+    }
+    return ch;
+}
+
 static std::string
 transcode_utf8(std::string const& utf8_val, encoding_e encoding,
               char unknown)
@ -1410,24 +1561,27 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
                {
                    result += QUtil::toUTF16(codepoint);
                }
+                else if ((codepoint >= 160) && (codepoint < 256) &&
+                         ((encoding == e_winansi) || (encoding == e_pdfdoc)))
+                {
+                    ch = static_cast<unsigned char>(codepoint & 0xff);
+                    result.append(1, ch);
+                }
                else
                {
                    ch = '\0';
                    if (encoding == e_winansi)
                    {
-                        if ((codepoint >= 160) && (codepoint < 256))
-                        {
-                            ch = static_cast<unsigned char>(codepoint & 0xff);
-                        }
-                        else
-                        {
-                            ch = encode_winansi(codepoint);
-                        }
+                        ch = encode_winansi(codepoint);
                    }
                    else if (encoding == e_macroman)
                    {
                        ch = encode_macroman(codepoint);
                    }
+                    else if (encoding == e_pdfdoc)
+                    {
+                        ch = encode_pdfdoc(codepoint);
+                    }
                    if (ch == '\0')
                    {
                        ch = static_cast<unsigned char>(unknown);
@ -1463,3 +1617,98 @@ QUtil::utf8_to_mac_roman(std::string const& utf8, char unknown_char)
 {
    return transcode_utf8(utf8, e_macroman, unknown_char);
 }
+
+std::string
+QUtil::utf8_to_pdf_doc(std::string const& utf8, char unknown_char)
+{
+    return transcode_utf8(utf8, e_pdfdoc, unknown_char);
+}
+
+bool
+QUtil::is_utf16(std::string const& val)
+{
+    return ((val.length() >= 2) &&
+            (val.at(0) == '\xfe') && (val.at(1) == '\xff'));
+}
+
+std::string
+QUtil::utf16_to_utf8(std::string const& val)
+{
+    std::string result;
+    // This code uses unsigned long and unsigned short to hold
+    // codepoint values. It requires unsigned long to be at least
+    // 32 bits and unsigned short to be at least 16 bits, but it
+    // will work fine if they are larger.
+    unsigned long codepoint = 0L;
+    size_t len = val.length();
+    size_t start = 0;
+    if (is_utf16(val))
+    {
+        start += 2;
+    }
+    // If the string has an odd number of bytes, the last byte is
+    // ignored.
+    for (unsigned int i = start; i < len; i += 2)
+    {
+        // Convert from UTF16-BE.  If we get a malformed
+        // codepoint, this code will generate incorrect output
+        // without giving a warning.  Specifically, a high
+        // codepoint not followed by a low codepoint will be
+        // discarded, and a low codepoint not preceded by a high
+        // codepoint will just get its low 10 bits output.
+        unsigned short bits =
+            (static_cast<unsigned char>(val.at(i)) << 8) +
+            static_cast<unsigned char>(val.at(i+1));
+        if ((bits & 0xFC00) == 0xD800)
+        {
+            codepoint = 0x10000 + ((bits & 0x3FF) << 10);
+            continue;
+        }
+        else if ((bits & 0xFC00) == 0xDC00)
+        {
+            if (codepoint != 0)
+            {
+                QTC::TC("qpdf", "QUtil non-trivial UTF-16");
+            }
+            codepoint += bits & 0x3FF;
+        }
+        else
+        {
+            codepoint = bits;
+        }
+
+        result += QUtil::toUTF8(codepoint);
+        codepoint = 0;
+    }
+    return result;
+}
+
+std::string
+QUtil::win_ansi_to_utf8(std::string const& val)
+{
+    return "QXXXQ";
+}
+
+std::string
+QUtil::mac_roman_to_utf8(std::string const& val)
+{
+    return "QXXXQ";
+}
+
+std::string
+QUtil::pdf_doc_to_utf8(std::string const& val)
+{
+    std::string result;
+    size_t len = val.length();
+    for (unsigned int i = 0; i < len; ++i)
+    {
+        unsigned char ch = static_cast<unsigned char>(val.at(i));
+        unsigned short val = ch;
+        if ((ch >= 128) && (ch <= 160))
+        {
+            val = pdf_doc_to_unicode[ch - 128];
+        }
+        result += QUtil::toUTF8(val);
+    }
+    return result;
+}
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@ -108,7 +108,7 @@ QPDF_Stream pipeStreamData with null pipeline 0
 QPDFWriter not recompressing /FlateDecode 0
 QPDF_encryption xref stream from encrypted file 0
 qpdf unable to filter 0
-QPDF_String non-trivial UTF-16 0
+QUtil non-trivial UTF-16 0
 QPDF xref overwrite object 0
 QPDF decoding error warning 0
 qpdf-c called qpdf_init 0