Improve Unicode filename testing

Remove dependency on the behavior of perl for reliable creation of Unicode file names on Windows.
2024-12-22 10:58:58 +00:00 · 2019-04-27 19:54:52 -04:00 · 2019-04-27 19:54:52 -04:00 · 03e27709f3
commit 03e27709f3
parent 7ff234a92f
5 changed files with 131 additions and 3 deletions
--- a/8
+++ b/8
@ -170,6 +170,14 @@ I find it useful to make reference to them in this list

 * Pl_TIFFPredictor is pretty slow.

+ * Support for handling file names with Unicode characters in Windows
+   is incomplete. qpdf seems to support them okay from a functionality
+   standpoint, and the right thing happens if you pass in UTF-8
+   encoded filenames to QPDF library routines in Windows (they are
+   converted internally to wchar_t*), but file names are encoded in
+   UTF-8 on output, which doesn't produce nice error messages or
+   output on Windows in some cases.
+
 * If we ever wanted to do anything more with character encoding, see
   ../misc/character-encoding/, which includes machine-readable dump
   of table D.2 in the ISO-32000 PDF spec. This shows the mapping
--- a/manual/qpdf-manual.xml
+++ b/manual/qpdf-manual.xml
@ -2612,6 +2612,31 @@ outfile.pdf</option>
    </varlistentry>
   </variablelist>
  </sect1>
+  <sect1 id="ref.unicode-files">
+   <title>A Note About Unicode File Names</title>
+   <para>
+    When strings are passed to qpdf library routines either as
+    <literal>char*</literal> or as <literal>std::string</literal>,
+    they are treated as byte arrays except where otherwise noted. When
+    Unicode is desired, qpdf wants UTF-8 unless otherwise noted in
+    comments in header files. In modern UNIX/Linux environments, this
+    generally does the right thing. In Windows, it's a bit more
+    complicated. Starting in qpdf 8.4.0, passwords that contain
+    Unicode characters are handled much better, and starting in qpdf
+    8.4.1, the library attempts to properly handle Unicode characters
+    in filenames. In particular, in Windows, if a UTF-8 encoded string
+    is used as a filename in either <classname>QPDF</classname> or
+    <classname>QPDFWriter</classname>, it is internally converted to
+    <literal>wchar_t*</literal>, and Unicode-aware Windows APIs are
+    used. As such, qpdf will generally operate properly on files with
+    non-ASCII characters in their names as long as the filenames are
+    UTF-8 encoded for passing into the qpdf library API, but there are
+    still some rough edges, such as the encoding of the filenames in
+    error messages our CLI output messages. Patches or bug reports are
+    welcome for any continuing issues with Unicode file names in
+    Windows.
+   </para>
+  </sect1>
 </chapter>
 <chapter id="ref.json">
  <title>QPDF JSON</title>
--- a/qpdf/build.mk
+++ b/qpdf/build.mk
@ -5,7 +5,8 @@ BINS_qpdf = \
    test_large_file \
    test_pdf_doc_encoding \
    test_pdf_unicode \
-    test_tokenizer
+    test_tokenizer \
+    test_unicode_filenames
 CBINS_qpdf = qpdf-ctest

 TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B)))
@ -20,6 +21,8 @@ TC_SRCS_qpdf = $(wildcard libqpdf/*.cc) $(wildcard qpdf/*.cc)

 XCXXFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_COMPILE)
 XLDFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_LINK)
+XCXXFLAGS_qpdf_test_unicode_filenames := $(WINDOWS_WMAIN_COMPILE)
+XLDFLAGS_qpdf_test_unicode_filenames := $(WINDOWS_WMAIN_LINK)

 $(foreach B,$(BINS_qpdf),$(eval \
  OBJS_$(B) = $(call src_to_obj,qpdf/$(B).cc)))
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@ -135,7 +135,7 @@ foreach my $c (@completion_tests)
 show_ntests();
 # ----------
 $td->notify("--- Argument Parsing ---");
-$n_tests += 8;
+$n_tests += 6;

 $td->runtest("required argument",
             {$td->COMMAND => "qpdf --password minimal.pdf"},
@ -167,10 +167,21 @@ $td->runtest("extra overlay filename",
             {$td->REGEXP => ".*overlay file already specified.*",
                  $td->EXIT_STATUS => 2},
             $td->NORMALIZE_NEWLINES);
+
+show_ntests();
+# ----------
+$td->notify("--- Unicode Filenames ---");
+$n_tests += 3;
+
+$td->runtest("create unicode filenames",
+             {$td->COMMAND => "test_unicode_filenames"},
+             {$td->STRING => "created Unicode filenames\n",
+                  $td->EXIT_STATUS => 0},
+             $td->NORMALIZE_NEWLINES);
+
 foreach my $d (['auto-ü', 1], ['auto-öπ', 2])
 {
    my ($u, $n) = @$d;
-    copy('minimal.pdf', "$u.pdf");
    $td->runtest("unicode filename $u",
                 {$td->COMMAND => "qpdf --check $u.pdf"},
                 {$td->FILE => "check-unicode-filename-$n.out",
--- a/qpdf/test_unicode_filenames.cc
+++ b/qpdf/test_unicode_filenames.cc
@ -0,0 +1,81 @@
+#ifdef _WIN32
+#include <windows.h>
+#include <direct.h>
+#include <io.h>
+#endif
+
+#include <iostream>
+#include <stdlib.h>
+#include <stdio.h>
+
+static void do_copy(FILE* in, FILE* out)
+{
+    if ((in == 0) || (out == 0))
+    {
+        std::cerr << "errors opening files" << std::endl;
+        exit(2);
+    }
+    char buf[10240];
+    size_t len = 0;
+    while ((len = fread(buf, 1, sizeof(buf), in)) > 0)
+    {
+        fwrite(buf, 1, len, out);
+    }
+    if (len != 0)
+    {
+        std::cerr << "errors reading or writing" << std::endl;
+        exit(2);
+    }
+    fclose(in);
+    fclose(out);
+}
+
+#ifdef WINDOWS_WMAIN
+
+void copy(wchar_t const* outname)
+{
+#ifdef _MSC_VER
+    FILE* in = 0;
+    _wfopen_s(&in, L"minimal.pdf", L"rb");
+    FILE* out = 0;
+    _wfopen_s(&out, outname, L"wb");
+#else
+    FILE* in = _wfopen(L"minimal.pdf", L"rb");
+    FILE* out = _wfopen(outname, L"wb");
+#endif
+    do_copy(in, out);
+}
+
+extern "C"
+int wmain(int argc, wchar_t* argv[])
+{
+    // Unicode
+    wchar_t const* f1 = L"auto-\xfc.pdf";
+    wchar_t const* f2 = L"auto-\xf6\x03c0.pdf";
+    copy(f1);
+    copy(f2);
+    std::cout << "created Unicode filenames" << std::endl;
+    return 0;
+}
+
+#else
+
+void copy(char const* outname)
+{
+    FILE* in = fopen("minimal.pdf", "rb");
+    FILE* out = fopen(outname, "wb");
+    do_copy(in, out);
+}
+
+int main(int argc, char* argv[])
+{
+    // Explicit UTF-8 encoding
+    char const* f1 = "auto-\xc3\xbc.pdf";
+    char const* f2 = "auto-\xc3\xb6\xcf\x80.pdf";
+    copy(f1);
+    copy(f2);
+    std::cout << "created Unicode filenames" << std::endl;
+    return 0;
+}
+
+#endif