From 03e27709f32ebc83b1c351da5c03ffb2d18f28da Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sat, 27 Apr 2019 19:54:52 -0400 Subject: [PATCH] Improve Unicode filename testing Remove dependency on the behavior of perl for reliable creation of Unicode file names on Windows. --- TODO | 8 ++++ manual/qpdf-manual.xml | 25 +++++++++++ qpdf/build.mk | 5 ++- qpdf/qtest/qpdf.test | 15 ++++++- qpdf/test_unicode_filenames.cc | 81 ++++++++++++++++++++++++++++++++++ 5 files changed, 131 insertions(+), 3 deletions(-) create mode 100644 qpdf/test_unicode_filenames.cc diff --git a/TODO b/TODO index a6ff5baf..650ef834 100644 --- a/TODO +++ b/TODO @@ -170,6 +170,14 @@ I find it useful to make reference to them in this list * Pl_TIFFPredictor is pretty slow. + * Support for handling file names with Unicode characters in Windows + is incomplete. qpdf seems to support them okay from a functionality + standpoint, and the right thing happens if you pass in UTF-8 + encoded filenames to QPDF library routines in Windows (they are + converted internally to wchar_t*), but file names are encoded in + UTF-8 on output, which doesn't produce nice error messages or + output on Windows in some cases. + * If we ever wanted to do anything more with character encoding, see ../misc/character-encoding/, which includes machine-readable dump of table D.2 in the ISO-32000 PDF spec. This shows the mapping diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml index dac5f00d..1df6e788 100644 --- a/manual/qpdf-manual.xml +++ b/manual/qpdf-manual.xml @@ -2612,6 +2612,31 @@ outfile.pdf + + A Note About Unicode File Names + + When strings are passed to qpdf library routines either as + char* or as std::string, + they are treated as byte arrays except where otherwise noted. When + Unicode is desired, qpdf wants UTF-8 unless otherwise noted in + comments in header files. In modern UNIX/Linux environments, this + generally does the right thing. In Windows, it's a bit more + complicated. Starting in qpdf 8.4.0, passwords that contain + Unicode characters are handled much better, and starting in qpdf + 8.4.1, the library attempts to properly handle Unicode characters + in filenames. In particular, in Windows, if a UTF-8 encoded string + is used as a filename in either QPDF or + QPDFWriter, it is internally converted to + wchar_t*, and Unicode-aware Windows APIs are + used. As such, qpdf will generally operate properly on files with + non-ASCII characters in their names as long as the filenames are + UTF-8 encoded for passing into the qpdf library API, but there are + still some rough edges, such as the encoding of the filenames in + error messages our CLI output messages. Patches or bug reports are + welcome for any continuing issues with Unicode file names in + Windows. + + QPDF JSON diff --git a/qpdf/build.mk b/qpdf/build.mk index 40de3617..87038c79 100644 --- a/qpdf/build.mk +++ b/qpdf/build.mk @@ -5,7 +5,8 @@ BINS_qpdf = \ test_large_file \ test_pdf_doc_encoding \ test_pdf_unicode \ - test_tokenizer + test_tokenizer \ + test_unicode_filenames CBINS_qpdf = qpdf-ctest TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B))) @@ -20,6 +21,8 @@ TC_SRCS_qpdf = $(wildcard libqpdf/*.cc) $(wildcard qpdf/*.cc) XCXXFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_COMPILE) XLDFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_LINK) +XCXXFLAGS_qpdf_test_unicode_filenames := $(WINDOWS_WMAIN_COMPILE) +XLDFLAGS_qpdf_test_unicode_filenames := $(WINDOWS_WMAIN_LINK) $(foreach B,$(BINS_qpdf),$(eval \ OBJS_$(B) = $(call src_to_obj,qpdf/$(B).cc))) diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index ec5eb3c1..e95c22bc 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -135,7 +135,7 @@ foreach my $c (@completion_tests) show_ntests(); # ---------- $td->notify("--- Argument Parsing ---"); -$n_tests += 8; +$n_tests += 6; $td->runtest("required argument", {$td->COMMAND => "qpdf --password minimal.pdf"}, @@ -167,10 +167,21 @@ $td->runtest("extra overlay filename", {$td->REGEXP => ".*overlay file already specified.*", $td->EXIT_STATUS => 2}, $td->NORMALIZE_NEWLINES); + +show_ntests(); +# ---------- +$td->notify("--- Unicode Filenames ---"); +$n_tests += 3; + +$td->runtest("create unicode filenames", + {$td->COMMAND => "test_unicode_filenames"}, + {$td->STRING => "created Unicode filenames\n", + $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); + foreach my $d (['auto-ü', 1], ['auto-öπ', 2]) { my ($u, $n) = @$d; - copy('minimal.pdf', "$u.pdf"); $td->runtest("unicode filename $u", {$td->COMMAND => "qpdf --check $u.pdf"}, {$td->FILE => "check-unicode-filename-$n.out", diff --git a/qpdf/test_unicode_filenames.cc b/qpdf/test_unicode_filenames.cc new file mode 100644 index 00000000..45701a9f --- /dev/null +++ b/qpdf/test_unicode_filenames.cc @@ -0,0 +1,81 @@ +#ifdef _WIN32 +#include +#include +#include +#endif + +#include +#include +#include + +static void do_copy(FILE* in, FILE* out) +{ + if ((in == 0) || (out == 0)) + { + std::cerr << "errors opening files" << std::endl; + exit(2); + } + char buf[10240]; + size_t len = 0; + while ((len = fread(buf, 1, sizeof(buf), in)) > 0) + { + fwrite(buf, 1, len, out); + } + if (len != 0) + { + std::cerr << "errors reading or writing" << std::endl; + exit(2); + } + fclose(in); + fclose(out); +} + +#ifdef WINDOWS_WMAIN + +void copy(wchar_t const* outname) +{ +#ifdef _MSC_VER + FILE* in = 0; + _wfopen_s(&in, L"minimal.pdf", L"rb"); + FILE* out = 0; + _wfopen_s(&out, outname, L"wb"); +#else + FILE* in = _wfopen(L"minimal.pdf", L"rb"); + FILE* out = _wfopen(outname, L"wb"); +#endif + do_copy(in, out); +} + +extern "C" +int wmain(int argc, wchar_t* argv[]) +{ + // Unicode + wchar_t const* f1 = L"auto-\xfc.pdf"; + wchar_t const* f2 = L"auto-\xf6\x03c0.pdf"; + copy(f1); + copy(f2); + std::cout << "created Unicode filenames" << std::endl; + return 0; +} + +#else + +void copy(char const* outname) +{ + FILE* in = fopen("minimal.pdf", "rb"); + FILE* out = fopen(outname, "wb"); + do_copy(in, out); +} + +int main(int argc, char* argv[]) +{ + // Explicit UTF-8 encoding + char const* f1 = "auto-\xc3\xbc.pdf"; + char const* f2 = "auto-\xc3\xb6\xcf\x80.pdf"; + copy(f1); + copy(f2); + std::cout << "created Unicode filenames" << std::endl; + return 0; +} + +#endif