Improve Unicode filename testing

Remove dependency on the behavior of perl for reliable creation of
Unicode file names on Windows.
This commit is contained in:
Jay Berkenbilt 2019-04-27 19:54:52 -04:00
parent 7ff234a92f
commit 03e27709f3
5 changed files with 131 additions and 3 deletions

8
TODO
View File

@ -170,6 +170,14 @@ I find it useful to make reference to them in this list
* Pl_TIFFPredictor is pretty slow.
* Support for handling file names with Unicode characters in Windows
is incomplete. qpdf seems to support them okay from a functionality
standpoint, and the right thing happens if you pass in UTF-8
encoded filenames to QPDF library routines in Windows (they are
converted internally to wchar_t*), but file names are encoded in
UTF-8 on output, which doesn't produce nice error messages or
output on Windows in some cases.
* If we ever wanted to do anything more with character encoding, see
../misc/character-encoding/, which includes machine-readable dump
of table D.2 in the ISO-32000 PDF spec. This shows the mapping

View File

@ -2612,6 +2612,31 @@ outfile.pdf</option>
</varlistentry>
</variablelist>
</sect1>
<sect1 id="ref.unicode-files">
<title>A Note About Unicode File Names</title>
<para>
When strings are passed to qpdf library routines either as
<literal>char*</literal> or as <literal>std::string</literal>,
they are treated as byte arrays except where otherwise noted. When
Unicode is desired, qpdf wants UTF-8 unless otherwise noted in
comments in header files. In modern UNIX/Linux environments, this
generally does the right thing. In Windows, it's a bit more
complicated. Starting in qpdf 8.4.0, passwords that contain
Unicode characters are handled much better, and starting in qpdf
8.4.1, the library attempts to properly handle Unicode characters
in filenames. In particular, in Windows, if a UTF-8 encoded string
is used as a filename in either <classname>QPDF</classname> or
<classname>QPDFWriter</classname>, it is internally converted to
<literal>wchar_t*</literal>, and Unicode-aware Windows APIs are
used. As such, qpdf will generally operate properly on files with
non-ASCII characters in their names as long as the filenames are
UTF-8 encoded for passing into the qpdf library API, but there are
still some rough edges, such as the encoding of the filenames in
error messages our CLI output messages. Patches or bug reports are
welcome for any continuing issues with Unicode file names in
Windows.
</para>
</sect1>
</chapter>
<chapter id="ref.json">
<title>QPDF JSON</title>

View File

@ -5,7 +5,8 @@ BINS_qpdf = \
test_large_file \
test_pdf_doc_encoding \
test_pdf_unicode \
test_tokenizer
test_tokenizer \
test_unicode_filenames
CBINS_qpdf = qpdf-ctest
TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B)))
@ -20,6 +21,8 @@ TC_SRCS_qpdf = $(wildcard libqpdf/*.cc) $(wildcard qpdf/*.cc)
XCXXFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_COMPILE)
XLDFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_LINK)
XCXXFLAGS_qpdf_test_unicode_filenames := $(WINDOWS_WMAIN_COMPILE)
XLDFLAGS_qpdf_test_unicode_filenames := $(WINDOWS_WMAIN_LINK)
$(foreach B,$(BINS_qpdf),$(eval \
OBJS_$(B) = $(call src_to_obj,qpdf/$(B).cc)))

View File

@ -135,7 +135,7 @@ foreach my $c (@completion_tests)
show_ntests();
# ----------
$td->notify("--- Argument Parsing ---");
$n_tests += 8;
$n_tests += 6;
$td->runtest("required argument",
{$td->COMMAND => "qpdf --password minimal.pdf"},
@ -167,10 +167,21 @@ $td->runtest("extra overlay filename",
{$td->REGEXP => ".*overlay file already specified.*",
$td->EXIT_STATUS => 2},
$td->NORMALIZE_NEWLINES);
show_ntests();
# ----------
$td->notify("--- Unicode Filenames ---");
$n_tests += 3;
$td->runtest("create unicode filenames",
{$td->COMMAND => "test_unicode_filenames"},
{$td->STRING => "created Unicode filenames\n",
$td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
foreach my $d (['auto-ü', 1], ['auto-öπ', 2])
{
my ($u, $n) = @$d;
copy('minimal.pdf', "$u.pdf");
$td->runtest("unicode filename $u",
{$td->COMMAND => "qpdf --check $u.pdf"},
{$td->FILE => "check-unicode-filename-$n.out",

View File

@ -0,0 +1,81 @@
#ifdef _WIN32
#include <windows.h>
#include <direct.h>
#include <io.h>
#endif
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
static void do_copy(FILE* in, FILE* out)
{
if ((in == 0) || (out == 0))
{
std::cerr << "errors opening files" << std::endl;
exit(2);
}
char buf[10240];
size_t len = 0;
while ((len = fread(buf, 1, sizeof(buf), in)) > 0)
{
fwrite(buf, 1, len, out);
}
if (len != 0)
{
std::cerr << "errors reading or writing" << std::endl;
exit(2);
}
fclose(in);
fclose(out);
}
#ifdef WINDOWS_WMAIN
void copy(wchar_t const* outname)
{
#ifdef _MSC_VER
FILE* in = 0;
_wfopen_s(&in, L"minimal.pdf", L"rb");
FILE* out = 0;
_wfopen_s(&out, outname, L"wb");
#else
FILE* in = _wfopen(L"minimal.pdf", L"rb");
FILE* out = _wfopen(outname, L"wb");
#endif
do_copy(in, out);
}
extern "C"
int wmain(int argc, wchar_t* argv[])
{
// Unicode
wchar_t const* f1 = L"auto-\xfc.pdf";
wchar_t const* f2 = L"auto-\xf6\x03c0.pdf";
copy(f1);
copy(f2);
std::cout << "created Unicode filenames" << std::endl;
return 0;
}
#else
void copy(char const* outname)
{
FILE* in = fopen("minimal.pdf", "rb");
FILE* out = fopen(outname, "wb");
do_copy(in, out);
}
int main(int argc, char* argv[])
{
// Explicit UTF-8 encoding
char const* f1 = "auto-\xc3\xbc.pdf";
char const* f2 = "auto-\xc3\xb6\xcf\x80.pdf";
copy(f1);
copy(f2);
std::cout << "created Unicode filenames" << std::endl;
return 0;
}
#endif