2
1
mirror of https://github.com/qpdf/qpdf.git synced 2024-06-19 18:32:21 +00:00
qpdf/libqpdf/QUtil.cc
Jay Berkenbilt 4f24617e1e Code clean up: use range-style for loops wherever possible
Where not possible, use "auto" to get the iterator type.

Editorial note: I have avoid this change for a long time because of
not wanting to make gratuitous changes to version history, which can
obscure when certain changes were made, but with having recently
touched every single file to apply automatic code formatting and with
making several broad changes to the API, I decided it was time to take
the plunge and get rid of the older (pre-C++11) verbose iterator
syntax. The new code is just easier to read and understand, and in
many cases, it will be more effecient as fewer temporary copies are
being made.

m-holger, if you're reading, you can see that I've finally come
around. :-)
2022-04-30 13:27:18 -04:00

1987 lines
55 KiB
C++

// Include qpdf-config.h first so off_t is guaranteed to have the right size.
#include <qpdf/qpdf-config.h>
#include <qpdf/QUtil.hh>
#include <qpdf/CryptoRandomDataProvider.hh>
#include <qpdf/Pipeline.hh>
#include <qpdf/QIntC.hh>
#include <qpdf/QPDFSystemError.hh>
#include <qpdf/QTC.hh>
#include <cmath>
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <fstream>
#include <iomanip>
#include <locale>
#include <map>
#include <memory>
#include <regex>
#include <set>
#include <sstream>
#include <stdexcept>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifndef QPDF_NO_WCHAR_T
# include <cwchar>
#endif
#ifdef _WIN32
# define WIN32_LEAN_AND_MEAN
# include <direct.h>
# include <io.h>
# include <windows.h>
#else
# include <sys/stat.h>
# include <unistd.h>
#endif
// First element is 24
static unsigned short pdf_doc_low_to_unicode[] = {
0x02d8, // 0x18 BREVE
0x02c7, // 0x19 CARON
0x02c6, // 0x1a MODIFIER LETTER CIRCUMFLEX ACCENT
0x02d9, // 0x1b DOT ABOVE
0x02dd, // 0x1c DOUBLE ACUTE ACCENT
0x02db, // 0x1d OGONEK
0x02da, // 0x1e RING ABOVE
0x02dc, // 0x1f SMALL TILDE
};
// First element is 127
static unsigned short pdf_doc_to_unicode[] = {
0xfffd, // 0x7f UNDEFINED
0x2022, // 0x80 BULLET
0x2020, // 0x81 DAGGER
0x2021, // 0x82 DOUBLE DAGGER
0x2026, // 0x83 HORIZONTAL ELLIPSIS
0x2014, // 0x84 EM DASH
0x2013, // 0x85 EN DASH
0x0192, // 0x86 SMALL LETTER F WITH HOOK
0x2044, // 0x87 FRACTION SLASH (solidus)
0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x2212, // 0x8a MINUS SIGN
0x2030, // 0x8b PER MILLE SIGN
0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
0x2122, // 0x92 TRADE MARK SIGN
0xfb01, // 0x93 LATIN SMALL LIGATURE FI
0xfb02, // 0x94 LATIN SMALL LIGATURE FL
0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
0x0153, // 0x9c LATIN SMALL LIGATURE OE
0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
0xfffd, // 0x9f UNDEFINED
0x20ac, // 0xa0 EURO SIGN
};
static unsigned short win_ansi_to_unicode[] = {
0x20ac, // 0x80
0xfffd, // 0x81
0x201a, // 0x82
0x0192, // 0x83
0x201e, // 0x84
0x2026, // 0x85
0x2020, // 0x86
0x2021, // 0x87
0x02c6, // 0x88
0x2030, // 0x89
0x0160, // 0x8a
0x2039, // 0x8b
0x0152, // 0x8c
0xfffd, // 0x8d
0x017d, // 0x8e
0xfffd, // 0x8f
0xfffd, // 0x90
0x2018, // 0x91
0x2019, // 0x92
0x201c, // 0x93
0x201d, // 0x94
0x2022, // 0x95
0x2013, // 0x96
0x2014, // 0x97
0x0303, // 0x98
0x2122, // 0x99
0x0161, // 0x9a
0x203a, // 0x9b
0x0153, // 0x9c
0xfffd, // 0x9d
0x017e, // 0x9e
0x0178, // 0x9f
0x00a0, // 0xa0
};
static unsigned short mac_roman_to_unicode[] = {
0x00c4, // 0x80
0x00c5, // 0x81
0x00c7, // 0x82
0x00c9, // 0x83
0x00d1, // 0x84
0x00d6, // 0x85
0x00dc, // 0x86
0x00e1, // 0x87
0x00e0, // 0x88
0x00e2, // 0x89
0x00e4, // 0x8a
0x00e3, // 0x8b
0x00e5, // 0x8c
0x00e7, // 0x8d
0x00e9, // 0x8e
0x00e8, // 0x8f
0x00ea, // 0x90
0x00eb, // 0x91
0x00ed, // 0x92
0x00ec, // 0x93
0x00ee, // 0x94
0x00ef, // 0x95
0x00f1, // 0x96
0x00f3, // 0x97
0x00f2, // 0x98
0x00f4, // 0x99
0x00f6, // 0x9a
0x00f5, // 0x9b
0x00fa, // 0x9c
0x00f9, // 0x9d
0x00fb, // 0x9e
0x00fc, // 0x9f
0x2020, // 0xa0
0x00b0, // 0xa1
0x00a2, // 0xa2
0x00a3, // 0xa3
0x00a7, // 0xa4
0x2022, // 0xa5
0x00b6, // 0xa6
0x00df, // 0xa7
0x00ae, // 0xa8
0x00a9, // 0xa9
0x2122, // 0xaa
0x0301, // 0xab
0x0308, // 0xac
0xfffd, // 0xad
0x00c6, // 0xae
0x00d8, // 0xaf
0xfffd, // 0xb0
0x00b1, // 0xb1
0xfffd, // 0xb2
0xfffd, // 0xb3
0x00a5, // 0xb4
0x03bc, // 0xb5
0xfffd, // 0xb6
0xfffd, // 0xb7
0xfffd, // 0xb8
0xfffd, // 0xb9
0xfffd, // 0xba
0x1d43, // 0xbb
0x1d52, // 0xbc
0xfffd, // 0xbd
0x00e6, // 0xbe
0x00f8, // 0xbf
0x00bf, // 0xc0
0x00a1, // 0xc1
0x00ac, // 0xc2
0xfffd, // 0xc3
0x0192, // 0xc4
0xfffd, // 0xc5
0xfffd, // 0xc6
0x00ab, // 0xc7
0x00bb, // 0xc8
0x2026, // 0xc9
0xfffd, // 0xca
0x00c0, // 0xcb
0x00c3, // 0xcc
0x00d5, // 0xcd
0x0152, // 0xce
0x0153, // 0xcf
0x2013, // 0xd0
0x2014, // 0xd1
0x201c, // 0xd2
0x201d, // 0xd3
0x2018, // 0xd4
0x2019, // 0xd5
0x00f7, // 0xd6
0xfffd, // 0xd7
0x00ff, // 0xd8
0x0178, // 0xd9
0x2044, // 0xda
0x00a4, // 0xdb
0x2039, // 0xdc
0x203a, // 0xdd
0xfb01, // 0xde
0xfb02, // 0xdf
0x2021, // 0xe0
0x00b7, // 0xe1
0x201a, // 0xe2
0x201e, // 0xe3
0x2030, // 0xe4
0x00c2, // 0xe5
0x00ca, // 0xe6
0x00c1, // 0xe7
0x00cb, // 0xe8
0x00c8, // 0xe9
0x00cd, // 0xea
0x00ce, // 0xeb
0x00cf, // 0xec
0x00cc, // 0xed
0x00d3, // 0xee
0x00d4, // 0xef
0xfffd, // 0xf0
0x00d2, // 0xf1
0x00da, // 0xf2
0x00db, // 0xf3
0x00d9, // 0xf4
0x0131, // 0xf5
0x02c6, // 0xf6
0x0303, // 0xf7
0x0304, // 0xf8
0x0306, // 0xf9
0x0307, // 0xfa
0x030a, // 0xfb
0x0327, // 0xfc
0x030b, // 0xfd
0x0328, // 0xfe
0x02c7, // 0xff
};
static std::map<unsigned long, unsigned char> unicode_to_win_ansi = {
{0x20ac, 0x80}, {0x201a, 0x82}, {0x192, 0x83}, {0x201e, 0x84},
{0x2026, 0x85}, {0x2020, 0x86}, {0x2021, 0x87}, {0x2c6, 0x88},
{0x2030, 0x89}, {0x160, 0x8a}, {0x2039, 0x8b}, {0x152, 0x8c},
{0x17d, 0x8e}, {0x2018, 0x91}, {0x2019, 0x92}, {0x201c, 0x93},
{0x201d, 0x94}, {0x2022, 0x95}, {0x2013, 0x96}, {0x2014, 0x97},
{0x303, 0x98}, {0x2122, 0x99}, {0x161, 0x9a}, {0x203a, 0x9b},
{0x153, 0x9c}, {0x17e, 0x9e}, {0x178, 0x9f}, {0xa0, 0xa0},
};
static std::map<unsigned long, unsigned char> unicode_to_mac_roman = {
{0xc4, 0x80}, {0xc5, 0x81}, {0xc7, 0x82}, {0xc9, 0x83},
{0xd1, 0x84}, {0xd6, 0x85}, {0xdc, 0x86}, {0xe1, 0x87},
{0xe0, 0x88}, {0xe2, 0x89}, {0xe4, 0x8a}, {0xe3, 0x8b},
{0xe5, 0x8c}, {0xe7, 0x8d}, {0xe9, 0x8e}, {0xe8, 0x8f},
{0xea, 0x90}, {0xeb, 0x91}, {0xed, 0x92}, {0xec, 0x93},
{0xee, 0x94}, {0xef, 0x95}, {0xf1, 0x96}, {0xf3, 0x97},
{0xf2, 0x98}, {0xf4, 0x99}, {0xf6, 0x9a}, {0xf5, 0x9b},
{0xfa, 0x9c}, {0xf9, 0x9d}, {0xfb, 0x9e}, {0xfc, 0x9f},
{0x2020, 0xa0}, {0xb0, 0xa1}, {0xa2, 0xa2}, {0xa3, 0xa3},
{0xa7, 0xa4}, {0x2022, 0xa5}, {0xb6, 0xa6}, {0xdf, 0xa7},
{0xae, 0xa8}, {0xa9, 0xa9}, {0x2122, 0xaa}, {0x301, 0xab},
{0x308, 0xac}, {0xc6, 0xae}, {0xd8, 0xaf}, {0xb1, 0xb1},
{0xa5, 0xb4}, {0x3bc, 0xb5}, {0x1d43, 0xbb}, {0x1d52, 0xbc},
{0xe6, 0xbe}, {0xf8, 0xbf}, {0xbf, 0xc0}, {0xa1, 0xc1},
{0xac, 0xc2}, {0x192, 0xc4}, {0xab, 0xc7}, {0xbb, 0xc8},
{0x2026, 0xc9}, {0xc0, 0xcb}, {0xc3, 0xcc}, {0xd5, 0xcd},
{0x152, 0xce}, {0x153, 0xcf}, {0x2013, 0xd0}, {0x2014, 0xd1},
{0x201c, 0xd2}, {0x201d, 0xd3}, {0x2018, 0xd4}, {0x2019, 0xd5},
{0xf7, 0xd6}, {0xff, 0xd8}, {0x178, 0xd9}, {0x2044, 0xda},
{0xa4, 0xdb}, {0x2039, 0xdc}, {0x203a, 0xdd}, {0xfb01, 0xde},
{0xfb02, 0xdf}, {0x2021, 0xe0}, {0xb7, 0xe1}, {0x201a, 0xe2},
{0x201e, 0xe3}, {0x2030, 0xe4}, {0xc2, 0xe5}, {0xca, 0xe6},
{0xc1, 0xe7}, {0xcb, 0xe8}, {0xc8, 0xe9}, {0xcd, 0xea},
{0xce, 0xeb}, {0xcf, 0xec}, {0xcc, 0xed}, {0xd3, 0xee},
{0xd4, 0xef}, {0xd2, 0xf1}, {0xda, 0xf2}, {0xdb, 0xf3},
{0xd9, 0xf4}, {0x131, 0xf5}, {0x2c6, 0xf6}, {0x303, 0xf7},
{0x304, 0xf8}, {0x306, 0xf9}, {0x307, 0xfa}, {0x30a, 0xfb},
{0x327, 0xfc}, {0x30b, 0xfd}, {0x328, 0xfe}, {0x2c7, 0xff},
};
static std::map<unsigned long, unsigned char> unicode_to_pdf_doc = {
{0x02d8, 0x18}, {0x02c7, 0x19}, {0x02c6, 0x1a}, {0x02d9, 0x1b},
{0x02dd, 0x1c}, {0x02db, 0x1d}, {0x02da, 0x1e}, {0x02dc, 0x1f},
{0x2022, 0x80}, {0x2020, 0x81}, {0x2021, 0x82}, {0x2026, 0x83},
{0x2014, 0x84}, {0x2013, 0x85}, {0x0192, 0x86}, {0x2044, 0x87},
{0x2039, 0x88}, {0x203a, 0x89}, {0x2212, 0x8a}, {0x2030, 0x8b},
{0x201e, 0x8c}, {0x201c, 0x8d}, {0x201d, 0x8e}, {0x2018, 0x8f},
{0x2019, 0x90}, {0x201a, 0x91}, {0x2122, 0x92}, {0xfb01, 0x93},
{0xfb02, 0x94}, {0x0141, 0x95}, {0x0152, 0x96}, {0x0160, 0x97},
{0x0178, 0x98}, {0x017d, 0x99}, {0x0131, 0x9a}, {0x0142, 0x9b},
{0x0153, 0x9c}, {0x0161, 0x9d}, {0x017e, 0x9e}, {0xfffd, 0x9f},
{0x20ac, 0xa0},
};
namespace
{
class FileCloser
{
public:
FileCloser(FILE* f) :
f(f)
{
}
~FileCloser()
{
fclose(f);
}
private:
FILE* f;
};
} // namespace
template <typename T>
static std::string
int_to_string_base_internal(T num, int base, int length)
{
// Backward compatibility -- int_to_string, which calls this
// function, used to use sprintf with %0*d, so we interpret length
// such that a negative value appends spaces and a positive value
// prepends zeroes.
if (!((base == 8) || (base == 10) || (base == 16))) {
throw std::logic_error(
"int_to_string_base called with unsupported base");
}
std::string cvt;
if (base == 10) {
// Use the more efficient std::to_string when possible
cvt = std::to_string(num);
} else {
std::ostringstream buf;
buf.imbue(std::locale::classic());
buf << std::setbase(base) << std::nouppercase << num;
cvt = buf.str();
}
std::string result;
int str_length = QIntC::to_int(cvt.length());
if ((length > 0) && (str_length < length)) {
result.append(QIntC::to_size(length - str_length), '0');
}
result += cvt;
if ((length < 0) && (str_length < -length)) {
result.append(QIntC::to_size(-length - str_length), ' ');
}
return result;
}
std::string
QUtil::int_to_string(long long num, int length)
{
return int_to_string_base(num, 10, length);
}
std::string
QUtil::uint_to_string(unsigned long long num, int length)
{
return uint_to_string_base(num, 10, length);
}
std::string
QUtil::int_to_string_base(long long num, int base, int length)
{
return int_to_string_base_internal(num, base, length);
}
std::string
QUtil::uint_to_string_base(unsigned long long num, int base, int length)
{
return int_to_string_base_internal(num, base, length);
}
std::string
QUtil::double_to_string(
double num, int decimal_places, bool trim_trailing_zeroes)
{
// Backward compatibility -- this code used to use sprintf and
// treated decimal_places <= 0 to mean to use the default, which
// was six decimal places. Starting in 10.2, we trim trailing
// zeroes by default.
if (decimal_places <= 0) {
decimal_places = 6;
}
std::ostringstream buf;
buf.imbue(std::locale::classic());
buf << std::setprecision(decimal_places) << std::fixed << num;
std::string result = buf.str();
if (trim_trailing_zeroes) {
while ((result.length() > 1) && (result.back() == '0')) {
result.pop_back();
}
if ((result.length() > 1) && (result.back() == '.')) {
result.pop_back();
}
}
return result;
}
long long
QUtil::string_to_ll(char const* str)
{
errno = 0;
#ifdef _MSC_VER
long long result = _strtoi64(str, 0, 10);
#else
long long result = strtoll(str, 0, 10);
#endif
if (errno == ERANGE) {
throw std::range_error(
std::string("overflow/underflow converting ") + str +
" to 64-bit integer");
}
return result;
}
int
QUtil::string_to_int(char const* str)
{
// QIntC::to_int does range checking
return QIntC::to_int(string_to_ll(str));
}
unsigned long long
QUtil::string_to_ull(char const* str)
{
char const* p = str;
while (*p && is_space(*p)) {
++p;
}
if (*p == '-') {
throw std::runtime_error(
std::string("underflow converting ") + str +
" to 64-bit unsigned integer");
}
errno = 0;
#ifdef _MSC_VER
unsigned long long result = _strtoui64(str, 0, 10);
#else
unsigned long long result = strtoull(str, 0, 10);
#endif
if (errno == ERANGE) {
throw std::runtime_error(
std::string("overflow converting ") + str +
" to 64-bit unsigned integer");
}
return result;
}
unsigned int
QUtil::string_to_uint(char const* str)
{
// QIntC::to_uint does range checking
return QIntC::to_uint(string_to_ull(str));
}
unsigned char*
QUtil::unsigned_char_pointer(std::string const& str)
{
return reinterpret_cast<unsigned char*>(const_cast<char*>(str.c_str()));
}
unsigned char*
QUtil::unsigned_char_pointer(char const* str)
{
return reinterpret_cast<unsigned char*>(const_cast<char*>(str));
}
void
QUtil::throw_system_error(std::string const& description)
{
throw QPDFSystemError(description, errno);
}
int
QUtil::os_wrapper(std::string const& description, int status)
{
if (status == -1) {
throw_system_error(description);
}
return status;
}
#ifdef _WIN32
static std::shared_ptr<wchar_t>
win_convert_filename(char const* filename)
{
// Convert the utf-8 encoded filename argument to wchar_t*. First,
// convert to utf16, then to wchar_t*. Note that u16 will start
// with the UTF16 marker, which we skip.
std::string u16 = QUtil::utf8_to_utf16(filename);
size_t len = u16.length();
size_t wlen = (len / 2) - 1;
auto wfilenamep = QUtil::make_shared_array<wchar_t>(wlen + 1);
wchar_t* wfilename = wfilenamep.get();
wfilename[wlen] = 0;
for (unsigned int i = 2; i < len; i += 2) {
wfilename[(i / 2) - 1] = static_cast<wchar_t>(
(static_cast<unsigned char>(u16.at(i)) << 8) +
static_cast<unsigned char>(u16.at(i + 1)));
}
return wfilenamep;
}
#endif
FILE*
QUtil::safe_fopen(char const* filename, char const* mode)
{
FILE* f = 0;
#ifdef _WIN32
std::shared_ptr<wchar_t> wfilenamep = win_convert_filename(filename);
wchar_t* wfilename = wfilenamep.get();
auto wmodep = QUtil::make_shared_array<wchar_t>(strlen(mode) + 1);
wchar_t* wmode = wmodep.get();
wmode[strlen(mode)] = 0;
for (size_t i = 0; i < strlen(mode); ++i) {
wmode[i] = static_cast<wchar_t>(mode[i]);
}
# ifdef _MSC_VER
errno_t err = _wfopen_s(&f, wfilename, wmode);
if (err != 0) {
errno = err;
}
# else
f = _wfopen(wfilename, wmode);
# endif
if (f == 0) {
throw_system_error(std::string("open ") + filename);
}
#else
f = fopen_wrapper(std::string("open ") + filename, fopen(filename, mode));
#endif
return f;
}
FILE*
QUtil::fopen_wrapper(std::string const& description, FILE* f)
{
if (f == 0) {
throw_system_error(description);
}
return f;
}
bool
QUtil::file_can_be_opened(char const* filename)
{
try {
fclose(safe_fopen(filename, "rb"));
return true;
} catch (std::runtime_error&) {
// can't open the file
}
return false;
}
int
QUtil::seek(FILE* stream, qpdf_offset_t offset, int whence)
{
#if HAVE_FSEEKO
return fseeko(
stream,
QIntC::IntConverter<qpdf_offset_t, off_t>::convert(offset),
whence);
#elif HAVE_FSEEKO64
return fseeko64(stream, offset, whence);
#else
# if defined _MSC_VER || defined __BORLANDC__
return _fseeki64(stream, offset, whence);
# else
return fseek(stream, QIntC::to_long(offset), whence);
# endif
#endif
}
qpdf_offset_t
QUtil::tell(FILE* stream)
{
#if HAVE_FSEEKO
return QIntC::to_offset(ftello(stream));
#elif HAVE_FSEEKO64
return QIntC::to_offset(ftello64(stream));
#else
# if defined _MSC_VER || defined __BORLANDC__
return _ftelli64(stream);
# else
return QIntC::to_offset(ftell(stream));
# endif
#endif
}
bool
QUtil::same_file(char const* name1, char const* name2)
{
if ((name1 == 0) || (strlen(name1) == 0) || (name2 == 0) ||
(strlen(name2) == 0)) {
return false;
}
#ifdef _WIN32
bool same = false;
# ifndef AVOID_WINDOWS_HANDLE
HANDLE fh1 = CreateFile(
name1,
GENERIC_READ,
FILE_SHARE_READ,
NULL,
OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL,
NULL);
HANDLE fh2 = CreateFile(
name2,
GENERIC_READ,
FILE_SHARE_READ,
NULL,
OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL,
NULL);
BY_HANDLE_FILE_INFORMATION fi1;
BY_HANDLE_FILE_INFORMATION fi2;
if ((fh1 != INVALID_HANDLE_VALUE) && (fh2 != INVALID_HANDLE_VALUE) &&
GetFileInformationByHandle(fh1, &fi1) &&
GetFileInformationByHandle(fh2, &fi2) &&
(fi1.dwVolumeSerialNumber == fi2.dwVolumeSerialNumber) &&
(fi1.nFileIndexLow == fi2.nFileIndexLow) &&
(fi1.nFileIndexHigh == fi2.nFileIndexHigh)) {
same = true;
}
if (fh1 != INVALID_HANDLE_VALUE) {
CloseHandle(fh1);
}
if (fh2 != INVALID_HANDLE_VALUE) {
CloseHandle(fh2);
}
# endif
return same;
#else
struct stat st1;
struct stat st2;
if ((stat(name1, &st1) == 0) && (stat(name2, &st2) == 0) &&
(st1.st_ino == st2.st_ino) && (st1.st_dev == st2.st_dev)) {
return true;
}
#endif
return false;
}
void
QUtil::remove_file(char const* path)
{
#ifdef _WIN32
std::shared_ptr<wchar_t> wpath = win_convert_filename(path);
os_wrapper(std::string("remove ") + path, _wunlink(wpath.get()));
#else
os_wrapper(std::string("remove ") + path, unlink(path));
#endif
}
void
QUtil::rename_file(char const* oldname, char const* newname)
{
#ifdef _WIN32
try {
remove_file(newname);
} catch (QPDFSystemError&) {
// ignore
}
std::shared_ptr<wchar_t> wold = win_convert_filename(oldname);
std::shared_ptr<wchar_t> wnew = win_convert_filename(newname);
os_wrapper(
std::string("rename ") + oldname + " " + newname,
_wrename(wold.get(), wnew.get()));
#else
os_wrapper(
std::string("rename ") + oldname + " " + newname,
rename(oldname, newname));
#endif
}
void
QUtil::pipe_file(char const* filename, Pipeline* p)
{
// Exercised in test suite by testing file_provider.
FILE* f = safe_fopen(filename, "rb");
FileCloser fc(f);
size_t len = 0;
int constexpr size = 8192;
unsigned char buf[size];
while ((len = fread(buf, 1, size, f)) > 0) {
p->write(buf, len);
}
p->finish();
if (ferror(f)) {
throw std::runtime_error(
std::string("failure reading file ") + filename);
}
}
std::function<void(Pipeline*)>
QUtil::file_provider(std::string const& filename)
{
return [filename](Pipeline* p) { pipe_file(filename.c_str(), p); };
}
std::string
QUtil::path_basename(std::string const& filename)
{
#ifdef _WIN32
char const* pathsep = "/\\";
#else
char const* pathsep = "/";
#endif
std::string last = filename;
auto len = last.length();
while (len > 1) {
auto pos = last.find_last_of(pathsep);
if (pos == len - 1) {
last.pop_back();
--len;
} else if (pos == std::string::npos) {
break;
} else {
last = last.substr(pos + 1);
break;
}
}
return last;
}
char*
QUtil::copy_string(std::string const& str)
{
char* result = new char[str.length() + 1];
// Use memcpy in case string contains nulls
result[str.length()] = '\0';
memcpy(result, str.c_str(), str.length());
return result;
}
std::shared_ptr<char>
QUtil::make_shared_cstr(std::string const& str)
{
auto result = QUtil::make_shared_array<char>(str.length() + 1);
// Use memcpy in case string contains nulls
result.get()[str.length()] = '\0';
memcpy(result.get(), str.c_str(), str.length());
return result;
}
std::unique_ptr<char[]>
QUtil::make_unique_cstr(std::string const& str)
{
auto result = std::make_unique<char[]>(str.length() + 1);
// Use memcpy in case string contains nulls
result.get()[str.length()] = '\0';
memcpy(result.get(), str.c_str(), str.length());
return result;
}
std::string
QUtil::hex_encode(std::string const& input)
{
std::string result;
for (unsigned int i = 0; i < input.length(); ++i) {
result += QUtil::int_to_string_base(
QIntC::to_int(static_cast<unsigned char>(input.at(i))), 16, 2);
}
return result;
}
std::string
QUtil::hex_decode(std::string const& input)
{
std::string result;
size_t pos = 0;
for (auto ch: input) {
bool skip = false;
if ((ch >= 'A') && (ch <= 'F')) {
ch = QIntC::to_char(ch - 'A' + 10);
} else if ((ch >= 'a') && (ch <= 'f')) {
ch = QIntC::to_char(ch - 'a' + 10);
} else if ((ch >= '0') && (ch <= '9')) {
ch = QIntC::to_char(ch - '0');
} else {
skip = true;
}
if (!skip) {
if (pos == 0) {
result.push_back(static_cast<char>(ch << 4));
pos = 1;
} else {
result[result.length() - 1] |= ch;
pos = 0;
}
}
}
return result;
}
void
QUtil::binary_stdout()
{
#if defined(_WIN32) && defined(__BORLANDC__)
setmode(_fileno(stdout), _O_BINARY);
#elif defined(_WIN32)
_setmode(_fileno(stdout), _O_BINARY);
#endif
}
void
QUtil::binary_stdin()
{
#if defined(_WIN32) && defined(__BORLANDC__)
setmode(_fileno(stdin), _O_BINARY);
#elif defined(_WIN32)
_setmode(_fileno(stdin), _O_BINARY);
#endif
}
void
QUtil::setLineBuf(FILE* f)
{
#ifndef _WIN32
setvbuf(f, reinterpret_cast<char*>(0), _IOLBF, 0);
#endif
}
char*
QUtil::getWhoami(char* argv0)
{
char* whoami = 0;
if (((whoami = strrchr(argv0, '/')) == NULL) &&
((whoami = strrchr(argv0, '\\')) == NULL)) {
whoami = argv0;
} else {
++whoami;
}
if ((strlen(whoami) > 4) &&
(strcmp(whoami + strlen(whoami) - 4, ".exe") == 0)) {
whoami[strlen(whoami) - 4] = '\0';
}
return whoami;
}
bool
QUtil::get_env(std::string const& var, std::string* value)
{
// This was basically ripped out of wxWindows.
#ifdef _WIN32
# ifdef NO_GET_ENVIRONMENT
return false;
# else
// first get the size of the buffer
DWORD len = ::GetEnvironmentVariable(var.c_str(), NULL, 0);
if (len == 0) {
// this means that there is no such variable
return false;
}
if (value) {
auto t = QUtil::make_shared_array<char>(len + 1);
::GetEnvironmentVariable(var.c_str(), t.get(), len);
*value = t.get();
}
return true;
# endif
#else
char* p = getenv(var.c_str());
if (p == 0) {
return false;
}
if (value) {
*value = p;
}
return true;
#endif
}
time_t
QUtil::get_current_time()
{
#ifdef _WIN32
// The procedure to get local time at this resolution comes from
// the Microsoft documentation. It says to convert a SYSTEMTIME
// to a FILETIME, and to copy the FILETIME to a ULARGE_INTEGER.
// The resulting number is the number of 100-nanosecond intervals
// between January 1, 1601 and now. POSIX threads wants a time
// based on January 1, 1970, so we adjust by subtracting the
// number of seconds in that time period from the result we get
// here.
SYSTEMTIME sysnow;
GetSystemTime(&sysnow);
FILETIME filenow;
SystemTimeToFileTime(&sysnow, &filenow);
ULARGE_INTEGER uinow;
uinow.LowPart = filenow.dwLowDateTime;
uinow.HighPart = filenow.dwHighDateTime;
ULONGLONG now = uinow.QuadPart;
return static_cast<time_t>((now / 10000000ULL) - 11644473600ULL);
#else
return time(0);
#endif
}
QUtil::QPDFTime
QUtil::get_current_qpdf_time()
{
#ifdef _WIN32
SYSTEMTIME ltime;
GetLocalTime(&ltime);
TIME_ZONE_INFORMATION tzinfo;
GetTimeZoneInformation(&tzinfo);
return QPDFTime(
static_cast<int>(ltime.wYear),
static_cast<int>(ltime.wMonth),
static_cast<int>(ltime.wDay),
static_cast<int>(ltime.wHour),
static_cast<int>(ltime.wMinute),
static_cast<int>(ltime.wSecond),
// tzinfo.Bias is minutes before UTC
static_cast<int>(tzinfo.Bias));
#else
struct tm ltime;
time_t now = time(0);
tzset();
# ifdef HAVE_LOCALTIME_R
localtime_r(&now, &ltime);
# else
ltime = *localtime(&now);
# endif
# if HAVE_TM_GMTOFF
// tm_gmtoff is seconds after UTC
int tzoff = -static_cast<int>(ltime.tm_gmtoff / 60);
# elif HAVE_EXTERN_LONG_TIMEZONE
// timezone is seconds before UTC, not adjusted for daylight saving time
int tzoff = static_cast<int>(timezone / 60);
# else
// Don't know how to get timezone on this platform
int tzoff = 0;
# endif
return QPDFTime(
static_cast<int>(ltime.tm_year + 1900),
static_cast<int>(ltime.tm_mon + 1),
static_cast<int>(ltime.tm_mday),
static_cast<int>(ltime.tm_hour),
static_cast<int>(ltime.tm_min),
static_cast<int>(ltime.tm_sec),
tzoff);
#endif
}
std::string
QUtil::qpdf_time_to_pdf_time(QPDFTime const& qtm)
{
std::string tz_offset;
int t = qtm.tz_delta;
if (t == 0) {
tz_offset = "Z";
} else {
if (t < 0) {
t = -t;
tz_offset += "+";
} else {
tz_offset += "-";
}
tz_offset += QUtil::int_to_string(t / 60, 2) + "'" +
QUtil::int_to_string(t % 60, 2) + "'";
}
return (
"D:" + QUtil::int_to_string(qtm.year, 4) +
QUtil::int_to_string(qtm.month, 2) + QUtil::int_to_string(qtm.day, 2) +
QUtil::int_to_string(qtm.hour, 2) +
QUtil::int_to_string(qtm.minute, 2) +
QUtil::int_to_string(qtm.second, 2) + tz_offset);
}
bool
QUtil::pdf_time_to_qpdf_time(std::string const& str, QPDFTime* qtm)
{
static std::regex pdf_date("^D:([0-9]{4})([0-9]{2})([0-9]{2})"
"([0-9]{2})([0-9]{2})([0-9]{2})"
"(?:(Z?)|([\\+\\-])([0-9]{2})'([0-9]{2})')$");
std::smatch m;
if (!std::regex_match(str, m, pdf_date)) {
return false;
}
int tz_delta = 0;
auto to_i = [](std::string const& s) {
return QUtil::string_to_int(s.c_str());
};
if (m[8] != "") {
tz_delta = ((to_i(m[9]) * 60) + to_i(m[10]));
if (m[8] == "+") {
tz_delta = -tz_delta;
}
}
if (qtm) {
*qtm = QPDFTime(
to_i(m[1]),
to_i(m[2]),
to_i(m[3]),
to_i(m[4]),
to_i(m[5]),
to_i(m[6]),
tz_delta);
}
return true;
}
std::string
QUtil::toUTF8(unsigned long uval)
{
std::string result;
// A UTF-8 encoding of a Unicode value is a single byte for
// Unicode values <= 127. For larger values, the first byte of
// the UTF-8 encoding has '1' as each of its n highest bits and
// '0' for its (n+1)th highest bit where n is the total number of
// bytes required. Subsequent bytes start with '10' and have the
// remaining 6 bits free for encoding. For example, an 11-bit
// Unicode value can be stored in two bytes where the first is
// 110zzzzz, the second is 10zzzzzz, and the z's represent the
// remaining bits.
if (uval > 0x7fffffff) {
throw std::runtime_error("bounds error in QUtil::toUTF8");
} else if (uval < 128) {
result += static_cast<char>(uval);
} else {
unsigned char bytes[7];
bytes[6] = '\0';
unsigned char* cur_byte = &bytes[5];
// maximum value that will fit in the current number of bytes
unsigned char maxval = 0x3f; // six bits
while (uval > QIntC::to_ulong(maxval)) {
// Assign low six bits plus 10000000 to lowest unused
// byte position, then shift
*cur_byte = static_cast<unsigned char>(0x80 + (uval & 0x3f));
uval >>= 6;
// Maximum that will fit in high byte now shrinks by one bit
maxval = static_cast<unsigned char>(maxval >> 1);
// Slide to the left one byte
if (cur_byte <= bytes) {
throw std::logic_error("QUtil::toUTF8: overflow error");
}
--cur_byte;
}
// If maxval is k bits long, the high (7 - k) bits of the
// resulting byte must be high.
*cur_byte = static_cast<unsigned char>(
QIntC::to_ulong(0xff - (1 + (maxval << 1))) + uval);
result += reinterpret_cast<char*>(cur_byte);
}
return result;
}
std::string
QUtil::toUTF16(unsigned long uval)
{
std::string result;
if ((uval >= 0xd800) && (uval <= 0xdfff)) {
result = "\xff\xfd";
} else if (uval <= 0xffff) {
char out[2];
out[0] = static_cast<char>((uval & 0xff00) >> 8);
out[1] = static_cast<char>(uval & 0xff);
result = std::string(out, 2);
} else if (uval <= 0x10ffff) {
char out[4];
uval -= 0x10000;
unsigned short high =
static_cast<unsigned short>(((uval & 0xffc00) >> 10) + 0xd800);
unsigned short low =
static_cast<unsigned short>((uval & 0x3ff) + 0xdc00);
out[0] = static_cast<char>((high & 0xff00) >> 8);
out[1] = static_cast<char>(high & 0xff);
out[2] = static_cast<char>((low & 0xff00) >> 8);
out[3] = static_cast<char>(low & 0xff);
result = std::string(out, 4);
} else {
result = "\xff\xfd";
}
return result;
}
// Random data support
namespace
{
class RandomDataProviderProvider
{
public:
RandomDataProviderProvider();
void setProvider(RandomDataProvider*);
RandomDataProvider* getProvider();
private:
RandomDataProvider* default_provider;
RandomDataProvider* current_provider;
};
} // namespace
RandomDataProviderProvider::RandomDataProviderProvider() :
default_provider(CryptoRandomDataProvider::getInstance()),
current_provider(0)
{
this->current_provider = default_provider;
}
RandomDataProvider*
RandomDataProviderProvider::getProvider()
{
return this->current_provider;
}
void
RandomDataProviderProvider::setProvider(RandomDataProvider* p)
{
this->current_provider = p ? p : this->default_provider;
}
static RandomDataProviderProvider*
getRandomDataProviderProvider()
{
// Thread-safe static initializer
static RandomDataProviderProvider rdpp;
return &rdpp;
}
void
QUtil::setRandomDataProvider(RandomDataProvider* p)
{
getRandomDataProviderProvider()->setProvider(p);
}
RandomDataProvider*
QUtil::getRandomDataProvider()
{
return getRandomDataProviderProvider()->getProvider();
}
void
QUtil::initializeWithRandomBytes(unsigned char* data, size_t len)
{
getRandomDataProvider()->provideRandomData(data, len);
}
long
QUtil::random()
{
long result = 0L;
initializeWithRandomBytes(
reinterpret_cast<unsigned char*>(&result), sizeof(result));
return result;
}
bool
QUtil::is_hex_digit(char ch)
{
return (ch && (strchr("0123456789abcdefABCDEF", ch) != 0));
}
bool
QUtil::is_space(char ch)
{
return (ch && (strchr(" \f\n\r\t\v", ch) != 0));
}
bool
QUtil::is_digit(char ch)
{
return ((ch >= '0') && (ch <= '9'));
}
bool
QUtil::is_number(char const* p)
{
// ^[\+\-]?(\.\d*|\d+(\.\d*)?)$
if (!*p) {
return false;
}
if ((*p == '-') || (*p == '+')) {
++p;
}
bool found_dot = false;
bool found_digit = false;
for (; *p; ++p) {
if (*p == '.') {
if (found_dot) {
// only one dot
return false;
}
found_dot = true;
} else if (QUtil::is_digit(*p)) {
found_digit = true;
} else {
return false;
}
}
return found_digit;
}
void
QUtil::read_file_into_memory(
char const* filename, std::shared_ptr<char>& file_buf, size_t& size)
{
FILE* f = safe_fopen(filename, "rb");
FileCloser fc(f);
fseek(f, 0, SEEK_END);
size = QIntC::to_size(QUtil::tell(f));
fseek(f, 0, SEEK_SET);
file_buf = QUtil::make_shared_array<char>(size);
char* buf_p = file_buf.get();
size_t bytes_read = 0;
size_t len = 0;
while ((len = fread(buf_p + bytes_read, 1, size - bytes_read, f)) > 0) {
bytes_read += len;
}
if (bytes_read != size) {
if (ferror(f)) {
throw std::runtime_error(
std::string("failure reading file ") + filename +
" into memory: read " + uint_to_string(bytes_read) +
"; wanted " + uint_to_string(size));
} else {
throw std::runtime_error(
std::string("premature eof reading file ") + filename +
" into memory: read " + uint_to_string(bytes_read) +
"; wanted " + uint_to_string(size));
}
}
}
static bool
read_char_from_FILE(char& ch, FILE* f)
{
auto len = fread(&ch, 1, 1, f);
if (len == 0) {
if (ferror(f)) {
throw std::runtime_error("failure reading character from file");
}
return false;
}
return true;
}
std::list<std::string>
QUtil::read_lines_from_file(char const* filename, bool preserve_eol)
{
std::list<std::string> lines;
FILE* f = safe_fopen(filename, "rb");
FileCloser fc(f);
auto next_char = [&f](char& ch) { return read_char_from_FILE(ch, f); };
read_lines_from_file(next_char, lines, preserve_eol);
return lines;
}
std::list<std::string>
QUtil::read_lines_from_file(std::istream& in, bool preserve_eol)
{
std::list<std::string> lines;
auto next_char = [&in](char& ch) { return (in.get(ch)) ? true : false; };
read_lines_from_file(next_char, lines, preserve_eol);
return lines;
}
std::list<std::string>
QUtil::read_lines_from_file(FILE* f, bool preserve_eol)
{
std::list<std::string> lines;
auto next_char = [&f](char& ch) { return read_char_from_FILE(ch, f); };
read_lines_from_file(next_char, lines, preserve_eol);
return lines;
}
void
QUtil::read_lines_from_file(
std::function<bool(char&)> next_char,
std::list<std::string>& lines,
bool preserve_eol)
{
std::string* buf = 0;
char c;
while (next_char(c)) {
if (buf == 0) {
lines.push_back("");
buf = &(lines.back());
buf->reserve(80);
}
if (buf->capacity() == buf->size()) {
buf->reserve(buf->capacity() * 2);
}
if (c == '\n') {
if (preserve_eol) {
buf->append(1, c);
} else {
// Remove any carriage return that preceded the
// newline and discard the newline
if ((!buf->empty()) && ((*(buf->rbegin())) == '\r')) {
buf->erase(buf->length() - 1);
}
}
buf = 0;
} else {
buf->append(1, c);
}
}
}
int
QUtil::str_compare_nocase(char const* s1, char const* s2)
{
#if defined(_WIN32) && defined(__BORLANDC__)
return stricmp(s1, s2);
#elif defined(_WIN32)
return _stricmp(s1, s2);
#else
return strcasecmp(s1, s2);
#endif
}
static int
maybe_from_end(int num, bool from_end, int max)
{
if (from_end) {
if (num > max) {
num = 0;
} else {
num = max + 1 - num;
}
}
return num;
}
std::vector<int>
QUtil::parse_numrange(char const* range, int max)
{
std::vector<int> result;
char const* p = range;
try {
std::vector<int> work;
static int const comma = -1;
static int const dash = -2;
size_t start_idx = 0;
size_t skip = 1;
enum { st_top, st_in_number, st_after_number } state = st_top;
bool last_separator_was_dash = false;
int cur_number = 0;
bool from_end = false;
while (*p) {
char ch = *p;
if (isdigit(ch)) {
if (!((state == st_top) || (state == st_in_number))) {
throw std::runtime_error("digit not expected");
}
state = st_in_number;
cur_number *= 10;
cur_number += (ch - '0');
} else if (ch == 'z') {
// z represents max
if (!(state == st_top)) {
throw std::runtime_error("z not expected");
}
state = st_after_number;
cur_number = max;
} else if (ch == 'r') {
if (!(state == st_top)) {
throw std::runtime_error("r not expected");
}
state = st_in_number;
from_end = true;
} else if ((ch == ',') || (ch == '-')) {
if (!((state == st_in_number) || (state == st_after_number))) {
throw std::runtime_error("unexpected separator");
}
cur_number = maybe_from_end(cur_number, from_end, max);
work.push_back(cur_number);
cur_number = 0;
from_end = false;
if (ch == ',') {
state = st_top;
last_separator_was_dash = false;
work.push_back(comma);
} else if (ch == '-') {
if (last_separator_was_dash) {
throw std::runtime_error("unexpected dash");
}
state = st_top;
last_separator_was_dash = true;
work.push_back(dash);
}
} else if (ch == ':') {
if (!((state == st_in_number) || (state == st_after_number))) {
throw std::runtime_error("unexpected colon");
}
break;
} else {
throw std::runtime_error("unexpected character");
}
++p;
}
if ((state == st_in_number) || (state == st_after_number)) {
cur_number = maybe_from_end(cur_number, from_end, max);
work.push_back(cur_number);
} else {
throw std::runtime_error("number expected");
}
if (*p == ':') {
if (strcmp(p, ":odd") == 0) {
skip = 2;
} else if (strcmp(p, ":even") == 0) {
skip = 2;
start_idx = 1;
} else {
throw std::runtime_error("unexpected even/odd modifier");
}
}
p = 0;
for (size_t i = 0; i < work.size(); i += 2) {
int num = work.at(i);
// max == 0 means we don't know the max and are just
// testing for valid syntax.
if ((max > 0) && ((num < 1) || (num > max))) {
throw std::runtime_error(
"number " + QUtil::int_to_string(num) + " out of range");
}
if (i == 0) {
result.push_back(work.at(i));
} else {
int separator = work.at(i - 1);
if (separator == comma) {
result.push_back(num);
} else if (separator == dash) {
int lastnum = result.back();
if (num > lastnum) {
for (int j = lastnum + 1; j <= num; ++j) {
result.push_back(j);
}
} else {
for (int j = lastnum - 1; j >= num; --j) {
result.push_back(j);
}
}
} else {
throw std::logic_error(
"INTERNAL ERROR parsing numeric range");
}
}
}
if ((start_idx > 0) || (skip != 1)) {
auto t = result;
result.clear();
for (size_t i = start_idx; i < t.size(); i += skip) {
result.push_back(t.at(i));
}
}
} catch (std::runtime_error const& e) {
std::string message;
if (p) {
message = "error at * in numeric range " +
std::string(range, QIntC::to_size(p - range)) + "*" + p + ": " +
e.what();
} else {
message = "error in numeric range " + std::string(range) + ": " +
e.what();
}
throw std::runtime_error(message);
}
return result;
}
enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc };
static unsigned char
encode_winansi(unsigned long codepoint)
{
auto i = unicode_to_win_ansi.find(codepoint);
if (i != unicode_to_win_ansi.end()) {
return i->second;
}
return '\0';
}
static unsigned char
encode_macroman(unsigned long codepoint)
{
auto i = unicode_to_mac_roman.find(codepoint);
if (i != unicode_to_mac_roman.end()) {
return i->second;
}
return '\0';
}
static unsigned char
encode_pdfdoc(unsigned long codepoint)
{
auto i = unicode_to_pdf_doc.find(codepoint);
if (i != unicode_to_pdf_doc.end()) {
return i->second;
}
return '\0';
}
unsigned long
QUtil::get_next_utf8_codepoint(
std::string const& utf8_val, size_t& pos, bool& error)
{
size_t len = utf8_val.length();
unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
error = false;
if (ch < 128) {
return static_cast<unsigned long>(ch);
}
size_t bytes_needed = 0;
unsigned bit_check = 0x40;
unsigned char to_clear = 0x80;
while (ch & bit_check) {
++bytes_needed;
to_clear = static_cast<unsigned char>(to_clear | bit_check);
bit_check >>= 1;
}
if (((bytes_needed > 5) || (bytes_needed < 1)) ||
((pos + bytes_needed) > len)) {
error = true;
return 0xfffd;
}
unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
while (bytes_needed > 0) {
--bytes_needed;
ch = static_cast<unsigned char>(utf8_val.at(pos++));
if ((ch & 0xc0) != 0x80) {
--pos;
error = true;
return 0xfffd;
}
codepoint <<= 6;
codepoint += (ch & 0x3f);
}
return codepoint;
}
static bool
transcode_utf8(
std::string const& utf8_val,
std::string& result,
encoding_e encoding,
char unknown)
{
bool okay = true;
result.clear();
if (encoding == e_utf16) {
result += "\xfe\xff";
}
size_t len = utf8_val.length();
size_t pos = 0;
while (pos < len) {
bool error = false;
unsigned long codepoint =
QUtil::get_next_utf8_codepoint(utf8_val, pos, error);
if (error) {
okay = false;
if (encoding == e_utf16) {
result += "\xff\xfd";
} else {
result.append(1, unknown);
}
} else if (codepoint < 128) {
char ch = static_cast<char>(codepoint);
if (encoding == e_utf16) {
result += QUtil::toUTF16(QIntC::to_ulong(ch));
} else if (
(encoding == e_pdfdoc) &&
(((ch >= 0x18) && (ch <= 0x1f)) || (ch == 127))) {
// PDFDocEncoding maps some low characters to Unicode,
// so if we encounter those invalid UTF-8 code points,
// map them to unknown so reversing the mapping
// doesn't change them into other characters.
okay = false;
result.append(1, unknown);
} else {
result.append(1, ch);
}
} else if (encoding == e_utf16) {
result += QUtil::toUTF16(codepoint);
} else if ((codepoint == 0xad) && (encoding == e_pdfdoc)) {
// PDFDocEncoding omits 0x00ad (soft hyphen).
okay = false;
result.append(1, unknown);
} else if (
(codepoint > 160) && (codepoint < 256) &&
((encoding == e_winansi) || (encoding == e_pdfdoc))) {
result.append(1, static_cast<char>(codepoint & 0xff));
} else {
unsigned char ch = '\0';
if (encoding == e_winansi) {
ch = encode_winansi(codepoint);
} else if (encoding == e_macroman) {
ch = encode_macroman(codepoint);
} else if (encoding == e_pdfdoc) {
ch = encode_pdfdoc(codepoint);
}
if (ch == '\0') {
okay = false;
ch = static_cast<unsigned char>(unknown);
}
result.append(1, static_cast<char>(ch));
}
}
return okay;
}
static std::string
transcode_utf8(std::string const& utf8_val, encoding_e encoding, char unknown)
{
std::string result;
transcode_utf8(utf8_val, result, encoding, unknown);
return result;
}
std::string
QUtil::utf8_to_utf16(std::string const& utf8)
{
return transcode_utf8(utf8, e_utf16, 0);
}
std::string
QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char)
{
return transcode_utf8(utf8, e_ascii, unknown_char);
}
std::string
QUtil::utf8_to_win_ansi(std::string const& utf8, char unknown_char)
{
return transcode_utf8(utf8, e_winansi, unknown_char);
}
std::string
QUtil::utf8_to_mac_roman(std::string const& utf8, char unknown_char)
{
return transcode_utf8(utf8, e_macroman, unknown_char);
}
std::string
QUtil::utf8_to_pdf_doc(std::string const& utf8, char unknown_char)
{
return transcode_utf8(utf8, e_pdfdoc, unknown_char);
}
bool
QUtil::utf8_to_ascii(
std::string const& utf8, std::string& ascii, char unknown_char)
{
return transcode_utf8(utf8, ascii, e_ascii, unknown_char);
}
bool
QUtil::utf8_to_win_ansi(
std::string const& utf8, std::string& win, char unknown_char)
{
return transcode_utf8(utf8, win, e_winansi, unknown_char);
}
bool
QUtil::utf8_to_mac_roman(
std::string const& utf8, std::string& mac, char unknown_char)
{
return transcode_utf8(utf8, mac, e_macroman, unknown_char);
}
bool
QUtil::utf8_to_pdf_doc(
std::string const& utf8, std::string& pdfdoc, char unknown_char)
{
return transcode_utf8(utf8, pdfdoc, e_pdfdoc, unknown_char);
}
bool
QUtil::is_utf16(std::string const& val)
{
return (
(val.length() >= 2) &&
(((val.at(0) == '\xfe') && (val.at(1) == '\xff')) ||
((val.at(0) == '\xff') && (val.at(1) == '\xfe'))));
}
bool
QUtil::is_explicit_utf8(std::string const& val)
{
// QPDF_String.cc knows that this is a 3-byte sequence.
return (
(val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') &&
(val.at(2) == '\xbf'));
}
std::string
QUtil::utf16_to_utf8(std::string const& val)
{
std::string result;
// This code uses unsigned long and unsigned short to hold
// codepoint values. It requires unsigned long to be at least
// 32 bits and unsigned short to be at least 16 bits, but it
// will work fine if they are larger.
unsigned long codepoint = 0L;
size_t len = val.length();
size_t start = 0;
bool is_le = false;
if (is_utf16(val)) {
if (static_cast<unsigned char>(val.at(0)) == 0xff) {
is_le = true;
}
start += 2;
}
// If the string has an odd number of bytes, the last byte is
// ignored.
for (size_t i = start; i + 1 < len; i += 2) {
// Convert from UTF16-BE. If we get a malformed
// codepoint, this code will generate incorrect output
// without giving a warning. Specifically, a high
// codepoint not followed by a low codepoint will be
// discarded, and a low codepoint not preceded by a high
// codepoint will just get its low 10 bits output.
auto msb = is_le ? i + 1 : i;
auto lsb = is_le ? i : i + 1;
unsigned short bits = QIntC::to_ushort(
(static_cast<unsigned char>(val.at(msb)) << 8) +
static_cast<unsigned char>(val.at(lsb)));
if ((bits & 0xFC00) == 0xD800) {
codepoint = 0x10000U + ((bits & 0x3FFU) << 10U);
continue;
} else if ((bits & 0xFC00) == 0xDC00) {
if (codepoint != 0) {
QTC::TC("qpdf", "QUtil non-trivial UTF-16");
}
codepoint += bits & 0x3FF;
} else {
codepoint = bits;
}
result += QUtil::toUTF8(codepoint);
codepoint = 0;
}
return result;
}
std::string
QUtil::win_ansi_to_utf8(std::string const& val)
{
std::string result;
size_t len = val.length();
for (unsigned int i = 0; i < len; ++i) {
unsigned char ch = static_cast<unsigned char>(val.at(i));
unsigned short ch_short = ch;
if ((ch >= 128) && (ch <= 160)) {
ch_short = win_ansi_to_unicode[ch - 128];
}
result += QUtil::toUTF8(ch_short);
}
return result;
}
std::string
QUtil::mac_roman_to_utf8(std::string const& val)
{
std::string result;
size_t len = val.length();
for (unsigned int i = 0; i < len; ++i) {
unsigned char ch = static_cast<unsigned char>(val.at(i));
unsigned short ch_short = ch;
if (ch >= 128) {
ch_short = mac_roman_to_unicode[ch - 128];
}
result += QUtil::toUTF8(ch_short);
}
return result;
}
std::string
QUtil::pdf_doc_to_utf8(std::string const& val)
{
std::string result;
size_t len = val.length();
for (unsigned int i = 0; i < len; ++i) {
unsigned char ch = static_cast<unsigned char>(val.at(i));
unsigned short ch_short = ch;
if ((ch >= 127) && (ch <= 160)) {
ch_short = pdf_doc_to_unicode[ch - 127];
} else if ((ch >= 24) && (ch <= 31)) {
ch_short = pdf_doc_low_to_unicode[ch - 24];
} else if (ch == 173) {
ch_short = 0xfffd;
}
result += QUtil::toUTF8(ch_short);
}
return result;
}
void
QUtil::analyze_encoding(
std::string const& val,
bool& has_8bit_chars,
bool& is_valid_utf8,
bool& is_utf16)
{
has_8bit_chars = is_utf16 = is_valid_utf8 = false;
if (QUtil::is_utf16(val)) {
has_8bit_chars = true;
is_utf16 = true;
return;
}
size_t len = val.length();
size_t pos = 0;
bool any_errors = false;
while (pos < len) {
bool error = false;
unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
if (error) {
any_errors = true;
}
if (codepoint >= 128) {
has_8bit_chars = true;
}
}
if (has_8bit_chars && (!any_errors)) {
is_valid_utf8 = true;
}
}
std::vector<std::string>
QUtil::possible_repaired_encodings(std::string supplied)
{
std::vector<std::string> result;
// Always include the original string
result.push_back(supplied);
bool has_8bit_chars = false;
bool is_valid_utf8 = false;
bool is_utf16 = false;
analyze_encoding(supplied, has_8bit_chars, is_valid_utf8, is_utf16);
if (!has_8bit_chars) {
return result;
}
if (is_utf16) {
// Convert to UTF-8 and pretend we got a UTF-8 string.
is_utf16 = false;
is_valid_utf8 = true;
supplied = utf16_to_utf8(supplied);
}
std::string output;
if (is_valid_utf8) {
// Maybe we were given UTF-8 but wanted one of the single-byte
// encodings.
if (utf8_to_pdf_doc(supplied, output)) {
result.push_back(output);
}
if (utf8_to_win_ansi(supplied, output)) {
result.push_back(output);
}
if (utf8_to_mac_roman(supplied, output)) {
result.push_back(output);
}
} else {
// Maybe we were given one of the single-byte encodings but
// wanted UTF-8.
std::string from_pdf_doc(pdf_doc_to_utf8(supplied));
result.push_back(from_pdf_doc);
std::string from_win_ansi(win_ansi_to_utf8(supplied));
result.push_back(from_win_ansi);
std::string from_mac_roman(mac_roman_to_utf8(supplied));
result.push_back(from_mac_roman);
// Maybe we were given one of the other single-byte encodings
// but wanted one of the other ones.
if (utf8_to_win_ansi(from_pdf_doc, output)) {
result.push_back(output);
}
if (utf8_to_mac_roman(from_pdf_doc, output)) {
result.push_back(output);
}
if (utf8_to_pdf_doc(from_win_ansi, output)) {
result.push_back(output);
}
if (utf8_to_mac_roman(from_win_ansi, output)) {
result.push_back(output);
}
if (utf8_to_pdf_doc(from_mac_roman, output)) {
result.push_back(output);
}
if (utf8_to_win_ansi(from_mac_roman, output)) {
result.push_back(output);
}
}
// De-duplicate
std::vector<std::string> t;
std::set<std::string> seen;
for (auto const& iter: result) {
if (!seen.count(iter)) {
seen.insert(iter);
t.push_back(iter);
}
}
return t;
}
#ifndef QPDF_NO_WCHAR_T
static int
call_main_from_wmain(
bool,
int argc,
wchar_t const* const argv[],
std::function<int(int, char*[])> realmain)
{
// argv contains UTF-16-encoded strings with a 16-bit wchar_t.
// Convert this to UTF-8-encoded strings for compatibility with
// other systems. That way the rest of qpdf.cc can just act like
// arguments are UTF-8.
std::vector<std::unique_ptr<char[]>> utf8_argv;
for (int i = 0; i < argc; ++i) {
std::string utf16;
for (size_t j = 0; j < std::wcslen(argv[i]); ++j) {
unsigned short codepoint = static_cast<unsigned short>(argv[i][j]);
utf16.append(1, static_cast<char>(QIntC::to_uchar(codepoint >> 8)));
utf16.append(
1, static_cast<char>(QIntC::to_uchar(codepoint & 0xff)));
}
std::string utf8 = QUtil::utf16_to_utf8(utf16);
utf8_argv.push_back(QUtil::make_unique_cstr(utf8));
}
auto utf8_argv_sp = std::make_unique<char*[]>(1 + utf8_argv.size());
char** new_argv = utf8_argv_sp.get();
for (size_t i = 0; i < utf8_argv.size(); ++i) {
new_argv[i] = utf8_argv.at(i).get();
}
argc = QIntC::to_int(utf8_argv.size());
new_argv[argc] = 0;
return realmain(argc, new_argv);
}
int
QUtil::call_main_from_wmain(
int argc, wchar_t* argv[], std::function<int(int, char*[])> realmain)
{
return ::call_main_from_wmain(true, argc, argv, realmain);
}
int
QUtil::call_main_from_wmain(
int argc,
wchar_t const* const argv[],
std::function<int(int, char const* const[])> realmain)
{
return ::call_main_from_wmain(
true, argc, argv, [realmain](int new_argc, char* new_argv[]) {
return realmain(new_argc, new_argv);
});
}
#endif // QPDF_NO_WCHAR_T