From 431987475b392daf4094570565881e1ebfc9528a Mon Sep 17 00:00:00 2001 From: m-holger Date: Sat, 10 Feb 2024 12:03:28 +0000 Subject: [PATCH] Add new method QPDF_Name::analyzeJSONEncoding Provide a custom method to check whether a name is valid utf8. Integrate checking for characters that need to be escaped in JSON. --- libqpdf/QPDF_Dictionary.cc | 31 +++++++--------- libqpdf/QPDF_Name.cc | 73 +++++++++++++++++++++++++++++++------- libqpdf/qpdf/QPDF_Name.hh | 5 +++ 3 files changed, 78 insertions(+), 31 deletions(-) diff --git a/libqpdf/QPDF_Dictionary.cc b/libqpdf/QPDF_Dictionary.cc index 53d78a2b..ca7fa04a 100644 --- a/libqpdf/QPDF_Dictionary.cc +++ b/libqpdf/QPDF_Dictionary.cc @@ -77,15 +77,11 @@ QPDF_Dictionary::getJSON(int json_version) if (json_version == 1) { j.addDictionaryMember( QPDF_Name::normalizeName(iter.first), iter.second.getJSON(json_version)); + } else if (auto res = QPDF_Name::analyzeJSONEncoding(iter.first); res.first) { + j.addDictionaryMember(iter.first, iter.second.getJSON(json_version)); } else { - bool has_8bit_chars; - bool is_valid_utf8; - bool is_utf16; - QUtil::analyze_encoding(iter.first, has_8bit_chars, is_valid_utf8, is_utf16); - std::string key = !has_8bit_chars || is_valid_utf8 - ? iter.first - : "n:" + QPDF_Name::normalizeName(iter.first); - j.addDictionaryMember(key, iter.second.getJSON(json_version)); + j.addDictionaryMember( + "n:" + QPDF_Name::normalizeName(iter.first), iter.second.getJSON(json_version)); } } } @@ -100,18 +96,17 @@ QPDF_Dictionary::writeJSON(int json_version, JSON::Writer& p) if (!iter.second.isNull()) { p.writeNext(); if (json_version == 1) { - p << "\"" << JSON::Writer::encode_string(QPDF_Name::normalizeName(iter.first)) << "\": "; - } else { - bool has_8bit_chars; - bool is_valid_utf8; - bool is_utf16; - QUtil::analyze_encoding(iter.first, has_8bit_chars, is_valid_utf8, is_utf16); - if (!has_8bit_chars || is_valid_utf8) { - p << "\"" << JSON::Writer::encode_string(iter.first) << "\": "; + p << "\"" << JSON::Writer::encode_string(QPDF_Name::normalizeName(iter.first)) + << "\": "; + } else if (auto res = QPDF_Name::analyzeJSONEncoding(iter.first); res.first) { + if (res.second) { + p << "\"" << iter.first << "\": "; } else { - p << "\"n:" << JSON::Writer::encode_string(QPDF_Name::normalizeName(iter.first)) - << "\": "; + p << "\"" << JSON::Writer::encode_string(iter.first) << "\": "; } + } else { + p << "\"n:" << JSON::Writer::encode_string(QPDF_Name::normalizeName(iter.first)) + << "\": "; } iter.second.writeJSON(json_version, p); } diff --git a/libqpdf/QPDF_Name.cc b/libqpdf/QPDF_Name.cc index 458b1428..04614769 100644 --- a/libqpdf/QPDF_Name.cc +++ b/libqpdf/QPDF_Name.cc @@ -3,6 +3,8 @@ #include #include +#include + QPDF_Name::QPDF_Name(std::string const& name) : QPDFValue(::ot_name, "name"), name(name) @@ -52,20 +54,65 @@ QPDF_Name::unparse() return normalizeName(this->name); } +std::pair +QPDF_Name::analyzeJSONEncoding(const std::string& name) +{ + std::basic_string_view view{ + reinterpret_cast(name.data()), name.size()}; + + int tail = 0; // Number of continuation characters expected. + bool tail2 = false; // Potential overlong 3 octet utf-8. + bool tail3 = false; // potential overlong 4 octet + bool needs_escaping = false; + for (auto const& c: view) { + if (tail) { + if ((c & 0xc0) != 0x80) { + return {false, false}; + } + if (tail2) { + if ((c & 0xe0) == 0x80) { + return {false, false}; + } + tail2 = false; + } else if (tail3) { + if ((c & 0xf0) == 0x80) { + return {false, false}; + } + tail3 = false; + } + tail--; + } else if (c < 0x80) { + if (!needs_escaping) { + needs_escaping = !((c > 34 && c != '\\') || c == ' ' || c == 33); + } + } else if ((c & 0xe0) == 0xc0) { + if ((c & 0xfe) == 0xc0) { + return {false, false}; + } + tail = 1; + } else if ((c & 0xf0) == 0xe0) { + tail2 = (c == 0xe0); + tail = 2; + } else if ((c & 0xf8) == 0xf0) { + tail3 = (c == 0xf0); + tail = 3; + } else { + return {false, false}; + } + } + return {tail == 0, !needs_escaping}; +} + JSON QPDF_Name::getJSON(int json_version) { if (json_version == 1) { return JSON::makeString(normalizeName(this->name)); } else { - bool has_8bit_chars; - bool is_valid_utf8; - bool is_utf16; - QUtil::analyze_encoding(this->name, has_8bit_chars, is_valid_utf8, is_utf16); - if (!has_8bit_chars || is_valid_utf8) { - return JSON::makeString(this->name); + if (auto res = analyzeJSONEncoding(name); res.first) { + return JSON::makeString(name); } else { - return JSON::makeString("n:" + normalizeName(this->name)); + return JSON::makeString("n:" + normalizeName(name)); } } } @@ -76,12 +123,12 @@ QPDF_Name::writeJSON(int json_version, JSON::Writer& p) if (json_version == 1) { p << "\"" << JSON::Writer::encode_string(normalizeName(name)) << "\""; } else { - bool has_8bit_chars; - bool is_valid_utf8; - bool is_utf16; - QUtil::analyze_encoding(this->name, has_8bit_chars, is_valid_utf8, is_utf16); - if (!has_8bit_chars || is_valid_utf8) { - p << "\"" << JSON::Writer::encode_string(name) << "\""; + if (auto res = analyzeJSONEncoding(name); res.first) { + if (res.second) { + p << "\"" << name << "\""; + } else { + p << "\"" << JSON::Writer::encode_string(name) << "\""; + } } else { p << "\"n:" << JSON::Writer::encode_string(normalizeName(name)) << "\""; } diff --git a/libqpdf/qpdf/QPDF_Name.hh b/libqpdf/qpdf/QPDF_Name.hh index 167ddef5..fd4ac458 100644 --- a/libqpdf/qpdf/QPDF_Name.hh +++ b/libqpdf/qpdf/QPDF_Name.hh @@ -15,6 +15,11 @@ class QPDF_Name: public QPDFValue // Put # into strings with characters unsuitable for name token static std::string normalizeName(std::string const& name); + + // Check whether name is valid utf-8 and whether it contains characters that require escaping. + // Return {false, false} if the name is not valid utf-8, otherwise return {true, true} if no + // characters require or {true, false} if escaping is required. + static std::pair analyzeJSONEncoding(std::string const& name); std::string getStringValue() const override {