Add "n:/pdf-name" to qpdf JSON for binary names (fixes #1072)

This commit is contained in:
Jay Berkenbilt 2023-12-21 17:14:28 -05:00
parent bb12a7ff8d
commit 4400ce84ee
8 changed files with 234 additions and 2 deletions

View File

@ -1,5 +1,12 @@
2023-12-21 Jay Berkenbilt <ejb@ql.org>
* Fix to QPDF JSON: the syntax "n:/pdf-syntax" is now accepted as
an alternative way to represent names. This can be used for any
name (e.g. "n:/text#2fplain"), but it is necessary when the name
contains binary characters. For example, /one#a0two must be
represented as "n:/one#a0two" since the single byte a0 is not
valid in JSON. Fixes #1072.
* From M. Holger: Refactor QPDFParser for performance. See #1059
for a discussion.

View File

@ -57,6 +57,14 @@ QPDF_Name::getJSON(int json_version)
if (json_version == 1) {
return JSON::makeString(normalizeName(this->name));
} else {
return JSON::makeString(this->name);
bool has_8bit_chars;
bool is_valid_utf8;
bool is_utf16;
QUtil::analyze_encoding(this->name, has_8bit_chars, is_valid_utf8, is_utf16);
if (!has_8bit_chars || is_valid_utf8) {
return JSON::makeString(this->name);
} else {
return JSON::makeString("n:" + normalizeName(this->name));
}
}
}

View File

@ -144,6 +144,12 @@ is_name(std::string const& v)
return ((v.length() > 1) && (v.at(0) == '/'));
}
static bool
is_pdf_name(std::string const& v)
{
return ((v.length() > 3) && (v.substr(0, 3) == "n:/"));
}
bool
QPDF::test_json_validators()
{
@ -740,6 +746,8 @@ QPDF::JSONReactor::makeObject(JSON const& value)
result = QPDFObjectHandle::newString(QUtil::hex_decode(str));
} else if (is_name(str_v)) {
result = QPDFObjectHandle::newName(str_v);
} else if (is_pdf_name(str_v)) {
result = QPDFObjectHandle::parse(str_v.substr(2));
} else {
QTC::TC("qpdf", "QPDF_json unrecognized string value");
error(value.getStart(), "unrecognized string value");

View File

@ -258,6 +258,12 @@ Object Values
syntax resolved. For example, the name whose canonical form (per
the PDF specification) is ``text/plain`` would be represented in
JSON as ``"/text/plain"`` and in PDF as ``"/text#2fplain"``.
Starting with qpdf 11.7.0, the syntax ``"n:/pdf-syntax"`` is
accepted as an alternative. This can be used for any name (e.g.
``"n:/text#2fplain"``), but it is necessary when the name contains
binary characters. For example, ``/one#a0two`` must be represented
as ``"n:/one#a0two"`` since the single byte ``a0`` is not valid in
JSON.
- Indirect object references are represented as JSON strings that
look like a PDF indirect object reference and have the form
@ -824,7 +830,8 @@ version 2.
- Names are shown in qpdf's canonical form rather than in PDF
syntax. (Example: the PDF-syntax name ``/text#2fplain`` appeared
as ``"/text#2fplain"`` in v1 but appears as ``"/text/plain"`` in
v2.
v2. In qpdf 11.7.0, a fix was made to accept ``"n:/pdf-syntax"``
for names containing binary characters.
- The top-level representation of an object in ``"objects"`` is a
dictionary containing either a ``"value"`` key or a ``"stream"``

View File

@ -45,6 +45,13 @@ Planned changes for future 12.x (subject to change):
reference streams, linearization hint streams, and object
streams. This has been fixed.
- Fix to QPDF JSON: the syntax ``"n:/pdf-syntax"`` is now accepted
as an alternative way to represent names. This can be used for
any name (e.g. ``"n:/text#2fplain"``), but it is necessary when
the name contains binary characters. For example, ``/one#a0two``
must be represented as ``"n:/one#a0two"`` since the single byte
``a0`` is not valid in JSON.
- Build Enhancements:
- The qpdf test suite now passes when qpdf is linked with an

View File

@ -61,6 +61,7 @@ my @goodfiles = (
'form-fields-and-annotations.pdf',
'need-appearances.pdf',
'fxo-blue.pdf',
'weird-tokens.pdf',
);
$n_tests += 6 * scalar(@goodfiles);
@ -341,5 +342,21 @@ $td->runtest("check C API write to JSON stream",
{$td->FILE => "auto-4"},
{$td->FILE => "qpdf-ctest-47-4"});
# Bugs #1072 and #1079 illustrate cases that qpdf-json got wrong. In
# #1072, it was noticed that name tokens containing binary characters
# (using #xx) would generate invalid JSON, even though qpdf's own JSON
# parser would accept it. Also, the JSON spec allows real numbers in
# scientific notation, but the PDF spec does not.
$n_tests += 2;
$td->runtest("handle binary names",
{$td->COMMAND =>
"qpdf --json-output weird-tokens.pdf a.json"},
{$td->STRING => "", $td->EXIT_STATUS => 0});
# Round-trip is tested above.
$td->runtest("check json",
{$td->FILE => "a.json"},
{$td->FILE => "weird-tokens.json"},
$td->NORMALIZE_NEWLINES);
cleanup();
$td->report($n_tests);

View File

@ -0,0 +1,83 @@
{
"qpdf": [
{
"jsonversion": 2,
"pdfversion": "2.0",
"pushedinheritedpageresources": false,
"calledgetallpages": false,
"maxobjectid": 6
},
{
"obj:1 0 R": {
"value": {
"/Extra": [
"u:Names with binary data",
"n:/ABCDEF+#ba#da#cc#e5",
"/ABCEDEF+π",
"n:/one+#a0two",
"/text/plain",
"u:Very small/large reals",
0.00001,
1000000000000
],
"/Pages": "2 0 R",
"/Type": "/Catalog"
}
},
"obj:2 0 R": {
"value": {
"/Count": 1,
"/Kids": [
"3 0 R"
],
"/Type": "/Pages"
}
},
"obj:3 0 R": {
"value": {
"/Contents": "4 0 R",
"/MediaBox": [
0,
0,
612,
792
],
"/Parent": "2 0 R",
"/Resources": {
"/Font": {
"/F1": "6 0 R"
}
},
"/Type": "/Page"
}
},
"obj:4 0 R": {
"stream": {
"data": "QlQKICAvRjEgMjQgVGYKICA3MiA3MjAgVGQKICAoUG90YXRvKSBUagpFVAo=",
"dict": {}
}
},
"obj:5 0 R": {
"value": 44
},
"obj:6 0 R": {
"value": {
"/BaseFont": "/Helvetica",
"/Encoding": "/WinAnsiEncoding",
"/Subtype": "/Type1",
"/Type": "/Font"
}
},
"trailer": {
"value": {
"/ID": [
"b:42841c13bbf709d79a200fa1691836f8",
"b:728c020f464c3cf7e02c12605fa7d88b"
],
"/Root": "1 0 R",
"/Size": 7
}
}
}
]
}

View File

@ -0,0 +1,95 @@
%PDF-2.0
%¿÷¢þ
%QDF-1.0
1 0 obj
<<
/Extra [
(Names with binary data)
/ABCDEF+#ba#da#cc#e5
/ABCEDEF+#cf#80
/one+#a0two
/text#2fplain
(Very small/large reals)
0.00001
1000000000000
]
/Pages 2 0 R
/Type /Catalog
>>
endobj
2 0 obj
<<
/Count 1
/Kids [
3 0 R
]
/Type /Pages
>>
endobj
%% Page 1
3 0 obj
<<
/Contents 4 0 R
/MediaBox [
0
0
612
792
]
/Parent 2 0 R
/Resources <<
/Font <<
/F1 6 0 R
>>
>>
/Type /Page
>>
endobj
%% Contents for page 1
4 0 obj
<<
/Length 5 0 R
>>
stream
BT
/F1 24 Tf
72 720 Td
(Potato) Tj
ET
endstream
endobj
5 0 obj
44
endobj
6 0 obj
<<
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
/Subtype /Type1
/Type /Font
>>
endobj
xref
0 7
0000000000 65535 f
0000000025 00000 n
0000000261 00000 n
0000000343 00000 n
0000000539 00000 n
0000000638 00000 n
0000000657 00000 n
trailer <<
/Root 1 0 R
/Size 7
/ID [<42841c13bbf709d79a200fa1691836f8><728c020f464c3cf7e02c12605fa7d88b>]
>>
startxref
763
%%EOF