This includes the output PDF, streams from --show-object and
attachments from --save-attachment. This also enables --verbose and
--progress to work with saving to stdout.
* native UTF-8 strings
* names whose PDF and canonical syntax differ in both dictionary key
positions and other positions
For json, names are converted both as names and directly when used as
dictionary keys.
* Replace --create-from-json=file with --json-input, which causes the
regular input to be treated as json.
* Eliminate --to-json
* In --json=2, bring back "objects" and eliminate "objectinfo". Stream
data is never present.
* In --json-output=2, write "qpdf-v2" with "objects" and include
stream data.
There is one unexpected pass in this commit. This script was applied
to the files changed in this commit:
#!/usr/bin/env python3
import json
import sys
def json_dumps(data):
return json.dumps(data, ensure_ascii=False,
indent=2, separators=(',', ': '))
for filename in sys.argv[1:]:
with open(filename, 'r') as f:
data = json.loads(
data['version'] = 2
objectinfo = {}
if 'objectinfo' in data:
objectinfo = data['objectinfo']
del data['objectinfo']
if 'objects' not in data:
qpdf = {'jsonversion': 2, 'pdfversion': '1.3', 'objects': {}}
for k, v in data['objects'].items():
is_stream = objectinfo.get(k, {}).get('stream', {}).get('is', False)
if k.endswith(' R'):
k = 'obj:' + k
if is_stream:
v = {'stream': {'dict': v}}
v = {'value': v}
qpdf['objects'][k] = v
data['qpdf'] = qpdf
del data['objects']
moddify -> modify. Also carefully spell checked all remaining keys by
splitting them into words and running a spell checker, not just
relying on visual proofreading. That was the only one.
This script was used on test data:
#!/usr/bin/env python3
import json
import sys
import re
def json_dumps(data):
return json.dumps(data, ensure_ascii=False,
indent=2, separators=(',', ': '))
for filename in sys.argv[1:]:
with open(filename, 'r') as f:
data = json.loads(
if 'objectinfo' not in data:
trailer = None
to_sort = []
for k, v in data['objectinfo'].items():
if k == 'trailer':
trailer = v
m = re.match(r'^(\d+) \d+ R', k)
if m:
to_sort.append([int(, k, v])
newobjectinfo = {x[1]: x[2] for x in sorted(to_sort)}
if trailer is not None:
newobjectinfo['trailer'] = trailer
data['objectinfo'] = newobjectinfo
The following script was used to adjust test data:
#!/usr/bin/env python3
import json
import sys
import re
def json_dumps(data):
return json.dumps(data, ensure_ascii=False,
indent=2, separators=(',', ': '))
for filename in sys.argv[1:]:
with open(filename, 'r') as f:
data = json.loads(
if 'objects' not in data:
trailer = None
to_sort = []
for k, v in data['objects'].items():
if k == 'trailer':
trailer = v
m = re.match(r'^(\d+) \d+ R', k)
if m:
to_sort.append([int(, k, v])
newobjects = {x[1]: x[2] for x in sorted(to_sort)}
if trailer is not None:
newobjects['trailer'] = trailer
data['objects'] = newobjects
This commit just changes the order in which fields are written to the
json without changing their content. All the json files in the test
suite were modified with this script to ensure that we didn't get any
changes other than ordering.
#!/usr/bin/env python3
import json
import sys
def json_dumps(data):
return json.dumps(data, ensure_ascii=False,
indent=2, separators=(',', ': '))
for filename in sys.argv[1:]:
with open(filename, 'r') as f:
data = json.loads(
newdata = {}
for i in ('version', 'parameters', 'pages', 'pagelabels',
'acroform', 'attachments', 'encrypt', 'outlines',
'objects', 'objectinfo'):
if i in data:
newdata[i] = data[i]