diff --git a/qpdf/build.mk b/qpdf/build.mk index 893bbbd1..1bc21836 100644 --- a/qpdf/build.mk +++ b/qpdf/build.mk @@ -1,4 +1,4 @@ -BINS_qpdf = qpdf test_driver pdf_from_scratch test_large_file +BINS_qpdf = qpdf test_driver pdf_from_scratch test_large_file test_tokenizer CBINS_qpdf = qpdf-ctest TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B))) diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index f0dde70f..ddf25d73 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -240,7 +240,7 @@ foreach my $d (@bug_tests) show_ntests(); # ---------- $td->notify("--- Miscellaneous Tests ---"); -$n_tests += 96; +$n_tests += 97; $td->runtest("qpdf version", {$td->COMMAND => "qpdf --version"}, @@ -263,6 +263,11 @@ $td->runtest("check pass1 file", {$td->FILE => "b.pdf"}, {$td->FILE => "minimal-linearize-pass1.pdf"}); +$td->runtest("tokenizer", + {$td->COMMAND => "test_tokenizer tokens.pdf"}, + {$td->FILE => "tokens.out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); + foreach (my $i = 1; $i <= 3; ++$i) { $td->runtest("misc tests", diff --git a/qpdf/qtest/qpdf/tokens.out b/qpdf/qtest/qpdf/tokens.out new file mode 100644 index 00000000..f52e619b --- /dev/null +++ b/qpdf/qtest/qpdf/tokens.out @@ -0,0 +1,1120 @@ +--- BEGIN FILE --- +60: integer: 1 +62: integer: 0 +64: word: obj +68: dict_open: << +73: name: /Type +79: name: /ObjStm +89: name: /Length +97: integer: 6020 +104: name: /N +107: integer: 35 +112: name: /First +119: integer: 323 +123: dict_close: >> +126: word: stream +skipping to endstream +6153: word: endstream +6163: word: endobj +6222: integer: 37 +6225: integer: 0 +6227: word: obj +6231: dict_open: << +6236: name: /Length +6244: integer: 38 +6247: integer: 0 +6249: word: R +6251: dict_close: >> +6254: word: stream +skipping to endstream +6305: word: endstream +6315: word: endobj +6323: integer: 38 +6326: integer: 0 +6328: word: obj +6332: integer: 44 +6335: word: endobj +6394: integer: 39 +6397: integer: 0 +6399: word: obj +6403: dict_open: << +6408: name: /Length +6416: integer: 40 +6419: integer: 0 +6421: word: R +6423: dict_close: >> +6426: word: stream +skipping to endstream +6832: word: endstream +6842: word: endobj +6850: integer: 40 +6853: integer: 0 +6855: word: obj +6859: integer: 399 +6863: word: endobj +6922: integer: 41 +6925: integer: 0 +6927: word: obj +6931: dict_open: << +6936: name: /Length +6944: integer: 42 +6947: integer: 0 +6949: word: R +6951: dict_close: >> +6954: word: stream +skipping to endstream +7001: word: endstream +7011: word: endobj +7019: integer: 42 +7022: integer: 0 +7024: word: obj +7028: integer: 40 +7031: word: endobj +7090: integer: 43 +7093: integer: 0 +7095: word: obj +7099: dict_open: << +7104: name: /Length +7112: integer: 44 +7115: integer: 0 +7117: word: R +7119: dict_close: >> +7122: word: stream +skipping to endstream +7404: word: endstream +7414: word: endobj +7422: integer: 44 +7425: integer: 0 +7427: word: obj +7431: integer: 275 +7435: word: endobj +7494: integer: 45 +7497: integer: 0 +7499: word: obj +7503: dict_open: << +7508: name: /Length +7516: integer: 46 +7519: integer: 0 +7521: word: R +7523: dict_close: >> +7526: word: stream +skipping to endstream +7601: word: endstream +7611: word: endobj +7619: integer: 46 +7622: integer: 0 +7624: word: obj +7628: integer: 68 +7631: word: endobj +7690: integer: 47 +7693: integer: 0 +7695: word: obj +7699: dict_open: << +7704: name: /Length +7712: integer: 48 +7715: integer: 0 +7717: word: R +7719: dict_close: >> +7722: word: stream +skipping to endstream +7773: word: endstream +7783: word: endobj +7791: integer: 48 +7794: integer: 0 +7796: word: obj +7800: integer: 44 +7803: word: endobj +7862: integer: 49 +7865: integer: 0 +7867: word: obj +7871: dict_open: << +7876: name: /Length +7884: integer: 50 +7887: integer: 0 +7889: word: R +7891: dict_close: >> +7894: word: stream +skipping to endstream +7945: word: endstream +7955: word: endobj +7963: integer: 50 +7966: integer: 0 +7968: word: obj +7972: integer: 44 +7975: word: endobj +8034: integer: 51 +8037: integer: 0 +8039: word: obj +8043: dict_open: << +8048: name: /Length +8056: integer: 52 +8059: integer: 0 +8061: word: R +8063: dict_close: >> +8066: word: stream +skipping to endstream +8117: word: endstream +8127: word: endobj +8135: integer: 52 +8138: integer: 0 +8140: word: obj +8144: integer: 44 +8147: word: endobj +8206: integer: 53 +8209: integer: 0 +8211: word: obj +8215: dict_open: << +8220: name: /Length +8228: integer: 54 +8231: integer: 0 +8233: word: R +8235: dict_close: >> +8238: word: stream +skipping to endstream +8289: word: endstream +8299: word: endobj +8307: integer: 54 +8310: integer: 0 +8312: word: obj +8316: integer: 44 +8319: word: endobj +8379: integer: 55 +8382: integer: 0 +8384: word: obj +8388: dict_open: << +8393: name: /Length +8401: integer: 56 +8404: integer: 0 +8406: word: R +8408: dict_close: >> +8411: word: stream +skipping to endstream +8462: word: endstream +8472: word: endobj +8480: integer: 56 +8483: integer: 0 +8485: word: obj +8489: integer: 44 +8492: word: endobj +8552: integer: 57 +8555: integer: 0 +8557: word: obj +8561: dict_open: << +8566: name: /Length +8574: integer: 58 +8577: integer: 0 +8579: word: R +8581: dict_close: >> +8584: word: stream +skipping to endstream +8635: word: endstream +8645: word: endobj +8653: integer: 58 +8656: integer: 0 +8658: word: obj +8662: integer: 44 +8665: word: endobj +8673: integer: 59 +8676: integer: 0 +8678: word: obj +8682: dict_open: << +8687: name: /Type +8693: name: /XRef +8701: name: /Length +8709: integer: 240 +8715: name: /W +8718: array_open: [ +8720: integer: 1 +8722: integer: 2 +8724: integer: 1 +8726: array_close: ] +8730: name: /Root +8736: integer: 2 +8738: integer: 0 +8740: word: R +8744: name: /Size +8750: integer: 60 +8755: name: /ID +8759: array_open: [ +8760: string: \x88\x04\x8e\x17\xc9a\xe0\x94\xff\xec\xe9\x8c\xb8\x8cF\xd0 (raw: <88048e17c961e094ffece98cb88c46d0>) +8794: string: \xed\xd6\x0f\xe8\xee\x87\xf8\x871\xa8o\x81\x9f\xe6Q\x99 (raw: ) +8828: array_close: ] +8830: dict_close: >> +8833: word: stream +skipping to endstream +9081: word: endstream +9091: word: endobj +9099: word: startxref +9109: integer: 8673 +9120: eof +--- END FILE --- +--- BEGIN PAGE 1 --- +0: word: BT +5: name: /F1 +9: integer: 24 +12: word: Tf +17: integer: 72 +20: integer: 720 +24: word: Td +29: string: Potato (raw: (Potato)) +38: word: Tj +41: word: ET +44: eof +--- END PAGE 1 --- +--- BEGIN PAGE 2 --- +0: word: BT +5: name: /F1 +9: integer: 24 +12: word: Tf +17: integer: 72 +20: integer: 720 +24: word: Td +29: string: Potato (raw: (Potato)) +38: word: Tj +41: word: ET +44: word: BI +47: name: /CS +51: name: /G +53: name: /W +56: integer: 66 +58: name: /H +61: integer: 47 +63: name: /BPC +68: integer: 8 +69: name: /F +71: name: /Fl +74: name: /DP +77: dict_open: << +79: name: /Predictor +90: integer: 15 +92: name: /Columns +101: integer: 66 +103: dict_close: >> +106: word: ID +skipping to EI +352: word: EI +355: word: BT +360: name: /F1 +364: integer: 24 +367: word: Tf +372: integer: 72 +375: integer: 720 +379: word: Td +384: string: Potato (raw: (Potato)) +393: word: Tj +396: word: ET +399: eof +--- END PAGE 2 --- +--- BEGIN PAGE 3 --- +0: word: BT +5: name: /F1 +9: integer: 24 +12: word: Tf +17: integer: 72 +20: integer: 720 +24: word: Td +29: bad: Potato\x0aET\x0a (raw: (Potato\x0aET\x0a) (EOF while reading token) +40: eof +--- END PAGE 3 --- +--- BEGIN PAGE 4 --- +0: word: BT +5: name: /F1 +9: integer: 24 +12: word: Tf +17: string: \xfe\xeb (raw: ) +26: string: \xab\xcd (raw: ) +36: string: quack (raw: (qu\\x0d\x0aack)) +49: string: quack (raw: (qu\\x0aack)) +61: string: quack (raw: (qu\\x0dack)) +73: integer: 72 +76: integer: 720 +80: word: Td +85: real: 3.14 +92: real: 3. +97: real: .14 +103: real: +3.14 +111: real: +3. +117: real: +.14 +124: real: -3.14 +132: real: -3. +138: real: -.14 +145: integer: +16059 +154: integer: -16059 +163: word: +. +168: bad: fadeE (raw: (unexpected >) +179: word: quack +185: bad: /name#oops (invalid name token) +196: name: /name (raw: /n#61me) +204: word: one +208: bool: true +213: word: two +217: bool: false +223: word: three +229: null: null +234: word: four +239: word: !@#$^& +245: brace_open: { +246: brace_close: } +247: word: *-_+= +253: word: abc123def3.14true +271: bad: ff (raw: > +420: dict_open: << +425: name: /Count +432: integer: 11 +437: name: /Kids +443: array_open: [ +449: integer: 4 +451: integer: 0 +453: word: R +459: integer: 5 +461: integer: 0 +463: word: R +469: integer: 6 +471: integer: 0 +473: word: R +479: integer: 7 +481: integer: 0 +483: word: R +489: integer: 8 +491: integer: 0 +493: word: R +499: integer: 9 +501: integer: 0 +503: word: R +509: integer: 10 +512: integer: 0 +514: word: R +520: integer: 11 +523: integer: 0 +525: word: R +531: integer: 12 +534: integer: 0 +536: word: R +542: integer: 13 +545: integer: 0 +547: word: R +553: integer: 14 +556: integer: 0 +558: word: R +562: array_close: ] +566: name: /Type +572: name: /Pages +579: dict_close: >> +651: dict_open: << +656: name: /Contents +666: integer: 37 +669: integer: 0 +671: word: R +675: name: /MediaBox +685: array_open: [ +691: integer: 0 +697: integer: 0 +703: integer: 612 +711: integer: 792 +717: array_close: ] +721: name: /Parent +729: integer: 3 +731: integer: 0 +733: word: R +737: name: /Resources +748: dict_open: << +755: name: /Font +761: dict_open: << +770: name: /F1 +774: integer: 15 +777: integer: 0 +779: word: R +785: dict_close: >> +792: name: /ProcSet +801: integer: 16 +804: integer: 0 +806: word: R +810: dict_close: >> +815: name: /Type +821: name: /Page +827: dict_close: >> +899: dict_open: << +904: name: /Contents +914: integer: 39 +917: integer: 0 +919: word: R +923: name: /MediaBox +933: array_open: [ +939: integer: 0 +945: integer: 0 +951: integer: 612 +959: integer: 792 +965: array_close: ] +969: name: /Parent +977: integer: 3 +979: integer: 0 +981: word: R +985: name: /Resources +996: dict_open: << +1003: name: /Font +1009: dict_open: << +1018: name: /F1 +1022: integer: 17 +1025: integer: 0 +1027: word: R +1033: dict_close: >> +1040: name: /ProcSet +1049: integer: 18 +1052: integer: 0 +1054: word: R +1058: dict_close: >> +1063: name: /Type +1069: name: /Page +1075: dict_close: >> +1147: dict_open: << +1152: name: /Contents +1162: integer: 41 +1165: integer: 0 +1167: word: R +1171: name: /MediaBox +1181: array_open: [ +1187: integer: 0 +1193: integer: 0 +1199: integer: 612 +1207: integer: 792 +1213: array_close: ] +1217: name: /Parent +1225: integer: 3 +1227: integer: 0 +1229: word: R +1233: name: /Resources +1244: dict_open: << +1251: name: /Font +1257: dict_open: << +1266: name: /F1 +1270: integer: 19 +1273: integer: 0 +1275: word: R +1281: dict_close: >> +1288: name: /ProcSet +1297: integer: 20 +1300: integer: 0 +1302: word: R +1306: dict_close: >> +1311: name: /Type +1317: name: /Page +1323: dict_close: >> +1395: dict_open: << +1400: name: /Contents +1410: integer: 43 +1413: integer: 0 +1415: word: R +1419: name: /MediaBox +1429: array_open: [ +1435: integer: 0 +1441: integer: 0 +1447: integer: 612 +1455: integer: 792 +1461: array_close: ] +1465: name: /Parent +1473: integer: 3 +1475: integer: 0 +1477: word: R +1481: name: /Resources +1492: dict_open: << +1499: name: /Font +1505: dict_open: << +1514: name: /F1 +1518: integer: 21 +1521: integer: 0 +1523: word: R +1529: dict_close: >> +1536: name: /ProcSet +1545: integer: 22 +1548: integer: 0 +1550: word: R +1554: dict_close: >> +1559: name: /Type +1565: name: /Page +1571: dict_close: >> +1643: dict_open: << +1648: name: /Contents +1658: integer: 45 +1661: integer: 0 +1663: word: R +1667: name: /MediaBox +1677: array_open: [ +1683: integer: 0 +1689: integer: 0 +1695: integer: 612 +1703: integer: 792 +1709: array_close: ] +1713: name: /Parent +1721: integer: 3 +1723: integer: 0 +1725: word: R +1729: name: /Resources +1740: dict_open: << +1747: name: /Font +1753: dict_open: << +1762: name: /F1 +1766: integer: 23 +1769: integer: 0 +1771: word: R +1777: dict_close: >> +1784: name: /ProcSet +1793: integer: 24 +1796: integer: 0 +1798: word: R +1802: dict_close: >> +1807: name: /Type +1813: name: /Page +1819: dict_close: >> +1891: dict_open: << +1896: name: /Contents +1906: integer: 47 +1909: integer: 0 +1911: word: R +1915: name: /MediaBox +1925: array_open: [ +1931: integer: 0 +1937: integer: 0 +1943: integer: 612 +1951: integer: 792 +1957: array_close: ] +1961: name: /Parent +1969: integer: 3 +1971: integer: 0 +1973: word: R +1977: name: /Resources +1988: dict_open: << +1995: name: /Font +2001: dict_open: << +2010: name: /F1 +2014: integer: 25 +2017: integer: 0 +2019: word: R +2025: dict_close: >> +2032: name: /ProcSet +2041: integer: 26 +2044: integer: 0 +2046: word: R +2050: dict_close: >> +2055: name: /Type +2061: name: /Page +2067: dict_close: >> +2141: dict_open: << +2146: name: /Contents +2156: integer: 49 +2159: integer: 0 +2161: word: R +2165: name: /MediaBox +2175: array_open: [ +2181: integer: 0 +2187: integer: 0 +2193: integer: 612 +2201: integer: 792 +2207: array_close: ] +2211: name: /Parent +2219: integer: 3 +2221: integer: 0 +2223: word: R +2227: name: /Resources +2238: dict_open: << +2245: name: /Font +2251: dict_open: << +2260: name: /F1 +2264: integer: 27 +2267: integer: 0 +2269: word: R +2275: dict_close: >> +2282: name: /ProcSet +2291: integer: 28 +2294: integer: 0 +2296: word: R +2300: dict_close: >> +2305: name: /Type +2311: name: /Page +2317: dict_close: >> +2391: dict_open: << +2396: name: /Contents +2406: integer: 51 +2409: integer: 0 +2411: word: R +2415: name: /MediaBox +2425: array_open: [ +2431: integer: 0 +2437: integer: 0 +2443: integer: 612 +2451: integer: 792 +2457: array_close: ] +2461: name: /Parent +2469: integer: 3 +2471: integer: 0 +2473: word: R +2477: name: /Resources +2488: dict_open: << +2495: name: /Font +2501: dict_open: << +2510: name: /F1 +2514: integer: 29 +2517: integer: 0 +2519: word: R +2525: dict_close: >> +2532: name: /ProcSet +2541: integer: 30 +2544: integer: 0 +2546: word: R +2550: dict_close: >> +2555: name: /Type +2561: name: /Page +2567: dict_close: >> +2642: dict_open: << +2647: name: /Contents +2657: integer: 53 +2660: integer: 0 +2662: word: R +2666: name: /MediaBox +2676: array_open: [ +2682: integer: 0 +2688: integer: 0 +2694: integer: 612 +2702: integer: 792 +2708: array_close: ] +2712: name: /Parent +2720: integer: 3 +2722: integer: 0 +2724: word: R +2728: name: /Resources +2739: dict_open: << +2746: name: /Font +2752: dict_open: << +2761: name: /F1 +2765: integer: 31 +2768: integer: 0 +2770: word: R +2776: dict_close: >> +2783: name: /ProcSet +2792: integer: 32 +2795: integer: 0 +2797: word: R +2801: dict_close: >> +2806: name: /Type +2812: name: /Page +2818: dict_close: >> +2894: dict_open: << +2899: name: /Contents +2909: integer: 55 +2912: integer: 0 +2914: word: R +2918: name: /MediaBox +2928: array_open: [ +2934: integer: 0 +2940: integer: 0 +2946: integer: 612 +2954: integer: 792 +2960: array_close: ] +2964: name: /Parent +2972: integer: 3 +2974: integer: 0 +2976: word: R +2980: name: /Resources +2991: dict_open: << +2998: name: /Font +3004: dict_open: << +3013: name: /F1 +3017: integer: 33 +3020: integer: 0 +3022: word: R +3028: dict_close: >> +3035: name: /ProcSet +3044: integer: 34 +3047: integer: 0 +3049: word: R +3053: dict_close: >> +3058: name: /Type +3064: name: /Page +3070: dict_close: >> +3146: dict_open: << +3151: name: /Contents +3161: integer: 57 +3164: integer: 0 +3166: word: R +3170: name: /MediaBox +3180: array_open: [ +3186: integer: 0 +3192: integer: 0 +3198: integer: 612 +3206: integer: 792 +3212: array_close: ] +3216: name: /Parent +3224: integer: 3 +3226: integer: 0 +3228: word: R +3232: name: /Resources +3243: dict_open: << +3250: name: /Font +3256: dict_open: << +3265: name: /F1 +3269: integer: 35 +3272: integer: 0 +3274: word: R +3280: dict_close: >> +3287: name: /ProcSet +3296: integer: 36 +3299: integer: 0 +3301: word: R +3305: dict_close: >> +3310: name: /Type +3316: name: /Page +3322: dict_close: >> +3387: dict_open: << +3392: name: /BaseFont +3402: name: /Helvetica +3415: name: /Encoding +3425: name: /WinAnsiEncoding +3444: name: /Name +3450: name: /F1 +3456: name: /Subtype +3465: name: /Type1 +3474: name: /Type +3480: name: /Font +3486: dict_close: >> +3551: array_open: [ +3555: name: /PDF +3562: name: /Text +3568: array_close: ] +3632: dict_open: << +3637: name: /BaseFont +3647: name: /Helvetica +3660: name: /Encoding +3670: name: /WinAnsiEncoding +3689: name: /Name +3695: name: /F1 +3701: name: /Subtype +3710: name: /Type1 +3719: name: /Type +3725: name: /Font +3731: dict_close: >> +3796: array_open: [ +3800: name: /PDF +3807: name: /Text +3813: array_close: ] +3877: dict_open: << +3882: name: /BaseFont +3892: name: /Helvetica +3905: name: /Encoding +3915: name: /WinAnsiEncoding +3934: name: /Name +3940: name: /F1 +3946: name: /Subtype +3955: name: /Type1 +3964: name: /Type +3970: name: /Font +3976: dict_close: >> +4041: array_open: [ +4045: name: /PDF +4052: name: /Text +4058: array_close: ] +4122: dict_open: << +4127: name: /BaseFont +4137: name: /Helvetica +4150: name: /Encoding +4160: name: /WinAnsiEncoding +4179: name: /Name +4185: name: /F1 +4191: name: /Subtype +4200: name: /Type1 +4209: name: /Type +4215: name: /Font +4221: dict_close: >> +4286: array_open: [ +4290: name: /PDF +4297: name: /Text +4303: array_close: ] +4367: dict_open: << +4372: name: /BaseFont +4382: name: /Helvetica +4395: name: /Encoding +4405: name: /WinAnsiEncoding +4424: name: /Name +4430: name: /F1 +4436: name: /Subtype +4445: name: /Type1 +4454: name: /Type +4460: name: /Font +4466: dict_close: >> +4531: array_open: [ +4535: name: /PDF +4542: name: /Text +4548: array_close: ] +4612: dict_open: << +4617: name: /BaseFont +4627: name: /Helvetica +4640: name: /Encoding +4650: name: /WinAnsiEncoding +4669: name: /Name +4675: name: /F1 +4681: name: /Subtype +4690: name: /Type1 +4699: name: /Type +4705: name: /Font +4711: dict_close: >> +4776: array_open: [ +4780: name: /PDF +4787: name: /Text +4793: array_close: ] +4857: dict_open: << +4862: name: /BaseFont +4872: name: /Helvetica +4885: name: /Encoding +4895: name: /WinAnsiEncoding +4914: name: /Name +4920: name: /F1 +4926: name: /Subtype +4935: name: /Type1 +4944: name: /Type +4950: name: /Font +4956: dict_close: >> +5021: array_open: [ +5025: name: /PDF +5032: name: /Text +5038: array_close: ] +5102: dict_open: << +5107: name: /BaseFont +5117: name: /Helvetica +5130: name: /Encoding +5140: name: /WinAnsiEncoding +5159: name: /Name +5165: name: /F1 +5171: name: /Subtype +5180: name: /Type1 +5189: name: /Type +5195: name: /Font +5201: dict_close: >> +5266: array_open: [ +5270: name: /PDF +5277: name: /Text +5283: array_close: ] +5347: dict_open: << +5352: name: /BaseFont +5362: name: /Helvetica +5375: name: /Encoding +5385: name: /WinAnsiEncoding +5404: name: /Name +5410: name: /F1 +5416: name: /Subtype +5425: name: /Type1 +5434: name: /Type +5440: name: /Font +5446: dict_close: >> +5511: array_open: [ +5515: name: /PDF +5522: name: /Text +5528: array_close: ] +5592: dict_open: << +5597: name: /BaseFont +5607: name: /Helvetica +5620: name: /Encoding +5630: name: /WinAnsiEncoding +5649: name: /Name +5655: name: /F1 +5661: name: /Subtype +5670: name: /Type1 +5679: name: /Type +5685: name: /Font +5691: dict_close: >> +5756: array_open: [ +5760: name: /PDF +5767: name: /Text +5773: array_close: ] +5837: dict_open: << +5842: name: /BaseFont +5852: name: /Helvetica +5865: name: /Encoding +5875: name: /WinAnsiEncoding +5894: name: /Name +5900: name: /F1 +5906: name: /Subtype +5915: name: /Type1 +5924: name: /Type +5930: name: /Font +5936: dict_close: >> +6001: array_open: [ +6005: name: /PDF +6012: name: /Text +6018: array_close: ] +6020: eof +--- END OBJECT STREAM 1 --- diff --git a/qpdf/qtest/qpdf/tokens.pdf b/qpdf/qtest/qpdf/tokens.pdf new file mode 100644 index 00000000..b444db5f Binary files /dev/null and b/qpdf/qtest/qpdf/tokens.pdf differ diff --git a/qpdf/test_tokenizer.cc b/qpdf/test_tokenizer.cc new file mode 100644 index 00000000..de079195 --- /dev/null +++ b/qpdf/test_tokenizer.cc @@ -0,0 +1,261 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static char const* whoami = 0; + +void usage() +{ + std::cerr << "Usage: " << whoami << " filename" + << std::endl; + exit(2); +} + +class Finder: public InputSource::Finder +{ + public: + Finder(PointerHolder is, std::string const& str) : + is(is), + str(str) + { + } + virtual ~Finder() + { + } + virtual bool check(); + + private: + PointerHolder is; + std::string str; +}; + +bool +Finder::check() +{ + QPDFTokenizer tokenizer; + QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true); + qpdf_offset_t offset = this->is->tell(); + bool result = (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)); + this->is->seek(offset - this->str.length(), SEEK_SET); + return result; +} + +static char const* tokenTypeName(QPDFTokenizer::token_type_e ttype) +{ + // Do this is a case statement instead of a lookup so the compiler + // will warn if we miss any. + switch (ttype) + { + case QPDFTokenizer::tt_bad: + return "bad"; + case QPDFTokenizer::tt_array_close: + return "array_close"; + case QPDFTokenizer::tt_array_open: + return "array_open"; + case QPDFTokenizer::tt_brace_close: + return "brace_close"; + case QPDFTokenizer::tt_brace_open: + return "brace_open"; + case QPDFTokenizer::tt_dict_close: + return "dict_close"; + case QPDFTokenizer::tt_dict_open: + return "dict_open"; + case QPDFTokenizer::tt_integer: + return "integer"; + case QPDFTokenizer::tt_name: + return "name"; + case QPDFTokenizer::tt_real: + return "real"; + case QPDFTokenizer::tt_string: + return "string"; + case QPDFTokenizer::tt_null: + return "null"; + case QPDFTokenizer::tt_bool: + return "bool"; + case QPDFTokenizer::tt_word: + return "word"; + case QPDFTokenizer::tt_eof: + return "eof"; + } + return 0; +} + +static std::string +sanitize(std::string const& value) +{ + std::string result; + for (std::string::const_iterator iter = value.begin(); iter != value.end(); + ++iter) + { + if ((*iter >= 32) && (*iter <= 126)) + { + result.append(1, *iter); + } + else + { + result += "\\x" + QUtil::int_to_string_base( + static_cast(*iter), 16, 2); + } + } + return result; +} + +static void +try_skipping(PointerHolder is, char const* what, Finder& f) +{ + std::cout << "skipping to " << what << std::endl; + qpdf_offset_t offset = is->tell(); + if (! is->findFirst(what, offset, 0, f)) + { + std::cout << what << " not found" << std::endl; + is->seek(offset, SEEK_SET); + } +} + +static void +dump_tokens(PointerHolder is, std::string const& label, + bool skip_streams, bool skip_inline_images) +{ + Finder f1(is, "endstream"); + Finder f2(is, "EI"); + std::cout << "--- BEGIN " << label << " ---" << std::endl; + bool done = false; + QPDFTokenizer tokenizer; + tokenizer.allowEOF(); + while (! done) + { + QPDFTokenizer::Token token = tokenizer.readToken(is, "test", true); + + qpdf_offset_t offset = is->tell() - token.getRawValue().length(); + std::cout << offset << ": " + << tokenTypeName(token.getType()); + if (token.getType() != QPDFTokenizer::tt_eof) + { + std::cout << ": " + << sanitize(token.getValue()); + if (token.getValue() != token.getRawValue()) + { + std::cout << " (raw: " << sanitize(token.getRawValue()) << ")"; + } + } + if (token.getType() == QPDFTokenizer::tt_bad) + { + std::cout << " (" << token.getErrorMessage() << ")"; + } + std::cout << std::endl; + if (skip_streams && + (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream"))) + { + try_skipping(is, "endstream", f1); + } + else if (skip_inline_images && + (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID"))) + { + try_skipping(is, "EI", f2); + } + else if (token.getType() == QPDFTokenizer::tt_eof) + { + done = true; + } + } + std::cout << "--- END " << label << " ---" << std::endl; +} + +static void process(char const* filename) +{ + PointerHolder is; + QPDFTokenizer tokenizer; + tokenizer.allowEOF(); + + // Tokenize file, skipping streams + FileInputSource* fis = new FileInputSource(); + fis->setFilename(filename); + is = fis; + dump_tokens(is, "FILE", true, false); + + // Tokenize content streams, skipping inline images + QPDF qpdf; + qpdf.processFile(filename); + std::vector pages = qpdf.getAllPages(); + int pageno = 0; + for (std::vector::iterator iter = pages.begin(); + iter != pages.end(); ++iter) + { + ++pageno; + Pl_Buffer plb("buffer"); + std::vector contents = (*iter).getPageContents(); + for (std::vector::iterator citer = contents.begin(); + citer != contents.end(); ++citer) + { + (*citer).pipeStreamData(&plb, 0, qpdf_dl_specialized); + } + plb.finish(); + PointerHolder content_data = plb.getBuffer(); + BufferInputSource* bis = new BufferInputSource( + "content data", content_data.getPointer()); + is = bis; + dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno), false, true); + } + + // Tokenize object streams + std::vector all = qpdf.getAllObjects(); + for (std::vector::iterator iter = all.begin(); + iter != all.end(); ++iter) + { + if ((*iter).isStream() && + (*iter).getDict().getKey("/Type").isName() && + (*iter).getDict().getKey("/Type").getName() == "/ObjStm") + { + PointerHolder b = + (*iter).getStreamData(qpdf_dl_specialized); + BufferInputSource* bis = new BufferInputSource( + "object stream data", b.getPointer()); + is = bis; + dump_tokens(is, "OBJECT STREAM " + + QUtil::int_to_string((*iter).getObjectID()), + false, false); + } + } +} + +int main(int argc, char* argv[]) +{ + QUtil::setLineBuf(stdout); + if ((whoami = strrchr(argv[0], '/')) == NULL) + { + whoami = argv[0]; + } + else + { + ++whoami; + } + // For libtool's sake.... + if (strncmp(whoami, "lt-", 3) == 0) + { + whoami += 3; + } + + if (argc != 2) + { + usage(); + } + + char const* filename = argv[1]; + try + { + process(filename); + } + catch (std::exception& e) + { + std::cerr << whoami << ": exception: " << e.what(); + exit(2); + } + return 0; +}