From 3d9bac43da5937235c962a53e68475f796c370aa Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Mon, 2 May 2022 15:46:07 -0400 Subject: [PATCH] Add internal Pl_Base64 Bidirectional base64; will be used by JSON v2. --- TODO | 1 - libqpdf/CMakeLists.txt | 1 + libqpdf/Pl_Base64.cc | 191 ++++++++++++++++++++++++++++++++++++ libqpdf/qpdf/Pl_Base64.hh | 30 ++++++ libtests/CMakeLists.txt | 1 + libtests/base64.cc | 81 +++++++++++++++ libtests/qtest/base64.test | 58 +++++++++++ libtests/qtest/base64/1.dec | 1 + libtests/qtest/base64/1.enc | 1 + libtests/qtest/base64/2.dec | 1 + libtests/qtest/base64/2.enc | 1 + libtests/qtest/base64/3.dec | 1 + libtests/qtest/base64/3.enc | 1 + libtests/qtest/base64/4.dec | 1 + libtests/qtest/base64/4.enc | 1 + libtests/qtest/base64/5.dec | Bin 0 -> 350 bytes libtests/qtest/base64/5.enc | 1 + 17 files changed, 371 insertions(+), 1 deletion(-) create mode 100644 libqpdf/Pl_Base64.cc create mode 100644 libqpdf/qpdf/Pl_Base64.hh create mode 100644 libtests/base64.cc create mode 100644 libtests/qtest/base64.test create mode 100644 libtests/qtest/base64/1.dec create mode 100644 libtests/qtest/base64/1.enc create mode 100644 libtests/qtest/base64/2.dec create mode 100644 libtests/qtest/base64/2.enc create mode 100644 libtests/qtest/base64/3.dec create mode 100644 libtests/qtest/base64/3.enc create mode 100644 libtests/qtest/base64/4.dec create mode 100644 libtests/qtest/base64/4.enc create mode 100644 libtests/qtest/base64/5.dec create mode 100644 libtests/qtest/base64/5.enc diff --git a/TODO b/TODO index ef70b7ac..14b7cfcb 100644 --- a/TODO +++ b/TODO @@ -45,7 +45,6 @@ notes from 5/2: Need new pipelines: * Pl_OStream(std::ostream) with semantics like Pl_StdioFile * Pl_String to std::string with semantics like Pl_Buffer -* Pl_Base64 New Pipeline methods: * writeString(std::string const&) diff --git a/libqpdf/CMakeLists.txt b/libqpdf/CMakeLists.txt index 305977de..72b87975 100644 --- a/libqpdf/CMakeLists.txt +++ b/libqpdf/CMakeLists.txt @@ -35,6 +35,7 @@ set(libqpdf_SOURCES Pl_AES_PDF.cc Pl_ASCII85Decoder.cc Pl_ASCIIHexDecoder.cc + Pl_Base64.cc Pl_Buffer.cc Pl_Concatenate.cc Pl_Count.cc diff --git a/libqpdf/Pl_Base64.cc b/libqpdf/Pl_Base64.cc new file mode 100644 index 00000000..bfacc1db --- /dev/null +++ b/libqpdf/Pl_Base64.cc @@ -0,0 +1,191 @@ +#include + +#include +#include +#include +#include +#include + +static char +to_c(unsigned int ch) +{ + return static_cast(ch); +} + +static unsigned char +to_uc(int ch) +{ + return static_cast(ch); +} + +static int +to_i(int i) +{ + return static_cast(i); +} + +Pl_Base64::Pl_Base64(char const* identifier, Pipeline* next, action_e action) : + Pipeline(identifier, next), + action(action), + pos(0), + end_of_data(false), + finished(false) +{ + reset(); +} + +void +Pl_Base64::write(unsigned char* data, size_t len) +{ + if (finished) { + throw std::logic_error("Pl_Base64 used after finished"); + } + if (this->action == a_decode) { + decode(data, len); + } else { + encode(data, len); + } +} + +void +Pl_Base64::decode(unsigned char* data, size_t len) +{ + unsigned char* p = data; + while (len > 0) { + if (!QUtil::is_space(to_c(*p))) { + this->buf[this->pos++] = *p; + if (this->pos == 4) { + flush(); + } + } + ++p; + --len; + } +} + +void +Pl_Base64::encode(unsigned char* data, size_t len) +{ + unsigned char* p = data; + while (len > 0) { + this->buf[this->pos++] = *p; + if (this->pos == 3) { + flush(); + } + ++p; + --len; + } +} + +void +Pl_Base64::flush() +{ + if (this->action == a_decode) { + flush_decode(); + } else { + flush_encode(); + } + reset(); +} + +void +Pl_Base64::flush_decode() +{ + if (this->end_of_data) { + throw std::runtime_error( + getIdentifier() + ": base64 decode: data follows pad characters"); + } + int pad = 0; + int shift = 18; + int outval = 0; + for (size_t i = 0; i < 4; ++i) { + int v = 0; + char ch = to_c(this->buf[i]); + if ((ch >= 'A') && (ch <= 'Z')) { + v = ch - 'A'; + } else if ((ch >= 'a') && (ch <= 'z')) { + v = ch - 'a' + 26; + } else if ((ch >= '0') && (ch <= '9')) { + v = ch - '0' + 52; + } else if ((ch == '+') || (ch == '-')) { + v = 62; + } else if ((ch == '/') || (ch == '_')) { + v = 63; + } else if ( + (ch == '=') && ((i == 3) || ((i == 2) && (this->buf[3] == '=')))) { + ++pad; + this->end_of_data = true; + v = 0; + } else { + throw std::runtime_error( + getIdentifier() + ": base64 decode: invalid input"); + } + outval |= v << shift; + shift -= 6; + } + unsigned char out[3] = { + to_uc(outval >> 16), + to_uc(0xff & (outval >> 8)), + to_uc(0xff & outval), + }; + + getNext()->write(out, QIntC::to_size(3 - pad)); +} + +void +Pl_Base64::flush_encode() +{ + int outval = ((this->buf[0] << 16) | (this->buf[1] << 8) | (this->buf[2])); + unsigned char out[4] = { + to_uc(outval >> 18), + to_uc(0x3f & (outval >> 12)), + to_uc(0x3f & (outval >> 6)), + to_uc(0x3f & outval), + }; + for (size_t i = 0; i < 4; ++i) { + int ch = to_i(out[i]); + if (ch < 26) { + ch += 'A'; + } else if (ch < 52) { + ch -= 26; + ch += 'a'; + } else if (ch < 62) { + ch -= 52; + ch += '0'; + } else if (ch == 62) { + ch = '+'; + } else if (ch == 63) { + ch = '/'; + } + out[i] = to_uc(ch); + } + for (size_t i = 0; i < 3 - this->pos; ++i) { + out[3 - i] = '='; + } + getNext()->write(out, 4); +} + +void +Pl_Base64::finish() +{ + if (this->pos > 0) { + if (finished) { + throw std::logic_error("Pl_Base64 used after finished"); + } + if (this->action == a_decode) { + for (size_t i = this->pos; i < 4; ++i) { + this->buf[i] = '='; + } + } + flush(); + } + this->finished = true; + getNext()->finish(); +} + +void +Pl_Base64::reset() +{ + this->pos = 0; + memset(buf, 0, 4); +} diff --git a/libqpdf/qpdf/Pl_Base64.hh b/libqpdf/qpdf/Pl_Base64.hh new file mode 100644 index 00000000..313bd2cb --- /dev/null +++ b/libqpdf/qpdf/Pl_Base64.hh @@ -0,0 +1,30 @@ +#ifndef PL_BASE64_HH +#define PL_BASE64_HH + +#include + +class Pl_Base64: public Pipeline +{ + public: + enum action_e { a_encode, a_decode }; + Pl_Base64(char const* identifier, Pipeline* next, action_e); + virtual ~Pl_Base64() = default; + virtual void write(unsigned char* buf, size_t len) override; + virtual void finish() override; + + private: + void decode(unsigned char* buf, size_t len); + void encode(unsigned char* buf, size_t len); + void flush(); + void flush_decode(); + void flush_encode(); + void reset(); + + action_e action; + unsigned char buf[4]; + size_t pos; + bool end_of_data; + bool finished; +}; + +#endif // PL_BASE64_HH diff --git a/libtests/CMakeLists.txt b/libtests/CMakeLists.txt index 96f93482..9eb9a490 100644 --- a/libtests/CMakeLists.txt +++ b/libtests/CMakeLists.txt @@ -3,6 +3,7 @@ set(TEST_PROGRAMS aes arg_parser ascii85 + base64 bits buffer closed_file_input_source diff --git a/libtests/base64.cc b/libtests/base64.cc new file mode 100644 index 00000000..66f2d828 --- /dev/null +++ b/libtests/base64.cc @@ -0,0 +1,81 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +static bool +write_some(FILE* f, size_t bytes, Pipeline* p) +{ + unsigned char buf[1000]; + assert(bytes <= sizeof(buf)); + size_t len = fread(buf, 1, bytes, f); + if (len > 0) { + p->write(buf, len); + } + if (len < bytes) { + if (ferror(f)) { + std::cerr << "error reading file" << std::endl; + exit(2); + } + p->finish(); + return false; + } + return (len == bytes); +} + +static void +usage() +{ + std::cerr << "Usage: base64 encode|decode" << std::endl; + exit(2); +} + +int +main(int argc, char* argv[]) +{ + if (argc != 2) { + usage(); + } + QUtil::binary_stdout(); + QUtil::binary_stdin(); + Pl_Base64::action_e action = Pl_Base64::a_decode; + if (strcmp(argv[1], "encode") == 0) { + action = Pl_Base64::a_encode; + } else if (strcmp(argv[1], "decode") != 0) { + usage(); + } + + try { + Pl_StdioFile out("stdout", stdout); + Pl_Base64 decode("decode", &out, action); + // The comments are "n: n%4 n%3", where n is the number of + // bytes read at the end of the call, and are there to + // indicate that we are reading in chunks that exercise + // various boundary conditions around subsequent writes and + // the state of buf and pos. There are some writes that don't + // do flush at all, some that call flush multiple times, and + // some that start in the middle and do flush, and this is + // true for both encode and decode. + if (write_some(stdin, 1, &decode) && // 1: 1 1 + write_some(stdin, 4, &decode) && // 5: 1 2 + write_some(stdin, 2, &decode) && // 7: 3 1 + write_some(stdin, 2, &decode) && // 9: 1 0 + write_some(stdin, 7, &decode) && // 16: 0 1 + write_some(stdin, 1, &decode) && // 17: 1 2 + write_some(stdin, 9, &decode) && // 26: 2 2 + write_some(stdin, 2, &decode)) { // 28: 0 1 + while (write_some(stdin, 1000, &decode)) { + } + } + } catch (std::exception& e) { + std::cout << "exception: " << e.what() << std::endl; + exit(2); + } + + return 0; +} diff --git a/libtests/qtest/base64.test b/libtests/qtest/base64.test new file mode 100644 index 00000000..9e709c73 --- /dev/null +++ b/libtests/qtest/base64.test @@ -0,0 +1,58 @@ +#!/usr/bin/env perl +require 5.008; +use warnings; +use strict; + +chdir("base64") or die "chdir testdir failed: $!\n"; + +require TestDriver; + +my $td = new TestDriver('base64'); + +cleanup(); + +# ** Do not use normalize newlines on these tests. ** + +my $n = 5; +for (my $i = 1; $i <= $n; ++$i) +{ + $td->runtest("encode $i", + {$td->COMMAND => "base64 encode < $i.dec"}, + {$td->FILE => "$i.enc", $td->EXIT_STATUS => 0}); + $td->runtest("code $i", + {$td->COMMAND => "base64 decode < $i.enc"}, + {$td->FILE => "$i.dec", $td->EXIT_STATUS => 0}); +} + +$td->runtest("non-zero discard bits", + {$td->COMMAND => "echo c2FsYWR= | base64 decode"}, + {$td->STRING => "salad", $td->EXIT_STATUS => 0}); +$td->runtest("write with +/", + {$td->COMMAND => "echo +/== | base64 decode > a"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}); +$td->runtest("write with -_", + {$td->COMMAND => "echo -_== | base64 decode > b"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}); +$td->runtest("interchangeability of +/ and -_", + {$td->FILE => "a"}, + {$td->FILE => "b"}); + +$td->runtest("invalid characters", + {$td->COMMAND => "echo aaaaa! | base64 decode"}, + {$td->REGEXP => ".*invalid input.*", $td->EXIT_STATUS => 2}); +$td->runtest("invalid pad", + {$td->COMMAND => "echo a= | base64 decode"}, + {$td->REGEXP => ".*invalid input.*", $td->EXIT_STATUS => 2}); +$td->runtest("data after pad", + {$td->COMMAND => "echo aa==potato | base64 decode"}, + {$td->REGEXP => ".*data follows pad characters.*", + $td->EXIT_STATUS => 2}); + +cleanup(); + +$td->report(7 + (2 * $n)); + +sub cleanup +{ + unlink('a', 'b'); +} diff --git a/libtests/qtest/base64/1.dec b/libtests/qtest/base64/1.dec new file mode 100644 index 00000000..d800886d --- /dev/null +++ b/libtests/qtest/base64/1.dec @@ -0,0 +1 @@ +123 \ No newline at end of file diff --git a/libtests/qtest/base64/1.enc b/libtests/qtest/base64/1.enc new file mode 100644 index 00000000..e644af9e --- /dev/null +++ b/libtests/qtest/base64/1.enc @@ -0,0 +1 @@ +MTIz \ No newline at end of file diff --git a/libtests/qtest/base64/2.dec b/libtests/qtest/base64/2.dec new file mode 100644 index 00000000..274c0052 --- /dev/null +++ b/libtests/qtest/base64/2.dec @@ -0,0 +1 @@ +1234 \ No newline at end of file diff --git a/libtests/qtest/base64/2.enc b/libtests/qtest/base64/2.enc new file mode 100644 index 00000000..9c06465b --- /dev/null +++ b/libtests/qtest/base64/2.enc @@ -0,0 +1 @@ +MTIzNA== \ No newline at end of file diff --git a/libtests/qtest/base64/3.dec b/libtests/qtest/base64/3.dec new file mode 100644 index 00000000..11d0d991 --- /dev/null +++ b/libtests/qtest/base64/3.dec @@ -0,0 +1 @@ +This file has a multiple of four bytes and is longer than four bytes... diff --git a/libtests/qtest/base64/3.enc b/libtests/qtest/base64/3.enc new file mode 100644 index 00000000..6dd9347a --- /dev/null +++ b/libtests/qtest/base64/3.enc @@ -0,0 +1 @@ +VGhpcyBmaWxlIGhhcyBhIG11bHRpcGxlIG9mIGZvdXIgYnl0ZXMgYW5kIGlzIGxvbmdlciB0aGFuIGZvdXIgYnl0ZXMuLi4K \ No newline at end of file diff --git a/libtests/qtest/base64/4.dec b/libtests/qtest/base64/4.dec new file mode 100644 index 00000000..a5e2af49 --- /dev/null +++ b/libtests/qtest/base64/4.dec @@ -0,0 +1 @@ +This file has a non-multiple of four bytes and is longer than four bytes. diff --git a/libtests/qtest/base64/4.enc b/libtests/qtest/base64/4.enc new file mode 100644 index 00000000..e43b9c1f --- /dev/null +++ b/libtests/qtest/base64/4.enc @@ -0,0 +1 @@ +VGhpcyBmaWxlIGhhcyBhIG5vbi1tdWx0aXBsZSBvZiBmb3VyIGJ5dGVzIGFuZCBpcyBsb25nZXIgdGhhbiBmb3VyIGJ5dGVzLgo= \ No newline at end of file diff --git a/libtests/qtest/base64/5.dec b/libtests/qtest/base64/5.dec new file mode 100644 index 0000000000000000000000000000000000000000..ed99eb1ae37dbcfbf90e7a7eec9be7a8d0955dd3 GIT binary patch literal 350 zcmV-k0ipg>XlZjGW@&6?AYpSLZ*FBEZ)PB6c4cyTAYyrRWguy8AZc_iAWm;|AarP9 zbRcPTAaZ44Y;1WTZDDI=b0A@Ec?x7{W@cq_Wo~0-EFfZabRc(Vc_40YbS?@C009C6 z1O)~M2nh-c3=Iws5D^j+6crX17#SKH9337XAR!_nBqb&%C@Cr{EG;fCFflSSG&MFi zI5|2yJUu=?KtV!7L`6nNNJ&adOifNtP*GA-R8>}2SXo+ITwPvYU}0ioWMyV&XlZI| zY;A6DaB*^Tbai%jczJqze0_d@fPsR8goTEOh>41ejE#