diff --git a/internal/textfile/read.go b/internal/textfile/read.go new file mode 100644 index 000000000..3129ba8fe --- /dev/null +++ b/internal/textfile/read.go @@ -0,0 +1,43 @@ +// Package textfile allows reading files that contain text. It automatically +// detects and converts several encodings and removes Byte Order Marks (BOM). +package textfile + +import ( + "bytes" + "io/ioutil" + + "golang.org/x/text/encoding/unicode" +) + +// All supported BOMs (Byte Order Marks) +var ( + bomUTF8 = []byte{0xef, 0xbb, 0xbf} + bomUTF16BigEndian = []byte{0xfe, 0xff} + bomUTF16LittleEndian = []byte{0xff, 0xfe} +) + +// Decode removes a byte order mark and converts the bytes to UTF-8. +func Decode(data []byte) ([]byte, error) { + if bytes.HasPrefix(data, bomUTF8) { + return data[len(bomUTF8):], nil + } + + if !bytes.HasPrefix(data, bomUTF16BigEndian) && !bytes.HasPrefix(data, bomUTF16LittleEndian) { + // no encoding specified, let's assume UTF-8 + return data, nil + } + + // UseBom means automatic endianness selection + e := unicode.UTF16(unicode.BigEndian, unicode.UseBOM) + return e.NewDecoder().Bytes(data) +} + +// Read returns the contents of the file, converted to UTF-8, stripped of any BOM. +func Read(filename string) ([]byte, error) { + data, err := ioutil.ReadFile(filename) + if err != nil { + return nil, err + } + + return Decode(data) +} diff --git a/internal/textfile/read_test.go b/internal/textfile/read_test.go new file mode 100644 index 000000000..572a33ebe --- /dev/null +++ b/internal/textfile/read_test.go @@ -0,0 +1,76 @@ +package textfile + +import ( + "bytes" + "encoding/hex" + "testing" + + "github.com/restic/restic/internal/fs" +) + +func writeTempfile(t testing.TB, data []byte) (fs.File, func()) { + f, removeTempfile := fs.TestTempFile(t, "restic-test-textfile-read-") + + _, err := f.Write(data) + if err != nil { + t.Fatal(err) + } + + err = f.Close() + if err != nil { + t.Fatal(err) + } + + return f, removeTempfile +} + +func dec(s string) []byte { + data, err := hex.DecodeString(s) + if err != nil { + panic(err) + } + return data +} + +func TestRead(t *testing.T) { + var tests = []struct { + data []byte + want []byte + }{ + {data: []byte("foo bar baz")}, + {data: []byte("Ööbär")}, + { + data: []byte("\xef\xbb\xbffööbär"), + want: []byte("fööbär"), + }, + { + data: dec("feff006600f600f6006200e40072"), + want: []byte("fööbär"), + }, + { + data: dec("fffe6600f600f6006200e4007200"), + want: []byte("fööbär"), + }, + } + + for _, test := range tests { + t.Run("", func(t *testing.T) { + want := test.want + if want == nil { + want = test.data + } + + f, cleanup := writeTempfile(t, test.data) + defer cleanup() + + data, err := Read(f.Name()) + if err != nil { + t.Fatal(err) + } + + if !bytes.Equal(want, data) { + t.Errorf("invalid data returned, want:\n %q\ngot:\n %q", want, data) + } + }) + } +}