From 6bd8a2faaa3186dcd6a4ed5d87b3cc3a029d3c4e Mon Sep 17 00:00:00 2001 From: greatroar <@> Date: Wed, 8 Jul 2020 09:59:00 +0200 Subject: [PATCH] backup: Add --ignore-ctime option and document change detection --- changelog/unreleased/pull-2823 | 26 ++++++++++ cmd/restic/cmd_backup.go | 12 ++++- doc/040_backup.rst | 70 ++++++++++++++++++++------- internal/archiver/archiver.go | 57 +++++++++++----------- internal/archiver/archiver_test.go | 77 ++++++++++++++++++++++-------- 5 files changed, 174 insertions(+), 68 deletions(-) create mode 100644 changelog/unreleased/pull-2823 diff --git a/changelog/unreleased/pull-2823 b/changelog/unreleased/pull-2823 new file mode 100644 index 000000000..ce4a3dfb2 --- /dev/null +++ b/changelog/unreleased/pull-2823 @@ -0,0 +1,26 @@ +Enhancement: Add option to let backup trust mtime without checking ctime + +The backup command used to require that both ctime and mtime of a file matched +with a previously backed up version to determine that the file was unchanged. +In other words, if either ctime or mtime of the file had changed, it would be +considered changed and restic would read the file's content again to back up +the relevant (changed) parts of it. + +The new option --ignore-ctime makes restic look at mtime only, such that ctime +changes for a file does not cause restic to read the file's contents again. + +The check for both ctime and mtime was introduced in restic 0.9.6 to make +backups more reliable in the face of programs that reset mtime (some Unix +archivers do that), but it turned out to often be expensive because it made +restic read file contents even if only the metadata (owner, permissions) of +a file had changed. The new --ignore-ctime option lets the user restore the +0.9.5 behavior when needed. The existing --ignore-inode option already turned +off this behavior, but also removed a different check. + +Please note that changes in files' metadata are still recorded, regardless of +the command line options provided to the backup command. + +https://github.com/restic/restic/issues/2495 +https://github.com/restic/restic/issues/2558 +https://github.com/restic/restic/issues/2819 +https://github.com/restic/restic/pull/2823 diff --git a/cmd/restic/cmd_backup.go b/cmd/restic/cmd_backup.go index b8d700964..d8500b263 100644 --- a/cmd/restic/cmd_backup.go +++ b/cmd/restic/cmd_backup.go @@ -90,6 +90,7 @@ type BackupOptions struct { TimeStamp string WithAtime bool IgnoreInode bool + IgnoreCtime bool UseFsSnapshot bool } @@ -126,6 +127,7 @@ func init() { f.StringVar(&backupOptions.TimeStamp, "time", "", "`time` of the backup (ex. '2012-11-01 22:08:41') (default: now)") f.BoolVar(&backupOptions.WithAtime, "with-atime", false, "store the atime for all files and directories") f.BoolVar(&backupOptions.IgnoreInode, "ignore-inode", false, "ignore inode number changes when checking for modified files") + f.BoolVar(&backupOptions.IgnoreCtime, "ignore-ctime", false, "ignore ctime changes when checking for modified files") if runtime.GOOS == "windows" { f.BoolVar(&backupOptions.UseFsSnapshot, "use-fs-snapshot", false, "use filesystem snapshot where possible (currently only Windows VSS)") } @@ -665,7 +667,15 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina arch.CompleteItem = p.CompleteItem arch.StartFile = p.StartFile arch.CompleteBlob = p.CompleteBlob - arch.IgnoreInode = opts.IgnoreInode + + if opts.IgnoreInode { + // --ignore-inode implies --ignore-ctime: on FUSE, the ctime is not + // reliable either. + arch.ChangeIgnoreFlags |= archiver.ChangeIgnoreCtime | archiver.ChangeIgnoreInode + } + if opts.IgnoreCtime { + arch.ChangeIgnoreFlags |= archiver.ChangeIgnoreCtime + } if parentSnapshotID == nil { parentSnapshotID = &restic.ID{} diff --git a/doc/040_backup.rst b/doc/040_backup.rst index 182999781..b79e2b70b 100644 --- a/doc/040_backup.rst +++ b/doc/040_backup.rst @@ -131,24 +131,62 @@ restic encounters: In fact several hosts may use the same repository to backup directories and files leading to a greater de-duplication. -Please be aware that when you backup different directories (or the -directories to be saved have a variable name component like a -time/date), restic always needs to read all files and only afterwards -can compute which parts of the files need to be saved. When you backup -the same directory again (maybe with new or changed files) restic will -find the old snapshot in the repo and by default only reads those files -that are new or have been modified since the last snapshot. This is -decided based on the following attributes of the file in the file system: - - * Type (file, symlink, or directory?) - * Modification time - * Size - * Inode number (internal number used to reference a file in a file system) - Now is a good time to run ``restic check`` to verify that all data is properly stored in the repository. You should run this command regularly to make sure the internal structure of the repository is free of errors. +File change detection +********************* + +When restic encounters a file that has already been backed up, whether in the +current backup or a previous one, it makes sure the file's contents are only +stored once in the repository. To do so, it normally has to scan the entire +contents of every file. Because this can be very expensive, restic also uses a +change detection rule based on file metadata to determine whether a file is +likely unchanged since a previous backup. If it is, the file is not scanned +again. + +Change detection is only performed for regular files (not special files, +symlinks or directories) that have the exact same path as they did in a +previous backup of the same location. If a file or one of its containing +directories was renamed, it is considered a different file and its entire +contents will be scanned again. + +Metadata changes (permissions, ownership, etc.) are always included in the +backup, even if file contents are considered unchanged. + +On **Unix** (including Linux and Mac), given that a file lives at the same +location as a file in a previous backup, the following file metadata +attributes have to match for its contents to be presumed unchanged: + + * Modification timestamp (mtime). + * Metadata change timestamp (ctime). + * File size. + * Inode number (internal number used to reference a file in a filesystem). + +The reason for requiring both mtime and ctime to match is that Unix programs +can freely change mtime (and some do). In such cases, a ctime change may be +the only hint that a file did change. + +The following ``restic backup`` command line flags modify the change detection +rules: + + * ``--force``: turn off change detection and rescan all files. + * ``--ignore-ctime``: require mtime to match, but allow ctime to differ. + * ``--ignore-inode``: require mtime to match, but allow inode number + and ctime to differ. + +The option ``--ignore-inode`` exists to support FUSE-based filesystems and +pCloud, which do not assign stable inodes to files. + +Note that the device id of the containing mount point is never taken into +account. Device numbers are not stable for removable devices and ZFS snapshots. +If you want to force a re-scan in such a case, you can change the mountpoint. + +On **Windows**, a file is considered unchanged when its path and modification +time match, and only ``--force`` has any effect. The other options are +recognized but ignored. + Excluding Files *************** @@ -372,10 +410,6 @@ written, and the next backup needs to write new metadata again. If you really want to save the access time for files and directories, you can pass the ``--with-atime`` option to the ``backup`` command. -In filesystems that do not support inode consistency, like FUSE-based ones and pCloud, it is -possible to ignore inode on changed files comparison by passing ``--ignore-inode`` to -``backup`` command. - Reading data from stdin *********************** diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go index 2fd170a3a..2b0ac4ada 100644 --- a/internal/archiver/archiver.go +++ b/internal/archiver/archiver.go @@ -78,10 +78,18 @@ type Archiver struct { // WithAtime configures if the access time for files and directories should // be saved. Enabling it may result in much metadata, so it's off by // default. - WithAtime bool - IgnoreInode bool + WithAtime bool + + // Flags controlling change detection. See doc/040_backup.rst for details. + ChangeIgnoreFlags uint } +// Flags for the ChangeIgnoreFlags bitfield. +const ( + ChangeIgnoreCtime = 1 << iota + ChangeIgnoreInode +) + // Options is used to configure the archiver. type Options struct { // FileReadConcurrency sets how many files are read in concurrently. If @@ -134,7 +142,6 @@ func New(repo restic.Repository, fs fs.FS, opts Options) *Archiver { CompleteItem: func(string, *restic.Node, *restic.Node, ItemStats, time.Duration) {}, StartFile: func(string) {}, CompleteBlob: func(string, uint64) {}, - IgnoreInode: false, } return arch @@ -379,7 +386,7 @@ func (arch *Archiver) Save(ctx context.Context, snPath, target string, previous // check if the file has not changed before performing a fopen operation (more expensive, specially // in network filesystems) - if previous != nil && !fileChanged(fi, previous, arch.IgnoreInode) { + if previous != nil && !fileChanged(fi, previous, arch.ChangeIgnoreFlags) { if arch.allBlobsPresent(previous) { debug.Log("%v hasn't changed, using old list of blobs", target) arch.CompleteItem(snPath, previous, previous, ItemStats{}, time.Since(start)) @@ -481,36 +488,30 @@ func (arch *Archiver) Save(ctx context.Context, snPath, target string, previous return fn, false, nil } -// fileChanged returns true if the file's content has changed since the node -// was created. -func fileChanged(fi os.FileInfo, node *restic.Node, ignoreInode bool) bool { - if node == nil { +// fileChanged tries to detect whether a file's content has changed compared +// to the contents of node, which describes the same path in the parent backup. +// It should only be run for regular files. +func fileChanged(fi os.FileInfo, node *restic.Node, ignoreFlags uint) bool { + switch { + case node == nil: + return true + case node.Type != "file": + // We're only called for regular files, so this is a type change. + return true + case uint64(fi.Size()) != node.Size: + return true + case !fi.ModTime().Equal(node.ModTime): return true } - // check type change - if node.Type != "file" { - return true - } + checkCtime := ignoreFlags&ChangeIgnoreCtime == 0 + checkInode := ignoreFlags&ChangeIgnoreInode == 0 - // check modification timestamp - if !fi.ModTime().Equal(node.ModTime) { - return true - } - - // check status change timestamp extFI := fs.ExtendedStat(fi) - if !ignoreInode && !extFI.ChangeTime.Equal(node.ChangeTime) { + switch { + case checkCtime && !extFI.ChangeTime.Equal(node.ChangeTime): return true - } - - // check size - if uint64(fi.Size()) != node.Size || uint64(extFI.Size) != node.Size { - return true - } - - // check inode - if !ignoreInode && node.Inode != extFI.Inode { + case checkInode && node.Inode != extFI.Inode: return true } diff --git a/internal/archiver/archiver_test.go b/internal/archiver/archiver_test.go index afd8e0f8a..3bd4d630b 100644 --- a/internal/archiver/archiver_test.go +++ b/internal/archiver/archiver_test.go @@ -505,6 +505,18 @@ func save(t testing.TB, filename string, data []byte) { } } +func chmodTwice(t testing.TB, name string) { + // POSIX says that ctime is updated "even if the file status does not + // change", but let's make sure it does change, just in case. + err := os.Chmod(name, 0700) + restictest.OK(t, err) + + sleep() + + err = os.Chmod(name, 0600) + restictest.OK(t, err) +} + func lstat(t testing.TB, name string) os.FileInfo { fi, err := os.Lstat(name) if err != nil { @@ -533,6 +545,13 @@ func remove(t testing.TB, filename string) { } } +func rename(t testing.TB, oldname, newname string) { + err := os.Rename(oldname, newname) + if err != nil { + t.Fatal(err) + } +} + func nodeFromFI(t testing.TB, filename string, fi os.FileInfo) *restic.Node { node, err := restic.NodeFromFileInfo(filename, fi) if err != nil { @@ -542,26 +561,26 @@ func nodeFromFI(t testing.TB, filename string, fi os.FileInfo) *restic.Node { return node } +// sleep sleeps long enough to ensure a timestamp change. +func sleep() { + d := 50 * time.Millisecond + if runtime.GOOS == "darwin" { + // On older Darwin instances, the file system only supports one second + // granularity. + d = 1500 * time.Millisecond + } + time.Sleep(d) +} + func TestFileChanged(t *testing.T) { var defaultContent = []byte("foobar") - var d = 50 * time.Millisecond - if runtime.GOOS == "darwin" { - // on older darwin instances the file system only supports one second - // granularity - d = time.Second - } - - sleep := func() { - time.Sleep(d) - } - var tests = []struct { Name string SkipForWindows bool Content []byte Modify func(t testing.TB, filename string) - IgnoreInode bool + ChangeIgnore uint SameFile bool }{ { @@ -618,17 +637,33 @@ func TestFileChanged(t *testing.T) { save(t, filename, defaultContent) }, }, + { + Name: "ctime-change", + Modify: chmodTwice, + SameFile: false, + SkipForWindows: true, // No ctime on Windows, so this test would fail. + }, + { + Name: "ignore-ctime-change", + Modify: chmodTwice, + ChangeIgnore: ChangeIgnoreCtime, + SameFile: true, + SkipForWindows: true, // No ctime on Windows, so this test is meaningless. + }, { Name: "ignore-inode", Modify: func(t testing.TB, filename string) { fi := lstat(t, filename) - remove(t, filename) - sleep() + // First create the new file, then remove the old one, + // so that the old file retains its inode number. + tempname := filename + ".old" + rename(t, filename, tempname) save(t, filename, defaultContent) + remove(t, tempname) setTimestamp(t, filename, fi.ModTime(), fi.ModTime()) }, - IgnoreInode: true, - SameFile: true, + ChangeIgnore: ChangeIgnoreCtime | ChangeIgnoreInode, + SameFile: true, }, } @@ -651,7 +686,7 @@ func TestFileChanged(t *testing.T) { fiBefore := lstat(t, filename) node := nodeFromFI(t, filename, fiBefore) - if fileChanged(fiBefore, node, false) { + if fileChanged(fiBefore, node, 0) { t.Fatalf("unchanged file detected as changed") } @@ -661,12 +696,12 @@ func TestFileChanged(t *testing.T) { if test.SameFile { // file should be detected as unchanged - if fileChanged(fiAfter, node, test.IgnoreInode) { + if fileChanged(fiAfter, node, test.ChangeIgnore) { t.Fatalf("unmodified file detected as changed") } } else { // file should be detected as changed - if !fileChanged(fiAfter, node, test.IgnoreInode) && !test.SameFile { + if !fileChanged(fiAfter, node, test.ChangeIgnore) && !test.SameFile { t.Fatalf("modified file detected as unchanged") } } @@ -684,7 +719,7 @@ func TestFilChangedSpecialCases(t *testing.T) { t.Run("nil-node", func(t *testing.T) { fi := lstat(t, filename) - if !fileChanged(fi, nil, false) { + if !fileChanged(fi, nil, 0) { t.Fatal("nil node detected as unchanged") } }) @@ -693,7 +728,7 @@ func TestFilChangedSpecialCases(t *testing.T) { fi := lstat(t, filename) node := nodeFromFI(t, filename, fi) node.Type = "symlink" - if !fileChanged(fi, node, false) { + if !fileChanged(fi, node, 0) { t.Fatal("node with changed type detected as unchanged") } })