diff --git a/changelog/unreleased/issue-1909 b/changelog/unreleased/issue-1909 new file mode 100644 index 000000000..869d532bf --- /dev/null +++ b/changelog/unreleased/issue-1909 @@ -0,0 +1,14 @@ +Enhancement: Reject files/dirs by name first + +The current scanner/archiver code had an architectural limitation: it always +ran the `lstat()` system call on all files and directories before a decision to +include/exclude the file/dir was made. This lead to a lot of unnecessary system +calls for items that could have been rejected by their name or path only. + +We've changed the archiver/scanner implementation so that it now first rejects +by name/path, and only runs the system call on the remaining items. This +reduces the number of `lstat()` system calls a lot (depending on the exclude +settings). + +https://github.com/restic/restic/issues/1909 +https://github.com/restic/restic/pull/1912 diff --git a/cmd/restic/cmd_backup.go b/cmd/restic/cmd_backup.go index a1b5981ea..290a31ba1 100644 --- a/cmd/restic/cmd_backup.go +++ b/cmd/restic/cmd_backup.go @@ -186,18 +186,9 @@ func (opts BackupOptions) Check(gopts GlobalOptions, args []string) error { return nil } -// collectRejectFuncs returns a list of all functions which may reject data -// from being saved in a snapshot -func collectRejectFuncs(opts BackupOptions, repo *repository.Repository, targets []string) (fs []RejectFunc, err error) { - // allowed devices - if opts.ExcludeOtherFS && !opts.Stdin { - f, err := rejectByDevice(targets) - if err != nil { - return nil, err - } - fs = append(fs, f) - } - +// collectRejectByNameFuncs returns a list of all functions which may reject data +// from being saved in a snapshot based on path only +func collectRejectByNameFuncs(opts BackupOptions, repo *repository.Repository, targets []string) (fs []RejectByNameFunc, err error) { // exclude restic cache if repo.Cache != nil { f, err := rejectResticCache(repo) @@ -237,6 +228,21 @@ func collectRejectFuncs(opts BackupOptions, repo *repository.Repository, targets return fs, nil } +// collectRejectFuncs returns a list of all functions which may reject data +// from being saved in a snapshot based on path and file info +func collectRejectFuncs(opts BackupOptions, repo *repository.Repository, targets []string) (fs []RejectFunc, err error) { + // allowed devices + if opts.ExcludeOtherFS && !opts.Stdin { + f, err := rejectByDevice(targets) + if err != nil { + return nil, err + } + fs = append(fs, f) + } + + return fs, nil +} + // readExcludePatternsFromFiles reads all exclude files and returns the list of // exclude patterns. For each line, leading and trailing white space is removed // and comment lines are ignored. For each remaining pattern, environment @@ -393,7 +399,13 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina return err } - // rejectFuncs collect functions that can reject items from the backup + // rejectByNameFuncs collect functions that can reject items from the backup based on path only + rejectByNameFuncs, err := collectRejectByNameFuncs(opts, repo, targets) + if err != nil { + return err + } + + // rejectFuncs collect functions that can reject items from the backup based on path and file info rejectFuncs, err := collectRejectFuncs(opts, repo, targets) if err != nil { return err @@ -414,6 +426,15 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina p.V("using parent snapshot %v\n", parentSnapshotID.Str()) } + selectByNameFilter := func(item string) bool { + for _, reject := range rejectByNameFuncs { + if reject(item) { + return false + } + } + return true + } + selectFilter := func(item string, fi os.FileInfo) bool { for _, reject := range rejectFuncs { if reject(item, fi) { @@ -436,6 +457,7 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina } sc := archiver.NewScanner(targetFS) + sc.SelectByName = selectByNameFilter sc.Select = selectFilter sc.Error = p.ScannerError sc.Result = p.ReportTotal @@ -444,6 +466,7 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina t.Go(func() error { return sc.Scan(t.Context(gopts.ctx), targets) }) arch := archiver.New(repo, targetFS, archiver.Options{}) + arch.SelectByName = selectByNameFilter arch.Select = selectFilter arch.WithAtime = opts.WithAtime arch.Error = p.Error diff --git a/cmd/restic/exclude.go b/cmd/restic/exclude.go index 537bdc344..479f8a308 100644 --- a/cmd/restic/exclude.go +++ b/cmd/restic/exclude.go @@ -60,15 +60,20 @@ func (rc *rejectionCache) Store(dir string, rejected bool) { rc.m[dir] = rejected } +// RejectByNameFunc is a function that takes a filename of a +// file that would be included in the backup. The function returns true if it +// should be excluded (rejected) from the backup. +type RejectByNameFunc func(path string) bool + // RejectFunc is a function that takes a filename and os.FileInfo of a // file that would be included in the backup. The function returns true if it // should be excluded (rejected) from the backup. type RejectFunc func(path string, fi os.FileInfo) bool -// rejectByPattern returns a RejectFunc which rejects files that match +// rejectByPattern returns a RejectByNameFunc which rejects files that match // one of the patterns. -func rejectByPattern(patterns []string) RejectFunc { - return func(item string, fi os.FileInfo) bool { +func rejectByPattern(patterns []string) RejectByNameFunc { + return func(item string) bool { matched, _, err := filter.List(patterns, item) if err != nil { Warnf("error for exclude pattern: %v", err) @@ -83,14 +88,14 @@ func rejectByPattern(patterns []string) RejectFunc { } } -// rejectIfPresent returns a RejectFunc which itself returns whether a path -// should be excluded. The RejectFunc considers a file to be excluded when +// rejectIfPresent returns a RejectByNameFunc which itself returns whether a path +// should be excluded. The RejectByNameFunc considers a file to be excluded when // it resides in a directory with an exclusion file, that is specified by // excludeFileSpec in the form "filename[:content]". The returned error is // non-nil if the filename component of excludeFileSpec is empty. If rc is -// non-nil, it is going to be used in the RejectFunc to expedite the evaluation +// non-nil, it is going to be used in the RejectByNameFunc to expedite the evaluation // of a directory based on previous visits. -func rejectIfPresent(excludeFileSpec string) (RejectFunc, error) { +func rejectIfPresent(excludeFileSpec string) (RejectByNameFunc, error) { if excludeFileSpec == "" { return nil, errors.New("name for exclusion tagfile is empty") } @@ -107,7 +112,7 @@ func rejectIfPresent(excludeFileSpec string) (RejectFunc, error) { } debug.Log("using %q as exclusion tagfile", tf) rc := &rejectionCache{} - fn := func(filename string, _ os.FileInfo) bool { + fn := func(filename string) bool { return isExcludedByFile(filename, tf, tc, rc) } return fn, nil @@ -252,11 +257,11 @@ func rejectByDevice(samples []string) (RejectFunc, error) { }, nil } -// rejectResticCache returns a RejectFunc that rejects the restic cache +// rejectResticCache returns a RejectByNameFunc that rejects the restic cache // directory (if set). -func rejectResticCache(repo *repository.Repository) (RejectFunc, error) { +func rejectResticCache(repo *repository.Repository) (RejectByNameFunc, error) { if repo.Cache == nil { - return func(string, os.FileInfo) bool { + return func(string) bool { return false }, nil } @@ -266,7 +271,7 @@ func rejectResticCache(repo *repository.Repository) (RejectFunc, error) { return nil, errors.New("cacheBase is empty string") } - return func(item string, _ os.FileInfo) bool { + return func(item string) bool { if fs.HasPathPrefix(cacheBase, item) { debug.Log("rejecting restic cache directory %v", item) return true diff --git a/cmd/restic/exclude_test.go b/cmd/restic/exclude_test.go index eeba76e74..741dbdb64 100644 --- a/cmd/restic/exclude_test.go +++ b/cmd/restic/exclude_test.go @@ -27,7 +27,7 @@ func TestRejectByPattern(t *testing.T) { for _, tc := range tests { t.Run("", func(t *testing.T) { reject := rejectByPattern(patterns) - res := reject(tc.filename, nil) + res := reject(tc.filename) if res != tc.reject { t.Fatalf("wrong result for filename %v: want %v, got %v", tc.filename, tc.reject, res) @@ -140,8 +140,8 @@ func TestMultipleIsExcludedByFile(t *testing.T) { if err != nil { return err } - excludedByFoo := fooExclude(p, fi) - excludedByBar := barExclude(p, fi) + excludedByFoo := fooExclude(p) + excludedByBar := barExclude(p) excluded := excludedByFoo || excludedByBar // the log message helps debugging in case the test fails t.Logf("%q: %v || %v = %v", p, excludedByFoo, excludedByBar, excluded) diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go index 0bd839b65..4ce9ef597 100644 --- a/internal/archiver/archiver.go +++ b/internal/archiver/archiver.go @@ -16,6 +16,10 @@ import ( tomb "gopkg.in/tomb.v2" ) +// SelectByNameFunc returns true for all items that should be included (files and +// dirs). If false is returned, files are ignored and dirs are not even walked. +type SelectByNameFunc func(item string) bool + // SelectFunc returns true for all items that should be included (files and // dirs). If false is returned, files are ignored and dirs are not even walked. type SelectFunc func(item string, fi os.FileInfo) bool @@ -43,10 +47,11 @@ func (s *ItemStats) Add(other ItemStats) { // Archiver saves a directory structure to the repo. type Archiver struct { - Repo restic.Repository - Select SelectFunc - FS fs.FS - Options Options + Repo restic.Repository + SelectByName SelectByNameFunc + Select SelectFunc + FS fs.FS + Options Options blobSaver *BlobSaver fileSaver *FileSaver @@ -119,10 +124,11 @@ func (o Options) ApplyDefaults() Options { // New initializes a new archiver. func New(repo restic.Repository, fs fs.FS, opts Options) *Archiver { arch := &Archiver{ - Repo: repo, - Select: func(string, os.FileInfo) bool { return true }, - FS: fs, - Options: opts.ApplyDefaults(), + Repo: repo, + SelectByName: func(item string) bool { return true }, + Select: func(item string, fi os.FileInfo) bool { return true }, + FS: fs, + Options: opts.ApplyDefaults(), CompleteItem: func(string, *restic.Node, *restic.Node, ItemStats, time.Duration) {}, StartFile: func(string) {}, @@ -294,10 +300,10 @@ func (fn *FutureNode) wait(ctx context.Context) { } // Save saves a target (file or directory) to the repo. If the item is -// excluded,this function returns a nil node and error, with excluded set to +// excluded, this function returns a nil node and error, with excluded set to // true. // -// Errors and completion is needs to be handled by the caller. +// Errors and completion needs to be handled by the caller. // // snPath is the path within the current snapshot. func (arch *Archiver) Save(ctx context.Context, snPath, target string, previous *restic.Node) (fn FutureNode, excluded bool, err error) { @@ -316,6 +322,13 @@ func (arch *Archiver) Save(ctx context.Context, snPath, target string, previous fn.absTarget = abstarget + // exclude files by path before running Lstat to reduce number of lstat calls + if !arch.SelectByName(abstarget) { + debug.Log("%v is excluded by path", target) + return FutureNode{}, true, nil + } + + // get file info and run remaining select functions that require file information fi, err := arch.FS.Lstat(target) if !arch.Select(abstarget, fi) { debug.Log("%v is excluded", target) diff --git a/internal/archiver/scanner.go b/internal/archiver/scanner.go index 40181291c..ba3d5c60b 100644 --- a/internal/archiver/scanner.go +++ b/internal/archiver/scanner.go @@ -12,23 +12,21 @@ import ( // stats concerning the files and folders found. Select is used to decide which // items should be included. Error is called when an error occurs. type Scanner struct { - FS fs.FS - Select SelectFunc - Error ErrorFunc - Result func(item string, s ScanStats) + FS fs.FS + SelectByName SelectByNameFunc + Select SelectFunc + Error ErrorFunc + Result func(item string, s ScanStats) } // NewScanner initializes a new Scanner. func NewScanner(fs fs.FS) *Scanner { return &Scanner{ - FS: fs, - Select: func(item string, fi os.FileInfo) bool { - return true - }, - Error: func(item string, fi os.FileInfo, err error) error { - return err - }, - Result: func(item string, s ScanStats) {}, + FS: fs, + SelectByName: func(item string) bool { return true }, + Select: func(item string, fi os.FileInfo) bool { return true }, + Error: func(item string, fi os.FileInfo, err error) error { return err }, + Result: func(item string, s ScanStats) {}, } } @@ -70,17 +68,18 @@ func (s *Scanner) scan(ctx context.Context, stats ScanStats, target string) (Sca return stats, ctx.Err() } + // exclude files by path before running stat to reduce number of lstat calls + if !s.SelectByName(target) { + return stats, nil + } + + // get file information fi, err := s.FS.Lstat(target) if err != nil { - // ignore error if the target is to be excluded anyway - if !s.Select(target, nil) { - return stats, nil - } - - // else return filtered error return stats, s.Error(target, fi, err) } + // run remaining select functions that require file information if !s.Select(target, fi) { return stats, nil }