From b07bb3d8c315b0e06dec9ca5bd4643a44c4b6cf7 Mon Sep 17 00:00:00 2001 From: Andreas Skielboe Date: Tue, 31 Jul 2018 17:25:25 +0200 Subject: [PATCH 1/2] Reject files excluded by name before calling lstat to improve scan speed Adds a SelectByName method to the archive and scanner which only require the filename as input, and can thus be run before calling lstat on the file. Can speed up scanning significantly if a lot of filename excludes are used. --- cmd/restic/cmd_backup.go | 49 +++++++++++++++++++++++++---------- cmd/restic/exclude.go | 29 ++++++++++++--------- cmd/restic/exclude_test.go | 6 ++--- internal/archiver/archiver.go | 33 ++++++++++++++++------- internal/archiver/scanner.go | 35 ++++++++++++------------- 5 files changed, 96 insertions(+), 56 deletions(-) diff --git a/cmd/restic/cmd_backup.go b/cmd/restic/cmd_backup.go index a1b5981ea..290a31ba1 100644 --- a/cmd/restic/cmd_backup.go +++ b/cmd/restic/cmd_backup.go @@ -186,18 +186,9 @@ func (opts BackupOptions) Check(gopts GlobalOptions, args []string) error { return nil } -// collectRejectFuncs returns a list of all functions which may reject data -// from being saved in a snapshot -func collectRejectFuncs(opts BackupOptions, repo *repository.Repository, targets []string) (fs []RejectFunc, err error) { - // allowed devices - if opts.ExcludeOtherFS && !opts.Stdin { - f, err := rejectByDevice(targets) - if err != nil { - return nil, err - } - fs = append(fs, f) - } - +// collectRejectByNameFuncs returns a list of all functions which may reject data +// from being saved in a snapshot based on path only +func collectRejectByNameFuncs(opts BackupOptions, repo *repository.Repository, targets []string) (fs []RejectByNameFunc, err error) { // exclude restic cache if repo.Cache != nil { f, err := rejectResticCache(repo) @@ -237,6 +228,21 @@ func collectRejectFuncs(opts BackupOptions, repo *repository.Repository, targets return fs, nil } +// collectRejectFuncs returns a list of all functions which may reject data +// from being saved in a snapshot based on path and file info +func collectRejectFuncs(opts BackupOptions, repo *repository.Repository, targets []string) (fs []RejectFunc, err error) { + // allowed devices + if opts.ExcludeOtherFS && !opts.Stdin { + f, err := rejectByDevice(targets) + if err != nil { + return nil, err + } + fs = append(fs, f) + } + + return fs, nil +} + // readExcludePatternsFromFiles reads all exclude files and returns the list of // exclude patterns. For each line, leading and trailing white space is removed // and comment lines are ignored. For each remaining pattern, environment @@ -393,7 +399,13 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina return err } - // rejectFuncs collect functions that can reject items from the backup + // rejectByNameFuncs collect functions that can reject items from the backup based on path only + rejectByNameFuncs, err := collectRejectByNameFuncs(opts, repo, targets) + if err != nil { + return err + } + + // rejectFuncs collect functions that can reject items from the backup based on path and file info rejectFuncs, err := collectRejectFuncs(opts, repo, targets) if err != nil { return err @@ -414,6 +426,15 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina p.V("using parent snapshot %v\n", parentSnapshotID.Str()) } + selectByNameFilter := func(item string) bool { + for _, reject := range rejectByNameFuncs { + if reject(item) { + return false + } + } + return true + } + selectFilter := func(item string, fi os.FileInfo) bool { for _, reject := range rejectFuncs { if reject(item, fi) { @@ -436,6 +457,7 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina } sc := archiver.NewScanner(targetFS) + sc.SelectByName = selectByNameFilter sc.Select = selectFilter sc.Error = p.ScannerError sc.Result = p.ReportTotal @@ -444,6 +466,7 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina t.Go(func() error { return sc.Scan(t.Context(gopts.ctx), targets) }) arch := archiver.New(repo, targetFS, archiver.Options{}) + arch.SelectByName = selectByNameFilter arch.Select = selectFilter arch.WithAtime = opts.WithAtime arch.Error = p.Error diff --git a/cmd/restic/exclude.go b/cmd/restic/exclude.go index 537bdc344..479f8a308 100644 --- a/cmd/restic/exclude.go +++ b/cmd/restic/exclude.go @@ -60,15 +60,20 @@ func (rc *rejectionCache) Store(dir string, rejected bool) { rc.m[dir] = rejected } +// RejectByNameFunc is a function that takes a filename of a +// file that would be included in the backup. The function returns true if it +// should be excluded (rejected) from the backup. +type RejectByNameFunc func(path string) bool + // RejectFunc is a function that takes a filename and os.FileInfo of a // file that would be included in the backup. The function returns true if it // should be excluded (rejected) from the backup. type RejectFunc func(path string, fi os.FileInfo) bool -// rejectByPattern returns a RejectFunc which rejects files that match +// rejectByPattern returns a RejectByNameFunc which rejects files that match // one of the patterns. -func rejectByPattern(patterns []string) RejectFunc { - return func(item string, fi os.FileInfo) bool { +func rejectByPattern(patterns []string) RejectByNameFunc { + return func(item string) bool { matched, _, err := filter.List(patterns, item) if err != nil { Warnf("error for exclude pattern: %v", err) @@ -83,14 +88,14 @@ func rejectByPattern(patterns []string) RejectFunc { } } -// rejectIfPresent returns a RejectFunc which itself returns whether a path -// should be excluded. The RejectFunc considers a file to be excluded when +// rejectIfPresent returns a RejectByNameFunc which itself returns whether a path +// should be excluded. The RejectByNameFunc considers a file to be excluded when // it resides in a directory with an exclusion file, that is specified by // excludeFileSpec in the form "filename[:content]". The returned error is // non-nil if the filename component of excludeFileSpec is empty. If rc is -// non-nil, it is going to be used in the RejectFunc to expedite the evaluation +// non-nil, it is going to be used in the RejectByNameFunc to expedite the evaluation // of a directory based on previous visits. -func rejectIfPresent(excludeFileSpec string) (RejectFunc, error) { +func rejectIfPresent(excludeFileSpec string) (RejectByNameFunc, error) { if excludeFileSpec == "" { return nil, errors.New("name for exclusion tagfile is empty") } @@ -107,7 +112,7 @@ func rejectIfPresent(excludeFileSpec string) (RejectFunc, error) { } debug.Log("using %q as exclusion tagfile", tf) rc := &rejectionCache{} - fn := func(filename string, _ os.FileInfo) bool { + fn := func(filename string) bool { return isExcludedByFile(filename, tf, tc, rc) } return fn, nil @@ -252,11 +257,11 @@ func rejectByDevice(samples []string) (RejectFunc, error) { }, nil } -// rejectResticCache returns a RejectFunc that rejects the restic cache +// rejectResticCache returns a RejectByNameFunc that rejects the restic cache // directory (if set). -func rejectResticCache(repo *repository.Repository) (RejectFunc, error) { +func rejectResticCache(repo *repository.Repository) (RejectByNameFunc, error) { if repo.Cache == nil { - return func(string, os.FileInfo) bool { + return func(string) bool { return false }, nil } @@ -266,7 +271,7 @@ func rejectResticCache(repo *repository.Repository) (RejectFunc, error) { return nil, errors.New("cacheBase is empty string") } - return func(item string, _ os.FileInfo) bool { + return func(item string) bool { if fs.HasPathPrefix(cacheBase, item) { debug.Log("rejecting restic cache directory %v", item) return true diff --git a/cmd/restic/exclude_test.go b/cmd/restic/exclude_test.go index eeba76e74..741dbdb64 100644 --- a/cmd/restic/exclude_test.go +++ b/cmd/restic/exclude_test.go @@ -27,7 +27,7 @@ func TestRejectByPattern(t *testing.T) { for _, tc := range tests { t.Run("", func(t *testing.T) { reject := rejectByPattern(patterns) - res := reject(tc.filename, nil) + res := reject(tc.filename) if res != tc.reject { t.Fatalf("wrong result for filename %v: want %v, got %v", tc.filename, tc.reject, res) @@ -140,8 +140,8 @@ func TestMultipleIsExcludedByFile(t *testing.T) { if err != nil { return err } - excludedByFoo := fooExclude(p, fi) - excludedByBar := barExclude(p, fi) + excludedByFoo := fooExclude(p) + excludedByBar := barExclude(p) excluded := excludedByFoo || excludedByBar // the log message helps debugging in case the test fails t.Logf("%q: %v || %v = %v", p, excludedByFoo, excludedByBar, excluded) diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go index 0bd839b65..4ce9ef597 100644 --- a/internal/archiver/archiver.go +++ b/internal/archiver/archiver.go @@ -16,6 +16,10 @@ import ( tomb "gopkg.in/tomb.v2" ) +// SelectByNameFunc returns true for all items that should be included (files and +// dirs). If false is returned, files are ignored and dirs are not even walked. +type SelectByNameFunc func(item string) bool + // SelectFunc returns true for all items that should be included (files and // dirs). If false is returned, files are ignored and dirs are not even walked. type SelectFunc func(item string, fi os.FileInfo) bool @@ -43,10 +47,11 @@ func (s *ItemStats) Add(other ItemStats) { // Archiver saves a directory structure to the repo. type Archiver struct { - Repo restic.Repository - Select SelectFunc - FS fs.FS - Options Options + Repo restic.Repository + SelectByName SelectByNameFunc + Select SelectFunc + FS fs.FS + Options Options blobSaver *BlobSaver fileSaver *FileSaver @@ -119,10 +124,11 @@ func (o Options) ApplyDefaults() Options { // New initializes a new archiver. func New(repo restic.Repository, fs fs.FS, opts Options) *Archiver { arch := &Archiver{ - Repo: repo, - Select: func(string, os.FileInfo) bool { return true }, - FS: fs, - Options: opts.ApplyDefaults(), + Repo: repo, + SelectByName: func(item string) bool { return true }, + Select: func(item string, fi os.FileInfo) bool { return true }, + FS: fs, + Options: opts.ApplyDefaults(), CompleteItem: func(string, *restic.Node, *restic.Node, ItemStats, time.Duration) {}, StartFile: func(string) {}, @@ -294,10 +300,10 @@ func (fn *FutureNode) wait(ctx context.Context) { } // Save saves a target (file or directory) to the repo. If the item is -// excluded,this function returns a nil node and error, with excluded set to +// excluded, this function returns a nil node and error, with excluded set to // true. // -// Errors and completion is needs to be handled by the caller. +// Errors and completion needs to be handled by the caller. // // snPath is the path within the current snapshot. func (arch *Archiver) Save(ctx context.Context, snPath, target string, previous *restic.Node) (fn FutureNode, excluded bool, err error) { @@ -316,6 +322,13 @@ func (arch *Archiver) Save(ctx context.Context, snPath, target string, previous fn.absTarget = abstarget + // exclude files by path before running Lstat to reduce number of lstat calls + if !arch.SelectByName(abstarget) { + debug.Log("%v is excluded by path", target) + return FutureNode{}, true, nil + } + + // get file info and run remaining select functions that require file information fi, err := arch.FS.Lstat(target) if !arch.Select(abstarget, fi) { debug.Log("%v is excluded", target) diff --git a/internal/archiver/scanner.go b/internal/archiver/scanner.go index 40181291c..ba3d5c60b 100644 --- a/internal/archiver/scanner.go +++ b/internal/archiver/scanner.go @@ -12,23 +12,21 @@ import ( // stats concerning the files and folders found. Select is used to decide which // items should be included. Error is called when an error occurs. type Scanner struct { - FS fs.FS - Select SelectFunc - Error ErrorFunc - Result func(item string, s ScanStats) + FS fs.FS + SelectByName SelectByNameFunc + Select SelectFunc + Error ErrorFunc + Result func(item string, s ScanStats) } // NewScanner initializes a new Scanner. func NewScanner(fs fs.FS) *Scanner { return &Scanner{ - FS: fs, - Select: func(item string, fi os.FileInfo) bool { - return true - }, - Error: func(item string, fi os.FileInfo, err error) error { - return err - }, - Result: func(item string, s ScanStats) {}, + FS: fs, + SelectByName: func(item string) bool { return true }, + Select: func(item string, fi os.FileInfo) bool { return true }, + Error: func(item string, fi os.FileInfo, err error) error { return err }, + Result: func(item string, s ScanStats) {}, } } @@ -70,17 +68,18 @@ func (s *Scanner) scan(ctx context.Context, stats ScanStats, target string) (Sca return stats, ctx.Err() } + // exclude files by path before running stat to reduce number of lstat calls + if !s.SelectByName(target) { + return stats, nil + } + + // get file information fi, err := s.FS.Lstat(target) if err != nil { - // ignore error if the target is to be excluded anyway - if !s.Select(target, nil) { - return stats, nil - } - - // else return filtered error return stats, s.Error(target, fi, err) } + // run remaining select functions that require file information if !s.Select(target, fi) { return stats, nil } From c145b618d442c78c7a3bd45923e1ed1a49fcafa1 Mon Sep 17 00:00:00 2001 From: Alexander Neumann Date: Sun, 12 Aug 2018 17:51:08 +0200 Subject: [PATCH 2/2] Add entry to changelog --- changelog/unreleased/issue-1909 | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 changelog/unreleased/issue-1909 diff --git a/changelog/unreleased/issue-1909 b/changelog/unreleased/issue-1909 new file mode 100644 index 000000000..869d532bf --- /dev/null +++ b/changelog/unreleased/issue-1909 @@ -0,0 +1,14 @@ +Enhancement: Reject files/dirs by name first + +The current scanner/archiver code had an architectural limitation: it always +ran the `lstat()` system call on all files and directories before a decision to +include/exclude the file/dir was made. This lead to a lot of unnecessary system +calls for items that could have been rejected by their name or path only. + +We've changed the archiver/scanner implementation so that it now first rejects +by name/path, and only runs the system call on the remaining items. This +reduces the number of `lstat()` system calls a lot (depending on the exclude +settings). + +https://github.com/restic/restic/issues/1909 +https://github.com/restic/restic/pull/1912