From 4a501d71185d328291053a36d10b6235630724c5 Mon Sep 17 00:00:00 2001 From: Kyle Brennan Date: Thu, 19 Mar 2020 00:44:34 +0000 Subject: [PATCH 1/2] backup: add option for file read concurrency --- changelog/unreleased/pull-2750 | 5 +++++ cmd/restic/cmd_backup.go | 20 ++++++++++++++++---- doc/047_tuning_backup_parameters.rst | 10 ++++++++++ doc/manual_rest.rst | 2 ++ 4 files changed, 33 insertions(+), 4 deletions(-) create mode 100644 changelog/unreleased/pull-2750 diff --git a/changelog/unreleased/pull-2750 b/changelog/unreleased/pull-2750 new file mode 100644 index 000000000..381c980ba --- /dev/null +++ b/changelog/unreleased/pull-2750 @@ -0,0 +1,5 @@ +Enhancement: Make backup file read concurrency configurable + +In order to tune restic for special situations we need to be able to configure `backup --file-read-concurrency`. + +https://github.com/restic/restic/pull/2750 diff --git a/cmd/restic/cmd_backup.go b/cmd/restic/cmd_backup.go index 67af4397a..49c71c21d 100644 --- a/cmd/restic/cmd_backup.go +++ b/cmd/restic/cmd_backup.go @@ -11,6 +11,7 @@ import ( "path" "path/filepath" "runtime" + "strconv" "strings" "sync" "time" @@ -100,6 +101,7 @@ type BackupOptions struct { IgnoreCtime bool UseFsSnapshot bool DryRun bool + FileReadConcurrency uint } var backupOptions BackupOptions @@ -108,6 +110,12 @@ var backupOptions BackupOptions var ErrInvalidSourceData = errors.New("at least one source file could not be read") func init() { + //set FileReadConcurrency to 2 if not set in env + fileReadConcurrency, err := strconv.ParseUint(os.Getenv("RESTIC_FILE_READ_CONCURRENCY"), 10, 32) + if err != nil || fileReadConcurrency < 1 { + fileReadConcurrency = 2 + } + cmdRoot.AddCommand(cmdBackup) f := cmdBackup.Flags() @@ -124,15 +132,14 @@ func init() { f.BoolVar(&backupOptions.Stdin, "stdin", false, "read backup from stdin") f.StringVar(&backupOptions.StdinFilename, "stdin-filename", "stdin", "`filename` to use when reading from stdin") f.Var(&backupOptions.Tags, "tag", "add `tags` for the new snapshot in the format `tag[,tag,...]` (can be specified multiple times)") - + f.UintVar(&backupOptions.FileReadConcurrency, "file-read-concurrency", uint(fileReadConcurrency), "set concurrency on file reads. (default: $RESTIC_FILE_READ_CONCURRENCY or 2)") f.StringVarP(&backupOptions.Host, "host", "H", "", "set the `hostname` for the snapshot manually. To prevent an expensive rescan use the \"parent\" flag") f.StringVar(&backupOptions.Host, "hostname", "", "set the `hostname` for the snapshot manually") - err := f.MarkDeprecated("hostname", "use --host") + err = f.MarkDeprecated("hostname", "use --host") if err != nil { // MarkDeprecated only returns an error when the flag could not be found panic(err) } - f.StringArrayVar(&backupOptions.FilesFrom, "files-from", nil, "read the files to backup from `file` (can be combined with file args; can be specified multiple times)") f.StringArrayVar(&backupOptions.FilesFromVerbatim, "files-from-verbatim", nil, "read the files to backup from `file` (can be combined with file args; can be specified multiple times)") f.StringArrayVar(&backupOptions.FilesFromRaw, "files-from-raw", nil, "read the files to backup from `file` (can be combined with file args; can be specified multiple times)") @@ -144,6 +151,7 @@ func init() { if runtime.GOOS == "windows" { f.BoolVar(&backupOptions.UseFsSnapshot, "use-fs-snapshot", false, "use filesystem snapshot where possible (currently only Windows VSS)") } + } // filterExisting returns a slice of all existing items, or an error if no @@ -284,6 +292,10 @@ func (opts BackupOptions) Check(gopts GlobalOptions, args []string) error { } } + if backupOptions.FileReadConcurrency == 0 { + return errors.Fatal("--file-read-concurrency must be a positive, nonzero integer") + } + return nil } @@ -685,7 +697,7 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina } wg.Go(func() error { return sc.Scan(cancelCtx, targets) }) - arch := archiver.New(repo, targetFS, archiver.Options{}) + arch := archiver.New(repo, targetFS, archiver.Options{FileReadConcurrency: backupOptions.FileReadConcurrency}) arch.SelectByName = selectByNameFilter arch.Select = selectFilter arch.WithAtime = opts.WithAtime diff --git a/doc/047_tuning_backup_parameters.rst b/doc/047_tuning_backup_parameters.rst index ecf2bebbb..989eea738 100644 --- a/doc/047_tuning_backup_parameters.rst +++ b/doc/047_tuning_backup_parameters.rst @@ -51,6 +51,16 @@ only applied for the single run of restic. The option can also be set via the en variable ``RESTIC_COMPRESSION``. +File Read Concurrency +===================== + +In some instances, such as backing up traditional spinning disks, reducing the file read +concurrency is desired. This will help reduce the amount of time spent seeking around +the disk and can increase the overall performance of the backup operation. You can specify +the concurrency of file reads with the ``RESTIC_FILE_READ_CONCURRENCY`` environment variable +or the ``--file-read-concurrency`` flag for the ``backup`` subcommand. + + Pack Size ========= diff --git a/doc/manual_rest.rst b/doc/manual_rest.rst index 41e1ca264..6c44acd89 100644 --- a/doc/manual_rest.rst +++ b/doc/manual_rest.rst @@ -103,6 +103,7 @@ command: --files-from file read the files to backup from file (can be combined with file args; can be specified multiple times) --files-from-raw file read the files to backup from file (can be combined with file args; can be specified multiple times) --files-from-verbatim file read the files to backup from file (can be combined with file args; can be specified multiple times) + --file-read-concurrency uint set concurrency on file reads. (default: $RESTIC_FILE_READ_CONCURRENCY or 2) -f, --force force re-reading the target files/directories (overrides the "parent" flag) -h, --help help for backup -H, --host hostname set the hostname for the snapshot manually. To prevent an expensive rescan use the "parent" flag @@ -442,3 +443,4 @@ time it is used, so by looking at the timestamps of the sub directories of the cache directory it can decide which sub directories are old and probably not needed any more. You can either remove these directories manually, or run a restic command with the ``--cleanup-cache`` flag. + From 2e606ca70b0e77b24b2453701f10cca93885f1b4 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 24 Sep 2022 11:57:16 +0200 Subject: [PATCH 2/2] backup: rework read concurrency --- changelog/unreleased/pull-2750 | 3 ++- cmd/restic/cmd_backup.go | 21 +++++++-------------- doc/040_backup.rst | 1 + doc/047_tuning_backup_parameters.rst | 10 +++++----- doc/manual_rest.rst | 2 +- internal/archiver/archiver.go | 12 ++++++------ internal/archiver/archiver_test.go | 2 +- 7 files changed, 23 insertions(+), 28 deletions(-) diff --git a/changelog/unreleased/pull-2750 b/changelog/unreleased/pull-2750 index 381c980ba..4dcec2204 100644 --- a/changelog/unreleased/pull-2750 +++ b/changelog/unreleased/pull-2750 @@ -1,5 +1,6 @@ Enhancement: Make backup file read concurrency configurable -In order to tune restic for special situations we need to be able to configure `backup --file-read-concurrency`. +The `backup` command now supports a `--read-concurrency` flag to allowing +tuning restic for very fast storage like NVME disks. https://github.com/restic/restic/pull/2750 diff --git a/cmd/restic/cmd_backup.go b/cmd/restic/cmd_backup.go index 49c71c21d..e36471762 100644 --- a/cmd/restic/cmd_backup.go +++ b/cmd/restic/cmd_backup.go @@ -101,7 +101,7 @@ type BackupOptions struct { IgnoreCtime bool UseFsSnapshot bool DryRun bool - FileReadConcurrency uint + ReadConcurrency uint } var backupOptions BackupOptions @@ -110,12 +110,6 @@ var backupOptions BackupOptions var ErrInvalidSourceData = errors.New("at least one source file could not be read") func init() { - //set FileReadConcurrency to 2 if not set in env - fileReadConcurrency, err := strconv.ParseUint(os.Getenv("RESTIC_FILE_READ_CONCURRENCY"), 10, 32) - if err != nil || fileReadConcurrency < 1 { - fileReadConcurrency = 2 - } - cmdRoot.AddCommand(cmdBackup) f := cmdBackup.Flags() @@ -132,10 +126,10 @@ func init() { f.BoolVar(&backupOptions.Stdin, "stdin", false, "read backup from stdin") f.StringVar(&backupOptions.StdinFilename, "stdin-filename", "stdin", "`filename` to use when reading from stdin") f.Var(&backupOptions.Tags, "tag", "add `tags` for the new snapshot in the format `tag[,tag,...]` (can be specified multiple times)") - f.UintVar(&backupOptions.FileReadConcurrency, "file-read-concurrency", uint(fileReadConcurrency), "set concurrency on file reads. (default: $RESTIC_FILE_READ_CONCURRENCY or 2)") + f.UintVar(&backupOptions.ReadConcurrency, "read-concurrency", 0, "read `n` file concurrently. (default: $RESTIC_READ_CONCURRENCY or 2)") f.StringVarP(&backupOptions.Host, "host", "H", "", "set the `hostname` for the snapshot manually. To prevent an expensive rescan use the \"parent\" flag") f.StringVar(&backupOptions.Host, "hostname", "", "set the `hostname` for the snapshot manually") - err = f.MarkDeprecated("hostname", "use --host") + err := f.MarkDeprecated("hostname", "use --host") if err != nil { // MarkDeprecated only returns an error when the flag could not be found panic(err) @@ -152,6 +146,9 @@ func init() { f.BoolVar(&backupOptions.UseFsSnapshot, "use-fs-snapshot", false, "use filesystem snapshot where possible (currently only Windows VSS)") } + // parse read concurrency from env, on error the default value will be used + readConcurrency, _ := strconv.ParseUint(os.Getenv("RESTIC_READ_CONCURRENCY"), 10, 32) + backupOptions.ReadConcurrency = uint(readConcurrency) } // filterExisting returns a slice of all existing items, or an error if no @@ -292,10 +289,6 @@ func (opts BackupOptions) Check(gopts GlobalOptions, args []string) error { } } - if backupOptions.FileReadConcurrency == 0 { - return errors.Fatal("--file-read-concurrency must be a positive, nonzero integer") - } - return nil } @@ -697,7 +690,7 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina } wg.Go(func() error { return sc.Scan(cancelCtx, targets) }) - arch := archiver.New(repo, targetFS, archiver.Options{FileReadConcurrency: backupOptions.FileReadConcurrency}) + arch := archiver.New(repo, targetFS, archiver.Options{ReadConcurrency: backupOptions.ReadConcurrency}) arch.SelectByName = selectByNameFilter arch.Select = selectFilter arch.WithAtime = opts.WithAtime diff --git a/doc/040_backup.rst b/doc/040_backup.rst index 891c6820d..cdd8302c9 100644 --- a/doc/040_backup.rst +++ b/doc/040_backup.rst @@ -555,6 +555,7 @@ environment variables. The following lists these environment variables: RESTIC_COMPRESSION Compression mode (only available for repository format version 2) RESTIC_PROGRESS_FPS Frames per second by which the progress bar is updated RESTIC_PACK_SIZE Target size for pack files + RESTIC_READ_CONCURRENCY Concurrency for file reads TMPDIR Location for temporary files diff --git a/doc/047_tuning_backup_parameters.rst b/doc/047_tuning_backup_parameters.rst index 989eea738..0de846e88 100644 --- a/doc/047_tuning_backup_parameters.rst +++ b/doc/047_tuning_backup_parameters.rst @@ -54,11 +54,11 @@ variable ``RESTIC_COMPRESSION``. File Read Concurrency ===================== -In some instances, such as backing up traditional spinning disks, reducing the file read -concurrency is desired. This will help reduce the amount of time spent seeking around -the disk and can increase the overall performance of the backup operation. You can specify -the concurrency of file reads with the ``RESTIC_FILE_READ_CONCURRENCY`` environment variable -or the ``--file-read-concurrency`` flag for the ``backup`` subcommand. +When backing up fast storage like NVME disks, it can be beneficial to increase the read +concurrency. This can increase the overall performance of the backup operation by reading +more files in parallel. You can specify the concurrency of file reads with the +``RESTIC_READ_CONCURRENCY`` environment variable or the ``--read-concurrency`` flag for +the ``backup`` command. Pack Size diff --git a/doc/manual_rest.rst b/doc/manual_rest.rst index 6c44acd89..f2d090209 100644 --- a/doc/manual_rest.rst +++ b/doc/manual_rest.rst @@ -103,7 +103,6 @@ command: --files-from file read the files to backup from file (can be combined with file args; can be specified multiple times) --files-from-raw file read the files to backup from file (can be combined with file args; can be specified multiple times) --files-from-verbatim file read the files to backup from file (can be combined with file args; can be specified multiple times) - --file-read-concurrency uint set concurrency on file reads. (default: $RESTIC_FILE_READ_CONCURRENCY or 2) -f, --force force re-reading the target files/directories (overrides the "parent" flag) -h, --help help for backup -H, --host hostname set the hostname for the snapshot manually. To prevent an expensive rescan use the "parent" flag @@ -113,6 +112,7 @@ command: --ignore-inode ignore inode number changes when checking for modified files -x, --one-file-system exclude other file systems, don't cross filesystem boundaries and subvolumes --parent snapshot use this parent snapshot (default: last snapshot in the repository that has the same target files/directories, and is not newer than the snapshot time) + --read-concurrency n read n file concurrently. (default: $RESTIC_READ_CONCURRENCY or 2) --stdin read backup from stdin --stdin-filename filename filename to use when reading from stdin (default "stdin") --tag tags add tags for the new snapshot in the format `tag[,tag,...]` (can be specified multiple times) (default []) diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go index 4fcc8e30c..776aedb53 100644 --- a/internal/archiver/archiver.go +++ b/internal/archiver/archiver.go @@ -95,10 +95,10 @@ const ( // Options is used to configure the archiver. type Options struct { - // FileReadConcurrency sets how many files are read in concurrently. If + // ReadConcurrency sets how many files are read in concurrently. If // it's set to zero, at most two files are read in concurrently (which // turned out to be a good default for most situations). - FileReadConcurrency uint + ReadConcurrency uint // SaveBlobConcurrency sets how many blobs are hashed and saved // concurrently. If it's set to zero, the default is the number of CPUs @@ -113,11 +113,11 @@ type Options struct { // ApplyDefaults returns a copy of o with the default options set for all unset // fields. func (o Options) ApplyDefaults() Options { - if o.FileReadConcurrency == 0 { + if o.ReadConcurrency == 0 { // two is a sweet spot for almost all situations. We've done some // experiments documented here: // https://github.com/borgbackup/borg/issues/3500 - o.FileReadConcurrency = 2 + o.ReadConcurrency = 2 } if o.SaveBlobConcurrency == 0 { @@ -132,7 +132,7 @@ func (o Options) ApplyDefaults() Options { // Also allow waiting for FileReadConcurrency files, this is the maximum of FutureFiles // which currently can be in progress. The main backup loop blocks when trying to queue // more files to read. - o.SaveTreeConcurrency = uint(runtime.GOMAXPROCS(0)) + o.FileReadConcurrency + o.SaveTreeConcurrency = uint(runtime.GOMAXPROCS(0)) + o.ReadConcurrency } return o @@ -782,7 +782,7 @@ func (arch *Archiver) runWorkers(ctx context.Context, wg *errgroup.Group) { arch.fileSaver = NewFileSaver(ctx, wg, arch.blobSaver.Save, arch.Repo.Config().ChunkerPolynomial, - arch.Options.FileReadConcurrency, arch.Options.SaveBlobConcurrency) + arch.Options.ReadConcurrency, arch.Options.SaveBlobConcurrency) arch.fileSaver.CompleteBlob = arch.CompleteBlob arch.fileSaver.NodeFromFileInfo = arch.nodeFromFileInfo diff --git a/internal/archiver/archiver_test.go b/internal/archiver/archiver_test.go index a6485234f..b5650d1b6 100644 --- a/internal/archiver/archiver_test.go +++ b/internal/archiver/archiver_test.go @@ -2040,7 +2040,7 @@ func TestArchiverAbortEarlyOnError(t *testing.T) { // at most two files may be queued arch := New(testRepo, testFS, Options{ - FileReadConcurrency: 2, + ReadConcurrency: 2, }) _, _, err := arch.Snapshot(ctx, []string{"."}, SnapshotOptions{Time: time.Now()})