Refactor max-unused calculation, add `unlimited` option

Add a callback to the PruneOptions struct which calculates the number of
bytes allowed to be unused after prune is done. This way, the logic is
closer to the option parsing code.

Also, add an explicit option `unlimited` for the use case when storage
does not matter but bandwidth and time do. Internally, this sets the
maximum number of unused bytes to MaxUint64.

Rework the documentation slightly so that no more "packs" are
mentioned and it talks about "files" instead.

Make it clear in the documentation that the percentage given to
`--max-unused` is relative to the whole repository size after pruning is
done. If specified, it must be below 100%, otherwise the repository
would contain 100% of unused data, which is pointless.

I had a hard time coming up with the correct formula to calculate the
maximum number of unused bytes based on the number of used bytes. For a
fraction `p` (0 ≤ p < 1), a repo with `u` bytes used, and the number of
unused bytes `x` the following holds:

      x ≤ p * (u+x)
    ⇔ x ≤ p*u + p*x
    ⇔ x - p*x ≤ p*u
    ⇔ x * (1-p) ≤ p*u
    ⇔ x ≤ p/(1-p) * u
This commit is contained in:
Alexander Neumann 2020-11-03 10:53:38 +01:00
parent f8c4dd7b1a
commit c1a3de4a6e
3 changed files with 94 additions and 64 deletions

View File

@ -1,8 +1,10 @@
package main
import (
"math"
"sort"
"strconv"
"strings"
"github.com/restic/restic/internal/debug"
"github.com/restic/restic/internal/errors"
@ -39,9 +41,8 @@ Exit status is 0 if the command was successful, and non-zero if there was any er
type PruneOptions struct {
DryRun bool
MaxUnused string
MaxUnusedPercent float64 // set if MaxUnused is a percentage
MaxUnusedBytes uint64 // set if MaxUnused is an absolute number of bytes
MaxUnused string
maxUnusedBytes func(used uint64) (unused uint64) // calculates the number of unused bytes after repacking, according to MaxUnused
MaxRepackSize string
MaxRepackBytes uint64
@ -60,7 +61,7 @@ func init() {
func addPruneOptions(c *cobra.Command) {
f := c.Flags()
f.StringVar(&pruneOptions.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused space (allowed suffixes: k/K, m/M, g/G, t/T or value in %)")
f.StringVar(&pruneOptions.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused data (absolute value in bytes with suffixes k/K, m/M, g/G, t/T, a value in % or the word 'unlimited')")
f.StringVar(&pruneOptions.MaxRepackSize, "max-repack-size", "", "maximum `size` to repack (allowed suffixes: k/K, m/M, g/G, t/T)")
f.BoolVar(&pruneOptions.RepackCachableOnly, "repack-cacheable-only", false, "only repack packs which are cacheable")
}
@ -74,27 +75,46 @@ func verifyPruneOptions(opts *PruneOptions) error {
opts.MaxRepackBytes = uint64(size)
}
length := len(opts.MaxUnused)
if length == 0 {
return nil
maxUnused := strings.TrimSpace(opts.MaxUnused)
if maxUnused == "" {
return errors.Fatalf("invalid value for --max-unused: %q", opts.MaxUnused)
}
var err error
if opts.MaxUnused[length-1] == '%' {
opts.MaxUnusedPercent, err = strconv.ParseFloat(opts.MaxUnused[:length-1], 64)
opts.MaxUnusedBytes = ^uint64(0)
} else {
var size int64
size, err = parseSizeStr(opts.MaxUnused)
opts.MaxUnusedPercent = 100.0
opts.MaxUnusedBytes = uint64(size)
}
if err != nil {
return err
}
// parse MaxUnused either as unlimited, a percentage, or an absolute number of bytes
switch {
case maxUnused == "unlimited":
opts.maxUnusedBytes = func(used uint64) uint64 {
return math.MaxUint64
}
if opts.MaxUnusedPercent < 0.0 || opts.MaxUnusedPercent > 100.0 {
return errors.Fatalf("--max-unused-percent should be between 0 and 100. Given value: %f", opts.MaxUnusedPercent)
case strings.HasSuffix(maxUnused, "%"):
maxUnused = strings.TrimSuffix(maxUnused, "%")
p, err := strconv.ParseFloat(maxUnused, 64)
if err != nil {
return errors.Fatalf("invalid percentage %q passed for --max-unused: %v", opts.MaxUnused, err)
}
if p < 0 {
return errors.Fatal("percentage for --max-unused must be positive")
}
if p >= 100 {
return errors.Fatal("percentage for --max-unused must be below 100%")
}
opts.maxUnusedBytes = func(used uint64) uint64 {
return uint64(p / (100 - p) * float64(used))
}
default:
size, err := parseSizeStr(maxUnused)
if err != nil {
return errors.Fatalf("invalid number of bytes %q for --max-unused: %v", opts.MaxUnused, err)
}
opts.maxUnusedBytes = func(used uint64) uint64 {
return uint64(size)
}
}
return nil
@ -344,13 +364,8 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
repackAllPacksWithDuplicates := true
maxUnusedSizeAfter := opts.MaxUnusedBytes
if opts.MaxUnusedPercent < 100.0 {
maxUnusedSizePercent := uint64(opts.MaxUnusedPercent / (100.0 - opts.MaxUnusedPercent) * float64(stats.size.used))
if maxUnusedSizePercent < maxUnusedSizeAfter {
maxUnusedSizeAfter = maxUnusedSizePercent
}
}
// calculate limit for number of unused bytes in the repo after repacking
maxUnusedSizeAfter := opts.maxUnusedBytes(stats.size.used)
// Sort repackCandidates such that packs with highest ratio unused/used space are picked first.
// This is equivalent to sorting by unused / total space.

View File

@ -1387,25 +1387,25 @@ func TestCheckRestoreNoLock(t *testing.T) {
func TestPrune(t *testing.T) {
t.Run("0", func(t *testing.T) {
opts := PruneOptions{MaxUnusedPercent: 0.0}
opts := PruneOptions{MaxUnused: "0%"}
checkOpts := CheckOptions{ReadData: true, CheckUnused: true}
testPrune(t, opts, checkOpts)
})
t.Run("50", func(t *testing.T) {
opts := PruneOptions{MaxUnusedPercent: 50.0}
opts := PruneOptions{MaxUnused: "50%"}
checkOpts := CheckOptions{ReadData: true}
testPrune(t, opts, checkOpts)
})
t.Run("100", func(t *testing.T) {
opts := PruneOptions{MaxUnusedPercent: 100.0}
t.Run("unlimited", func(t *testing.T) {
opts := PruneOptions{MaxUnused: "unlimited"}
checkOpts := CheckOptions{ReadData: true}
testPrune(t, opts, checkOpts)
})
t.Run("CachableOnly", func(t *testing.T) {
opts := PruneOptions{RepackCachableOnly: true}
opts := PruneOptions{MaxUnused: "5%", RepackCachableOnly: true}
checkOpts := CheckOptions{ReadData: true}
testPrune(t, opts, checkOpts)
})
@ -1436,7 +1436,7 @@ func testPrune(t *testing.T, pruneOpts PruneOptions, checkOpts CheckOptions) {
rtest.OK(t, runCheck(checkOpts, env.gopts, nil))
}
var pruneDefaultOptions = PruneOptions{MaxUnusedPercent: 1.5}
var pruneDefaultOptions = PruneOptions{MaxUnused: "5%"}
func listPacks(gopts GlobalOptions, t *testing.T) restic.IDSet {
r, err := OpenRepository(gopts)

View File

@ -310,39 +310,54 @@ Customize pruning
To understand the custom options, we first explain how the pruning process works:
- First all snapshots and directories within snapshots are scanned to determine
which data is still in use.
- Then for all pack files ``prune`` finds out if the file is fully used, partly
used or completely unused.
- Completely unused packs are marked for deletion. Fully used packs are kept.
A partially used pack is either kept or marked for repacking depending on user
options.
Note that for repacking, restic must download the file from the repository
storage and reupload the needed data in the repository. This can be very
time-consuming for remote repositories.
- After deciding what to do, ``prune`` will actually perform the repack, modify
the index according to the changes and delete the obsolete files.
1. All snapshots and directories within snapshots are scanned to determine
which data is still in use.
2. For all files in the repository, restic finds out if the file is fully
used, partly used or completely unused.
3. Completely unused files are marked for deletion. Fully used files are kept.
A partially used file is either kept or marked for repacking depending on user
options.
Note that for repacking, restic must download the file from the repository
storage and re-upload the needed data in the repository. This can be very
time-consuming for remote repositories.
4. After deciding what to do, ``prune`` will actually perform the repack, modify
the index according to the changes and delete the obsolete files.
The ``prune`` command accepts the following options:
- ``--max-unused limit`` allow unused data up to the specified limit within the repository.
This allows restic to keep partly used packs instead of repacking them.
The limit can be specified as size, e.g. "200M" or in percentage with respect to the total
repository size, e.g. "0.5%".
``prune`` tries to repack as little data as possible while still ensuring this
This allows restic to keep partly used files instead of repacking them.
The limit can be specified in several ways:
* As an absolute size (e.g. ``200M``). If you want to minimize the space
used by your repository, pass ``0`` to this option.
* As a size relative to the total repo size (e.g. ``10%``). This means that
after prune, at most ``10%`` of the total data stored in the repo may be
unused data. If the repo after prune has as size of 500MB, then at most
50MB may be unused.
* If the string ``unlimited`` is passed, there is no limit for partly
unused files. This means that as long as some data is still used within
a file stored in the repo, restic will just leave it there. Use this if
you want to minimize the time and bandwidth used by the ``prune``
operation.
Restic tries to repack as little data as possible while still ensuring this
limit for unused data.
If you want to minimize the space used by your repository, use a value of 0%.
If you want to minimize the time and bandwidth used by the ``prune`` command, use a
high value. A value of 100% will not require any pack file to be repacked.
The default value is 5%.
- ``--max-repack-size size`` if set limits the total size of packs to repack.
As ``prune`` first stores all repacked packs and deletes the obsolete packs at the end,
this option might be handy if you expect many packs to be repacked and fear to run low
on storage.
- ``--repack-cacheable-only`` if set to true only pack files which are cacheable are repacked.
Other pack files are not repacked, if this option is set.
This allows a very fast repacking using only cached data. It can, however, imply that the
unused data in your repository exceeds the value given by ``--max-unused-percent``.
The default value is false.
- ``--max-repack-size size`` if set limits the total size of files to repack.
As ``prune`` first stores all repacked files and deletes the obsolete files at the end,
this option might be handy if you expect many files to be repacked and fear to run low
on storage.
- ``--repack-cacheable-only`` if set to true only files which contain
metadata and would be stored in the cache are repacked. Other pack files are
not repacked if this option is set. This allows a very fast repacking
using only cached data. It can, however, imply that the unused data in
your repository exceeds the value given by ``--max-unused``.
The default value is false.
- ``--dry-run`` only show what ``prune`` would do.
- ``--verbose`` increased verbosity shows additional statistics for ``prune``.