Check data subset: check random percentage subset

2024-12-22 10:58:55 +00:00 · 2020-10-24 17:30:42 +02:00 · 2020-10-24 17:30:42 +02:00 · 8f9cea8cc0
commit 8f9cea8cc0
parent 3c0c0c132b
4 changed files with 257 additions and 35 deletions
--- a/changelog/unreleased/pull-3038
+++ b/changelog/unreleased/pull-3038
@ -0,0 +1,8 @@
 Enhancement: Allow specifying percentage in check's --read-data-subset
 We've enhanced the command-line option --read-data-subset to also accept a
 percentage. This will check the given percentage of pack files which are
 randomly selected on each run.
 https://github.com/restic/restic/pull/3038
 https://github.com/restic/restic/issues/2186
--- a/cmd/restic/cmd_check.go
+++ b/cmd/restic/cmd_check.go
@ -1,8 +1,8 @@
 package main
 import (
 	"fmt"
 	"io/ioutil"
 	"math/rand"
 	"strconv"
 	"strings"
@ -53,25 +53,38 @@ func init() {
 	f := cmdCheck.Flags()
 	f.BoolVar(&checkOptions.ReadData, "read-data", false, "read all data blobs")
-	f.StringVar(&checkOptions.ReadDataSubset, "read-data-subset", "", "read subset n of m data packs (format: `n/m`)")
+	f.StringVar(&checkOptions.ReadDataSubset, "read-data-subset", "", "read a `subset` of data packs, specified as 'n/t' for specific subset or either 'x%' or 'x.y%' for random subset")
 	f.BoolVar(&checkOptions.CheckUnused, "check-unused", false, "find unused blobs")
 	f.BoolVar(&checkOptions.WithCache, "with-cache", false, "use the cache")
 }
 func checkFlags(opts CheckOptions) error {
 	if opts.ReadData && opts.ReadDataSubset != "" {
-		return errors.Fatalf("check flags --read-data and --read-data-subset cannot be used together")
+		return errors.Fatal("check flags --read-data and --read-data-subset cannot be used together")
 	}
 	if opts.ReadDataSubset != "" {
 		dataSubset, err := stringToIntSlice(opts.ReadDataSubset)
-		if err != nil || len(dataSubset) != 2 {
+		argumentError := errors.Fatal("check flag --read-data-subset must have two positive integer values or a percentage, e.g. --read-data-subset=1/2 or --read-data-subset=2.5%%")
-			return errors.Fatalf("check flag --read-data-subset must have two positive integer values, e.g. --read-data-subset=1/2")
+		if err == nil {
-		}
+			if len(dataSubset) != 2 {
-		if dataSubset[0] == 0 || dataSubset[1] == 0 || dataSubset[0] > dataSubset[1] {
+				return argumentError
-			return errors.Fatalf("check flag --read-data-subset=n/t values must be positive integers, and n <= t, e.g. --read-data-subset=1/2")
+			}
-		}
+			if dataSubset[0] == 0 || dataSubset[1] == 0 || dataSubset[0] > dataSubset[1] {
-		if dataSubset[1] > totalBucketsMax {
+				return errors.Fatal("check flag --read-data-subset=n/t values must be positive integers, and n <= t, e.g. --read-data-subset=1/2")
-			return errors.Fatalf("check flag --read-data-subset=n/t t must be at most %d", totalBucketsMax)
+			}
 			if dataSubset[1] > totalBucketsMax {
 				return errors.Fatalf("check flag --read-data-subset=n/t t must be at most %d", totalBucketsMax)
 			}
 		} else {
 			percentage, err := parsePercentage(opts.ReadDataSubset)
 			if err != nil {
 				return argumentError
 			}
 			if percentage <= 0.0 || percentage > 100.0 {
 				return errors.Fatal(
 					"check flag --read-data-subset=n% n must be above 0.0% and 100.0%")
 			}
 		}
 	}
@ -98,6 +111,21 @@ func stringToIntSlice(param string) (split []uint, err error) {
 	return result, nil
 }
 // ParsePercentage parses a percentage string of the form "X%" where X is a float constant,
 // and returns the value of that constant. It does not check the range of the value.
 func parsePercentage(s string) (float64, error) {
 	if !strings.HasSuffix(s, "%") {
 		return 0, errors.Errorf(`parsePercentage: %q does not end in "%%"`, s)
 	}
 	s = s[:len(s)-1]
 	p, err := strconv.ParseFloat(s, 64)
 	if err != nil {
 		return 0, errors.Errorf("parsePercentage: %v", err)
 	}
 	return p, nil
 }
 // prepareCheckCache configures a special cache directory for check.
 //
 //  * if --with-cache is specified, the default cache is used
@ -233,23 +261,9 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
 		}
 	}
-	doReadData := func(bucket, totalBuckets uint) {
+	doReadData := func(packs map[restic.ID]int64) {
 		packs := make(map[restic.ID]int64)
 		for pack, size := range chkr.GetPacks() {
 			// If we ever check more than the first byte
 			// of pack, update totalBucketsMax.
 			if (uint(pack[0]) % totalBuckets) == (bucket - 1) {
 				packs[pack] = size
 			}
 		}
 		packCount := uint64(len(packs))
 		if packCount < chkr.CountPacks() {
 			Verbosef(fmt.Sprintf("read group #%d of %d data packs (out of total %d packs in %d groups)\n", bucket, packCount, chkr.CountPacks(), totalBuckets))
 		} else {
 			Verbosef("read all data\n")
 		}
 		p := newProgressMax(!gopts.Quiet, packCount, "packs")
 		errChan := make(chan error)
@ -264,10 +278,26 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
 	switch {
 	case opts.ReadData:
-		doReadData(1, 1)
+		Verbosef("read all data\n")
 		doReadData(selectPacksByBucket(chkr.GetPacks(), 1, 1))
 	case opts.ReadDataSubset != "":
-		dataSubset, _ := stringToIntSlice(opts.ReadDataSubset)
+		var packs map[restic.ID]int64
-		doReadData(dataSubset[0], dataSubset[1])
+		dataSubset, err := stringToIntSlice(opts.ReadDataSubset)
 		if err == nil {
 			bucket := dataSubset[0]
 			totalBuckets := dataSubset[1]
 			packs = selectPacksByBucket(chkr.GetPacks(), bucket, totalBuckets)
 			packCount := uint64(len(packs))
 			Verbosef("read group #%d of %d data packs (out of total %d packs in %d groups)\n", bucket, packCount, chkr.CountPacks(), totalBuckets)
 		} else {
 			percentage, _ := parsePercentage(opts.ReadDataSubset)
 			packs = selectRandomPacksByPercentage(chkr.GetPacks(), percentage)
 			Verbosef("read %.1f%% of data packs\n", percentage)
 		}
 		if packs == nil {
 			return errors.Fatal("internal error: failed to select packs to check")
 		}
 		doReadData(packs)
 	}
 	if errorsFound {
@ -278,3 +308,40 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
 	return nil
 }
 // selectPacksByBucket selects subsets of packs by ranges of buckets.
 func selectPacksByBucket(allPacks map[restic.ID]int64, bucket, totalBuckets uint) map[restic.ID]int64 {
 	packs := make(map[restic.ID]int64)
 	for pack, size := range allPacks {
 		// If we ever check more than the first byte
 		// of pack, update totalBucketsMax.
 		if (uint(pack[0]) % totalBuckets) == (bucket - 1) {
 			packs[pack] = size
 		}
 	}
 	return packs
 }
 // selectRandomPacksByPercentage selects the given percentage of packs which are randomly choosen.
 func selectRandomPacksByPercentage(allPacks map[restic.ID]int64, percentage float64) map[restic.ID]int64 {
 	packCount := len(allPacks)
 	packsToCheck := int(float64(packCount) * (percentage / 100.0))
 	if packsToCheck < 1 {
 		packsToCheck = 1
 	}
 	idx := rand.Perm(packCount)
 	var keys []restic.ID
 	for k := range allPacks {
 		keys = append(keys, k)
 	}
 	packs := make(map[restic.ID]int64)
 	for i := 0; i < packsToCheck; i++ {
 		id := keys[idx[i]]
 		packs[id] = allPacks[id]
 	}
 	return packs
 }
--- a/cmd/restic/cmd_check_test.go
+++ b/cmd/restic/cmd_check_test.go
@ -0,0 +1,124 @@
 package main
 import (
 	"math"
 	"reflect"
 	"testing"
 	"github.com/restic/restic/internal/restic"
 	rtest "github.com/restic/restic/internal/test"
 )
 func TestParsePercentage(t *testing.T) {
 	testCases := []struct {
 		input       string
 		output      float64
 		expectError bool
 	}{
 		{"0%", 0.0, false},
 		{"1%", 1.0, false},
 		{"100%", 100.0, false},
 		{"123%", 123.0, false},
 		{"123.456%", 123.456, false},
 		{"0.742%", 0.742, false},
 		{"-100%", -100.0, false},
 		{" 1%", 0.0, true},
 		{"1 %", 0.0, true},
 		{"1% ", 0.0, true},
 	}
 	for _, testCase := range testCases {
 		output, err := parsePercentage(testCase.input)
 		if testCase.expectError {
 			rtest.Assert(t, err != nil, "Expected error for case %s", testCase.input)
 			rtest.Assert(t, output == 0.0, "Expected output to be 0.0, got %s", output)
 		} else {
 			rtest.Assert(t, err == nil, "Expected no error for case %s", testCase.input)
 			rtest.Assert(t, math.Abs(testCase.output-output) < 0.00001, "Expected %f, got %f",
 				testCase.output, output)
 		}
 	}
 }
 func TestStringToIntSlice(t *testing.T) {
 	testCases := []struct {
 		input       string
 		output      []uint
 		expectError bool
 	}{
 		{"3/5", []uint{3, 5}, false},
 		{"1/100", []uint{1, 100}, false},
 		{"abc", nil, true},
 		{"1/a", nil, true},
 		{"/", nil, true},
 	}
 	for _, testCase := range testCases {
 		output, err := stringToIntSlice(testCase.input)
 		if testCase.expectError {
 			rtest.Assert(t, err != nil, "Expected error for case %s", testCase.input)
 			rtest.Assert(t, output == nil, "Expected output to be nil, got %s", output)
 		} else {
 			rtest.Assert(t, err == nil, "Expected no error for case %s", testCase.input)
 			rtest.Assert(t, len(output) == 2, "Invalid output length for case %s", testCase.input)
 			rtest.Assert(t, reflect.DeepEqual(output, testCase.output), "Expected %f, got %f",
 				testCase.output, output)
 		}
 	}
 }
 func TestSelectPacksByBucket(t *testing.T) {
 	var testPacks = make(map[restic.ID]int64)
 	for i := 1; i <= 10; i++ {
 		id := restic.NewRandomID()
 		// ensure relevant part of generated id is reproducable
 		id[0] = byte(i)
 		testPacks[id] = 0
 	}
 	selectedPacks := selectPacksByBucket(testPacks, 0, 10)
 	rtest.Assert(t, len(selectedPacks) == 0, "Expected 0 selected packs")
 	for i := uint(1); i <= 5; i++ {
 		selectedPacks = selectPacksByBucket(testPacks, i, 5)
 		rtest.Assert(t, len(selectedPacks) == 2, "Expected 2 selected packs")
 	}
 	selectedPacks = selectPacksByBucket(testPacks, 1, 1)
 	rtest.Assert(t, len(selectedPacks) == 10, "Expected 10 selected packs")
 	for testPack := range testPacks {
 		_, ok := selectedPacks[testPack]
 		rtest.Assert(t, ok, "Expected input and output to be equal")
 	}
 }
 func TestSelectRandomPacksByPercentage(t *testing.T) {
 	var testPacks = make(map[restic.ID]int64)
 	for i := 1; i <= 10; i++ {
 		testPacks[restic.NewRandomID()] = 0
 	}
 	selectedPacks := selectRandomPacksByPercentage(testPacks, 0.0)
 	rtest.Assert(t, len(selectedPacks) == 1, "Expected 1 selected packs")
 	selectedPacks = selectRandomPacksByPercentage(testPacks, 10.0)
 	rtest.Assert(t, len(selectedPacks) == 1, "Expected 1 selected pack")
 	for pack := range selectedPacks {
 		_, ok := testPacks[pack]
 		rtest.Assert(t, ok, "Unexpected selection")
 	}
 	selectedPacks = selectRandomPacksByPercentage(testPacks, 50.0)
 	rtest.Assert(t, len(selectedPacks) == 5, "Expected 5 selected packs")
 	for pack := range selectedPacks {
 		_, ok := testPacks[pack]
 		rtest.Assert(t, ok, "Unexpected item in selection")
 	}
 	selectedPacks = selectRandomPacksByPercentage(testPacks, 100.0)
 	rtest.Assert(t, len(selectedPacks) == 10, "Expected 10 selected packs")
 	for testPack := range testPacks {
 		_, ok := selectedPacks[testPack]
 		rtest.Assert(t, ok, "Expected input and output to be equal")
 	}
 }
--- a/doc/045_working_with_repos.rst
+++ b/doc/045_working_with_repos.rst
@ -238,12 +238,17 @@ integrity of the pack files in the repository, use the ``--read-data`` flag:
    repository, beware that it might incur higher bandwidth costs than usual
    and also that it takes more time than the default ``check``.
-Alternatively, use the ``--read-data-subset=n/t`` parameter to check only a
+Alternatively, use the ``--read-data-subset`` parameter to check only a
-subset of the repository pack files at a time. The parameter takes two values,
+subset of the repository pack files at a time. It supports two ways to select a
-``n`` and ``t``. When the check command runs, all pack files in the repository
+subset. One selects a specific range of pack files, the other selects a random
-are logically divided in ``t`` (roughly equal) groups, and only files that
+percentage of pack files.
-belong to group number ``n`` are checked. For example, the following commands
+
-check all repository pack files over 5 separate invocations:
+Use ``--read-data-subset=n/t`` to check only a subset of the repository pack
 files at a time. The parameter takes two values, ``n`` and ``t``. When the check
 command runs, all pack files in the repository are logically divided in ``t``
 (roughly equal) groups, and only files that belong to group number ``n`` are
 checked. For example, the following commands check all repository pack files
 over 5 separate invocations:
 .. code-block:: console
@ -252,3 +257,21 @@ check all repository pack files over 5 separate invocations:
    $ restic -r /srv/restic-repo check --read-data-subset=3/5
    $ restic -r /srv/restic-repo check --read-data-subset=4/5
    $ restic -r /srv/restic-repo check --read-data-subset=5/5
 Use ``--read-data-subset=n%`` to check a randomly choosen subset of the
 repository pack files. It takes one parameter, ``n``, the percentage of pack
 files to check as an integer or floating point number. This will not guarantee
 to cover all available pack files after sufficient runs, but it is easy to
 automate checking a small subset of data after each backup. For a floating point
 value the following command may be used:
 .. code-block:: console
    $ restic -r /srv/restic-repo check --read-data-subset=2.5%
 When checking bigger subsets you most likely specify the percentage as an
 integer:
 .. code-block:: console
    $ restic -r /srv/restic-repo check --read-data-subset=10%