Check data subset: check random percentage subset

2024-12-22 02:48:55 +00:00 · 2020-10-24 17:30:42 +02:00 · 2020-10-24 17:30:42 +02:00 · 8f9cea8cc0
commit 8f9cea8cc0
parent 3c0c0c132b
4 changed files with 257 additions and 35 deletions
--- a/changelog/unreleased/pull-3038
+++ b/changelog/unreleased/pull-3038
@ -0,0 +1,8 @@
+Enhancement: Allow specifying percentage in check's --read-data-subset
+
+We've enhanced the command-line option --read-data-subset to also accept a
+percentage. This will check the given percentage of pack files which are
+randomly selected on each run.
+
+https://github.com/restic/restic/pull/3038
+https://github.com/restic/restic/issues/2186
--- a/cmd/restic/cmd_check.go
+++ b/cmd/restic/cmd_check.go
@ -1,8 +1,8 @@
 package main

 import (
-	"fmt"
 	"io/ioutil"
+	"math/rand"
 	"strconv"
 	"strings"

@ -53,25 +53,38 @@ func init() {

 	f := cmdCheck.Flags()
 	f.BoolVar(&checkOptions.ReadData, "read-data", false, "read all data blobs")
-	f.StringVar(&checkOptions.ReadDataSubset, "read-data-subset", "", "read subset n of m data packs (format: `n/m`)")
+	f.StringVar(&checkOptions.ReadDataSubset, "read-data-subset", "", "read a `subset` of data packs, specified as 'n/t' for specific subset or either 'x%' or 'x.y%' for random subset")
 	f.BoolVar(&checkOptions.CheckUnused, "check-unused", false, "find unused blobs")
 	f.BoolVar(&checkOptions.WithCache, "with-cache", false, "use the cache")
 }

 func checkFlags(opts CheckOptions) error {
 	if opts.ReadData && opts.ReadDataSubset != "" {
-		return errors.Fatalf("check flags --read-data and --read-data-subset cannot be used together")
+		return errors.Fatal("check flags --read-data and --read-data-subset cannot be used together")
 	}
 	if opts.ReadDataSubset != "" {
 		dataSubset, err := stringToIntSlice(opts.ReadDataSubset)
-		if err != nil || len(dataSubset) != 2 {
-			return errors.Fatalf("check flag --read-data-subset must have two positive integer values, e.g. --read-data-subset=1/2")
-		}
-		if dataSubset[0] == 0 || dataSubset[1] == 0 || dataSubset[0] > dataSubset[1] {
-			return errors.Fatalf("check flag --read-data-subset=n/t values must be positive integers, and n <= t, e.g. --read-data-subset=1/2")
-		}
-		if dataSubset[1] > totalBucketsMax {
-			return errors.Fatalf("check flag --read-data-subset=n/t t must be at most %d", totalBucketsMax)
+		argumentError := errors.Fatal("check flag --read-data-subset must have two positive integer values or a percentage, e.g. --read-data-subset=1/2 or --read-data-subset=2.5%%")
+		if err == nil {
+			if len(dataSubset) != 2 {
+				return argumentError
+			}
+			if dataSubset[0] == 0 || dataSubset[1] == 0 || dataSubset[0] > dataSubset[1] {
+				return errors.Fatal("check flag --read-data-subset=n/t values must be positive integers, and n <= t, e.g. --read-data-subset=1/2")
+			}
+			if dataSubset[1] > totalBucketsMax {
+				return errors.Fatalf("check flag --read-data-subset=n/t t must be at most %d", totalBucketsMax)
+			}
+		} else {
+			percentage, err := parsePercentage(opts.ReadDataSubset)
+			if err != nil {
+				return argumentError
+			}
+
+			if percentage <= 0.0 || percentage > 100.0 {
+				return errors.Fatal(
+					"check flag --read-data-subset=n% n must be above 0.0% and 100.0%")
+			}
 		}
 	}

@ -98,6 +111,21 @@ func stringToIntSlice(param string) (split []uint, err error) {
 	return result, nil
 }

+// ParsePercentage parses a percentage string of the form "X%" where X is a float constant,
+// and returns the value of that constant. It does not check the range of the value.
+func parsePercentage(s string) (float64, error) {
+	if !strings.HasSuffix(s, "%") {
+		return 0, errors.Errorf(`parsePercentage: %q does not end in "%%"`, s)
+	}
+	s = s[:len(s)-1]
+
+	p, err := strconv.ParseFloat(s, 64)
+	if err != nil {
+		return 0, errors.Errorf("parsePercentage: %v", err)
+	}
+	return p, nil
+}
+
 // prepareCheckCache configures a special cache directory for check.
 //
 //  * if --with-cache is specified, the default cache is used
@ -233,23 +261,9 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
 		}
 	}

-	doReadData := func(bucket, totalBuckets uint) {
-		packs := make(map[restic.ID]int64)
-		for pack, size := range chkr.GetPacks() {
-			// If we ever check more than the first byte
-			// of pack, update totalBucketsMax.
-			if (uint(pack[0]) % totalBuckets) == (bucket - 1) {
-				packs[pack] = size
-			}
-		}
+	doReadData := func(packs map[restic.ID]int64) {
 		packCount := uint64(len(packs))

-		if packCount < chkr.CountPacks() {
-			Verbosef(fmt.Sprintf("read group #%d of %d data packs (out of total %d packs in %d groups)\n", bucket, packCount, chkr.CountPacks(), totalBuckets))
-		} else {
-			Verbosef("read all data\n")
-		}
-
 		p := newProgressMax(!gopts.Quiet, packCount, "packs")
 		errChan := make(chan error)

@ -264,10 +278,26 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {

 	switch {
 	case opts.ReadData:
-		doReadData(1, 1)
+		Verbosef("read all data\n")
+		doReadData(selectPacksByBucket(chkr.GetPacks(), 1, 1))
 	case opts.ReadDataSubset != "":
-		dataSubset, _ := stringToIntSlice(opts.ReadDataSubset)
-		doReadData(dataSubset[0], dataSubset[1])
+		var packs map[restic.ID]int64
+		dataSubset, err := stringToIntSlice(opts.ReadDataSubset)
+		if err == nil {
+			bucket := dataSubset[0]
+			totalBuckets := dataSubset[1]
+			packs = selectPacksByBucket(chkr.GetPacks(), bucket, totalBuckets)
+			packCount := uint64(len(packs))
+			Verbosef("read group #%d of %d data packs (out of total %d packs in %d groups)\n", bucket, packCount, chkr.CountPacks(), totalBuckets)
+		} else {
+			percentage, _ := parsePercentage(opts.ReadDataSubset)
+			packs = selectRandomPacksByPercentage(chkr.GetPacks(), percentage)
+			Verbosef("read %.1f%% of data packs\n", percentage)
+		}
+		if packs == nil {
+			return errors.Fatal("internal error: failed to select packs to check")
+		}
+		doReadData(packs)
 	}

 	if errorsFound {
@ -278,3 +308,40 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {

 	return nil
 }
+
+// selectPacksByBucket selects subsets of packs by ranges of buckets.
+func selectPacksByBucket(allPacks map[restic.ID]int64, bucket, totalBuckets uint) map[restic.ID]int64 {
+	packs := make(map[restic.ID]int64)
+	for pack, size := range allPacks {
+		// If we ever check more than the first byte
+		// of pack, update totalBucketsMax.
+		if (uint(pack[0]) % totalBuckets) == (bucket - 1) {
+			packs[pack] = size
+		}
+	}
+	return packs
+}
+
+// selectRandomPacksByPercentage selects the given percentage of packs which are randomly choosen.
+func selectRandomPacksByPercentage(allPacks map[restic.ID]int64, percentage float64) map[restic.ID]int64 {
+	packCount := len(allPacks)
+	packsToCheck := int(float64(packCount) * (percentage / 100.0))
+	if packsToCheck < 1 {
+		packsToCheck = 1
+	}
+	idx := rand.Perm(packCount)
+
+	var keys []restic.ID
+	for k := range allPacks {
+		keys = append(keys, k)
+	}
+
+	packs := make(map[restic.ID]int64)
+
+	for i := 0; i < packsToCheck; i++ {
+		id := keys[idx[i]]
+		packs[id] = allPacks[id]
+	}
+
+	return packs
+}
--- a/cmd/restic/cmd_check_test.go
+++ b/cmd/restic/cmd_check_test.go
@ -0,0 +1,124 @@
+package main
+
+import (
+	"math"
+	"reflect"
+	"testing"
+
+	"github.com/restic/restic/internal/restic"
+	rtest "github.com/restic/restic/internal/test"
+)
+
+func TestParsePercentage(t *testing.T) {
+	testCases := []struct {
+		input       string
+		output      float64
+		expectError bool
+	}{
+		{"0%", 0.0, false},
+		{"1%", 1.0, false},
+		{"100%", 100.0, false},
+		{"123%", 123.0, false},
+		{"123.456%", 123.456, false},
+		{"0.742%", 0.742, false},
+		{"-100%", -100.0, false},
+		{" 1%", 0.0, true},
+		{"1 %", 0.0, true},
+		{"1% ", 0.0, true},
+	}
+	for _, testCase := range testCases {
+		output, err := parsePercentage(testCase.input)
+
+		if testCase.expectError {
+			rtest.Assert(t, err != nil, "Expected error for case %s", testCase.input)
+			rtest.Assert(t, output == 0.0, "Expected output to be 0.0, got %s", output)
+		} else {
+			rtest.Assert(t, err == nil, "Expected no error for case %s", testCase.input)
+			rtest.Assert(t, math.Abs(testCase.output-output) < 0.00001, "Expected %f, got %f",
+				testCase.output, output)
+		}
+	}
+}
+
+func TestStringToIntSlice(t *testing.T) {
+	testCases := []struct {
+		input       string
+		output      []uint
+		expectError bool
+	}{
+		{"3/5", []uint{3, 5}, false},
+		{"1/100", []uint{1, 100}, false},
+		{"abc", nil, true},
+		{"1/a", nil, true},
+		{"/", nil, true},
+	}
+	for _, testCase := range testCases {
+		output, err := stringToIntSlice(testCase.input)
+
+		if testCase.expectError {
+			rtest.Assert(t, err != nil, "Expected error for case %s", testCase.input)
+			rtest.Assert(t, output == nil, "Expected output to be nil, got %s", output)
+		} else {
+			rtest.Assert(t, err == nil, "Expected no error for case %s", testCase.input)
+			rtest.Assert(t, len(output) == 2, "Invalid output length for case %s", testCase.input)
+			rtest.Assert(t, reflect.DeepEqual(output, testCase.output), "Expected %f, got %f",
+				testCase.output, output)
+		}
+	}
+}
+
+func TestSelectPacksByBucket(t *testing.T) {
+	var testPacks = make(map[restic.ID]int64)
+	for i := 1; i <= 10; i++ {
+		id := restic.NewRandomID()
+		// ensure relevant part of generated id is reproducable
+		id[0] = byte(i)
+		testPacks[id] = 0
+	}
+
+	selectedPacks := selectPacksByBucket(testPacks, 0, 10)
+	rtest.Assert(t, len(selectedPacks) == 0, "Expected 0 selected packs")
+
+	for i := uint(1); i <= 5; i++ {
+		selectedPacks = selectPacksByBucket(testPacks, i, 5)
+		rtest.Assert(t, len(selectedPacks) == 2, "Expected 2 selected packs")
+	}
+
+	selectedPacks = selectPacksByBucket(testPacks, 1, 1)
+	rtest.Assert(t, len(selectedPacks) == 10, "Expected 10 selected packs")
+	for testPack := range testPacks {
+		_, ok := selectedPacks[testPack]
+		rtest.Assert(t, ok, "Expected input and output to be equal")
+	}
+}
+
+func TestSelectRandomPacksByPercentage(t *testing.T) {
+	var testPacks = make(map[restic.ID]int64)
+	for i := 1; i <= 10; i++ {
+		testPacks[restic.NewRandomID()] = 0
+	}
+
+	selectedPacks := selectRandomPacksByPercentage(testPacks, 0.0)
+	rtest.Assert(t, len(selectedPacks) == 1, "Expected 1 selected packs")
+
+	selectedPacks = selectRandomPacksByPercentage(testPacks, 10.0)
+	rtest.Assert(t, len(selectedPacks) == 1, "Expected 1 selected pack")
+	for pack := range selectedPacks {
+		_, ok := testPacks[pack]
+		rtest.Assert(t, ok, "Unexpected selection")
+	}
+
+	selectedPacks = selectRandomPacksByPercentage(testPacks, 50.0)
+	rtest.Assert(t, len(selectedPacks) == 5, "Expected 5 selected packs")
+	for pack := range selectedPacks {
+		_, ok := testPacks[pack]
+		rtest.Assert(t, ok, "Unexpected item in selection")
+	}
+
+	selectedPacks = selectRandomPacksByPercentage(testPacks, 100.0)
+	rtest.Assert(t, len(selectedPacks) == 10, "Expected 10 selected packs")
+	for testPack := range testPacks {
+		_, ok := selectedPacks[testPack]
+		rtest.Assert(t, ok, "Expected input and output to be equal")
+	}
+}
--- a/doc/045_working_with_repos.rst
+++ b/doc/045_working_with_repos.rst
@ -238,12 +238,17 @@ integrity of the pack files in the repository, use the ``--read-data`` flag:
    repository, beware that it might incur higher bandwidth costs than usual
    and also that it takes more time than the default ``check``.

-Alternatively, use the ``--read-data-subset=n/t`` parameter to check only a
-subset of the repository pack files at a time. The parameter takes two values,
-``n`` and ``t``. When the check command runs, all pack files in the repository
-are logically divided in ``t`` (roughly equal) groups, and only files that
-belong to group number ``n`` are checked. For example, the following commands
-check all repository pack files over 5 separate invocations:
+Alternatively, use the ``--read-data-subset`` parameter to check only a
+subset of the repository pack files at a time. It supports two ways to select a
+subset. One selects a specific range of pack files, the other selects a random
+percentage of pack files.
+
+Use ``--read-data-subset=n/t`` to check only a subset of the repository pack
+files at a time. The parameter takes two values, ``n`` and ``t``. When the check
+command runs, all pack files in the repository are logically divided in ``t``
+(roughly equal) groups, and only files that belong to group number ``n`` are
+checked. For example, the following commands check all repository pack files
+over 5 separate invocations:

 .. code-block:: console

@ -252,3 +257,21 @@ check all repository pack files over 5 separate invocations:
    $ restic -r /srv/restic-repo check --read-data-subset=3/5
    $ restic -r /srv/restic-repo check --read-data-subset=4/5
    $ restic -r /srv/restic-repo check --read-data-subset=5/5
+
+Use ``--read-data-subset=n%`` to check a randomly choosen subset of the
+repository pack files. It takes one parameter, ``n``, the percentage of pack
+files to check as an integer or floating point number. This will not guarantee
+to cover all available pack files after sufficient runs, but it is easy to
+automate checking a small subset of data after each backup. For a floating point
+value the following command may be used:
+
+.. code-block:: console
+
+    $ restic -r /srv/restic-repo check --read-data-subset=2.5%
+
+When checking bigger subsets you most likely specify the percentage as an
+integer:
+
+.. code-block:: console
+
+    $ restic -r /srv/restic-repo check --read-data-subset=10%