Merge pull request #2731 from dionorgua/rewrite-snapshot

Implement 'rewrite' command to exclude files from existing snapshots
This commit is contained in:
Michael Eischer 2022-11-12 20:06:35 +01:00 committed by GitHub
commit 726a1969cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 676 additions and 3 deletions

View File

@ -0,0 +1,7 @@
Enhancement: Implement rewrite command
We've added a new command which allows to rewrite existing snapshots to remove
unwanted files.
https://github.com/restic/restic/issues/14
https://github.com/restic/restic/pull/2731

View File

@ -306,7 +306,7 @@ func collectRejectByNameFuncs(opts BackupOptions, repo *repository.Repository, t
fs = append(fs, f)
}
fsPatterns, err := collectExcludePatterns(opts.excludePatternOptions)
fsPatterns, err := opts.excludePatternOptions.CollectPatterns()
if err != nil {
return nil, err
}

218
cmd/restic/cmd_rewrite.go Normal file
View File

@ -0,0 +1,218 @@
package main
import (
"context"
"fmt"
"github.com/spf13/cobra"
"golang.org/x/sync/errgroup"
"github.com/restic/restic/internal/backend"
"github.com/restic/restic/internal/debug"
"github.com/restic/restic/internal/errors"
"github.com/restic/restic/internal/repository"
"github.com/restic/restic/internal/restic"
"github.com/restic/restic/internal/walker"
)
var cmdRewrite = &cobra.Command{
Use: "rewrite [flags] [snapshotID ...]",
Short: "Rewrite snapshots to exclude unwanted files",
Long: `
The "rewrite" command excludes files from existing snapshots. It creates new
snapshots containing the same data as the original ones, but without the files
you specify to exclude. All metadata (time, host, tags) will be preserved.
The snapshots to rewrite are specified using the --host, --tag and --path options,
or by providing a list of snapshot IDs. Please note that specifying neither any of
these options nor a snapshot ID will cause the command to rewrite all snapshots.
The special tag 'rewrite' will be added to the new snapshots to distinguish
them from the original ones, unless --forget is used. If the --forget option is
used, the original snapshots will instead be directly removed from the repository.
Please note that the --forget option only removes the snapshots and not the actual
data stored in the repository. In order to delete the no longer referenced data,
use the "prune" command.
EXIT STATUS
===========
Exit status is 0 if the command was successful, and non-zero if there was any error.
`,
DisableAutoGenTag: true,
RunE: func(cmd *cobra.Command, args []string) error {
return runRewrite(cmd.Context(), rewriteOptions, globalOptions, args)
},
}
// RewriteOptions collects all options for the rewrite command.
type RewriteOptions struct {
Forget bool
DryRun bool
snapshotFilterOptions
excludePatternOptions
}
var rewriteOptions RewriteOptions
func init() {
cmdRoot.AddCommand(cmdRewrite)
f := cmdRewrite.Flags()
f.BoolVarP(&rewriteOptions.Forget, "forget", "", false, "remove original snapshots after creating new ones")
f.BoolVarP(&rewriteOptions.DryRun, "dry-run", "n", false, "do not do anything, just print what would be done")
initMultiSnapshotFilterOptions(f, &rewriteOptions.snapshotFilterOptions, true)
initExcludePatternOptions(f, &rewriteOptions.excludePatternOptions)
}
func rewriteSnapshot(ctx context.Context, repo *repository.Repository, sn *restic.Snapshot, opts RewriteOptions) (bool, error) {
if sn.Tree == nil {
return false, errors.Errorf("snapshot %v has nil tree", sn.ID().Str())
}
rejectByNameFuncs, err := opts.excludePatternOptions.CollectPatterns()
if err != nil {
return false, err
}
selectByName := func(nodepath string) bool {
for _, reject := range rejectByNameFuncs {
if reject(nodepath) {
return false
}
}
return true
}
wg, wgCtx := errgroup.WithContext(ctx)
repo.StartPackUploader(wgCtx, wg)
var filteredTree restic.ID
wg.Go(func() error {
filteredTree, err = walker.FilterTree(wgCtx, repo, "/", *sn.Tree, &walker.TreeFilterVisitor{
SelectByName: selectByName,
PrintExclude: func(path string) { Verbosef(fmt.Sprintf("excluding %s\n", path)) },
})
if err != nil {
return err
}
return repo.Flush(wgCtx)
})
err = wg.Wait()
if err != nil {
return false, err
}
if filteredTree == *sn.Tree {
debug.Log("Snapshot %v not modified", sn)
return false, nil
}
debug.Log("Snapshot %v modified", sn)
if opts.DryRun {
Verbosef("would save new snapshot\n")
if opts.Forget {
Verbosef("would remove old snapshot\n")
}
return true, nil
}
// Retain the original snapshot id over all tag changes.
if sn.Original == nil {
sn.Original = sn.ID()
}
*sn.Tree = filteredTree
if !opts.Forget {
sn.AddTags([]string{"rewrite"})
}
// Save the new snapshot.
id, err := restic.SaveSnapshot(ctx, repo, sn)
if err != nil {
return false, err
}
if opts.Forget {
h := restic.Handle{Type: restic.SnapshotFile, Name: sn.ID().String()}
if err = repo.Backend().Remove(ctx, h); err != nil {
return false, err
}
debug.Log("removed old snapshot %v", sn.ID())
Verbosef("removed old snapshot %v\n", sn.ID().Str())
}
Verbosef("saved new snapshot %v\n", id.Str())
return true, nil
}
func runRewrite(ctx context.Context, opts RewriteOptions, gopts GlobalOptions, args []string) error {
if opts.excludePatternOptions.Empty() {
return errors.Fatal("Nothing to do: no excludes provided")
}
repo, err := OpenRepository(ctx, gopts)
if err != nil {
return err
}
if !opts.DryRun {
var lock *restic.Lock
var err error
if opts.Forget {
Verbosef("create exclusive lock for repository\n")
lock, ctx, err = lockRepoExclusive(ctx, repo)
} else {
lock, ctx, err = lockRepo(ctx, repo)
}
defer unlockRepo(lock)
if err != nil {
return err
}
} else {
repo.SetDryRun()
}
snapshotLister, err := backend.MemorizeList(ctx, repo.Backend(), restic.SnapshotFile)
if err != nil {
return err
}
if err = repo.LoadIndex(ctx); err != nil {
return err
}
changedCount := 0
for sn := range FindFilteredSnapshots(ctx, snapshotLister, repo, opts.Hosts, opts.Tags, opts.Paths, args) {
Verbosef("\nsnapshot %s of %v at %s)\n", sn.ID().Str(), sn.Paths, sn.Time)
changed, err := rewriteSnapshot(ctx, repo, sn, opts)
if err != nil {
return errors.Fatalf("unable to rewrite snapshot ID %q: %v", sn.ID().Str(), err)
}
if changed {
changedCount++
}
}
Verbosef("\n")
if changedCount == 0 {
if !opts.DryRun {
Verbosef("no snapshots were modified\n")
} else {
Verbosef("no snapshots would be modified\n")
}
} else {
if !opts.DryRun {
Verbosef("modified %v snapshots\n", changedCount)
} else {
Verbosef("would modify %v snapshots\n", changedCount)
}
}
return nil
}

View File

@ -475,7 +475,11 @@ func initExcludePatternOptions(f *pflag.FlagSet, opts *excludePatternOptions) {
f.StringArrayVar(&opts.InsensitiveExcludeFiles, "iexclude-file", nil, "same as --exclude-file but ignores casing of `file`names in patterns")
}
func collectExcludePatterns(opts excludePatternOptions) ([]RejectByNameFunc, error) {
func (opts *excludePatternOptions) Empty() bool {
return len(opts.Excludes) == 0 && len(opts.InsensitiveExcludes) == 0 && len(opts.ExcludeFiles) == 0 && len(opts.InsensitiveExcludeFiles) == 0
}
func (opts excludePatternOptions) CollectPatterns() ([]RejectByNameFunc, error) {
var fs []RejectByNameFunc
// add patterns from file
if len(opts.ExcludeFiles) > 0 {

View File

@ -0,0 +1,73 @@
package main
import (
"context"
"path/filepath"
"testing"
"github.com/restic/restic/internal/restic"
rtest "github.com/restic/restic/internal/test"
)
func testRunRewriteExclude(t testing.TB, gopts GlobalOptions, excludes []string, forget bool) {
opts := RewriteOptions{
excludePatternOptions: excludePatternOptions{
Excludes: excludes,
},
Forget: forget,
}
rtest.OK(t, runRewrite(context.TODO(), opts, gopts, nil))
}
func createBasicRewriteRepo(t testing.TB, env *testEnvironment) restic.ID {
testSetupBackupData(t, env)
// create backup
testRunBackup(t, filepath.Dir(env.testdata), []string{"testdata"}, BackupOptions{}, env.gopts)
snapshotIDs := testRunList(t, "snapshots", env.gopts)
rtest.Assert(t, len(snapshotIDs) == 1, "expected one snapshot, got %v", snapshotIDs)
testRunCheck(t, env.gopts)
return snapshotIDs[0]
}
func TestRewrite(t *testing.T) {
env, cleanup := withTestEnvironment(t)
defer cleanup()
createBasicRewriteRepo(t, env)
// exclude some data
testRunRewriteExclude(t, env.gopts, []string{"3"}, false)
snapshotIDs := testRunList(t, "snapshots", env.gopts)
rtest.Assert(t, len(snapshotIDs) == 2, "expected two snapshots, got %v", snapshotIDs)
testRunCheck(t, env.gopts)
}
func TestRewriteUnchanged(t *testing.T) {
env, cleanup := withTestEnvironment(t)
defer cleanup()
snapshotID := createBasicRewriteRepo(t, env)
// use an exclude that will not exclude anything
testRunRewriteExclude(t, env.gopts, []string{"3dflkhjgdflhkjetrlkhjgfdlhkj"}, false)
newSnapshotIDs := testRunList(t, "snapshots", env.gopts)
rtest.Assert(t, len(newSnapshotIDs) == 1, "expected one snapshot, got %v", newSnapshotIDs)
rtest.Assert(t, snapshotID == newSnapshotIDs[0], "snapshot id changed unexpectedly")
testRunCheck(t, env.gopts)
}
func TestRewriteReplace(t *testing.T) {
env, cleanup := withTestEnvironment(t)
defer cleanup()
snapshotID := createBasicRewriteRepo(t, env)
// exclude some data
testRunRewriteExclude(t, env.gopts, []string{"3"}, true)
newSnapshotIDs := testRunList(t, "snapshots", env.gopts)
rtest.Assert(t, len(newSnapshotIDs) == 1, "expected one snapshot, got %v", newSnapshotIDs)
rtest.Assert(t, snapshotID != newSnapshotIDs[0], "snapshot id should have changed")
// check forbids unused blobs, thus remove them first
testRunPrune(t, env.gopts, PruneOptions{MaxUnused: "0"})
testRunCheck(t, env.gopts)
}

View File

@ -204,6 +204,7 @@ Combined with ``--verbose``, you can see a list of changes:
modified /archive.tar.gz, saved in 0.140s (25.542 MiB added)
Would be added to the repository: 25.551 MiB
.. _backup-excluding-files:
Excluding Files
***************

View File

@ -136,11 +136,12 @@ or the environment variable ``$RESTIC_FROM_KEY_HINT``.
repository. You can avoid this limitation by using the rclone backend
along with remotes which are configured in rclone.
.. _copy-filtering-snapshots:
Filtering snapshots to copy
---------------------------
The list of snapshots to copy can be filtered by host, path in the backup
and / or a comma-separated tag list:
and/or a comma-separated tag list:
.. code-block:: console
@ -175,6 +176,61 @@ using the same chunker parameters as the source repository:
Note that it is not possible to change the chunker parameters of an existing repository.
Removing files from snapshots
=============================
Snapshots sometimes turn out to include more files that intended. Instead of
removing the snapshots entirely and running the corresponding backup commands
again (which is not always practical after the fact) it is possible to remove
the unwanted files from affected snapshots by rewriting them using the
``rewrite`` command:
.. code-block:: console
$ restic -r /srv/restic-repo rewrite --exclude secret-file
repository c881945a opened (repository version 2) successfully, password is correct
snapshot 6160ddb2 of [/home/user/work] at 2022-06-12 16:01:28.406630608 +0200 CEST)
excluding /home/user/work/secret-file
saved new snapshot b6aee1ff
snapshot 4fbaf325 of [/home/user/work] at 2022-05-01 11:22:26.500093107 +0200 CEST)
modified 1 snapshots
$ restic -r /srv/restic-repo rewrite --exclude secret-file 6160ddb2
repository c881945a opened (repository version 2) successfully, password is correct
snapshot 6160ddb2 of [/home/user/work] at 2022-06-12 16:01:28.406630608 +0200 CEST)
excluding /home/user/work/secret-file
new snapshot saved as b6aee1ff
modified 1 snapshots
The options ``--exclude``, ``--exclude-file``, ``--iexclude`` and
``--iexclude-file`` are supported. They behave the same way as for the backup
command, see :ref:`backup-excluding-files` for details.
It is possible to rewrite only a subset of snapshots by filtering them the same
way as for the ``copy`` command, see :ref:`copy-filtering-snapshots`.
By default, the ``rewrite`` command will keep the original snapshots and create
new ones for every snapshot which was modified during rewriting. The new
snapshots are marked with the tag ``rewrite`` to differentiate them from the
original, rewritten snapshots.
Alternatively, you can use the ``--forget`` option to immediately remove the
original snapshots. In this case, no tag is added to the new snapshots. Please
note that this only removes the snapshots and not the actual data stored in the
repository. Run the ``prune`` command afterwards to remove the now unreferenced
data (just like when having used the ``forget`` command).
In order to preview the changes which ``rewrite`` would make, you can use the
``--dry-run`` option. This will simulate the rewriting process without actually
modifying the repository. Instead restic will only print the actions it would
perform.
Checking integrity and consistency
==================================

View File

@ -38,6 +38,7 @@ Usage help is available:
rebuild-index Build a new index
recover Recover data from the repository not referenced by snapshots
restore Extract the data from a snapshot
rewrite Rewrite snapshots to exclude unwanted files
self-update Update the restic binary
snapshots List all snapshots
stats Scan the repository and show basic statistics

View File

@ -0,0 +1,91 @@
package walker
import (
"context"
"fmt"
"path"
"github.com/restic/restic/internal/debug"
"github.com/restic/restic/internal/restic"
)
// SelectByNameFunc returns true for all items that should be included (files and
// dirs). If false is returned, files are ignored and dirs are not even walked.
type SelectByNameFunc func(item string) bool
type TreeFilterVisitor struct {
SelectByName SelectByNameFunc
PrintExclude func(string)
}
type BlobLoadSaver interface {
restic.BlobSaver
restic.BlobLoader
}
func FilterTree(ctx context.Context, repo BlobLoadSaver, nodepath string, nodeID restic.ID, visitor *TreeFilterVisitor) (newNodeID restic.ID, err error) {
curTree, err := restic.LoadTree(ctx, repo, nodeID)
if err != nil {
return restic.ID{}, err
}
// check that we can properly encode this tree without losing information
// The alternative of using json/Decoder.DisallowUnknownFields() doesn't work as we use
// a custom UnmarshalJSON to decode trees, see also https://github.com/golang/go/issues/41144
testID, err := restic.SaveTree(ctx, repo, curTree)
if err != nil {
return restic.ID{}, err
}
if nodeID != testID {
return restic.ID{}, fmt.Errorf("cannot encode tree at %q without loosing information", nodepath)
}
debug.Log("filterTree: %s, nodeId: %s\n", nodepath, nodeID.Str())
changed := false
tb := restic.NewTreeJSONBuilder()
for _, node := range curTree.Nodes {
path := path.Join(nodepath, node.Name)
if !visitor.SelectByName(path) {
if visitor.PrintExclude != nil {
visitor.PrintExclude(path)
}
changed = true
continue
}
if node.Subtree == nil {
err = tb.AddNode(node)
if err != nil {
return restic.ID{}, err
}
continue
}
newID, err := FilterTree(ctx, repo, path, *node.Subtree, visitor)
if err != nil {
return restic.ID{}, err
}
if !node.Subtree.Equal(newID) {
changed = true
}
node.Subtree = &newID
err = tb.AddNode(node)
if err != nil {
return restic.ID{}, err
}
}
if changed {
tree, err := tb.Finalize()
if err != nil {
return restic.ID{}, err
}
// Save new tree
newTreeID, _, _, err := repo.SaveBlob(ctx, restic.TreeBlob, tree, restic.ID{}, false)
debug.Log("filterTree: save new tree for %s as %v\n", nodepath, newTreeID)
return newTreeID, err
}
return nodeID, nil
}

View File

@ -0,0 +1,222 @@
package walker
import (
"context"
"fmt"
"testing"
"github.com/google/go-cmp/cmp"
"github.com/pkg/errors"
"github.com/restic/restic/internal/restic"
)
// WritableTreeMap also support saving
type WritableTreeMap struct {
TreeMap
}
func (t WritableTreeMap) SaveBlob(ctx context.Context, tpe restic.BlobType, buf []byte, id restic.ID, storeDuplicate bool) (newID restic.ID, known bool, size int, err error) {
if tpe != restic.TreeBlob {
return restic.ID{}, false, 0, errors.New("can only save trees")
}
if id.IsNull() {
id = restic.Hash(buf)
}
_, ok := t.TreeMap[id]
if ok {
return id, false, 0, nil
}
t.TreeMap[id] = append([]byte{}, buf...)
return id, true, len(buf), nil
}
func (t WritableTreeMap) Dump() {
for k, v := range t.TreeMap {
fmt.Printf("%v: %v", k, string(v))
}
}
type checkRewriteFunc func(t testing.TB) (visitor TreeFilterVisitor, final func(testing.TB))
// checkRewriteItemOrder ensures that the order of the 'path' arguments is the one passed in as 'want'.
func checkRewriteItemOrder(want []string) checkRewriteFunc {
pos := 0
return func(t testing.TB) (visitor TreeFilterVisitor, final func(testing.TB)) {
vis := TreeFilterVisitor{
SelectByName: func(path string) bool {
if pos >= len(want) {
t.Errorf("additional unexpected path found: %v", path)
return false
}
if path != want[pos] {
t.Errorf("wrong path found, want %q, got %q", want[pos], path)
}
pos++
return true
},
}
final = func(t testing.TB) {
if pos != len(want) {
t.Errorf("not enough items returned, want %d, got %d", len(want), pos)
}
}
return vis, final
}
}
// checkRewriteSkips excludes nodes if path is in skipFor, it checks that all excluded entries are printed.
func checkRewriteSkips(skipFor map[string]struct{}, want []string) checkRewriteFunc {
var pos int
printed := make(map[string]struct{})
return func(t testing.TB) (visitor TreeFilterVisitor, final func(testing.TB)) {
vis := TreeFilterVisitor{
SelectByName: func(path string) bool {
if pos >= len(want) {
t.Errorf("additional unexpected path found: %v", path)
return false
}
if path != want[pos] {
t.Errorf("wrong path found, want %q, got %q", want[pos], path)
}
pos++
_, ok := skipFor[path]
return !ok
},
PrintExclude: func(s string) {
if _, ok := printed[s]; ok {
t.Errorf("path was already printed %v", s)
}
printed[s] = struct{}{}
},
}
final = func(t testing.TB) {
if !cmp.Equal(skipFor, printed) {
t.Errorf("unexpected paths skipped: %s", cmp.Diff(skipFor, printed))
}
if pos != len(want) {
t.Errorf("not enough items returned, want %d, got %d", len(want), pos)
}
}
return vis, final
}
}
func TestRewriter(t *testing.T) {
var tests = []struct {
tree TestTree
newTree TestTree
check checkRewriteFunc
}{
{ // don't change
tree: TestTree{
"foo": TestFile{},
"subdir": TestTree{
"subfile": TestFile{},
},
},
check: checkRewriteItemOrder([]string{
"/foo",
"/subdir",
"/subdir/subfile",
}),
},
{ // exclude file
tree: TestTree{
"foo": TestFile{},
"subdir": TestTree{
"subfile": TestFile{},
},
},
newTree: TestTree{
"foo": TestFile{},
"subdir": TestTree{},
},
check: checkRewriteSkips(
map[string]struct{}{
"/subdir/subfile": {},
},
[]string{
"/foo",
"/subdir",
"/subdir/subfile",
},
),
},
{ // exclude dir
tree: TestTree{
"foo": TestFile{},
"subdir": TestTree{
"subfile": TestFile{},
},
},
newTree: TestTree{
"foo": TestFile{},
},
check: checkRewriteSkips(
map[string]struct{}{
"/subdir": {},
},
[]string{
"/foo",
"/subdir",
},
),
},
}
for _, test := range tests {
t.Run("", func(t *testing.T) {
repo, root := BuildTreeMap(test.tree)
if test.newTree == nil {
test.newTree = test.tree
}
expRepo, expRoot := BuildTreeMap(test.newTree)
modrepo := WritableTreeMap{repo}
ctx, cancel := context.WithCancel(context.TODO())
defer cancel()
vis, last := test.check(t)
newRoot, err := FilterTree(ctx, modrepo, "/", root, &vis)
if err != nil {
t.Error(err)
}
last(t)
// verifying against the expected tree root also implicitly checks the structural integrity
if newRoot != expRoot {
t.Error("hash mismatch")
fmt.Println("Got")
modrepo.Dump()
fmt.Println("Expected")
WritableTreeMap{expRepo}.Dump()
}
})
}
}
func TestRewriterFailOnUnknownFields(t *testing.T) {
tm := WritableTreeMap{TreeMap{}}
node := []byte(`{"nodes":[{"name":"subfile","type":"file","mtime":"0001-01-01T00:00:00Z","atime":"0001-01-01T00:00:00Z","ctime":"0001-01-01T00:00:00Z","uid":0,"gid":0,"content":null,"unknown_field":42}]}`)
id := restic.Hash(node)
tm.TreeMap[id] = node
ctx, cancel := context.WithCancel(context.TODO())
defer cancel()
// use nil visitor to crash if the tree loading works unexpectedly
_, err := FilterTree(ctx, tm, "/", id, nil)
if err == nil {
t.Error("missing error on unknown field")
}
}