diff --git a/changelog/unreleased/issue-14 b/changelog/unreleased/issue-14 new file mode 100644 index 000000000..93f83686d --- /dev/null +++ b/changelog/unreleased/issue-14 @@ -0,0 +1,7 @@ +Enhancement: Implement rewrite command + +We've added a new command which allows to rewrite existing snapshots to remove +unwanted files. + +https://github.com/restic/restic/issues/14 +https://github.com/restic/restic/pull/2731 diff --git a/cmd/restic/cmd_backup.go b/cmd/restic/cmd_backup.go index 8b1f13b55..2b64217c2 100644 --- a/cmd/restic/cmd_backup.go +++ b/cmd/restic/cmd_backup.go @@ -306,7 +306,7 @@ func collectRejectByNameFuncs(opts BackupOptions, repo *repository.Repository, t fs = append(fs, f) } - fsPatterns, err := collectExcludePatterns(opts.excludePatternOptions) + fsPatterns, err := opts.excludePatternOptions.CollectPatterns() if err != nil { return nil, err } diff --git a/cmd/restic/cmd_rewrite.go b/cmd/restic/cmd_rewrite.go new file mode 100644 index 000000000..2a750b969 --- /dev/null +++ b/cmd/restic/cmd_rewrite.go @@ -0,0 +1,218 @@ +package main + +import ( + "context" + "fmt" + + "github.com/spf13/cobra" + "golang.org/x/sync/errgroup" + + "github.com/restic/restic/internal/backend" + "github.com/restic/restic/internal/debug" + "github.com/restic/restic/internal/errors" + "github.com/restic/restic/internal/repository" + "github.com/restic/restic/internal/restic" + "github.com/restic/restic/internal/walker" +) + +var cmdRewrite = &cobra.Command{ + Use: "rewrite [flags] [snapshotID ...]", + Short: "Rewrite snapshots to exclude unwanted files", + Long: ` +The "rewrite" command excludes files from existing snapshots. It creates new +snapshots containing the same data as the original ones, but without the files +you specify to exclude. All metadata (time, host, tags) will be preserved. + +The snapshots to rewrite are specified using the --host, --tag and --path options, +or by providing a list of snapshot IDs. Please note that specifying neither any of +these options nor a snapshot ID will cause the command to rewrite all snapshots. + +The special tag 'rewrite' will be added to the new snapshots to distinguish +them from the original ones, unless --forget is used. If the --forget option is +used, the original snapshots will instead be directly removed from the repository. + +Please note that the --forget option only removes the snapshots and not the actual +data stored in the repository. In order to delete the no longer referenced data, +use the "prune" command. + +EXIT STATUS +=========== + +Exit status is 0 if the command was successful, and non-zero if there was any error. +`, + DisableAutoGenTag: true, + RunE: func(cmd *cobra.Command, args []string) error { + return runRewrite(cmd.Context(), rewriteOptions, globalOptions, args) + }, +} + +// RewriteOptions collects all options for the rewrite command. +type RewriteOptions struct { + Forget bool + DryRun bool + + snapshotFilterOptions + excludePatternOptions +} + +var rewriteOptions RewriteOptions + +func init() { + cmdRoot.AddCommand(cmdRewrite) + + f := cmdRewrite.Flags() + f.BoolVarP(&rewriteOptions.Forget, "forget", "", false, "remove original snapshots after creating new ones") + f.BoolVarP(&rewriteOptions.DryRun, "dry-run", "n", false, "do not do anything, just print what would be done") + + initMultiSnapshotFilterOptions(f, &rewriteOptions.snapshotFilterOptions, true) + initExcludePatternOptions(f, &rewriteOptions.excludePatternOptions) +} + +func rewriteSnapshot(ctx context.Context, repo *repository.Repository, sn *restic.Snapshot, opts RewriteOptions) (bool, error) { + if sn.Tree == nil { + return false, errors.Errorf("snapshot %v has nil tree", sn.ID().Str()) + } + + rejectByNameFuncs, err := opts.excludePatternOptions.CollectPatterns() + if err != nil { + return false, err + } + + selectByName := func(nodepath string) bool { + for _, reject := range rejectByNameFuncs { + if reject(nodepath) { + return false + } + } + return true + } + + wg, wgCtx := errgroup.WithContext(ctx) + repo.StartPackUploader(wgCtx, wg) + + var filteredTree restic.ID + wg.Go(func() error { + filteredTree, err = walker.FilterTree(wgCtx, repo, "/", *sn.Tree, &walker.TreeFilterVisitor{ + SelectByName: selectByName, + PrintExclude: func(path string) { Verbosef(fmt.Sprintf("excluding %s\n", path)) }, + }) + if err != nil { + return err + } + + return repo.Flush(wgCtx) + }) + err = wg.Wait() + if err != nil { + return false, err + } + + if filteredTree == *sn.Tree { + debug.Log("Snapshot %v not modified", sn) + return false, nil + } + + debug.Log("Snapshot %v modified", sn) + if opts.DryRun { + Verbosef("would save new snapshot\n") + + if opts.Forget { + Verbosef("would remove old snapshot\n") + } + + return true, nil + } + + // Retain the original snapshot id over all tag changes. + if sn.Original == nil { + sn.Original = sn.ID() + } + *sn.Tree = filteredTree + + if !opts.Forget { + sn.AddTags([]string{"rewrite"}) + } + + // Save the new snapshot. + id, err := restic.SaveSnapshot(ctx, repo, sn) + if err != nil { + return false, err + } + + if opts.Forget { + h := restic.Handle{Type: restic.SnapshotFile, Name: sn.ID().String()} + if err = repo.Backend().Remove(ctx, h); err != nil { + return false, err + } + debug.Log("removed old snapshot %v", sn.ID()) + Verbosef("removed old snapshot %v\n", sn.ID().Str()) + } + Verbosef("saved new snapshot %v\n", id.Str()) + return true, nil +} + +func runRewrite(ctx context.Context, opts RewriteOptions, gopts GlobalOptions, args []string) error { + if opts.excludePatternOptions.Empty() { + return errors.Fatal("Nothing to do: no excludes provided") + } + + repo, err := OpenRepository(ctx, gopts) + if err != nil { + return err + } + + if !opts.DryRun { + var lock *restic.Lock + var err error + if opts.Forget { + Verbosef("create exclusive lock for repository\n") + lock, ctx, err = lockRepoExclusive(ctx, repo) + } else { + lock, ctx, err = lockRepo(ctx, repo) + } + defer unlockRepo(lock) + if err != nil { + return err + } + } else { + repo.SetDryRun() + } + + snapshotLister, err := backend.MemorizeList(ctx, repo.Backend(), restic.SnapshotFile) + if err != nil { + return err + } + + if err = repo.LoadIndex(ctx); err != nil { + return err + } + + changedCount := 0 + for sn := range FindFilteredSnapshots(ctx, snapshotLister, repo, opts.Hosts, opts.Tags, opts.Paths, args) { + Verbosef("\nsnapshot %s of %v at %s)\n", sn.ID().Str(), sn.Paths, sn.Time) + changed, err := rewriteSnapshot(ctx, repo, sn, opts) + if err != nil { + return errors.Fatalf("unable to rewrite snapshot ID %q: %v", sn.ID().Str(), err) + } + if changed { + changedCount++ + } + } + + Verbosef("\n") + if changedCount == 0 { + if !opts.DryRun { + Verbosef("no snapshots were modified\n") + } else { + Verbosef("no snapshots would be modified\n") + } + } else { + if !opts.DryRun { + Verbosef("modified %v snapshots\n", changedCount) + } else { + Verbosef("would modify %v snapshots\n", changedCount) + } + } + + return nil +} diff --git a/cmd/restic/exclude.go b/cmd/restic/exclude.go index 86f85f133..efe6f41e4 100644 --- a/cmd/restic/exclude.go +++ b/cmd/restic/exclude.go @@ -475,7 +475,11 @@ func initExcludePatternOptions(f *pflag.FlagSet, opts *excludePatternOptions) { f.StringArrayVar(&opts.InsensitiveExcludeFiles, "iexclude-file", nil, "same as --exclude-file but ignores casing of `file`names in patterns") } -func collectExcludePatterns(opts excludePatternOptions) ([]RejectByNameFunc, error) { +func (opts *excludePatternOptions) Empty() bool { + return len(opts.Excludes) == 0 && len(opts.InsensitiveExcludes) == 0 && len(opts.ExcludeFiles) == 0 && len(opts.InsensitiveExcludeFiles) == 0 +} + +func (opts excludePatternOptions) CollectPatterns() ([]RejectByNameFunc, error) { var fs []RejectByNameFunc // add patterns from file if len(opts.ExcludeFiles) > 0 { diff --git a/cmd/restic/integration_rewrite_test.go b/cmd/restic/integration_rewrite_test.go new file mode 100644 index 000000000..e6007973b --- /dev/null +++ b/cmd/restic/integration_rewrite_test.go @@ -0,0 +1,73 @@ +package main + +import ( + "context" + "path/filepath" + "testing" + + "github.com/restic/restic/internal/restic" + rtest "github.com/restic/restic/internal/test" +) + +func testRunRewriteExclude(t testing.TB, gopts GlobalOptions, excludes []string, forget bool) { + opts := RewriteOptions{ + excludePatternOptions: excludePatternOptions{ + Excludes: excludes, + }, + Forget: forget, + } + + rtest.OK(t, runRewrite(context.TODO(), opts, gopts, nil)) +} + +func createBasicRewriteRepo(t testing.TB, env *testEnvironment) restic.ID { + testSetupBackupData(t, env) + + // create backup + testRunBackup(t, filepath.Dir(env.testdata), []string{"testdata"}, BackupOptions{}, env.gopts) + snapshotIDs := testRunList(t, "snapshots", env.gopts) + rtest.Assert(t, len(snapshotIDs) == 1, "expected one snapshot, got %v", snapshotIDs) + testRunCheck(t, env.gopts) + + return snapshotIDs[0] +} + +func TestRewrite(t *testing.T) { + env, cleanup := withTestEnvironment(t) + defer cleanup() + createBasicRewriteRepo(t, env) + + // exclude some data + testRunRewriteExclude(t, env.gopts, []string{"3"}, false) + snapshotIDs := testRunList(t, "snapshots", env.gopts) + rtest.Assert(t, len(snapshotIDs) == 2, "expected two snapshots, got %v", snapshotIDs) + testRunCheck(t, env.gopts) +} + +func TestRewriteUnchanged(t *testing.T) { + env, cleanup := withTestEnvironment(t) + defer cleanup() + snapshotID := createBasicRewriteRepo(t, env) + + // use an exclude that will not exclude anything + testRunRewriteExclude(t, env.gopts, []string{"3dflkhjgdflhkjetrlkhjgfdlhkj"}, false) + newSnapshotIDs := testRunList(t, "snapshots", env.gopts) + rtest.Assert(t, len(newSnapshotIDs) == 1, "expected one snapshot, got %v", newSnapshotIDs) + rtest.Assert(t, snapshotID == newSnapshotIDs[0], "snapshot id changed unexpectedly") + testRunCheck(t, env.gopts) +} + +func TestRewriteReplace(t *testing.T) { + env, cleanup := withTestEnvironment(t) + defer cleanup() + snapshotID := createBasicRewriteRepo(t, env) + + // exclude some data + testRunRewriteExclude(t, env.gopts, []string{"3"}, true) + newSnapshotIDs := testRunList(t, "snapshots", env.gopts) + rtest.Assert(t, len(newSnapshotIDs) == 1, "expected one snapshot, got %v", newSnapshotIDs) + rtest.Assert(t, snapshotID != newSnapshotIDs[0], "snapshot id should have changed") + // check forbids unused blobs, thus remove them first + testRunPrune(t, env.gopts, PruneOptions{MaxUnused: "0"}) + testRunCheck(t, env.gopts) +} diff --git a/doc/040_backup.rst b/doc/040_backup.rst index c1cd940c7..b9996311d 100644 --- a/doc/040_backup.rst +++ b/doc/040_backup.rst @@ -204,6 +204,7 @@ Combined with ``--verbose``, you can see a list of changes: modified /archive.tar.gz, saved in 0.140s (25.542 MiB added) Would be added to the repository: 25.551 MiB +.. _backup-excluding-files: Excluding Files *************** diff --git a/doc/045_working_with_repos.rst b/doc/045_working_with_repos.rst index 6da4b707b..860d01ae3 100644 --- a/doc/045_working_with_repos.rst +++ b/doc/045_working_with_repos.rst @@ -136,11 +136,12 @@ or the environment variable ``$RESTIC_FROM_KEY_HINT``. repository. You can avoid this limitation by using the rclone backend along with remotes which are configured in rclone. +.. _copy-filtering-snapshots: Filtering snapshots to copy --------------------------- The list of snapshots to copy can be filtered by host, path in the backup -and / or a comma-separated tag list: +and/or a comma-separated tag list: .. code-block:: console @@ -175,6 +176,61 @@ using the same chunker parameters as the source repository: Note that it is not possible to change the chunker parameters of an existing repository. +Removing files from snapshots +============================= + +Snapshots sometimes turn out to include more files that intended. Instead of +removing the snapshots entirely and running the corresponding backup commands +again (which is not always practical after the fact) it is possible to remove +the unwanted files from affected snapshots by rewriting them using the +``rewrite`` command: + +.. code-block:: console + + $ restic -r /srv/restic-repo rewrite --exclude secret-file + repository c881945a opened (repository version 2) successfully, password is correct + + snapshot 6160ddb2 of [/home/user/work] at 2022-06-12 16:01:28.406630608 +0200 CEST) + excluding /home/user/work/secret-file + saved new snapshot b6aee1ff + + snapshot 4fbaf325 of [/home/user/work] at 2022-05-01 11:22:26.500093107 +0200 CEST) + + modified 1 snapshots + + $ restic -r /srv/restic-repo rewrite --exclude secret-file 6160ddb2 + repository c881945a opened (repository version 2) successfully, password is correct + + snapshot 6160ddb2 of [/home/user/work] at 2022-06-12 16:01:28.406630608 +0200 CEST) + excluding /home/user/work/secret-file + new snapshot saved as b6aee1ff + + modified 1 snapshots + +The options ``--exclude``, ``--exclude-file``, ``--iexclude`` and +``--iexclude-file`` are supported. They behave the same way as for the backup +command, see :ref:`backup-excluding-files` for details. + +It is possible to rewrite only a subset of snapshots by filtering them the same +way as for the ``copy`` command, see :ref:`copy-filtering-snapshots`. + +By default, the ``rewrite`` command will keep the original snapshots and create +new ones for every snapshot which was modified during rewriting. The new +snapshots are marked with the tag ``rewrite`` to differentiate them from the +original, rewritten snapshots. + +Alternatively, you can use the ``--forget`` option to immediately remove the +original snapshots. In this case, no tag is added to the new snapshots. Please +note that this only removes the snapshots and not the actual data stored in the +repository. Run the ``prune`` command afterwards to remove the now unreferenced +data (just like when having used the ``forget`` command). + +In order to preview the changes which ``rewrite`` would make, you can use the +``--dry-run`` option. This will simulate the rewriting process without actually +modifying the repository. Instead restic will only print the actions it would +perform. + + Checking integrity and consistency ================================== diff --git a/doc/manual_rest.rst b/doc/manual_rest.rst index f2d090209..1aa9a434d 100644 --- a/doc/manual_rest.rst +++ b/doc/manual_rest.rst @@ -38,6 +38,7 @@ Usage help is available: rebuild-index Build a new index recover Recover data from the repository not referenced by snapshots restore Extract the data from a snapshot + rewrite Rewrite snapshots to exclude unwanted files self-update Update the restic binary snapshots List all snapshots stats Scan the repository and show basic statistics diff --git a/internal/walker/rewriter.go b/internal/walker/rewriter.go new file mode 100644 index 000000000..6f063831e --- /dev/null +++ b/internal/walker/rewriter.go @@ -0,0 +1,91 @@ +package walker + +import ( + "context" + "fmt" + "path" + + "github.com/restic/restic/internal/debug" + "github.com/restic/restic/internal/restic" +) + +// SelectByNameFunc returns true for all items that should be included (files and +// dirs). If false is returned, files are ignored and dirs are not even walked. +type SelectByNameFunc func(item string) bool + +type TreeFilterVisitor struct { + SelectByName SelectByNameFunc + PrintExclude func(string) +} + +type BlobLoadSaver interface { + restic.BlobSaver + restic.BlobLoader +} + +func FilterTree(ctx context.Context, repo BlobLoadSaver, nodepath string, nodeID restic.ID, visitor *TreeFilterVisitor) (newNodeID restic.ID, err error) { + curTree, err := restic.LoadTree(ctx, repo, nodeID) + if err != nil { + return restic.ID{}, err + } + + // check that we can properly encode this tree without losing information + // The alternative of using json/Decoder.DisallowUnknownFields() doesn't work as we use + // a custom UnmarshalJSON to decode trees, see also https://github.com/golang/go/issues/41144 + testID, err := restic.SaveTree(ctx, repo, curTree) + if err != nil { + return restic.ID{}, err + } + if nodeID != testID { + return restic.ID{}, fmt.Errorf("cannot encode tree at %q without loosing information", nodepath) + } + + debug.Log("filterTree: %s, nodeId: %s\n", nodepath, nodeID.Str()) + + changed := false + tb := restic.NewTreeJSONBuilder() + for _, node := range curTree.Nodes { + path := path.Join(nodepath, node.Name) + if !visitor.SelectByName(path) { + if visitor.PrintExclude != nil { + visitor.PrintExclude(path) + } + changed = true + continue + } + + if node.Subtree == nil { + err = tb.AddNode(node) + if err != nil { + return restic.ID{}, err + } + continue + } + newID, err := FilterTree(ctx, repo, path, *node.Subtree, visitor) + if err != nil { + return restic.ID{}, err + } + if !node.Subtree.Equal(newID) { + changed = true + } + node.Subtree = &newID + err = tb.AddNode(node) + if err != nil { + return restic.ID{}, err + } + } + + if changed { + tree, err := tb.Finalize() + if err != nil { + return restic.ID{}, err + } + + // Save new tree + newTreeID, _, _, err := repo.SaveBlob(ctx, restic.TreeBlob, tree, restic.ID{}, false) + debug.Log("filterTree: save new tree for %s as %v\n", nodepath, newTreeID) + return newTreeID, err + } + + return nodeID, nil +} diff --git a/internal/walker/rewriter_test.go b/internal/walker/rewriter_test.go new file mode 100644 index 000000000..3dcf0ac9e --- /dev/null +++ b/internal/walker/rewriter_test.go @@ -0,0 +1,222 @@ +package walker + +import ( + "context" + "fmt" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/pkg/errors" + "github.com/restic/restic/internal/restic" +) + +// WritableTreeMap also support saving +type WritableTreeMap struct { + TreeMap +} + +func (t WritableTreeMap) SaveBlob(ctx context.Context, tpe restic.BlobType, buf []byte, id restic.ID, storeDuplicate bool) (newID restic.ID, known bool, size int, err error) { + if tpe != restic.TreeBlob { + return restic.ID{}, false, 0, errors.New("can only save trees") + } + + if id.IsNull() { + id = restic.Hash(buf) + } + _, ok := t.TreeMap[id] + if ok { + return id, false, 0, nil + } + + t.TreeMap[id] = append([]byte{}, buf...) + return id, true, len(buf), nil +} + +func (t WritableTreeMap) Dump() { + for k, v := range t.TreeMap { + fmt.Printf("%v: %v", k, string(v)) + } +} + +type checkRewriteFunc func(t testing.TB) (visitor TreeFilterVisitor, final func(testing.TB)) + +// checkRewriteItemOrder ensures that the order of the 'path' arguments is the one passed in as 'want'. +func checkRewriteItemOrder(want []string) checkRewriteFunc { + pos := 0 + return func(t testing.TB) (visitor TreeFilterVisitor, final func(testing.TB)) { + vis := TreeFilterVisitor{ + SelectByName: func(path string) bool { + if pos >= len(want) { + t.Errorf("additional unexpected path found: %v", path) + return false + } + + if path != want[pos] { + t.Errorf("wrong path found, want %q, got %q", want[pos], path) + } + pos++ + return true + }, + } + + final = func(t testing.TB) { + if pos != len(want) { + t.Errorf("not enough items returned, want %d, got %d", len(want), pos) + } + } + + return vis, final + } +} + +// checkRewriteSkips excludes nodes if path is in skipFor, it checks that all excluded entries are printed. +func checkRewriteSkips(skipFor map[string]struct{}, want []string) checkRewriteFunc { + var pos int + printed := make(map[string]struct{}) + + return func(t testing.TB) (visitor TreeFilterVisitor, final func(testing.TB)) { + vis := TreeFilterVisitor{ + SelectByName: func(path string) bool { + if pos >= len(want) { + t.Errorf("additional unexpected path found: %v", path) + return false + } + + if path != want[pos] { + t.Errorf("wrong path found, want %q, got %q", want[pos], path) + } + pos++ + + _, ok := skipFor[path] + return !ok + }, + PrintExclude: func(s string) { + if _, ok := printed[s]; ok { + t.Errorf("path was already printed %v", s) + } + printed[s] = struct{}{} + }, + } + + final = func(t testing.TB) { + if !cmp.Equal(skipFor, printed) { + t.Errorf("unexpected paths skipped: %s", cmp.Diff(skipFor, printed)) + } + if pos != len(want) { + t.Errorf("not enough items returned, want %d, got %d", len(want), pos) + } + } + + return vis, final + } +} + +func TestRewriter(t *testing.T) { + var tests = []struct { + tree TestTree + newTree TestTree + check checkRewriteFunc + }{ + { // don't change + tree: TestTree{ + "foo": TestFile{}, + "subdir": TestTree{ + "subfile": TestFile{}, + }, + }, + check: checkRewriteItemOrder([]string{ + "/foo", + "/subdir", + "/subdir/subfile", + }), + }, + { // exclude file + tree: TestTree{ + "foo": TestFile{}, + "subdir": TestTree{ + "subfile": TestFile{}, + }, + }, + newTree: TestTree{ + "foo": TestFile{}, + "subdir": TestTree{}, + }, + check: checkRewriteSkips( + map[string]struct{}{ + "/subdir/subfile": {}, + }, + []string{ + "/foo", + "/subdir", + "/subdir/subfile", + }, + ), + }, + { // exclude dir + tree: TestTree{ + "foo": TestFile{}, + "subdir": TestTree{ + "subfile": TestFile{}, + }, + }, + newTree: TestTree{ + "foo": TestFile{}, + }, + check: checkRewriteSkips( + map[string]struct{}{ + "/subdir": {}, + }, + []string{ + "/foo", + "/subdir", + }, + ), + }, + } + + for _, test := range tests { + t.Run("", func(t *testing.T) { + repo, root := BuildTreeMap(test.tree) + if test.newTree == nil { + test.newTree = test.tree + } + expRepo, expRoot := BuildTreeMap(test.newTree) + modrepo := WritableTreeMap{repo} + + ctx, cancel := context.WithCancel(context.TODO()) + defer cancel() + + vis, last := test.check(t) + newRoot, err := FilterTree(ctx, modrepo, "/", root, &vis) + if err != nil { + t.Error(err) + } + last(t) + + // verifying against the expected tree root also implicitly checks the structural integrity + if newRoot != expRoot { + t.Error("hash mismatch") + fmt.Println("Got") + modrepo.Dump() + fmt.Println("Expected") + WritableTreeMap{expRepo}.Dump() + } + }) + } +} + +func TestRewriterFailOnUnknownFields(t *testing.T) { + tm := WritableTreeMap{TreeMap{}} + node := []byte(`{"nodes":[{"name":"subfile","type":"file","mtime":"0001-01-01T00:00:00Z","atime":"0001-01-01T00:00:00Z","ctime":"0001-01-01T00:00:00Z","uid":0,"gid":0,"content":null,"unknown_field":42}]}`) + id := restic.Hash(node) + tm.TreeMap[id] = node + + ctx, cancel := context.WithCancel(context.TODO()) + defer cancel() + // use nil visitor to crash if the tree loading works unexpectedly + _, err := FilterTree(ctx, tm, "/", id, nil) + + if err == nil { + t.Error("missing error on unknown field") + } +}