From 7478cbf70e0513c11160961586e23f0c04f0d943 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Fri, 19 Feb 2021 16:57:51 +0100 Subject: [PATCH 1/3] prune: Enhance treatment of duplicates --- changelog/unreleased/issue-3114 | 12 ++++ cmd/restic/cmd_prune.go | 113 ++++++++++++++++++++++---------- 2 files changed, 91 insertions(+), 34 deletions(-) create mode 100644 changelog/unreleased/issue-3114 diff --git a/changelog/unreleased/issue-3114 b/changelog/unreleased/issue-3114 new file mode 100644 index 000000000..68b2556c8 --- /dev/null +++ b/changelog/unreleased/issue-3114 @@ -0,0 +1,12 @@ +Enhancement: Improve `prune` in presence of duplicate blobs + +Restic `prune` always used to repack all pack files containing duplicate +blobs. This effectively removed all duplicates during prune. However, one +of the consequences was that all those pack files were downloadeded and +duplicate blobs did not contribute to the threshold for unused repository +space. +This is now changed and `prune` works nice and fast also if there are lots +of duplicates. + +https://github.com/restic/restic/issues/3114 +https://github.com/restic/restic/pull/3290 diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 115f48557..96e115b66 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -195,13 +195,12 @@ func runPruneWithRepo(opts PruneOptions, gopts GlobalOptions, repo *repository.R } type packInfo struct { - usedBlobs uint - unusedBlobs uint - duplicateBlobs uint - usedSize uint64 - unusedSize uint64 - tpe restic.BlobType - uncompressed bool + usedBlobs uint + unusedBlobs uint + usedSize uint64 + unusedSize uint64 + tpe restic.BlobType + uncompressed bool } type packInfoWithID struct { @@ -243,7 +242,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB Verbosef("searching used packs...\n") keepBlobs := restic.NewBlobSet() - duplicateBlobs := restic.NewBlobSet() + duplicateBlobs := make(map[restic.BlobHandle]uint8) // iterate over all blobs in index to find out which blobs are duplicates for blob := range repo.Index().Each(ctx) { @@ -256,7 +255,17 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB stats.size.used += size stats.blobs.used++ case keepBlobs.Has(bh): // duplicate blob - duplicateBlobs.Insert(bh) + count, ok := duplicateBlobs[bh] + if !ok { + count = 2 // this one is already the second blob! + } else { + count++ + if count == 0 { + // catch uint8 overflow + panic("too many duplicates, prune can only handly up to 255!") + } + } + duplicateBlobs[bh] = count stats.size.duplicate += size stats.blobs.duplicate++ default: @@ -299,10 +308,9 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB bh := blob.BlobHandle size := uint64(blob.Length) + _, isDuplicate := duplicateBlobs[bh] switch { - case duplicateBlobs.Has(bh): // duplicate blob - ip.usedSize += size - ip.duplicateBlobs++ + case isDuplicate: // duplicate blobs will be handled later case keepBlobs.Has(bh): // used blob, not duplicate ip.usedSize += size ip.usedBlobs++ @@ -317,19 +325,52 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB indexPack[blob.PackID] = ip } + // if duplicate blobs exist, those will be set to either "used" or "unused": + // - mark only one occurency of duplicate blobs as used + // - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used" + // - if there are no used blobs in a pack, possibly mark duplicates as "usused" + if len(duplicateBlobs) > 0 { + // iterate again over all blobs in index (this is pretty cheap, all in-mem) + for blob := range repo.Index().Each(ctx) { + bh := blob.BlobHandle + count, isDuplicate := duplicateBlobs[bh] + if !isDuplicate { + continue + } + + ip := indexPack[blob.PackID] + size := uint64(blob.Length) + switch { + case count == 0: + // used duplicate exists -> mark as unused + ip.unusedSize += size + ip.unusedBlobs++ + case ip.usedBlobs > 0, count == 1: + // other used blobs in pack or "last" occurency -> mark as used + ip.usedSize += size + ip.usedBlobs++ + // let other occurences be marked as unused + duplicateBlobs[bh] = 0 + default: + // mark as unused and decrease counter + ip.unusedSize += size + ip.unusedBlobs++ + duplicateBlobs[bh] = count - 1 + } + // update indexPack + indexPack[blob.PackID] = ip + } + } + Verbosef("collecting packs for deletion and repacking\n") removePacksFirst := restic.NewIDSet() removePacks := restic.NewIDSet() repackPacks := restic.NewIDSet() var repackCandidates []packInfoWithID - repackAllPacksWithDuplicates := true keep := func(p packInfo) { stats.packs.keep++ - if p.duplicateBlobs > 0 { - repackAllPacksWithDuplicates = false - } } repoVersion := repo.Config().Version @@ -347,7 +388,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB } if p.unusedSize+p.usedSize != uint64(packSize) && - !(p.usedBlobs == 0 && p.duplicateBlobs == 0) { + p.usedBlobs != 0 { // Pack size does not fit and pack is needed => error // If the pack is not needed, this is no error, the pack can // and will be simply removed, see below. @@ -358,7 +399,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB // statistics switch { - case p.usedBlobs == 0 && p.duplicateBlobs == 0: + case p.usedBlobs == 0: stats.packs.unused++ case p.unusedBlobs == 0: stats.packs.used++ @@ -377,7 +418,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB // decide what to do switch { - case p.usedBlobs == 0 && p.duplicateBlobs == 0: + case p.usedBlobs == 0: // All blobs in pack are no longer used => remove pack! removePacks.Insert(id) stats.blobs.remove += p.unusedBlobs @@ -387,8 +428,8 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB // if this is a data pack and --repack-cacheable-only is set => keep pack! keep(p) - case p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress: - // All blobs in pack are used and not duplicates/mixed => keep pack! + case p.unusedBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress: + // All blobs in pack are used and not mixed => keep pack! keep(p) default: @@ -410,7 +451,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB // missing packs that are not needed can be ignored ignorePacks := restic.NewIDSet() for id, p := range indexPack { - if p.usedBlobs == 0 && p.duplicateBlobs == 0 { + if p.usedBlobs == 0 { ignorePacks.Insert(id) stats.blobs.remove += p.unusedBlobs stats.size.remove += p.unusedSize @@ -439,15 +480,11 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB // This is equivalent to sorting by unused / total space. // Instead of unused[i] / used[i] > unused[j] / used[j] we use // unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64 - // Morover duplicates and packs containing trees are sorted to the beginning + // Morover packs containing trees are sorted to the beginning sort.Slice(repackCandidates, func(i, j int) bool { pi := repackCandidates[i].packInfo pj := repackCandidates[j].packInfo switch { - case pi.duplicateBlobs > 0 && pj.duplicateBlobs == 0: - return true - case pj.duplicateBlobs > 0 && pi.duplicateBlobs == 0: - return false case pi.tpe != restic.DataBlob && pj.tpe == restic.DataBlob: return true case pj.tpe != restic.DataBlob && pi.tpe == restic.DataBlob: @@ -458,7 +495,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB repack := func(id restic.ID, p packInfo) { repackPacks.Insert(id) - stats.blobs.repack += p.unusedBlobs + p.duplicateBlobs + p.usedBlobs + stats.blobs.repack += p.unusedBlobs + p.usedBlobs stats.size.repack += p.unusedSize + p.usedSize stats.blobs.repackrm += p.unusedBlobs stats.size.repackrm += p.unusedSize @@ -472,8 +509,8 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB case reachedRepackSize: keep(p.packInfo) - case p.duplicateBlobs > 0, p.tpe != restic.DataBlob, p.uncompressed: - // repacking duplicates/non-data/uncompressed-trees is only limited by repackSize + case p.tpe != restic.DataBlob, p.uncompressed: + // repacking non-data packs / uncompressed-trees is only limited by repackSize repack(p.ID, p.packInfo) case reachedUnusedSizeAfter: @@ -485,10 +522,18 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB } } - // if all duplicates are repacked, print out correct statistics - if repackAllPacksWithDuplicates { - stats.blobs.repackrm += stats.blobs.duplicate - stats.size.repackrm += stats.size.duplicate + if len(repackPacks) != 0 { + // when repacking, we do not want to keep blobs which are + // already contained in kept packs, so delete them from keepBlobs + for blob := range repo.Index().Each(ctx) { + if removePacks.Has(blob.PackID) || repackPacks.Has(blob.PackID) { + continue + } + keepBlobs.Delete(blob.BlobHandle) + } + } else { + // keepBlobs is only needed if packs are repacked + keepBlobs = nil } Verboseff("\nused: %10d blobs / %s\n", stats.blobs.used, formatBytes(stats.size.used)) From 9be1bd2accbaadd7119d51f7a989ea37e2ee3b92 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sun, 17 Jul 2022 00:27:40 +0200 Subject: [PATCH 2/3] prune: handle very high duplication of some blobs Suggested-By: Alexander Weiss --- changelog/unreleased/issue-3114 | 12 ++++++------ cmd/restic/cmd_prune.go | 13 ++++++------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/changelog/unreleased/issue-3114 b/changelog/unreleased/issue-3114 index 68b2556c8..c7cf8c7b9 100644 --- a/changelog/unreleased/issue-3114 +++ b/changelog/unreleased/issue-3114 @@ -1,10 +1,10 @@ -Enhancement: Improve `prune` in presence of duplicate blobs +Enhancement: Optimize handling of duplicate blobs in `prune` + +Restic `prune` always used to repack all data files containing duplicate +blobs. This effectively removed all duplicates during prune. However, as a +consequence all these data files were repacked even if the unused repository +space threshold could be reached with less work. -Restic `prune` always used to repack all pack files containing duplicate -blobs. This effectively removed all duplicates during prune. However, one -of the consequences was that all those pack files were downloadeded and -duplicate blobs did not contribute to the threshold for unused repository -space. This is now changed and `prune` works nice and fast also if there are lots of duplicates. diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 96e115b66..1c63e0755 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -258,12 +258,11 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB count, ok := duplicateBlobs[bh] if !ok { count = 2 // this one is already the second blob! - } else { + } else if count < math.MaxUint8 { + // don't overflow, but saturate count at 255 + // this can lead to a non-optimal pack selection, but won't cause + // problems otherwise count++ - if count == 0 { - // catch uint8 overflow - panic("too many duplicates, prune can only handly up to 255!") - } } duplicateBlobs[bh] = count stats.size.duplicate += size @@ -326,9 +325,9 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB } // if duplicate blobs exist, those will be set to either "used" or "unused": - // - mark only one occurency of duplicate blobs as used + // - mark only one occurence of duplicate blobs as used // - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used" - // - if there are no used blobs in a pack, possibly mark duplicates as "usused" + // - if there are no used blobs in a pack, possibly mark duplicates as "unused" if len(duplicateBlobs) > 0 { // iterate again over all blobs in index (this is pretty cheap, all in-mem) for blob := range repo.Index().Each(ctx) { From 715d457aad2c867a07ee1439a9b2f9db6e210729 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sun, 17 Jul 2022 11:41:56 +0200 Subject: [PATCH 3/3] prune: code cleanups --- cmd/restic/cmd_prune.go | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 1c63e0755..47d70de9d 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -367,11 +367,6 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB repackPacks := restic.NewIDSet() var repackCandidates []packInfoWithID - - keep := func(p packInfo) { - stats.packs.keep++ - } - repoVersion := repo.Config().Version // loop over all packs and decide what to do @@ -386,8 +381,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB return nil } - if p.unusedSize+p.usedSize != uint64(packSize) && - p.usedBlobs != 0 { + if p.unusedSize+p.usedSize != uint64(packSize) && p.usedBlobs != 0 { // Pack size does not fit and pack is needed => error // If the pack is not needed, this is no error, the pack can // and will be simply removed, see below. @@ -425,11 +419,11 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB case opts.RepackCachableOnly && p.tpe == restic.DataBlob: // if this is a data pack and --repack-cacheable-only is set => keep pack! - keep(p) + stats.packs.keep++ case p.unusedBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress: // All blobs in pack are used and not mixed => keep pack! - keep(p) + stats.packs.keep++ default: // all other packs are candidates for repacking @@ -506,7 +500,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB switch { case reachedRepackSize: - keep(p.packInfo) + stats.packs.keep++ case p.tpe != restic.DataBlob, p.uncompressed: // repacking non-data packs / uncompressed-trees is only limited by repackSize @@ -514,7 +508,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB case reachedUnusedSizeAfter: // for all other packs stop repacking if tolerated unused size is reached. - keep(p.packInfo) + stats.packs.keep++ default: repack(p.ID, p.packInfo)