diff --git a/changelog/unreleased/issue-3114 b/changelog/unreleased/issue-3114 index 68b2556c8..c7cf8c7b9 100644 --- a/changelog/unreleased/issue-3114 +++ b/changelog/unreleased/issue-3114 @@ -1,10 +1,10 @@ -Enhancement: Improve `prune` in presence of duplicate blobs +Enhancement: Optimize handling of duplicate blobs in `prune` + +Restic `prune` always used to repack all data files containing duplicate +blobs. This effectively removed all duplicates during prune. However, as a +consequence all these data files were repacked even if the unused repository +space threshold could be reached with less work. -Restic `prune` always used to repack all pack files containing duplicate -blobs. This effectively removed all duplicates during prune. However, one -of the consequences was that all those pack files were downloadeded and -duplicate blobs did not contribute to the threshold for unused repository -space. This is now changed and `prune` works nice and fast also if there are lots of duplicates. diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 96e115b66..1c63e0755 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -258,12 +258,11 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB count, ok := duplicateBlobs[bh] if !ok { count = 2 // this one is already the second blob! - } else { + } else if count < math.MaxUint8 { + // don't overflow, but saturate count at 255 + // this can lead to a non-optimal pack selection, but won't cause + // problems otherwise count++ - if count == 0 { - // catch uint8 overflow - panic("too many duplicates, prune can only handly up to 255!") - } } duplicateBlobs[bh] = count stats.size.duplicate += size @@ -326,9 +325,9 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB } // if duplicate blobs exist, those will be set to either "used" or "unused": - // - mark only one occurency of duplicate blobs as used + // - mark only one occurence of duplicate blobs as used // - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used" - // - if there are no used blobs in a pack, possibly mark duplicates as "usused" + // - if there are no used blobs in a pack, possibly mark duplicates as "unused" if len(duplicateBlobs) > 0 { // iterate again over all blobs in index (this is pretty cheap, all in-mem) for blob := range repo.Index().Each(ctx) {