From 8bdfcf779fb4e7260fc05649beb7c524d7518bbe Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Sun, 5 Jun 2022 10:14:32 +0200 Subject: [PATCH] restic prune: Merge three loops over the index There were three loops over the index in restic prune, to find duplicates, to determine sizes (in pack.Size) and to generate packInfos. These three are now one loop. This way, prune doesn't need to construct a set of duplicate blobs, pack.Size doesn't need to contain special logic for prune's use case (the onlyHdr argument) and pack.Size doesn't need to construct a map only to have it immediately transformed into a different map. Some quick testing on a 160GiB local repo doesn't show running time or memory use of restic prune --dry-run changing significantly. --- cmd/restic/cmd_prune.go | 80 ++++++++++++++------------------- cmd/restic/cmd_rebuild_index.go | 2 +- internal/checker/checker.go | 2 +- internal/pack/pack.go | 17 +++---- 4 files changed, 42 insertions(+), 59 deletions(-) diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 76801dea4..e0273122e 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -242,11 +242,26 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB Verbosef("searching used packs...\n") + indexPack := make(map[restic.ID]packInfo) keepBlobs := restic.NewBlobSet() - duplicateBlobs := restic.NewBlobSet() - // iterate over all blobs in index to find out which blobs are duplicates + // iterate over all blobs in index to generate packInfo and find duplicates for blob := range repo.Index().Each(ctx) { + ip, seen := indexPack[blob.PackID] + + if seen { + // mark mixed packs with "Invalid blob type" + if ip.tpe != blob.Type { + ip.tpe = restic.InvalidBlob + } + } else { + ip = packInfo{ + tpe: blob.Type, + usedSize: pack.HeaderSize, + } + } + ip.usedSize += uint64(pack.CalculateEntrySize(blob.Blob)) + bh := blob.BlobHandle size := uint64(blob.Length) switch { @@ -255,14 +270,27 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB keepBlobs.Insert(bh) stats.size.used += size stats.blobs.used++ - case keepBlobs.Has(bh): // duplicate blob - duplicateBlobs.Insert(bh) + ip.usedSize += size + ip.usedBlobs++ + + case keepBlobs.Has(bh): // duplicate of a blob that we want to keep stats.size.duplicate += size stats.blobs.duplicate++ - default: + ip.usedSize += size + ip.duplicateBlobs++ + + default: // unused, don't care if it's a duplicate stats.size.unused += size stats.blobs.unused++ + ip.unusedSize += size + ip.unusedBlobs++ } + + if !blob.IsCompressed() { + ip.uncompressed = true + } + // update indexPack + indexPack[blob.PackID] = ip } // Check if all used blobs have been found in index @@ -275,48 +303,6 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB return errorIndexIncomplete } - indexPack := make(map[restic.ID]packInfo) - - // save computed pack header size - for pid, hdrSize := range pack.Size(ctx, repo.Index(), true) { - // initialize tpe with NumBlobTypes to indicate it's not set - indexPack[pid] = packInfo{tpe: restic.NumBlobTypes, usedSize: uint64(hdrSize)} - } - - // iterate over all blobs in index to generate packInfo - for blob := range repo.Index().Each(ctx) { - ip := indexPack[blob.PackID] - - // Set blob type if not yet set - if ip.tpe == restic.NumBlobTypes { - ip.tpe = blob.Type - } - - // mark mixed packs with "Invalid blob type" - if ip.tpe != blob.Type { - ip.tpe = restic.InvalidBlob - } - - bh := blob.BlobHandle - size := uint64(blob.Length) - switch { - case duplicateBlobs.Has(bh): // duplicate blob - ip.usedSize += size - ip.duplicateBlobs++ - case keepBlobs.Has(bh): // used blob, not duplicate - ip.usedSize += size - ip.usedBlobs++ - default: // unused blob - ip.unusedSize += size - ip.unusedBlobs++ - } - if !blob.IsCompressed() { - ip.uncompressed = true - } - // update indexPack - indexPack[blob.PackID] = ip - } - Verbosef("collecting packs for deletion and repacking\n") removePacksFirst := restic.NewIDSet() removePacks := restic.NewIDSet() diff --git a/cmd/restic/cmd_rebuild_index.go b/cmd/restic/cmd_rebuild_index.go index 0b3274ec4..5611fa939 100644 --- a/cmd/restic/cmd_rebuild_index.go +++ b/cmd/restic/cmd_rebuild_index.go @@ -98,7 +98,7 @@ func rebuildIndex(opts RebuildIndexOptions, gopts GlobalOptions, repo *repositor if err != nil { return err } - packSizeFromIndex = pack.Size(ctx, repo.Index(), false) + packSizeFromIndex = pack.Size(ctx, repo.Index()) } Verbosef("getting pack files to read...\n") diff --git a/internal/checker/checker.go b/internal/checker/checker.go index 2ecd1469c..670459479 100644 --- a/internal/checker/checker.go +++ b/internal/checker/checker.go @@ -131,7 +131,7 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) { } // compute pack size using index entries - c.packs = pack.Size(ctx, c.masterIndex, false) + c.packs = pack.Size(ctx, c.masterIndex) debug.Log("checking for duplicate packs") for packID := range c.packs { diff --git a/internal/pack/pack.go b/internal/pack/pack.go index 2d7a5c3fb..196d882cd 100644 --- a/internal/pack/pack.go +++ b/internal/pack/pack.go @@ -177,8 +177,8 @@ var ( const ( // size of the header-length field at the end of the file; it is a uint32 headerLengthSize = 4 - // headerSize is the header's constant overhead (independent of #entries) - headerSize = headerLengthSize + crypto.Extension + // HeaderSize is the header's constant overhead (independent of #entries) + HeaderSize = headerLengthSize + crypto.Extension // MaxHeaderSize is the max size of header including header-length field MaxHeaderSize = 16*1024*1024 + headerLengthSize @@ -242,7 +242,7 @@ func readHeader(rd io.ReaderAt, size int64) ([]byte, error) { // eagerly download eagerEntries header entries as part of header-length request. // only make second request if actual number of entries is greater than eagerEntries - eagerSize := eagerEntries*int(entrySize) + headerSize + eagerSize := eagerEntries*int(entrySize) + HeaderSize b, c, err := readRecords(rd, size, eagerSize) if err != nil { return nil, err @@ -349,7 +349,7 @@ func CalculateEntrySize(blob restic.Blob) int { } func CalculateHeaderSize(blobs []restic.Blob) int { - size := headerSize + size := HeaderSize for _, blob := range blobs { size += CalculateEntrySize(blob) } @@ -357,20 +357,17 @@ func CalculateHeaderSize(blobs []restic.Blob) int { } // Size returns the size of all packs computed by index information. -// If onlyHdr is set to true, only the size of the header is returned // Note that this function only gives correct sizes, if there are no // duplicates in the index. -func Size(ctx context.Context, mi restic.MasterIndex, onlyHdr bool) map[restic.ID]int64 { +func Size(ctx context.Context, mi restic.MasterIndex) map[restic.ID]int64 { packSize := make(map[restic.ID]int64) for blob := range mi.Each(ctx) { size, ok := packSize[blob.PackID] if !ok { - size = headerSize - } - if !onlyHdr { - size += int64(blob.Length) + size = HeaderSize } + size += int64(blob.Length) packSize[blob.PackID] = size + int64(CalculateEntrySize(blob.Blob)) }