2
2
mirror of https://github.com/octoleo/restic.git synced 2024-11-26 14:56:29 +00:00

checker: Unify blobs, processed trees and referenced blobs map

The blobRefs map and the processedTrees IDSet are merged to reduce the
memory usage. The blobRefs map now uses separate flags to track blob
usage as data or tree blob. This prevents skipping of trees whose
content is identical to an already processed data blob. A third flag
tracks whether a blob exists or not, which removes the need for the
blobs IDSet.
This commit is contained in:
Michael Eischer 2019-07-13 18:34:55 +02:00
parent 35d8413639
commit 36c69e3ca7

View File

@ -22,10 +22,10 @@ import (
// repository (e.g. missing blobs), and needs a valid Repository to work on. // repository (e.g. missing blobs), and needs a valid Repository to work on.
type Checker struct { type Checker struct {
packs restic.IDSet packs restic.IDSet
blobs restic.IDSet
blobRefs struct { blobRefs struct {
sync.Mutex sync.Mutex
M map[restic.ID]bool // see flags below
M map[restic.ID]blobStatus
} }
masterIndex *repository.MasterIndex masterIndex *repository.MasterIndex
@ -33,16 +33,23 @@ type Checker struct {
repo restic.Repository repo restic.Repository
} }
type blobStatus uint8
const (
blobExists blobStatus = 1 << iota
blobReferenced
treeProcessed
)
// New returns a new checker which runs on repo. // New returns a new checker which runs on repo.
func New(repo restic.Repository) *Checker { func New(repo restic.Repository) *Checker {
c := &Checker{ c := &Checker{
packs: restic.NewIDSet(), packs: restic.NewIDSet(),
blobs: restic.NewIDSet(),
masterIndex: repository.NewMasterIndex(), masterIndex: repository.NewMasterIndex(),
repo: repo, repo: repo,
} }
c.blobRefs.M = make(map[restic.ID]bool) c.blobRefs.M = make(map[restic.ID]blobStatus)
return c return c
} }
@ -156,7 +163,7 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) {
cnt := 0 cnt := 0
for blob := range res.Index.Each(ctx) { for blob := range res.Index.Each(ctx) {
c.packs.Insert(blob.PackID) c.packs.Insert(blob.PackID)
c.blobs.Insert(blob.ID) c.blobRefs.M[blob.ID] = blobExists
cnt++ cnt++
if _, ok := packToIndex[blob.PackID]; !ok { if _, ok := packToIndex[blob.PackID]; !ok {
@ -441,10 +448,6 @@ func (c *Checker) checkTreeWorker(ctx context.Context, in <-chan treeJob, out ch
return return
} }
c.blobRefs.Lock()
c.blobRefs.M[job.ID] = true
c.blobRefs.Unlock()
debug.Log("check tree %v (tree %v, err %v)", job.ID, job.Tree, job.error) debug.Log("check tree %v (tree %v, err %v)", job.ID, job.Tree, job.error)
var errs []error var errs []error
@ -469,7 +472,7 @@ func (c *Checker) checkTreeWorker(ctx context.Context, in <-chan treeJob, out ch
} }
} }
func filterTrees(ctx context.Context, backlog restic.IDs, loaderChan chan<- restic.ID, in <-chan treeJob, out chan<- treeJob) { func (c *Checker) filterTrees(ctx context.Context, backlog restic.IDs, loaderChan chan<- restic.ID, in <-chan treeJob, out chan<- treeJob) {
defer func() { defer func() {
debug.Log("closing output channels") debug.Log("closing output channels")
close(loaderChan) close(loaderChan)
@ -483,7 +486,6 @@ func filterTrees(ctx context.Context, backlog restic.IDs, loaderChan chan<- rest
job treeJob job treeJob
nextTreeID restic.ID nextTreeID restic.ID
outstandingLoadTreeJobs = 0 outstandingLoadTreeJobs = 0
processedTrees = restic.NewIDSet()
) )
outCh = nil outCh = nil
@ -492,9 +494,16 @@ func filterTrees(ctx context.Context, backlog restic.IDs, loaderChan chan<- rest
for { for {
if loadCh == nil && len(backlog) > 0 { if loadCh == nil && len(backlog) > 0 {
nextTreeID, backlog = backlog[0], backlog[1:] nextTreeID, backlog = backlog[0], backlog[1:]
if processedTrees.Has(nextTreeID) {
// use a separate flag for processed trees to ensure that check still processes trees
// even when a file references a tree blob
c.blobRefs.Lock()
blobFlags := c.blobRefs.M[nextTreeID]
c.blobRefs.Unlock()
if (blobFlags & treeProcessed) != 0 {
continue continue
} }
loadCh = loaderChan loadCh = loaderChan
} }
@ -510,7 +519,9 @@ func filterTrees(ctx context.Context, backlog restic.IDs, loaderChan chan<- rest
case loadCh <- nextTreeID: case loadCh <- nextTreeID:
outstandingLoadTreeJobs++ outstandingLoadTreeJobs++
loadCh = nil loadCh = nil
processedTrees.Insert(nextTreeID) c.blobRefs.Lock()
c.blobRefs.M[nextTreeID] |= treeProcessed | blobReferenced
c.blobRefs.Unlock()
case j, ok := <-inCh: case j, ok := <-inCh:
if !ok { if !ok {
@ -591,7 +602,7 @@ func (c *Checker) Structure(ctx context.Context, errChan chan<- error) {
go c.checkTreeWorker(ctx, treeJobChan2, errChan, &wg) go c.checkTreeWorker(ctx, treeJobChan2, errChan, &wg)
} }
filterTrees(ctx, trees, treeIDChan, treeJobChan1, treeJobChan2) c.filterTrees(ctx, trees, treeIDChan, treeJobChan1, treeJobChan2)
wg.Wait() wg.Wait()
} }
@ -646,15 +657,13 @@ func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) {
for _, blobID := range blobs { for _, blobID := range blobs {
c.blobRefs.Lock() c.blobRefs.Lock()
c.blobRefs.M[blobID] = true if (c.blobRefs.M[blobID] & blobExists) == 0 {
debug.Log("blob %v is referenced", blobID)
c.blobRefs.Unlock()
if !c.blobs.Has(blobID) {
debug.Log("tree %v references blob %v which isn't contained in index", id, blobID) debug.Log("tree %v references blob %v which isn't contained in index", id, blobID)
errs = append(errs, Error{TreeID: id, BlobID: blobID, Err: errors.New("not found in index")}) errs = append(errs, Error{TreeID: id, BlobID: blobID, Err: errors.New("not found in index")})
} }
c.blobRefs.M[blobID] |= blobReferenced
debug.Log("blob %v is referenced", blobID)
c.blobRefs.Unlock()
} }
return errs return errs
@ -665,9 +674,9 @@ func (c *Checker) UnusedBlobs() (blobs restic.IDs) {
c.blobRefs.Lock() c.blobRefs.Lock()
defer c.blobRefs.Unlock() defer c.blobRefs.Unlock()
debug.Log("checking %d blobs", len(c.blobs)) debug.Log("checking %d blobs", len(c.blobRefs.M))
for id := range c.blobs { for id, flags := range c.blobRefs.M {
if !c.blobRefs.M[id] { if (flags & blobReferenced) == 0 {
debug.Log("blob %v not referenced", id) debug.Log("blob %v not referenced", id)
blobs = append(blobs, id) blobs = append(blobs, id)
} }