From 0372c7ef04bded9c231a27bcafaaf8fe64c3da1f Mon Sep 17 00:00:00 2001
From: Matt Armstrong <matt@rfc20.org>
Date: Mon, 1 Nov 2021 15:13:23 -0700
Subject: [PATCH] Improve the ETA displayed during backup

The ETA restic displays was based on a rate computed across the entire
backup operation. Often restic can progress at uneven rates. In the worst
case, restic progresses over most of the backup at a very high rate and
then finds new data to back up. The displayed ETA is then unrealistic and
never adapts.

Restic now estimates the transfer rate based on a sliding window, with the
goal of adapting to observed changes in rate. To avoid wild changes in the
estimate, several heuristics are used to keep the sliding window wide
enough to be relatively stable.
---
 changelog/unreleased/issue-3397           |  17 ++
 internal/ui/backup/progress.go            |  15 +-
 internal/ui/backup/rate_estimator.go      |  98 +++++++++
 internal/ui/backup/rate_estimator_test.go | 233 ++++++++++++++++++++++
 4 files changed, 359 insertions(+), 4 deletions(-)
 create mode 100644 changelog/unreleased/issue-3397
 create mode 100644 internal/ui/backup/rate_estimator.go
 create mode 100644 internal/ui/backup/rate_estimator_test.go

diff --git a/changelog/unreleased/issue-3397 b/changelog/unreleased/issue-3397
new file mode 100644
index 000000000..170305642
--- /dev/null
+++ b/changelog/unreleased/issue-3397
@@ -0,0 +1,17 @@
+# The first line must start with Bugfix:, Enhancement: or Change:,
+# including the colon. Use present use. Remove lines starting with '#'
+# from this template.
+Enhancement: Improve the ETA displayed during backup
+
+# Describe the problem in the past tense, the new behavior in the present
+# tense. Mention the affected commands, backends, operating systems, etc.
+# Focus on user-facing behavior, not the implementation.
+
+Restic's backup command displayed an ETA that did not adapt when the rate
+of progress made during the backup changed during the course of the
+backup. Restic now uses recent progress when computing the ETA. It is
+important to realize that the estimate may still be wrong, because restic
+cannot predict the future, but the hope is that the ETA will be more
+accurate in most cases.
+
+https://github.com/restic/restic/issues/3397
diff --git a/internal/ui/backup/progress.go b/internal/ui/backup/progress.go
index 8e15662a6..7ad6cb6c6 100644
--- a/internal/ui/backup/progress.go
+++ b/internal/ui/backup/progress.go
@@ -43,7 +43,8 @@ type Progress struct {
 	progress.Updater
 	mu sync.Mutex
 
-	start time.Time
+	start     time.Time
+	estimator rateEstimator
 
 	scanStarted, scanFinished bool
 
@@ -60,6 +61,7 @@ func NewProgress(printer ProgressPrinter, interval time.Duration) *Progress {
 		start:        time.Now(),
 		currentFiles: make(map[string]struct{}),
 		printer:      printer,
+		estimator:    *newRateEstimator(time.Now()),
 	}
 	p.Updater = *progress.NewUpdater(interval, func(runtime time.Duration, final bool) {
 		if final {
@@ -73,9 +75,13 @@ func NewProgress(printer ProgressPrinter, interval time.Duration) *Progress {
 
 			var secondsRemaining uint64
 			if p.scanFinished {
-				secs := float64(runtime / time.Second)
-				todo := float64(p.total.Bytes - p.processed.Bytes)
-				secondsRemaining = uint64(secs / float64(p.processed.Bytes) * todo)
+				rate := p.estimator.rate(time.Now())
+				if rate <= 0 {
+					secondsRemaining = 0
+				} else {
+					todo := float64(p.total.Bytes - p.processed.Bytes)
+					secondsRemaining = uint64(todo / rate)
+				}
 			}
 
 			p.printer.Update(p.total, p.processed, p.errors, p.currentFiles, p.start, secondsRemaining)
@@ -105,6 +111,7 @@ func (p *Progress) addProcessed(c Counter) {
 	p.processed.Files += c.Files
 	p.processed.Dirs += c.Dirs
 	p.processed.Bytes += c.Bytes
+	p.estimator.recordBytes(time.Now(), c.Bytes)
 	p.scanStarted = true
 }
 
diff --git a/internal/ui/backup/rate_estimator.go b/internal/ui/backup/rate_estimator.go
new file mode 100644
index 000000000..c37e243d1
--- /dev/null
+++ b/internal/ui/backup/rate_estimator.go
@@ -0,0 +1,98 @@
+package backup
+
+import (
+	"container/list"
+	"time"
+)
+
+// rateBucket represents a one second window of recorded progress.
+type rateBucket struct {
+	totalBytes uint64
+	end        time.Time // the end of the time window, exclusive
+}
+
+// rateEstimator represents an estimate of the time to complete an operation.
+type rateEstimator struct {
+	buckets    *list.List
+	start      time.Time
+	totalBytes uint64
+}
+
+// newRateEstimator returns an esimator initialized to a presumed start time.
+func newRateEstimator(start time.Time) *rateEstimator {
+	return &rateEstimator{buckets: list.New(), start: start}
+}
+
+// See trim(), below.
+const (
+	bucketWidth             = time.Second
+	minRateEstimatorBytes   = 100 * 1000 * 1000
+	minRateEstimatorBuckets = 20
+	minRateEstimatorMinutes = 2
+)
+
+// trim removes the oldest history from the estimator assuming a given
+// current time.
+func (r *rateEstimator) trim(now time.Time) {
+	// The estimator retains byte transfer counts over a two minute window.
+	// However, to avoid removing too much history when transfer rates are
+	// low, the estimator also retains a minimum number of processed bytes
+	// across a minimum number of buckets. An operation that is processing a
+	// significant number of bytes per second will typically retain only a
+	// two minute window's worth of information. One that is making slow
+	// progress, such as one being over a rate limited connection, typically
+	// observes bursts of updates as infrequently as every ten or twenty
+	// seconds, in which case the other limiters will kick in. This heuristic
+	// avoids wildly fluctuating estimates over rate limited connections.
+	start := now.Add(-minRateEstimatorMinutes * time.Minute)
+
+	for e := r.buckets.Front(); e != nil; e = r.buckets.Front() {
+		if r.buckets.Len() <= minRateEstimatorBuckets {
+			break
+		}
+		b := e.Value.(*rateBucket)
+		if b.end.After(start) {
+			break
+		}
+		total := r.totalBytes - b.totalBytes
+		if total < minRateEstimatorBytes {
+			break
+		}
+		r.start = b.end
+		r.totalBytes = total
+		r.buckets.Remove(e)
+	}
+}
+
+// recordBytes records the transfer of a number of bytes at a given
+// time. Times passed in successive calls should advance monotonically (as
+// is the case with time.Now().
+func (r *rateEstimator) recordBytes(now time.Time, bytes uint64) {
+	if bytes == 0 {
+		return
+	}
+	var tail *rateBucket
+	if r.buckets.Len() > 0 {
+		tail = r.buckets.Back().Value.(*rateBucket)
+	}
+	if tail == nil || !tail.end.After(now) {
+		// The new bucket holds measurements in the time range [now .. now+1sec).
+		tail = &rateBucket{end: now.Add(bucketWidth)}
+		r.buckets.PushBack(tail)
+	}
+	tail.totalBytes += bytes
+	r.totalBytes += bytes
+	r.trim(now)
+}
+
+// rate returns an estimated bytes per second rate at a given time, or zero
+// if there is not enough data to compute a rate.
+func (r *rateEstimator) rate(now time.Time) float64 {
+	r.trim(now)
+	if !r.start.Before(now) {
+		return 0
+	}
+	elapsed := float64(now.Sub(r.start)) / float64(time.Second)
+	rate := float64(r.totalBytes) / elapsed
+	return rate
+}
diff --git a/internal/ui/backup/rate_estimator_test.go b/internal/ui/backup/rate_estimator_test.go
new file mode 100644
index 000000000..3dad42b49
--- /dev/null
+++ b/internal/ui/backup/rate_estimator_test.go
@@ -0,0 +1,233 @@
+package backup
+
+import (
+	"math"
+	"testing"
+	"time"
+)
+
+const float64EqualityThreshold = 1e-6
+
+func almostEqual(a, b float64) bool {
+	if math.IsNaN(a) || math.IsNaN(b) {
+		panic("almostEqual passed a NaN")
+	}
+	return math.Abs(a-b) <= float64EqualityThreshold
+}
+
+func TestEstimatorDefault(t *testing.T) {
+	var start time.Time
+	e := newRateEstimator(start)
+	r := e.rate(start)
+	if math.IsNaN(r) || r != 0 {
+		t.Fatalf("e.Rate == %v, want zero", r)
+	}
+	r = e.rate(start.Add(time.Hour))
+	if math.IsNaN(r) || r != 0 {
+		t.Fatalf("e.Rate == %v, want zero", r)
+	}
+}
+
+func TestEstimatorSimple(t *testing.T) {
+	var when time.Time
+	type testcase struct {
+		bytes uint64
+		when  time.Duration
+		rate  float64
+	}
+
+	cases := []testcase{
+		{0, 0, 0},
+		{1, time.Second, 1},
+		{60, time.Second, 60},
+		{60, time.Minute, 1},
+	}
+	for _, c := range cases {
+		e := newRateEstimator(when)
+		e.recordBytes(when.Add(time.Second), c.bytes)
+		rate := e.rate(when.Add(c.when))
+		if !almostEqual(rate, c.rate) {
+			t.Fatalf("e.Rate == %v, want %v (testcase %+v)", rate, c.rate, c)
+		}
+	}
+}
+
+func TestBucketWidth(t *testing.T) {
+	var when time.Time
+
+	// Recording byte transfers within a bucket width's time window uses one
+	// bucket.
+	e := newRateEstimator(when)
+	e.recordBytes(when, 1)
+	e.recordBytes(when.Add(bucketWidth-time.Nanosecond), 1)
+	if e.buckets.Len() != 1 {
+		t.Fatalf("e.buckets.Len() is %d, want 1", e.buckets.Len())
+	}
+	b := e.buckets.Back().Value.(*rateBucket)
+	if b.totalBytes != 2 {
+		t.Fatalf("b.totalBytes is %d, want 2", b.totalBytes)
+	}
+	if b.end != when.Add(bucketWidth) {
+		t.Fatalf("b.end is %v, want %v", b.end, when.Add(bucketWidth))
+	}
+
+	// Recording a byte outside the bucket width causes another bucket.
+	e.recordBytes(when.Add(bucketWidth), 1)
+	if e.buckets.Len() != 2 {
+		t.Fatalf("e.buckets.Len() is %d, want 2", e.buckets.Len())
+	}
+	b = e.buckets.Back().Value.(*rateBucket)
+	if b.totalBytes != 1 {
+		t.Fatalf("b.totalBytes is %d, want 1", b.totalBytes)
+	}
+	if b.end != when.Add(2*bucketWidth) {
+		t.Fatalf("b.end is %v, want %v", b.end, when.Add(bucketWidth))
+	}
+
+	// Recording a byte after a longer delay creates a sparse bucket list.
+	e.recordBytes(when.Add(time.Hour+time.Millisecond), 7)
+	if e.buckets.Len() != 3 {
+		t.Fatalf("e.buckets.Len() is %d, want 3", e.buckets.Len())
+	}
+	b = e.buckets.Back().Value.(*rateBucket)
+	if b.totalBytes != 7 {
+		t.Fatalf("b.totalBytes is %d, want 7", b.totalBytes)
+	}
+	if b.end != when.Add(time.Hour+time.Millisecond+time.Second) {
+		t.Fatalf("b.end is %v, want %v", b.end, when.Add(time.Hour+time.Millisecond+time.Second))
+	}
+}
+
+type chunk struct {
+	repetitions uint64 // repetition count
+	bytes       uint64 // byte count (every second)
+}
+
+func applyChunk(c chunk, t time.Time, e *rateEstimator) time.Time {
+	for i := uint64(0); i < c.repetitions; i++ {
+		e.recordBytes(t, c.bytes)
+		t = t.Add(time.Second)
+	}
+	return t
+}
+
+func applyChunks(chunks []chunk, t time.Time, e *rateEstimator) time.Time {
+	for _, c := range chunks {
+		t = applyChunk(c, t, e)
+	}
+	return t
+}
+
+func TestEstimatorResponsiveness(t *testing.T) {
+	type testcase struct {
+		description string
+		chunks      []chunk
+		rate        float64
+	}
+
+	cases := []testcase{
+		{
+			"1000 bytes/sec over one second",
+			[]chunk{
+				{1, 1000},
+			},
+			1000,
+		},
+		{
+			"1000 bytes/sec over one minute",
+			[]chunk{
+				{60, 1000},
+			},
+			1000,
+		},
+		{
+			"1000 bytes/sec for 10 seconds, then 2000 bytes/sec for 10 seconds",
+			[]chunk{
+				{10, 1000},
+				{10, 2000},
+			},
+			1500,
+		},
+		{
+			"1000 bytes/sec for one minute, then 2000 bytes/sec for one minute",
+			[]chunk{
+				{60, 1000},
+				{60, 2000},
+			},
+			1500,
+		},
+		{
+			"rate doubles after 30 seconds",
+			[]chunk{
+				{30, minRateEstimatorBytes},
+				{90, 2 * minRateEstimatorBytes},
+			},
+			minRateEstimatorBytes * 1.75,
+		},
+		{
+			"rate doubles after 31 seconds",
+			[]chunk{
+				{31, minRateEstimatorBytes},
+				{90, 2 * minRateEstimatorBytes},
+			},
+			// The expected rate is the same as the prior test case because the
+			// first second has rolled off the estimator.
+			minRateEstimatorBytes * 1.75,
+		},
+		{
+			"rate doubles after 90 seconds",
+			[]chunk{
+				{90, minRateEstimatorBytes},
+				{90, 2 * minRateEstimatorBytes},
+			},
+			// The expected rate is the same as the prior test case because the
+			// first 60 seconds have rolled off the estimator.
+			minRateEstimatorBytes * 1.75,
+		},
+		{
+			"rate doubles for two full minutes",
+			[]chunk{
+				{60, minRateEstimatorBytes},
+				{120, 2 * minRateEstimatorBytes},
+			},
+			2 * minRateEstimatorBytes,
+		},
+		{
+			"rate falls to zero",
+			[]chunk{
+				{30, minRateEstimatorBytes},
+				{30, 0},
+			},
+			minRateEstimatorBytes / 2,
+		},
+		{
+			"rate falls to zero for extended time",
+			[]chunk{
+				{60, 1000},
+				{300, 0},
+			},
+			1000 * 60 / (60 + 300.0),
+		},
+		{
+			"rate falls to zero for extended time (from high rate)",
+			[]chunk{
+				{2 * minRateEstimatorBuckets, minRateEstimatorBytes},
+				{300, 0},
+			},
+			// Expect that only minRateEstimatorBuckets buckets are used in the
+			// rate estimate.
+			minRateEstimatorBytes * minRateEstimatorBuckets /
+				(minRateEstimatorBuckets + 300.0),
+		},
+	}
+
+	for i, c := range cases {
+		var w time.Time
+		e := newRateEstimator(w)
+		w = applyChunks(c.chunks, w, e)
+		r := e.rate(w)
+		if !almostEqual(r, c.rate) {
+			t.Fatalf("e.Rate == %f, want %f (testcase %d %+v)", r, c.rate, i, c)
+		}
+	}
+}