diff --git a/doc/command-line-flags.md b/doc/command-line-flags.md index 827ce96..52c88cf 100644 --- a/doc/command-line-flags.md +++ b/doc/command-line-flags.md @@ -43,6 +43,16 @@ password=123456 See `exact-rowcount` +### critical-load-interval-millis + +`--critical-load` defines a threshold that, when met, `gh-ost` panics and bails out. The default behavior is to bail out immediately when meeting this threshold. + +This may sometimes lead to migrations bailing out on a very short spike, that, while in itself is impacting production and is worth investigating, isn't reason enough to kill a 10 hour migration. + +When `--critical-load-interval-millis` is specified (e.g. `--critical-load-interval-millis=2500`), `gh-ost` gives a second chance: when it meets `critical-load` threshold, it doesn't bail out. Instead, it starts a timer (in this example: `2.5` seconds) and re-checks `critical-load` when the timer expires. If `critical-load` is met again, `gh-ost` panics and bails out. If not, execution continues. + +This is somewhat similar to a Nagios `n`-times test, where `n` in our case is always `2`. + ### cut-over Optional. Default is `safe`. See more discussion in [cut-over](cut-over.md) diff --git a/go/base/context.go b/go/base/context.go index 41aa915..c3a0255 100644 --- a/go/base/context.go +++ b/go/base/context.go @@ -90,6 +90,7 @@ type MigrationContext struct { ThrottleCommandedByUser int64 maxLoad LoadMap criticalLoad LoadMap + CriticalLoadIntervalMilliseconds int64 PostponeCutOverFlagFile string CutOverLockTimeoutSeconds int64 ForceNamedCutOverCommand bool diff --git a/go/cmd/gh-ost/main.go b/go/cmd/gh-ost/main.go index 9231269..9ae9cd8 100644 --- a/go/cmd/gh-ost/main.go +++ b/go/cmd/gh-ost/main.go @@ -99,6 +99,7 @@ func main() { maxLoad := flag.String("max-load", "", "Comma delimited status-name=threshold. e.g: 'Threads_running=100,Threads_connected=500'. When status exceeds threshold, app throttles writes") criticalLoad := flag.String("critical-load", "", "Comma delimited status-name=threshold, same format as `--max-load`. When status exceeds threshold, app panics and quits") + flag.Int64Var(&migrationContext.CriticalLoadIntervalMilliseconds, "critical-load-interval-millis", 0, "When 0, migration bails out upon meeting critical-load immediately. When non-zero, a second check is done after given interval, and migration only bails out if 2nd check still meets critical load") quiet := flag.Bool("quiet", false, "quiet") verbose := flag.Bool("verbose", false, "verbose") debug := flag.Bool("debug", false, "debug mode (very verbose)") diff --git a/go/logic/throttler.go b/go/logic/throttler.go index 08cf6c3..ee22931 100644 --- a/go/logic/throttler.go +++ b/go/logic/throttler.go @@ -130,6 +130,20 @@ func (this *Throttler) collectControlReplicasLag() { } } +func (this *Throttler) criticalLoadIsMet() (met bool, variableName string, value int64, threshold int64, err error) { + criticalLoad := this.migrationContext.GetCriticalLoad() + for variableName, threshold = range criticalLoad { + value, err = this.applier.ShowStatusVariable(variableName) + if err != nil { + return false, variableName, value, threshold, err + } + if value >= threshold { + return true, variableName, value, threshold, nil + } + } + return false, variableName, value, threshold, nil +} + // collectGeneralThrottleMetrics reads the once-per-sec metrics, and stores them onto this.migrationContext func (this *Throttler) collectGeneralThrottleMetrics() error { @@ -144,15 +158,23 @@ func (this *Throttler) collectGeneralThrottleMetrics() error { this.migrationContext.PanicAbort <- fmt.Errorf("Found panic-file %s. Aborting without cleanup", this.migrationContext.PanicFlagFile) } } - criticalLoad := this.migrationContext.GetCriticalLoad() - for variableName, threshold := range criticalLoad { - value, err := this.applier.ShowStatusVariable(variableName) - if err != nil { - return setThrottle(true, fmt.Sprintf("%s %s", variableName, err)) - } - if value >= threshold { - this.migrationContext.PanicAbort <- fmt.Errorf("critical-load met: %s=%d, >=%d", variableName, value, threshold) - } + + criticalLoadMet, variableName, value, threshold, err := this.criticalLoadIsMet() + if err != nil { + return setThrottle(true, fmt.Sprintf("%s %s", variableName, err)) + } + if criticalLoadMet && this.migrationContext.CriticalLoadIntervalMilliseconds == 0 { + this.migrationContext.PanicAbort <- fmt.Errorf("critical-load met: %s=%d, >=%d", variableName, value, threshold) + } + if criticalLoadMet && this.migrationContext.CriticalLoadIntervalMilliseconds > 0 { + log.Errorf("critical-load met once: %s=%d, >=%d. Will check again in %d millis", variableName, value, threshold, this.migrationContext.CriticalLoadIntervalMilliseconds) + go func() { + timer := time.NewTimer(time.Millisecond * time.Duration(this.migrationContext.CriticalLoadIntervalMilliseconds)) + <-timer.C + if criticalLoadMetAgain, variableName, value, threshold, _ := this.criticalLoadIsMet(); criticalLoadMetAgain { + this.migrationContext.PanicAbort <- fmt.Errorf("critical-load met again after %d millis: %s=%d, >=%d", this.migrationContext.CriticalLoadIntervalMilliseconds, variableName, value, threshold) + } + }() } // Back to throttle considerations