diff --git a/go/base/context.go b/go/base/context.go index c300df1..d82b22e 100644 --- a/go/base/context.go +++ b/go/base/context.go @@ -40,8 +40,9 @@ const ( type ThrottleReasonHint string const ( - NoThrottleReasonHint ThrottleReasonHint = "NoThrottleReasonHint" - UserCommandThrottleReasonHint = "UserCommandThrottleReasonHint" + NoThrottleReasonHint ThrottleReasonHint = "NoThrottleReasonHint" + UserCommandThrottleReasonHint = "UserCommandThrottleReasonHint" + LeavingHibernationThrottleReasonHint = "LeavingHibernationThrottleReasonHint" ) const ( @@ -105,9 +106,11 @@ type MigrationContext struct { throttleQuery string throttleHTTP string ThrottleCommandedByUser int64 + HibernateUntil int64 maxLoad LoadMap criticalLoad LoadMap CriticalLoadIntervalMilliseconds int64 + CriticalLoadHibernateSeconds int64 PostponeCutOverFlagFile string CutOverLockTimeoutSeconds int64 ForceNamedCutOverCommand bool diff --git a/go/cmd/gh-ost/main.go b/go/cmd/gh-ost/main.go index f27e12b..a4f4f3e 100644 --- a/go/cmd/gh-ost/main.go +++ b/go/cmd/gh-ost/main.go @@ -112,6 +112,7 @@ func main() { maxLoad := flag.String("max-load", "", "Comma delimited status-name=threshold. e.g: 'Threads_running=100,Threads_connected=500'. When status exceeds threshold, app throttles writes") criticalLoad := flag.String("critical-load", "", "Comma delimited status-name=threshold, same format as --max-load. When status exceeds threshold, app panics and quits") flag.Int64Var(&migrationContext.CriticalLoadIntervalMilliseconds, "critical-load-interval-millis", 0, "When 0, migration immediately bails out upon meeting critical-load. When non-zero, a second check is done after given interval, and migration only bails out if 2nd check still meets critical load") + flag.Int64Var(&migrationContext.CriticalLoadHibernateSeconds, "critical-load-hibernate-seconds", 0, "When nonzero, critical-load does not panic and bail out; instead, gh-ost goes into hibernate for the specified duration. It will not read/write anything to from/to any server") quiet := flag.Bool("quiet", false, "quiet") verbose := flag.Bool("verbose", false, "verbose") debug := flag.Bool("debug", false, "debug mode (very verbose)") diff --git a/go/logic/applier.go b/go/logic/applier.go index 4e3f783..b167de8 100644 --- a/go/logic/applier.go +++ b/go/logic/applier.go @@ -293,6 +293,9 @@ func (this *Applier) WriteChangelogState(value string) (string, error) { func (this *Applier) InitiateHeartbeat() { var numSuccessiveFailures int64 injectHeartbeat := func() error { + if atomic.LoadInt64(&this.migrationContext.HibernateUntil) > 0 { + return nil + } if _, err := this.WriteChangelog("heartbeat", time.Now().Format(time.RFC3339Nano)); err != nil { numSuccessiveFailures++ if numSuccessiveFailures > this.migrationContext.MaxRetries() { diff --git a/go/logic/throttler.go b/go/logic/throttler.go index 1c2c62a..ecc1f2b 100644 --- a/go/logic/throttler.go +++ b/go/logic/throttler.go @@ -38,6 +38,10 @@ func NewThrottler(applier *Applier, inspector *Inspector) *Throttler { // It merely observes the metrics collected by other components, it does not issue // its own metric collection. func (this *Throttler) shouldThrottle() (result bool, reason string, reasonHint base.ThrottleReasonHint) { + if hibernateUntil := atomic.LoadInt64(&this.migrationContext.HibernateUntil); hibernateUntil > 0 { + hibernateUntilTime := time.Unix(0, hibernateUntil) + return true, fmt.Sprintf("critical-load-hibernate until %+v", hibernateUntilTime), base.NoThrottleReasonHint + } generalCheckResult := this.migrationContext.GetThrottleGeneralCheckResult() if generalCheckResult.ShouldThrottle { return generalCheckResult.ShouldThrottle, generalCheckResult.Reason, generalCheckResult.ReasonHint @@ -96,6 +100,9 @@ func (this *Throttler) collectReplicationLag(firstThrottlingCollected chan<- boo if atomic.LoadInt64(&this.migrationContext.CleanupImminentFlag) > 0 { return nil } + if atomic.LoadInt64(&this.migrationContext.HibernateUntil) > 0 { + return nil + } if this.migrationContext.TestOnReplica || this.migrationContext.MigrateOnReplica { // when running on replica, the heartbeat injection is also done on the replica. @@ -128,6 +135,10 @@ func (this *Throttler) collectReplicationLag(firstThrottlingCollected chan<- boo // collectControlReplicasLag polls all the control replicas to get maximum lag value func (this *Throttler) collectControlReplicasLag() { + if atomic.LoadInt64(&this.migrationContext.HibernateUntil) > 0 { + return + } + replicationLagQuery := fmt.Sprintf(` select value from %s.%s where hint = 'heartbeat' and id <= 255 `, @@ -222,6 +233,9 @@ func (this *Throttler) criticalLoadIsMet() (met bool, variableName string, value // collectReplicationLag reads the latest changelog heartbeat value func (this *Throttler) collectThrottleHTTPStatus(firstThrottlingCollected chan<- bool) { collectFunc := func() (sleep bool, err error) { + if atomic.LoadInt64(&this.migrationContext.HibernateUntil) > 0 { + return true, nil + } url := this.migrationContext.GetThrottleHTTP() if url == "" { return true, nil @@ -247,6 +261,9 @@ func (this *Throttler) collectThrottleHTTPStatus(firstThrottlingCollected chan<- // collectGeneralThrottleMetrics reads the once-per-sec metrics, and stores them onto this.migrationContext func (this *Throttler) collectGeneralThrottleMetrics() error { + if atomic.LoadInt64(&this.migrationContext.HibernateUntil) > 0 { + return nil + } setThrottle := func(throttle bool, reason string, reasonHint base.ThrottleReasonHint) error { this.migrationContext.SetThrottleGeneralCheckResult(base.NewThrottleCheckResult(throttle, reason, reasonHint)) @@ -264,6 +281,20 @@ func (this *Throttler) collectGeneralThrottleMetrics() error { if err != nil { return setThrottle(true, fmt.Sprintf("%s %s", variableName, err), base.NoThrottleReasonHint) } + + if criticalLoadMet && this.migrationContext.CriticalLoadHibernateSeconds > 0 { + hibernateDuration := time.Duration(this.migrationContext.CriticalLoadHibernateSeconds) * time.Second + hibernateUntilTime := time.Now().Add(hibernateDuration) + atomic.StoreInt64(&this.migrationContext.HibernateUntil, hibernateUntilTime.UnixNano()) + log.Errorf("critical-load met: %s=%d, >=%d. Will hibernate for the duration of %+v, until %+v", variableName, value, threshold, hibernateDuration, hibernateUntilTime) + go func() { + time.Sleep(hibernateDuration) + this.migrationContext.SetThrottleGeneralCheckResult(base.NewThrottleCheckResult(true, "leaving hibernation", base.LeavingHibernationThrottleReasonHint)) + atomic.StoreInt64(&this.migrationContext.HibernateUntil, 0) + }() + return nil + } + if criticalLoadMet && this.migrationContext.CriticalLoadIntervalMilliseconds == 0 { this.migrationContext.PanicAbort <- fmt.Errorf("critical-load met: %s=%d, >=%d", variableName, value, threshold) }