Merge pull request #424 from github/critical-load-hibernate

hibernate on critical-load
This commit is contained in:
Shlomi Noach 2017-06-05 09:37:47 +03:00 committed by GitHub
commit 8335f13721
4 changed files with 40 additions and 2 deletions

View File

@ -40,8 +40,9 @@ const (
type ThrottleReasonHint string type ThrottleReasonHint string
const ( const (
NoThrottleReasonHint ThrottleReasonHint = "NoThrottleReasonHint" NoThrottleReasonHint ThrottleReasonHint = "NoThrottleReasonHint"
UserCommandThrottleReasonHint = "UserCommandThrottleReasonHint" UserCommandThrottleReasonHint = "UserCommandThrottleReasonHint"
LeavingHibernationThrottleReasonHint = "LeavingHibernationThrottleReasonHint"
) )
const ( const (
@ -105,9 +106,11 @@ type MigrationContext struct {
throttleQuery string throttleQuery string
throttleHTTP string throttleHTTP string
ThrottleCommandedByUser int64 ThrottleCommandedByUser int64
HibernateUntil int64
maxLoad LoadMap maxLoad LoadMap
criticalLoad LoadMap criticalLoad LoadMap
CriticalLoadIntervalMilliseconds int64 CriticalLoadIntervalMilliseconds int64
CriticalLoadHibernateSeconds int64
PostponeCutOverFlagFile string PostponeCutOverFlagFile string
CutOverLockTimeoutSeconds int64 CutOverLockTimeoutSeconds int64
ForceNamedCutOverCommand bool ForceNamedCutOverCommand bool

View File

@ -112,6 +112,7 @@ func main() {
maxLoad := flag.String("max-load", "", "Comma delimited status-name=threshold. e.g: 'Threads_running=100,Threads_connected=500'. When status exceeds threshold, app throttles writes") maxLoad := flag.String("max-load", "", "Comma delimited status-name=threshold. e.g: 'Threads_running=100,Threads_connected=500'. When status exceeds threshold, app throttles writes")
criticalLoad := flag.String("critical-load", "", "Comma delimited status-name=threshold, same format as --max-load. When status exceeds threshold, app panics and quits") criticalLoad := flag.String("critical-load", "", "Comma delimited status-name=threshold, same format as --max-load. When status exceeds threshold, app panics and quits")
flag.Int64Var(&migrationContext.CriticalLoadIntervalMilliseconds, "critical-load-interval-millis", 0, "When 0, migration immediately bails out upon meeting critical-load. When non-zero, a second check is done after given interval, and migration only bails out if 2nd check still meets critical load") flag.Int64Var(&migrationContext.CriticalLoadIntervalMilliseconds, "critical-load-interval-millis", 0, "When 0, migration immediately bails out upon meeting critical-load. When non-zero, a second check is done after given interval, and migration only bails out if 2nd check still meets critical load")
flag.Int64Var(&migrationContext.CriticalLoadHibernateSeconds, "critical-load-hibernate-seconds", 0, "When nonzero, critical-load does not panic and bail out; instead, gh-ost goes into hibernate for the specified duration. It will not read/write anything to from/to any server")
quiet := flag.Bool("quiet", false, "quiet") quiet := flag.Bool("quiet", false, "quiet")
verbose := flag.Bool("verbose", false, "verbose") verbose := flag.Bool("verbose", false, "verbose")
debug := flag.Bool("debug", false, "debug mode (very verbose)") debug := flag.Bool("debug", false, "debug mode (very verbose)")

View File

@ -293,6 +293,9 @@ func (this *Applier) WriteChangelogState(value string) (string, error) {
func (this *Applier) InitiateHeartbeat() { func (this *Applier) InitiateHeartbeat() {
var numSuccessiveFailures int64 var numSuccessiveFailures int64
injectHeartbeat := func() error { injectHeartbeat := func() error {
if atomic.LoadInt64(&this.migrationContext.HibernateUntil) > 0 {
return nil
}
if _, err := this.WriteChangelog("heartbeat", time.Now().Format(time.RFC3339Nano)); err != nil { if _, err := this.WriteChangelog("heartbeat", time.Now().Format(time.RFC3339Nano)); err != nil {
numSuccessiveFailures++ numSuccessiveFailures++
if numSuccessiveFailures > this.migrationContext.MaxRetries() { if numSuccessiveFailures > this.migrationContext.MaxRetries() {

View File

@ -38,6 +38,10 @@ func NewThrottler(applier *Applier, inspector *Inspector) *Throttler {
// It merely observes the metrics collected by other components, it does not issue // It merely observes the metrics collected by other components, it does not issue
// its own metric collection. // its own metric collection.
func (this *Throttler) shouldThrottle() (result bool, reason string, reasonHint base.ThrottleReasonHint) { func (this *Throttler) shouldThrottle() (result bool, reason string, reasonHint base.ThrottleReasonHint) {
if hibernateUntil := atomic.LoadInt64(&this.migrationContext.HibernateUntil); hibernateUntil > 0 {
hibernateUntilTime := time.Unix(0, hibernateUntil)
return true, fmt.Sprintf("critical-load-hibernate until %+v", hibernateUntilTime), base.NoThrottleReasonHint
}
generalCheckResult := this.migrationContext.GetThrottleGeneralCheckResult() generalCheckResult := this.migrationContext.GetThrottleGeneralCheckResult()
if generalCheckResult.ShouldThrottle { if generalCheckResult.ShouldThrottle {
return generalCheckResult.ShouldThrottle, generalCheckResult.Reason, generalCheckResult.ReasonHint return generalCheckResult.ShouldThrottle, generalCheckResult.Reason, generalCheckResult.ReasonHint
@ -96,6 +100,9 @@ func (this *Throttler) collectReplicationLag(firstThrottlingCollected chan<- boo
if atomic.LoadInt64(&this.migrationContext.CleanupImminentFlag) > 0 { if atomic.LoadInt64(&this.migrationContext.CleanupImminentFlag) > 0 {
return nil return nil
} }
if atomic.LoadInt64(&this.migrationContext.HibernateUntil) > 0 {
return nil
}
if this.migrationContext.TestOnReplica || this.migrationContext.MigrateOnReplica { if this.migrationContext.TestOnReplica || this.migrationContext.MigrateOnReplica {
// when running on replica, the heartbeat injection is also done on the replica. // when running on replica, the heartbeat injection is also done on the replica.
@ -128,6 +135,10 @@ func (this *Throttler) collectReplicationLag(firstThrottlingCollected chan<- boo
// collectControlReplicasLag polls all the control replicas to get maximum lag value // collectControlReplicasLag polls all the control replicas to get maximum lag value
func (this *Throttler) collectControlReplicasLag() { func (this *Throttler) collectControlReplicasLag() {
if atomic.LoadInt64(&this.migrationContext.HibernateUntil) > 0 {
return
}
replicationLagQuery := fmt.Sprintf(` replicationLagQuery := fmt.Sprintf(`
select value from %s.%s where hint = 'heartbeat' and id <= 255 select value from %s.%s where hint = 'heartbeat' and id <= 255
`, `,
@ -222,6 +233,9 @@ func (this *Throttler) criticalLoadIsMet() (met bool, variableName string, value
// collectReplicationLag reads the latest changelog heartbeat value // collectReplicationLag reads the latest changelog heartbeat value
func (this *Throttler) collectThrottleHTTPStatus(firstThrottlingCollected chan<- bool) { func (this *Throttler) collectThrottleHTTPStatus(firstThrottlingCollected chan<- bool) {
collectFunc := func() (sleep bool, err error) { collectFunc := func() (sleep bool, err error) {
if atomic.LoadInt64(&this.migrationContext.HibernateUntil) > 0 {
return true, nil
}
url := this.migrationContext.GetThrottleHTTP() url := this.migrationContext.GetThrottleHTTP()
if url == "" { if url == "" {
return true, nil return true, nil
@ -247,6 +261,9 @@ func (this *Throttler) collectThrottleHTTPStatus(firstThrottlingCollected chan<-
// collectGeneralThrottleMetrics reads the once-per-sec metrics, and stores them onto this.migrationContext // collectGeneralThrottleMetrics reads the once-per-sec metrics, and stores them onto this.migrationContext
func (this *Throttler) collectGeneralThrottleMetrics() error { func (this *Throttler) collectGeneralThrottleMetrics() error {
if atomic.LoadInt64(&this.migrationContext.HibernateUntil) > 0 {
return nil
}
setThrottle := func(throttle bool, reason string, reasonHint base.ThrottleReasonHint) error { setThrottle := func(throttle bool, reason string, reasonHint base.ThrottleReasonHint) error {
this.migrationContext.SetThrottleGeneralCheckResult(base.NewThrottleCheckResult(throttle, reason, reasonHint)) this.migrationContext.SetThrottleGeneralCheckResult(base.NewThrottleCheckResult(throttle, reason, reasonHint))
@ -264,6 +281,20 @@ func (this *Throttler) collectGeneralThrottleMetrics() error {
if err != nil { if err != nil {
return setThrottle(true, fmt.Sprintf("%s %s", variableName, err), base.NoThrottleReasonHint) return setThrottle(true, fmt.Sprintf("%s %s", variableName, err), base.NoThrottleReasonHint)
} }
if criticalLoadMet && this.migrationContext.CriticalLoadHibernateSeconds > 0 {
hibernateDuration := time.Duration(this.migrationContext.CriticalLoadHibernateSeconds) * time.Second
hibernateUntilTime := time.Now().Add(hibernateDuration)
atomic.StoreInt64(&this.migrationContext.HibernateUntil, hibernateUntilTime.UnixNano())
log.Errorf("critical-load met: %s=%d, >=%d. Will hibernate for the duration of %+v, until %+v", variableName, value, threshold, hibernateDuration, hibernateUntilTime)
go func() {
time.Sleep(hibernateDuration)
this.migrationContext.SetThrottleGeneralCheckResult(base.NewThrottleCheckResult(true, "leaving hibernation", base.LeavingHibernationThrottleReasonHint))
atomic.StoreInt64(&this.migrationContext.HibernateUntil, 0)
}()
return nil
}
if criticalLoadMet && this.migrationContext.CriticalLoadIntervalMilliseconds == 0 { if criticalLoadMet && this.migrationContext.CriticalLoadIntervalMilliseconds == 0 {
this.migrationContext.PanicAbort <- fmt.Errorf("critical-load met: %s=%d, >=%d", variableName, value, threshold) this.migrationContext.PanicAbort <- fmt.Errorf("critical-load met: %s=%d, >=%d", variableName, value, threshold)
} }