Merge pull request #563 from kurtzur/master
Support exponential backoff for cutover attempts
This commit is contained in:
commit
731dfab9fc
@ -118,6 +118,8 @@ type MigrationContext struct {
|
|||||||
CriticalLoadHibernateSeconds int64
|
CriticalLoadHibernateSeconds int64
|
||||||
PostponeCutOverFlagFile string
|
PostponeCutOverFlagFile string
|
||||||
CutOverLockTimeoutSeconds int64
|
CutOverLockTimeoutSeconds int64
|
||||||
|
CutOverExponentialBackoff bool
|
||||||
|
ExponentialBackoffMaxInterval int64
|
||||||
ForceNamedCutOverCommand bool
|
ForceNamedCutOverCommand bool
|
||||||
PanicFlagFile string
|
PanicFlagFile string
|
||||||
HooksPath string
|
HooksPath string
|
||||||
@ -341,6 +343,14 @@ func (this *MigrationContext) SetCutOverLockTimeoutSeconds(timeoutSeconds int64)
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (this *MigrationContext) SetExponentialBackoffMaxInterval(intervalSeconds int64) error {
|
||||||
|
if intervalSeconds < 2 {
|
||||||
|
return fmt.Errorf("Minimal maximum interval is 2sec. Timeout remains at %d", this.ExponentialBackoffMaxInterval)
|
||||||
|
}
|
||||||
|
this.ExponentialBackoffMaxInterval = intervalSeconds
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (this *MigrationContext) SetDefaultNumRetries(retries int64) {
|
func (this *MigrationContext) SetDefaultNumRetries(retries int64) {
|
||||||
this.throttleMutex.Lock()
|
this.throttleMutex.Lock()
|
||||||
defer this.throttleMutex.Unlock()
|
defer this.throttleMutex.Unlock()
|
||||||
|
@ -82,6 +82,8 @@ func main() {
|
|||||||
|
|
||||||
flag.BoolVar(&migrationContext.SwitchToRowBinlogFormat, "switch-to-rbr", false, "let this tool automatically switch binary log format to 'ROW' on the replica, if needed. The format will NOT be switched back. I'm too scared to do that, and wish to protect you if you happen to execute another migration while this one is running")
|
flag.BoolVar(&migrationContext.SwitchToRowBinlogFormat, "switch-to-rbr", false, "let this tool automatically switch binary log format to 'ROW' on the replica, if needed. The format will NOT be switched back. I'm too scared to do that, and wish to protect you if you happen to execute another migration while this one is running")
|
||||||
flag.BoolVar(&migrationContext.AssumeRBR, "assume-rbr", false, "set to 'true' when you know for certain your server uses 'ROW' binlog_format. gh-ost is unable to tell, event after reading binlog_format, whether the replication process does indeed use 'ROW', and restarts replication to be certain RBR setting is applied. Such operation requires SUPER privileges which you might not have. Setting this flag avoids restarting replication and you can proceed to use gh-ost without SUPER privileges")
|
flag.BoolVar(&migrationContext.AssumeRBR, "assume-rbr", false, "set to 'true' when you know for certain your server uses 'ROW' binlog_format. gh-ost is unable to tell, event after reading binlog_format, whether the replication process does indeed use 'ROW', and restarts replication to be certain RBR setting is applied. Such operation requires SUPER privileges which you might not have. Setting this flag avoids restarting replication and you can proceed to use gh-ost without SUPER privileges")
|
||||||
|
flag.BoolVar(&migrationContext.CutOverExponentialBackoff, "cut-over-exponential-backoff", false, "Wait exponentially longer intervals between failed cut-over attempts. Wait intervals obey a maximum configurable with 'exponential-backoff-max-interval').")
|
||||||
|
exponentialBackoffMaxInterval := flag.Int64("exponential-backoff-max-interval", 64, "Maximum number of seconds to wait between attempts when performing various operations with exponential backoff.")
|
||||||
chunkSize := flag.Int64("chunk-size", 1000, "amount of rows to handle in each iteration (allowed range: 100-100,000)")
|
chunkSize := flag.Int64("chunk-size", 1000, "amount of rows to handle in each iteration (allowed range: 100-100,000)")
|
||||||
dmlBatchSize := flag.Int64("dml-batch-size", 10, "batch size for DML events to apply in a single transaction (range 1-100)")
|
dmlBatchSize := flag.Int64("dml-batch-size", 10, "batch size for DML events to apply in a single transaction (range 1-100)")
|
||||||
defaultRetries := flag.Int64("default-retries", 60, "Default number of retries for various operations before panicking")
|
defaultRetries := flag.Int64("default-retries", 60, "Default number of retries for various operations before panicking")
|
||||||
@ -237,6 +239,9 @@ func main() {
|
|||||||
if err := migrationContext.SetCutOverLockTimeoutSeconds(*cutOverLockTimeoutSeconds); err != nil {
|
if err := migrationContext.SetCutOverLockTimeoutSeconds(*cutOverLockTimeoutSeconds); err != nil {
|
||||||
log.Errore(err)
|
log.Errore(err)
|
||||||
}
|
}
|
||||||
|
if err := migrationContext.SetExponentialBackoffMaxInterval(*exponentialBackoffMaxInterval); err != nil {
|
||||||
|
log.Errore(err)
|
||||||
|
}
|
||||||
|
|
||||||
log.Infof("starting gh-ost %+v", AppVersion)
|
log.Infof("starting gh-ost %+v", AppVersion)
|
||||||
acceptSignals(migrationContext)
|
acceptSignals(migrationContext)
|
||||||
|
@ -149,6 +149,34 @@ func (this *Migrator) retryOperation(operation func() error, notFatalHint ...boo
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// `retryOperationWithExponentialBackoff` attempts running given function, waiting 2^(n-1)
|
||||||
|
// seconds between each attempt, where `n` is the running number of attempts. Exits
|
||||||
|
// as soon as the function returns with non-error, or as soon as `MaxRetries`
|
||||||
|
// attempts are reached. Wait intervals between attempts obey a maximum of
|
||||||
|
// `ExponentialBackoffMaxInterval`.
|
||||||
|
func (this *Migrator) retryOperationWithExponentialBackoff(operation func() error, notFatalHint ...bool) (err error) {
|
||||||
|
var interval int64
|
||||||
|
maxRetries := int(this.migrationContext.MaxRetries())
|
||||||
|
maxInterval := this.migrationContext.ExponentialBackoffMaxInterval
|
||||||
|
for i := 0; i < maxRetries; i++ {
|
||||||
|
newInterval := int64(math.Exp2(float64(i - 1)))
|
||||||
|
if newInterval <= maxInterval {
|
||||||
|
interval = newInterval
|
||||||
|
}
|
||||||
|
if i != 0 {
|
||||||
|
time.Sleep(time.Duration(interval) * time.Second)
|
||||||
|
}
|
||||||
|
err = operation()
|
||||||
|
if err == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(notFatalHint) == 0 {
|
||||||
|
this.migrationContext.PanicAbort <- err
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// executeAndThrottleOnError executes a given function. If it errors, it
|
// executeAndThrottleOnError executes a given function. If it errors, it
|
||||||
// throttles.
|
// throttles.
|
||||||
func (this *Migrator) executeAndThrottleOnError(operation func() error) (err error) {
|
func (this *Migrator) executeAndThrottleOnError(operation func() error) (err error) {
|
||||||
@ -372,7 +400,13 @@ func (this *Migrator) Migrate() (err error) {
|
|||||||
if err := this.hooksExecutor.onBeforeCutOver(); err != nil {
|
if err := this.hooksExecutor.onBeforeCutOver(); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if err := this.retryOperation(this.cutOver); err != nil {
|
var retrier func(func() error, ...bool) error
|
||||||
|
if this.migrationContext.CutOverExponentialBackoff {
|
||||||
|
retrier = this.retryOperationWithExponentialBackoff
|
||||||
|
} else {
|
||||||
|
retrier = this.retryOperation
|
||||||
|
}
|
||||||
|
if err := retrier(this.cutOver); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
atomic.StoreInt64(&this.migrationContext.CutOverCompleteFlag, 1)
|
atomic.StoreInt64(&this.migrationContext.CutOverCompleteFlag, 1)
|
||||||
|
Loading…
Reference in New Issue
Block a user