Merge pull request #14 from ccoffey/cathal/safer_cut_over
Cut-over should wait for heartbeat lag to be low enough to succeed
This commit is contained in:
commit
4a36e246c0
@ -66,6 +66,7 @@ The following variables are available on all hooks:
|
|||||||
- `GH_OST_ESTIMATED_ROWS` - estimated total rows in table
|
- `GH_OST_ESTIMATED_ROWS` - estimated total rows in table
|
||||||
- `GH_OST_COPIED_ROWS` - number of rows copied by `gh-ost`
|
- `GH_OST_COPIED_ROWS` - number of rows copied by `gh-ost`
|
||||||
- `GH_OST_INSPECTED_LAG` - lag in seconds (floating point) of inspected server
|
- `GH_OST_INSPECTED_LAG` - lag in seconds (floating point) of inspected server
|
||||||
|
- `GH_OST_HEARTBEAT_LAG` - lag in seconds (floating point) of heartbeat
|
||||||
- `GH_OST_PROGRESS` - progress pct ([0..100], floating point) of migration
|
- `GH_OST_PROGRESS` - progress pct ([0..100], floating point) of migration
|
||||||
- `GH_OST_MIGRATED_HOST`
|
- `GH_OST_MIGRATED_HOST`
|
||||||
- `GH_OST_INSPECTED_HOST`
|
- `GH_OST_INSPECTED_HOST`
|
||||||
|
@ -178,6 +178,8 @@ type MigrationContext struct {
|
|||||||
RenameTablesEndTime time.Time
|
RenameTablesEndTime time.Time
|
||||||
pointOfInterestTime time.Time
|
pointOfInterestTime time.Time
|
||||||
pointOfInterestTimeMutex *sync.Mutex
|
pointOfInterestTimeMutex *sync.Mutex
|
||||||
|
lastHeartbeatOnChangelogTime time.Time
|
||||||
|
lastHeartbeatOnChangelogMutex *sync.Mutex
|
||||||
CurrentLag int64
|
CurrentLag int64
|
||||||
currentProgress uint64
|
currentProgress uint64
|
||||||
ThrottleHTTPStatusCode int64
|
ThrottleHTTPStatusCode int64
|
||||||
@ -272,6 +274,7 @@ func NewMigrationContext() *MigrationContext {
|
|||||||
throttleControlReplicaKeys: mysql.NewInstanceKeyMap(),
|
throttleControlReplicaKeys: mysql.NewInstanceKeyMap(),
|
||||||
configMutex: &sync.Mutex{},
|
configMutex: &sync.Mutex{},
|
||||||
pointOfInterestTimeMutex: &sync.Mutex{},
|
pointOfInterestTimeMutex: &sync.Mutex{},
|
||||||
|
lastHeartbeatOnChangelogMutex: &sync.Mutex{},
|
||||||
ColumnRenameMap: make(map[string]string),
|
ColumnRenameMap: make(map[string]string),
|
||||||
PanicAbort: make(chan error),
|
PanicAbort: make(chan error),
|
||||||
Log: NewDefaultLogger(),
|
Log: NewDefaultLogger(),
|
||||||
@ -455,6 +458,10 @@ func (this *MigrationContext) MarkRowCopyEndTime() {
|
|||||||
this.RowCopyEndTime = time.Now()
|
this.RowCopyEndTime = time.Now()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (this *MigrationContext) TimeSinceLastHeartbeatOnChangelog() time.Duration {
|
||||||
|
return time.Since(this.GetLastHeartbeatOnChangelogTime())
|
||||||
|
}
|
||||||
|
|
||||||
func (this *MigrationContext) GetCurrentLagDuration() time.Duration {
|
func (this *MigrationContext) GetCurrentLagDuration() time.Duration {
|
||||||
return time.Duration(atomic.LoadInt64(&this.CurrentLag))
|
return time.Duration(atomic.LoadInt64(&this.CurrentLag))
|
||||||
}
|
}
|
||||||
@ -494,6 +501,20 @@ func (this *MigrationContext) TimeSincePointOfInterest() time.Duration {
|
|||||||
return time.Since(this.pointOfInterestTime)
|
return time.Since(this.pointOfInterestTime)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (this *MigrationContext) SetLastHeartbeatOnChangelogTime(t time.Time) {
|
||||||
|
this.lastHeartbeatOnChangelogMutex.Lock()
|
||||||
|
defer this.lastHeartbeatOnChangelogMutex.Unlock()
|
||||||
|
|
||||||
|
this.lastHeartbeatOnChangelogTime = t
|
||||||
|
}
|
||||||
|
|
||||||
|
func (this *MigrationContext) GetLastHeartbeatOnChangelogTime() time.Time {
|
||||||
|
this.lastHeartbeatOnChangelogMutex.Lock()
|
||||||
|
defer this.lastHeartbeatOnChangelogMutex.Unlock()
|
||||||
|
|
||||||
|
return this.lastHeartbeatOnChangelogTime
|
||||||
|
}
|
||||||
|
|
||||||
func (this *MigrationContext) SetHeartbeatIntervalMilliseconds(heartbeatIntervalMilliseconds int64) {
|
func (this *MigrationContext) SetHeartbeatIntervalMilliseconds(heartbeatIntervalMilliseconds int64) {
|
||||||
if heartbeatIntervalMilliseconds < 100 {
|
if heartbeatIntervalMilliseconds < 100 {
|
||||||
heartbeatIntervalMilliseconds = 100
|
heartbeatIntervalMilliseconds = 100
|
||||||
|
@ -64,6 +64,7 @@ func (this *HooksExecutor) applyEnvironmentVariables(extraVariables ...string) [
|
|||||||
env = append(env, fmt.Sprintf("GH_OST_INSPECTED_HOST=%s", this.migrationContext.GetInspectorHostname()))
|
env = append(env, fmt.Sprintf("GH_OST_INSPECTED_HOST=%s", this.migrationContext.GetInspectorHostname()))
|
||||||
env = append(env, fmt.Sprintf("GH_OST_EXECUTING_HOST=%s", this.migrationContext.Hostname))
|
env = append(env, fmt.Sprintf("GH_OST_EXECUTING_HOST=%s", this.migrationContext.Hostname))
|
||||||
env = append(env, fmt.Sprintf("GH_OST_INSPECTED_LAG=%f", this.migrationContext.GetCurrentLagDuration().Seconds()))
|
env = append(env, fmt.Sprintf("GH_OST_INSPECTED_LAG=%f", this.migrationContext.GetCurrentLagDuration().Seconds()))
|
||||||
|
env = append(env, fmt.Sprintf("GH_OST_HEARTBEAT_LAG=%f", this.migrationContext.TimeSinceLastHeartbeatOnChangelog().Seconds()))
|
||||||
env = append(env, fmt.Sprintf("GH_OST_PROGRESS=%f", this.migrationContext.GetProgressPct()))
|
env = append(env, fmt.Sprintf("GH_OST_PROGRESS=%f", this.migrationContext.GetProgressPct()))
|
||||||
env = append(env, fmt.Sprintf("GH_OST_HOOKS_HINT=%s", this.migrationContext.HooksHintMessage))
|
env = append(env, fmt.Sprintf("GH_OST_HOOKS_HINT=%s", this.migrationContext.HooksHintMessage))
|
||||||
env = append(env, fmt.Sprintf("GH_OST_HOOKS_HINT_OWNER=%s", this.migrationContext.HooksHintOwner))
|
env = append(env, fmt.Sprintf("GH_OST_HOOKS_HINT_OWNER=%s", this.migrationContext.HooksHintOwner))
|
||||||
|
@ -207,12 +207,20 @@ func (this *Migrator) canStopStreaming() bool {
|
|||||||
return atomic.LoadInt64(&this.migrationContext.CutOverCompleteFlag) != 0
|
return atomic.LoadInt64(&this.migrationContext.CutOverCompleteFlag) != 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// onChangelogStateEvent is called when a binlog event operation on the changelog table is intercepted.
|
// onChangelogEvent is called when a binlog event operation on the changelog table is intercepted.
|
||||||
func (this *Migrator) onChangelogStateEvent(dmlEvent *binlog.BinlogDMLEvent) (err error) {
|
func (this *Migrator) onChangelogEvent(dmlEvent *binlog.BinlogDMLEvent) (err error) {
|
||||||
// Hey, I created the changelog table, I know the type of columns it has!
|
// Hey, I created the changelog table, I know the type of columns it has!
|
||||||
if hint := dmlEvent.NewColumnValues.StringColumn(2); hint != "state" {
|
switch hint := dmlEvent.NewColumnValues.StringColumn(2); hint {
|
||||||
|
case "state":
|
||||||
|
return this.onChangelogStateEvent(dmlEvent)
|
||||||
|
case "heartbeat":
|
||||||
|
return this.onChangelogHeartbeatEvent(dmlEvent)
|
||||||
|
default:
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (this *Migrator) onChangelogStateEvent(dmlEvent *binlog.BinlogDMLEvent) (err error) {
|
||||||
changelogStateString := dmlEvent.NewColumnValues.StringColumn(3)
|
changelogStateString := dmlEvent.NewColumnValues.StringColumn(3)
|
||||||
changelogState := ReadChangelogState(changelogStateString)
|
changelogState := ReadChangelogState(changelogStateString)
|
||||||
this.migrationContext.Log.Infof("Intercepted changelog state %s", changelogState)
|
this.migrationContext.Log.Infof("Intercepted changelog state %s", changelogState)
|
||||||
@ -245,6 +253,18 @@ func (this *Migrator) onChangelogStateEvent(dmlEvent *binlog.BinlogDMLEvent) (er
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (this *Migrator) onChangelogHeartbeatEvent(dmlEvent *binlog.BinlogDMLEvent) (err error) {
|
||||||
|
changelogHeartbeatString := dmlEvent.NewColumnValues.StringColumn(3)
|
||||||
|
|
||||||
|
heartbeatTime, err := time.Parse(time.RFC3339Nano, changelogHeartbeatString)
|
||||||
|
if err != nil {
|
||||||
|
return this.migrationContext.Log.Errore(err)
|
||||||
|
} else {
|
||||||
|
this.migrationContext.SetLastHeartbeatOnChangelogTime(heartbeatTime)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// listenOnPanicAbort aborts on abort request
|
// listenOnPanicAbort aborts on abort request
|
||||||
func (this *Migrator) listenOnPanicAbort() {
|
func (this *Migrator) listenOnPanicAbort() {
|
||||||
err := <-this.migrationContext.PanicAbort
|
err := <-this.migrationContext.PanicAbort
|
||||||
@ -476,6 +496,13 @@ func (this *Migrator) cutOver() (err error) {
|
|||||||
this.migrationContext.Log.Debugf("checking for cut-over postpone")
|
this.migrationContext.Log.Debugf("checking for cut-over postpone")
|
||||||
this.sleepWhileTrue(
|
this.sleepWhileTrue(
|
||||||
func() (bool, error) {
|
func() (bool, error) {
|
||||||
|
heartbeatLag := this.migrationContext.TimeSinceLastHeartbeatOnChangelog()
|
||||||
|
maxLagMillisecondsThrottle := time.Duration(atomic.LoadInt64(&this.migrationContext.MaxLagMillisecondsThrottleThreshold)) * time.Millisecond
|
||||||
|
cutOverLockTimeout := time.Duration(this.migrationContext.CutOverLockTimeoutSeconds) * time.Second
|
||||||
|
if heartbeatLag > maxLagMillisecondsThrottle || heartbeatLag > cutOverLockTimeout {
|
||||||
|
this.migrationContext.Log.Debugf("current HeartbeatLag (%.2fs) is too high, it needs to be less than both --max-lag-millis (%.2fs) and --cut-over-lock-timeout-seconds (%.2fs) to continue", heartbeatLag.Seconds(), maxLagMillisecondsThrottle.Seconds(), cutOverLockTimeout.Seconds())
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
if this.migrationContext.PostponeCutOverFlagFile == "" {
|
if this.migrationContext.PostponeCutOverFlagFile == "" {
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
@ -962,13 +989,14 @@ func (this *Migrator) printStatus(rule PrintStatusRule, writers ...io.Writer) {
|
|||||||
|
|
||||||
currentBinlogCoordinates := *this.eventsStreamer.GetCurrentBinlogCoordinates()
|
currentBinlogCoordinates := *this.eventsStreamer.GetCurrentBinlogCoordinates()
|
||||||
|
|
||||||
status := fmt.Sprintf("Copy: %d/%d %.1f%%; Applied: %d; Backlog: %d/%d; Time: %+v(total), %+v(copy); streamer: %+v; Lag: %.2fs, State: %s; ETA: %s",
|
status := fmt.Sprintf("Copy: %d/%d %.1f%%; Applied: %d; Backlog: %d/%d; Time: %+v(total), %+v(copy); streamer: %+v; Lag: %.2fs, HeartbeatLag: %.2fs, State: %s; ETA: %s",
|
||||||
totalRowsCopied, rowsEstimate, progressPct,
|
totalRowsCopied, rowsEstimate, progressPct,
|
||||||
atomic.LoadInt64(&this.migrationContext.TotalDMLEventsApplied),
|
atomic.LoadInt64(&this.migrationContext.TotalDMLEventsApplied),
|
||||||
len(this.applyEventsQueue), cap(this.applyEventsQueue),
|
len(this.applyEventsQueue), cap(this.applyEventsQueue),
|
||||||
base.PrettifyDurationOutput(elapsedTime), base.PrettifyDurationOutput(this.migrationContext.ElapsedRowCopyTime()),
|
base.PrettifyDurationOutput(elapsedTime), base.PrettifyDurationOutput(this.migrationContext.ElapsedRowCopyTime()),
|
||||||
currentBinlogCoordinates,
|
currentBinlogCoordinates,
|
||||||
this.migrationContext.GetCurrentLagDuration().Seconds(),
|
this.migrationContext.GetCurrentLagDuration().Seconds(),
|
||||||
|
this.migrationContext.TimeSinceLastHeartbeatOnChangelog().Seconds(),
|
||||||
state,
|
state,
|
||||||
eta,
|
eta,
|
||||||
)
|
)
|
||||||
@ -995,7 +1023,7 @@ func (this *Migrator) initiateStreaming() error {
|
|||||||
this.migrationContext.DatabaseName,
|
this.migrationContext.DatabaseName,
|
||||||
this.migrationContext.GetChangelogTableName(),
|
this.migrationContext.GetChangelogTableName(),
|
||||||
func(dmlEvent *binlog.BinlogDMLEvent) error {
|
func(dmlEvent *binlog.BinlogDMLEvent) error {
|
||||||
return this.onChangelogStateEvent(dmlEvent)
|
return this.onChangelogEvent(dmlEvent)
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user