Merge pull request #370 from github/throttle-race-condition
Migration only starting after first replication lag metric collected
This commit is contained in:
commit
315072690b
@ -940,7 +940,9 @@ func (this *Migrator) initiateThrottler() error {
|
|||||||
|
|
||||||
go this.throttler.initiateThrottlerCollection(this.firstThrottlingCollected)
|
go this.throttler.initiateThrottlerCollection(this.firstThrottlingCollected)
|
||||||
log.Infof("Waiting for first throttle metrics to be collected")
|
log.Infof("Waiting for first throttle metrics to be collected")
|
||||||
<-this.firstThrottlingCollected
|
<-this.firstThrottlingCollected // replication lag
|
||||||
|
<-this.firstThrottlingCollected // other metrics
|
||||||
|
log.Infof("First throttle metrics collected")
|
||||||
go this.throttler.initiateThrottlerChecks()
|
go this.throttler.initiateThrottlerChecks()
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
@ -85,32 +85,37 @@ func (this *Throttler) parseChangelogHeartbeat(heartbeatValue string) (err error
|
|||||||
}
|
}
|
||||||
|
|
||||||
// collectReplicationLag reads the latest changelog heartbeat value
|
// collectReplicationLag reads the latest changelog heartbeat value
|
||||||
func (this *Throttler) collectReplicationLag() {
|
func (this *Throttler) collectReplicationLag(firstThrottlingCollected chan<- bool) {
|
||||||
|
collectFunc := func() error {
|
||||||
|
if atomic.LoadInt64(&this.migrationContext.CleanupImminentFlag) > 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if this.migrationContext.TestOnReplica || this.migrationContext.MigrateOnReplica {
|
||||||
|
// when running on replica, the heartbeat injection is also done on the replica.
|
||||||
|
// This means we will always get a good heartbeat value.
|
||||||
|
// When runnign on replica, we should instead check the `SHOW SLAVE STATUS` output.
|
||||||
|
if lag, err := mysql.GetReplicationLag(this.inspector.connectionConfig); err != nil {
|
||||||
|
return log.Errore(err)
|
||||||
|
} else {
|
||||||
|
atomic.StoreInt64(&this.migrationContext.CurrentLag, int64(lag))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if heartbeatValue, err := this.inspector.readChangelogState("heartbeat"); err != nil {
|
||||||
|
return log.Errore(err)
|
||||||
|
} else {
|
||||||
|
this.parseChangelogHeartbeat(heartbeatValue)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
collectFunc()
|
||||||
|
firstThrottlingCollected <- true
|
||||||
|
|
||||||
ticker := time.Tick(time.Duration(this.migrationContext.HeartbeatIntervalMilliseconds) * time.Millisecond)
|
ticker := time.Tick(time.Duration(this.migrationContext.HeartbeatIntervalMilliseconds) * time.Millisecond)
|
||||||
for range ticker {
|
for range ticker {
|
||||||
go func() error {
|
go collectFunc()
|
||||||
if atomic.LoadInt64(&this.migrationContext.CleanupImminentFlag) > 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if this.migrationContext.TestOnReplica || this.migrationContext.MigrateOnReplica {
|
|
||||||
// when running on replica, the heartbeat injection is also done on the replica.
|
|
||||||
// This means we will always get a good heartbeat value.
|
|
||||||
// When runnign on replica, we should instead check the `SHOW SLAVE STATUS` output.
|
|
||||||
if lag, err := mysql.GetReplicationLag(this.inspector.connectionConfig); err != nil {
|
|
||||||
return log.Errore(err)
|
|
||||||
} else {
|
|
||||||
atomic.StoreInt64(&this.migrationContext.CurrentLag, int64(lag))
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if heartbeatValue, err := this.inspector.readChangelogState("heartbeat"); err != nil {
|
|
||||||
return log.Errore(err)
|
|
||||||
} else {
|
|
||||||
this.parseChangelogHeartbeat(heartbeatValue)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -285,13 +290,14 @@ func (this *Throttler) collectGeneralThrottleMetrics() error {
|
|||||||
// that may affect throttling. There are several components, all running independently,
|
// that may affect throttling. There are several components, all running independently,
|
||||||
// that collect such metrics.
|
// that collect such metrics.
|
||||||
func (this *Throttler) initiateThrottlerCollection(firstThrottlingCollected chan<- bool) {
|
func (this *Throttler) initiateThrottlerCollection(firstThrottlingCollected chan<- bool) {
|
||||||
go this.collectReplicationLag()
|
go this.collectReplicationLag(firstThrottlingCollected)
|
||||||
go this.collectControlReplicasLag()
|
go this.collectControlReplicasLag()
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
throttlerMetricsTick := time.Tick(1 * time.Second)
|
|
||||||
this.collectGeneralThrottleMetrics()
|
this.collectGeneralThrottleMetrics()
|
||||||
firstThrottlingCollected <- true
|
firstThrottlingCollected <- true
|
||||||
|
|
||||||
|
throttlerMetricsTick := time.Tick(1 * time.Second)
|
||||||
for range throttlerMetricsTick {
|
for range throttlerMetricsTick {
|
||||||
this.collectGeneralThrottleMetrics()
|
this.collectGeneralThrottleMetrics()
|
||||||
}
|
}
|
||||||
|
@ -32,9 +32,11 @@ func GetReplicationLag(connectionConfig *ConnectionConfig) (replicationLag time.
|
|||||||
}
|
}
|
||||||
|
|
||||||
err = sqlutils.QueryRowsMap(db, `show slave status`, func(m sqlutils.RowMap) error {
|
err = sqlutils.QueryRowsMap(db, `show slave status`, func(m sqlutils.RowMap) error {
|
||||||
|
slaveIORunning := m.GetString("Slave_IO_Running")
|
||||||
|
slaveSQLRunning := m.GetString("Slave_SQL_Running")
|
||||||
secondsBehindMaster := m.GetNullInt64("Seconds_Behind_Master")
|
secondsBehindMaster := m.GetNullInt64("Seconds_Behind_Master")
|
||||||
if !secondsBehindMaster.Valid {
|
if !secondsBehindMaster.Valid {
|
||||||
return fmt.Errorf("replication not running")
|
return fmt.Errorf("replication not running; Slave_IO_Running=%+v, Slave_SQL_Running=%+v", slaveIORunning, slaveSQLRunning)
|
||||||
}
|
}
|
||||||
replicationLag = time.Duration(secondsBehindMaster.Int64) * time.Second
|
replicationLag = time.Duration(secondsBehindMaster.Int64) * time.Second
|
||||||
return nil
|
return nil
|
||||||
|
Loading…
Reference in New Issue
Block a user