From 779e9fdd8302dc4f2e8ae94b5bc31d9522002a8b Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Sun, 29 Jan 2017 09:25:29 +0200 Subject: [PATCH 1/7] question (?) as argument in interactive commands --- go/logic/server.go | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/go/logic/server.go b/go/logic/server.go index f616f60..b1246c2 100644 --- a/go/logic/server.go +++ b/go/logic/server.go @@ -126,7 +126,7 @@ func (this *Server) applyServerCommand(command string, writer *bufio.Writer) (pr if len(tokens) > 1 { arg = strings.TrimSpace(tokens[1]) } - + argIsQuestion := (arg == "?") throttleHint := "# Note: you may only throttle for as long as your binary logs are not purged\n" if err := this.hooksExecutor.onInteractiveCommand(command); err != nil { @@ -152,6 +152,7 @@ no-throttle # End forced throttling (other throttling m unpostpone # Bail out a cut-over postpone; proceed to cut-over panic # panic and quit without cleanup help # This message +- use '?' (question mark) as argument to get info rather than set. e.g. "max-load=?" will just print out current max-load. `) } case "sup": @@ -160,6 +161,10 @@ help # This message return ForcePrintStatusAndHintRule, nil case "chunk-size": { + if argIsQuestion { + fmt.Fprintf(writer, "%+v\n", atomic.LoadInt64(&this.migrationContext.ChunkSize)) + return NoPrintStatusRule, nil + } if chunkSize, err := strconv.Atoi(arg); err != nil { return NoPrintStatusRule, err } else { @@ -169,6 +174,10 @@ help # This message } case "max-lag-millis": { + if argIsQuestion { + fmt.Fprintf(writer, "%+v\n", atomic.LoadInt64(&this.migrationContext.MaxLagMillisecondsThrottleThreshold)) + return NoPrintStatusRule, nil + } if maxLagMillis, err := strconv.Atoi(arg); err != nil { return NoPrintStatusRule, err } else { @@ -182,6 +191,10 @@ help # This message } case "nice-ratio": { + if argIsQuestion { + fmt.Fprintf(writer, "%+v\n", this.migrationContext.GetNiceRatio()) + return NoPrintStatusRule, nil + } if niceRatio, err := strconv.ParseFloat(arg, 64); err != nil { return NoPrintStatusRule, err } else { @@ -191,6 +204,11 @@ help # This message } case "max-load": { + if argIsQuestion { + maxLoad := this.migrationContext.GetMaxLoad() + fmt.Fprintf(writer, "%s\n", maxLoad.String()) + return NoPrintStatusRule, nil + } if err := this.migrationContext.ReadMaxLoad(arg); err != nil { return NoPrintStatusRule, err } @@ -198,6 +216,11 @@ help # This message } case "critical-load": { + if argIsQuestion { + criticalLoad := this.migrationContext.GetCriticalLoad() + fmt.Fprintf(writer, "%s\n", criticalLoad.String()) + return NoPrintStatusRule, nil + } if err := this.migrationContext.ReadCriticalLoad(arg); err != nil { return NoPrintStatusRule, err } @@ -205,12 +228,20 @@ help # This message } case "throttle-query": { + if argIsQuestion { + fmt.Fprintf(writer, "%+v\n", this.migrationContext.GetThrottleQuery()) + return NoPrintStatusRule, nil + } this.migrationContext.SetThrottleQuery(arg) fmt.Fprintf(writer, throttleHint) return ForcePrintStatusAndHintRule, nil } case "throttle-control-replicas": { + if argIsQuestion { + fmt.Fprintf(writer, "%s\n", this.migrationContext.GetThrottleControlReplicaKeys().ToCommaDelimitedList()) + return NoPrintStatusRule, nil + } if err := this.migrationContext.ReadThrottleControlReplicaKeys(arg); err != nil { return NoPrintStatusRule, err } From 599f4323a2aceafadacb14f7b56c0324199c15f1 Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Sun, 29 Jan 2017 09:25:41 +0200 Subject: [PATCH 2/7] documenting the question argument --- doc/interactive-commands.md | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/doc/interactive-commands.md b/doc/interactive-commands.md index 84c887d..c6398c5 100644 --- a/doc/interactive-commands.md +++ b/doc/interactive-commands.md @@ -26,9 +26,9 @@ Both interfaces may serve at the same time. Both respond to simple text command, - The `critical-load` format must be: `some_status=[,some_status=...]`' - For example: `Threads_running=1000,threads_connected=5000`, and you would then write/echo `critical-load=Threads_running=1000,threads_connected=5000` to the socket. - `nice-ratio=`: change _nice_ ratio: 0 for aggressive (not nice, not sleeping), positive integer `n`: - - For any `1ms` spent copying rows, spend `n*1ms` units of time sleeping. - - Examples: assume a single rows chunk copy takes `100ms` to complete. - - `nice-ratio=0.5` will cause `gh-ost` to sleep for `50ms` immediately following. + - For any `1ms` spent copying rows, spend `n*1ms` units of time sleeping. + - Examples: assume a single rows chunk copy takes `100ms` to complete. + - `nice-ratio=0.5` will cause `gh-ost` to sleep for `50ms` immediately following. - `nice-ratio=1` will cause `gh-ost` to sleep for `100ms`, effectively doubling runtime - value of `2` will effectively triple the runtime; etc. - `throttle-query`: change throttle query @@ -38,6 +38,10 @@ Both interfaces may serve at the same time. Both respond to simple text command, - `unpostpone`: at a time where `gh-ost` is postponing the [cut-over](cut-over.md) phase, instruct `gh-ost` to stop postponing and proceed immediately to cut-over. - `panic`: immediately panic and abort operation +### Querying for data + +For commands that accept an argumetn as value, pass `?` (question mark) to _get_ current value rather than _set_ a new one. + ### Examples While migration is running: @@ -63,6 +67,11 @@ $ echo "chunk-size=250" | nc -U /tmp/gh-ost.test.sample_data_0.sock # Serving on TCP port: 10001 ``` +```shell +$ echo "chunk-size=?" | nc -U /tmp/gh-ost.test.sample_data_0.sock +250 +``` + ```shell $ echo throttle | nc -U /tmp/gh-ost.test.sample_data_0.sock From 8219209c8d30540a32a007c11200a91ef6ce4319 Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Sun, 29 Jan 2017 09:25:58 +0200 Subject: [PATCH 3/7] tests accept postpone flag file --- localtests/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/localtests/test.sh b/localtests/test.sh index c0b97a5..8bbcf6f 100755 --- a/localtests/test.sh +++ b/localtests/test.sh @@ -88,7 +88,7 @@ test_single() { --throttle-query='select timestampdiff(second, min(last_update), now()) < 5 from _gh_ost_test_ghc' \ --serve-socket-file=/tmp/gh-ost.test.sock \ --initially-drop-socket-file \ - --postpone-cut-over-flag-file="" \ + --postpone-cut-over-flag-file=/tmp/gh-ost.test.postpone.flag \ --test-on-replica \ --default-retries=1 \ --verbose \ From be1ab175c77df6cea97052130d699a5b96ee60cd Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Sun, 29 Jan 2017 09:56:25 +0200 Subject: [PATCH 4/7] status presents with '# throttle-control-replicas count:' --- go/logic/migrator.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/go/logic/migrator.go b/go/logic/migrator.go index 61db6db..05ab99a 100644 --- a/go/logic/migrator.go +++ b/go/logic/migrator.go @@ -761,6 +761,12 @@ func (this *Migrator) printMigrationStatusHint(writers ...io.Writer) { throttleQuery, )) } + if throttleControlReplicaKeys := this.migrationContext.GetThrottleControlReplicaKeys(); throttleControlReplicaKeys.Len() > 0 { + fmt.Fprintln(w, fmt.Sprintf("# throttle-control-replicas count: %+v", + throttleControlReplicaKeys.Len(), + )) + } + if this.migrationContext.PostponeCutOverFlagFile != "" { setIndicator := "" if base.FileExists(this.migrationContext.PostponeCutOverFlagFile) { From baee4f69f90f2c55e68a45166934ca9e768ce573 Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Sun, 29 Jan 2017 10:18:39 +0200 Subject: [PATCH 5/7] fixing phantom throttle-control-replicas lag result --- go/base/context.go | 6 +++++- go/logic/throttler.go | 4 +--- go/mysql/utils.go | 8 ++++++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/go/base/context.go b/go/base/context.go index 6142f9c..7b196af 100644 --- a/go/base/context.go +++ b/go/base/context.go @@ -559,7 +559,11 @@ func (this *MigrationContext) GetControlReplicasLagResult() mysql.ReplicationLag func (this *MigrationContext) SetControlReplicasLagResult(lagResult *mysql.ReplicationLagResult) { this.throttleMutex.Lock() defer this.throttleMutex.Unlock() - this.controlReplicasLagResult = *lagResult + if lagResult == nil { + this.controlReplicasLagResult = *mysql.NewNoReplicationLagResult() + } else { + this.controlReplicasLagResult = *lagResult + } } func (this *MigrationContext) GetThrottleControlReplicaKeys() *mysql.InstanceKeyMap { diff --git a/go/logic/throttler.go b/go/logic/throttler.go index 1635b5e..beb4f47 100644 --- a/go/logic/throttler.go +++ b/go/logic/throttler.go @@ -158,9 +158,7 @@ func (this *Throttler) collectControlReplicasLag() { // No need to read lag return } - if result := readControlReplicasLag(); result != nil { - this.migrationContext.SetControlReplicasLagResult(result) - } + this.migrationContext.SetControlReplicasLagResult(readControlReplicasLag()) } aggressiveTicker := time.Tick(100 * time.Millisecond) relaxedFactor := 10 diff --git a/go/mysql/utils.go b/go/mysql/utils.go index bfd3c24..fb70dc9 100644 --- a/go/mysql/utils.go +++ b/go/mysql/utils.go @@ -22,6 +22,14 @@ type ReplicationLagResult struct { Err error } +func NewNoReplicationLagResult() *ReplicationLagResult { + return &ReplicationLagResult{Lag: 0, Err: nil} +} + +func (this *ReplicationLagResult) HasLag() bool { + return this.Lag > 0 +} + // GetReplicationLag returns replication lag for a given connection config; either by explicit query // or via SHOW SLAVE STATUS func GetReplicationLag(connectionConfig *ConnectionConfig) (replicationLag time.Duration, err error) { From 10edf3c063691c320df4bb2b0d7cddd2a7480576 Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Tue, 7 Feb 2017 12:13:19 +0200 Subject: [PATCH 6/7] Migration only starting after first replication lag metric collected --- go/logic/migrator.go | 4 ++- go/logic/throttler.go | 58 ++++++++++++++++++++++++------------------- go/mysql/utils.go | 4 ++- 3 files changed, 38 insertions(+), 28 deletions(-) diff --git a/go/logic/migrator.go b/go/logic/migrator.go index 61db6db..5880c99 100644 --- a/go/logic/migrator.go +++ b/go/logic/migrator.go @@ -940,7 +940,9 @@ func (this *Migrator) initiateThrottler() error { go this.throttler.initiateThrottlerCollection(this.firstThrottlingCollected) log.Infof("Waiting for first throttle metrics to be collected") - <-this.firstThrottlingCollected + <-this.firstThrottlingCollected // replication lag + <-this.firstThrottlingCollected // other metrics + log.Infof("First throttle metrics collected") go this.throttler.initiateThrottlerChecks() return nil diff --git a/go/logic/throttler.go b/go/logic/throttler.go index 41ece40..2430c13 100644 --- a/go/logic/throttler.go +++ b/go/logic/throttler.go @@ -85,32 +85,37 @@ func (this *Throttler) parseChangelogHeartbeat(heartbeatValue string) (err error } // collectReplicationLag reads the latest changelog heartbeat value -func (this *Throttler) collectReplicationLag() { +func (this *Throttler) collectReplicationLag(firstThrottlingCollected chan<- bool) { + collectFunc := func() error { + if atomic.LoadInt64(&this.migrationContext.CleanupImminentFlag) > 0 { + return nil + } + + if this.migrationContext.TestOnReplica || this.migrationContext.MigrateOnReplica { + // when running on replica, the heartbeat injection is also done on the replica. + // This means we will always get a good heartbeat value. + // When runnign on replica, we should instead check the `SHOW SLAVE STATUS` output. + if lag, err := mysql.GetReplicationLag(this.inspector.connectionConfig); err != nil { + return log.Errore(err) + } else { + atomic.StoreInt64(&this.migrationContext.CurrentLag, int64(lag)) + } + } else { + if heartbeatValue, err := this.inspector.readChangelogState("heartbeat"); err != nil { + return log.Errore(err) + } else { + this.parseChangelogHeartbeat(heartbeatValue) + } + } + return nil + } + + collectFunc() + firstThrottlingCollected <- true + ticker := time.Tick(time.Duration(this.migrationContext.HeartbeatIntervalMilliseconds) * time.Millisecond) for range ticker { - go func() error { - if atomic.LoadInt64(&this.migrationContext.CleanupImminentFlag) > 0 { - return nil - } - - if this.migrationContext.TestOnReplica || this.migrationContext.MigrateOnReplica { - // when running on replica, the heartbeat injection is also done on the replica. - // This means we will always get a good heartbeat value. - // When runnign on replica, we should instead check the `SHOW SLAVE STATUS` output. - if lag, err := mysql.GetReplicationLag(this.inspector.connectionConfig); err != nil { - return log.Errore(err) - } else { - atomic.StoreInt64(&this.migrationContext.CurrentLag, int64(lag)) - } - } else { - if heartbeatValue, err := this.inspector.readChangelogState("heartbeat"); err != nil { - return log.Errore(err) - } else { - this.parseChangelogHeartbeat(heartbeatValue) - } - } - return nil - }() + go collectFunc() } } @@ -285,13 +290,14 @@ func (this *Throttler) collectGeneralThrottleMetrics() error { // that may affect throttling. There are several components, all running independently, // that collect such metrics. func (this *Throttler) initiateThrottlerCollection(firstThrottlingCollected chan<- bool) { - go this.collectReplicationLag() + go this.collectReplicationLag(firstThrottlingCollected) go this.collectControlReplicasLag() go func() { - throttlerMetricsTick := time.Tick(1 * time.Second) this.collectGeneralThrottleMetrics() firstThrottlingCollected <- true + + throttlerMetricsTick := time.Tick(1 * time.Second) for range throttlerMetricsTick { this.collectGeneralThrottleMetrics() } diff --git a/go/mysql/utils.go b/go/mysql/utils.go index bfd3c24..80bcf1b 100644 --- a/go/mysql/utils.go +++ b/go/mysql/utils.go @@ -32,9 +32,11 @@ func GetReplicationLag(connectionConfig *ConnectionConfig) (replicationLag time. } err = sqlutils.QueryRowsMap(db, `show slave status`, func(m sqlutils.RowMap) error { + slaveIORunning := m.GetString("Slave_IO_Running") + slaveSQLRunning := m.GetString("Slave_SQL_Running") secondsBehindMaster := m.GetNullInt64("Seconds_Behind_Master") if !secondsBehindMaster.Valid { - return fmt.Errorf("replication not running") + return fmt.Errorf("replication not running; Slave_IO_Running=%+v, Slave_SQL_Running=", slaveIORunning, slaveSQLRunning) } replicationLag = time.Duration(secondsBehindMaster.Int64) * time.Second return nil From 57409c21986dce4ed6c026ddec60470d84ecc247 Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Wed, 8 Feb 2017 12:24:44 +0200 Subject: [PATCH 7/7] added mising slaveSQLRunning value --- go/mysql/utils.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/mysql/utils.go b/go/mysql/utils.go index 80bcf1b..cb3ae36 100644 --- a/go/mysql/utils.go +++ b/go/mysql/utils.go @@ -36,7 +36,7 @@ func GetReplicationLag(connectionConfig *ConnectionConfig) (replicationLag time. slaveSQLRunning := m.GetString("Slave_SQL_Running") secondsBehindMaster := m.GetNullInt64("Seconds_Behind_Master") if !secondsBehindMaster.Valid { - return fmt.Errorf("replication not running; Slave_IO_Running=%+v, Slave_SQL_Running=", slaveIORunning, slaveSQLRunning) + return fmt.Errorf("replication not running; Slave_IO_Running=%+v, Slave_SQL_Running=%+v", slaveIORunning, slaveSQLRunning) } replicationLag = time.Duration(secondsBehindMaster.Int64) * time.Second return nil