From 78698899880b45c6e93a715b190317b55d07eabc Mon Sep 17 00:00:00 2001 From: Shlomi Noach <shlomi-noach@github.com> Date: Sun, 6 Oct 2019 17:08:35 +0300 Subject: [PATCH 1/8] context, status and hooks: progressPct and CurrentLag --- go/base/context.go | 5 +++++ go/logic/hooks.go | 3 +++ go/logic/migrator.go | 7 +++++-- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/go/base/context.go b/go/base/context.go index f12cd43..84e6264 100644 --- a/go/base/context.go +++ b/go/base/context.go @@ -174,6 +174,7 @@ type MigrationContext struct { pointOfInterestTime time.Time pointOfInterestTimeMutex *sync.Mutex CurrentLag int64 + CurrentProgress uint64 // math.Float64bits([f=0..100]) ThrottleHTTPStatusCode int64 controlReplicasLagResult mysql.ReplicationLagResult TotalRowsCopied int64 @@ -428,6 +429,10 @@ func (this *MigrationContext) MarkRowCopyEndTime() { this.RowCopyEndTime = time.Now() } +func (this *MigrationContext) GetCurrentLagDuration() time.Duration { + return time.Duration(atomic.LoadInt64(&this.CurrentLag)) +} + // GetTotalRowsCopied returns the accurate number of rows being copied (affected) // This is not exactly the same as the rows being iterated via chunks, but potentially close enough func (this *MigrationContext) GetTotalRowsCopied() int64 { diff --git a/go/logic/hooks.go b/go/logic/hooks.go index be11130..b56b0b4 100644 --- a/go/logic/hooks.go +++ b/go/logic/hooks.go @@ -8,6 +8,7 @@ package logic import ( "fmt" + "math" "os" "os/exec" "path/filepath" @@ -63,6 +64,8 @@ func (this *HooksExecutor) applyEnvironmentVariables(extraVariables ...string) [ env = append(env, fmt.Sprintf("GH_OST_MIGRATED_HOST=%s", this.migrationContext.GetApplierHostname())) env = append(env, fmt.Sprintf("GH_OST_INSPECTED_HOST=%s", this.migrationContext.GetInspectorHostname())) env = append(env, fmt.Sprintf("GH_OST_EXECUTING_HOST=%s", this.migrationContext.Hostname)) + env = append(env, fmt.Sprintf("GH_OST_INSPECTED_LAG=%f", this.migrationContext.GetCurrentLagDuration().Seconds())) + env = append(env, fmt.Sprintf("GH_OST_PROGRESS=%f", math.Float64frombits(atomic.LoadUint64(&this.migrationContext.CurrentProgress)))) env = append(env, fmt.Sprintf("GH_OST_HOOKS_HINT=%s", this.migrationContext.HooksHintMessage)) env = append(env, fmt.Sprintf("GH_OST_HOOKS_HINT_OWNER=%s", this.migrationContext.HooksHintOwner)) env = append(env, fmt.Sprintf("GH_OST_HOOKS_HINT_TOKEN=%s", this.migrationContext.HooksHintToken)) diff --git a/go/logic/migrator.go b/go/logic/migrator.go index 86de5ac..985bef1 100644 --- a/go/logic/migrator.go +++ b/go/logic/migrator.go @@ -895,6 +895,8 @@ func (this *Migrator) printStatus(rule PrintStatusRule, writers ...io.Writer) { } else { progressPct = 100.0 * float64(totalRowsCopied) / float64(rowsEstimate) } + // we take the opportunity to update migration context with progressPct + atomic.StoreUint64(&this.migrationContext.CurrentProgress, math.Float64bits(progressPct)) // Before status, let's see if we should print a nice reminder for what exactly we're doing here. shouldPrintMigrationStatusHint := (elapsedSeconds%600 == 0) if rule == ForcePrintStatusAndHintRule { @@ -911,7 +913,7 @@ func (this *Migrator) printStatus(rule PrintStatusRule, writers ...io.Writer) { eta := "N/A" if progressPct >= 100.0 { eta = "due" - } else if progressPct >= 1.0 { + } else if progressPct >= 0.1 { elapsedRowCopySeconds := this.migrationContext.ElapsedRowCopyTime().Seconds() totalExpectedSeconds := elapsedRowCopySeconds * float64(rowsEstimate) / float64(totalRowsCopied) etaSeconds = totalExpectedSeconds - elapsedRowCopySeconds @@ -958,12 +960,13 @@ func (this *Migrator) printStatus(rule PrintStatusRule, writers ...io.Writer) { currentBinlogCoordinates := *this.eventsStreamer.GetCurrentBinlogCoordinates() - status := fmt.Sprintf("Copy: %d/%d %.1f%%; Applied: %d; Backlog: %d/%d; Time: %+v(total), %+v(copy); streamer: %+v; State: %s; ETA: %s", + status := fmt.Sprintf("Copy: %d/%d %.1f%%; Applied: %d; Backlog: %d/%d; Time: %+v(total), %+v(copy); streamer: %+v; Lag: %+v, State: %s; ETA: %s", totalRowsCopied, rowsEstimate, progressPct, atomic.LoadInt64(&this.migrationContext.TotalDMLEventsApplied), len(this.applyEventsQueue), cap(this.applyEventsQueue), base.PrettifyDurationOutput(elapsedTime), base.PrettifyDurationOutput(this.migrationContext.ElapsedRowCopyTime()), currentBinlogCoordinates, + this.migrationContext.GetCurrentLagDuration().Seconds(), state, eta, ) From 8893b2207b4bbadedf80f428dc43de28810dc4b4 Mon Sep 17 00:00:00 2001 From: Shlomi Noach <shlomi-noach@github.com> Date: Sun, 6 Oct 2019 17:12:20 +0300 Subject: [PATCH 2/8] documenting new hook env variables --- doc/hooks.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/hooks.md b/doc/hooks.md index 93f500f..4c49c85 100644 --- a/doc/hooks.md +++ b/doc/hooks.md @@ -65,6 +65,8 @@ The following variables are available on all hooks: - `GH_OST_ELAPSED_COPY_SECONDS` - row-copy time (excluding startup, row-count and postpone time) - `GH_OST_ESTIMATED_ROWS` - estimated total rows in table - `GH_OST_COPIED_ROWS` - number of rows copied by `gh-ost` +- `GH_OST_INSPECTED_LAG` - lag in seconds (floating point) of inspected server +- `GH_OST_PROGRESS` - progress pct ([0..100], floating point) of migration - `GH_OST_MIGRATED_HOST` - `GH_OST_INSPECTED_HOST` - `GH_OST_EXECUTING_HOST` From 271c7274a3dbac3f82c8939ccfbbc22ec8bcd964 Mon Sep 17 00:00:00 2001 From: Shlomi Noach <shlomi-noach@github.com> Date: Mon, 7 Oct 2019 06:59:56 +0300 Subject: [PATCH 3/8] refactor progressPct into migrationContext --- go/base/context.go | 13 ++++++++++++- go/logic/hooks.go | 3 +-- go/logic/migrator.go | 2 +- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/go/base/context.go b/go/base/context.go index 84e6264..5ebf092 100644 --- a/go/base/context.go +++ b/go/base/context.go @@ -7,6 +7,7 @@ package base import ( "fmt" + "math" "os" "regexp" "strings" @@ -174,7 +175,7 @@ type MigrationContext struct { pointOfInterestTime time.Time pointOfInterestTimeMutex *sync.Mutex CurrentLag int64 - CurrentProgress uint64 // math.Float64bits([f=0..100]) + currentProgress uint64 ThrottleHTTPStatusCode int64 controlReplicasLagResult mysql.ReplicationLagResult TotalRowsCopied int64 @@ -433,6 +434,16 @@ func (this *MigrationContext) GetCurrentLagDuration() time.Duration { return time.Duration(atomic.LoadInt64(&this.CurrentLag)) } +func (this *MigrationContext) GetProgressPct() float64 { + return math.Float64frombits(atomic.LoadUint64(&this.currentProgress)) +} + +func (this *MigrationContext) SetProgressPct(progressPct float64) { + atomic.StoreUint64(&this.currentProgress, math.Float64bits(progressPct)) +} + +// math.Float64bits([f=0..100]) + // GetTotalRowsCopied returns the accurate number of rows being copied (affected) // This is not exactly the same as the rows being iterated via chunks, but potentially close enough func (this *MigrationContext) GetTotalRowsCopied() int64 { diff --git a/go/logic/hooks.go b/go/logic/hooks.go index b56b0b4..fa5011e 100644 --- a/go/logic/hooks.go +++ b/go/logic/hooks.go @@ -8,7 +8,6 @@ package logic import ( "fmt" - "math" "os" "os/exec" "path/filepath" @@ -65,7 +64,7 @@ func (this *HooksExecutor) applyEnvironmentVariables(extraVariables ...string) [ env = append(env, fmt.Sprintf("GH_OST_INSPECTED_HOST=%s", this.migrationContext.GetInspectorHostname())) env = append(env, fmt.Sprintf("GH_OST_EXECUTING_HOST=%s", this.migrationContext.Hostname)) env = append(env, fmt.Sprintf("GH_OST_INSPECTED_LAG=%f", this.migrationContext.GetCurrentLagDuration().Seconds())) - env = append(env, fmt.Sprintf("GH_OST_PROGRESS=%f", math.Float64frombits(atomic.LoadUint64(&this.migrationContext.CurrentProgress)))) + env = append(env, fmt.Sprintf("GH_OST_PROGRESS=%f", this.migrationContext.GetProgressPct())) env = append(env, fmt.Sprintf("GH_OST_HOOKS_HINT=%s", this.migrationContext.HooksHintMessage)) env = append(env, fmt.Sprintf("GH_OST_HOOKS_HINT_OWNER=%s", this.migrationContext.HooksHintOwner)) env = append(env, fmt.Sprintf("GH_OST_HOOKS_HINT_TOKEN=%s", this.migrationContext.HooksHintToken)) diff --git a/go/logic/migrator.go b/go/logic/migrator.go index 985bef1..1446a65 100644 --- a/go/logic/migrator.go +++ b/go/logic/migrator.go @@ -896,7 +896,7 @@ func (this *Migrator) printStatus(rule PrintStatusRule, writers ...io.Writer) { progressPct = 100.0 * float64(totalRowsCopied) / float64(rowsEstimate) } // we take the opportunity to update migration context with progressPct - atomic.StoreUint64(&this.migrationContext.CurrentProgress, math.Float64bits(progressPct)) + this.migrationContext.SetProgressPct(progressPct) // Before status, let's see if we should print a nice reminder for what exactly we're doing here. shouldPrintMigrationStatusHint := (elapsedSeconds%600 == 0) if rule == ForcePrintStatusAndHintRule { From 21e7ec6b7c005d713eae354f76f6791a9f539e50 Mon Sep 17 00:00:00 2001 From: Shlomi Noach <shlomi-noach@github.com> Date: Mon, 7 Oct 2019 07:06:11 +0300 Subject: [PATCH 4/8] lag: %+vs notation, e.g. 1s --- go/logic/migrator.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/logic/migrator.go b/go/logic/migrator.go index 1446a65..b0bec68 100644 --- a/go/logic/migrator.go +++ b/go/logic/migrator.go @@ -960,7 +960,7 @@ func (this *Migrator) printStatus(rule PrintStatusRule, writers ...io.Writer) { currentBinlogCoordinates := *this.eventsStreamer.GetCurrentBinlogCoordinates() - status := fmt.Sprintf("Copy: %d/%d %.1f%%; Applied: %d; Backlog: %d/%d; Time: %+v(total), %+v(copy); streamer: %+v; Lag: %+v, State: %s; ETA: %s", + status := fmt.Sprintf("Copy: %d/%d %.1f%%; Applied: %d; Backlog: %d/%d; Time: %+v(total), %+v(copy); streamer: %+v; Lag: %+vs, State: %s; ETA: %s", totalRowsCopied, rowsEstimate, progressPct, atomic.LoadInt64(&this.migrationContext.TotalDMLEventsApplied), len(this.applyEventsQueue), cap(this.applyEventsQueue), From d0ce7c014ee7fd4f1692574527506c0f8ce9b258 Mon Sep 17 00:00:00 2001 From: Shlomi Noach <shlomi-noach@github.com> Date: Wed, 23 Oct 2019 10:18:56 +0300 Subject: [PATCH 5/8] truncate lag digits --- go/logic/migrator.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/logic/migrator.go b/go/logic/migrator.go index b0bec68..b1a238f 100644 --- a/go/logic/migrator.go +++ b/go/logic/migrator.go @@ -960,7 +960,7 @@ func (this *Migrator) printStatus(rule PrintStatusRule, writers ...io.Writer) { currentBinlogCoordinates := *this.eventsStreamer.GetCurrentBinlogCoordinates() - status := fmt.Sprintf("Copy: %d/%d %.1f%%; Applied: %d; Backlog: %d/%d; Time: %+v(total), %+v(copy); streamer: %+v; Lag: %+vs, State: %s; ETA: %s", + status := fmt.Sprintf("Copy: %d/%d %.1f%%; Applied: %d; Backlog: %d/%d; Time: %+v(total), %+v(copy); streamer: %+v; Lag: %.2fs, State: %s; ETA: %s", totalRowsCopied, rowsEstimate, progressPct, atomic.LoadInt64(&this.migrationContext.TotalDMLEventsApplied), len(this.applyEventsQueue), cap(this.applyEventsQueue), From 991cdc51f0f882a917d3b559945595f485a1d697 Mon Sep 17 00:00:00 2001 From: Mohit Maroliya <35539313+mohitm15@users.noreply.github.com> Date: Thu, 31 Oct 2019 19:28:54 +0530 Subject: [PATCH 6/8] Update command-line-flags.md --- doc/command-line-flags.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/command-line-flags.md b/doc/command-line-flags.md index 61ffdbc..ee0051c 100644 --- a/doc/command-line-flags.md +++ b/doc/command-line-flags.md @@ -18,7 +18,7 @@ If, for some reason, you do not wish `gh-ost` to connect to a replica, you may c ### approve-renamed-columns -When your migration issues a column rename (`change column old_name new_name ...`) `gh-ost` analyzes the statement to try an associate the old column name with new column name. Otherwise the new structure may also look like some column was dropped and another was added. +When your migration issues a column rename (`change column old_name new_name ...`) `gh-ost` analyzes the statement to try and associate the old column name with new column name. Otherwise the new structure may also look like some column was dropped and another was added. `gh-ost` will print out what it thinks the _rename_ implied, but will not issue the migration unless you provide with `--approve-renamed-columns`. @@ -161,7 +161,7 @@ List of metrics and threshold values; topping the threshold of any will cause th ### migrate-on-replica -Typically `gh-ost` is used to migrate tables on a master. If you wish to only perform the migration in full on a replica, connect `gh-ost` to said replica and pass `--migrate-on-replica`. `gh-ost` will briefly connect to the master but other issue no changes on the master. Migration will be fully executed on the replica, while making sure to maintain a small replication lag. +Typically `gh-ost` is used to migrate tables on a master. If you wish to only perform the migration in full on a replica, connect `gh-ost` to said replica and pass `--migrate-on-replica`. `gh-ost` will briefly connect to the master but other issue will make no changes on the master. Migration will be fully executed on the replica, while making sure to maintain a small replication lag. ### postpone-cut-over-flag-file From 859800079bfb73a3e86ee778d51bfcdc1095b739 Mon Sep 17 00:00:00 2001 From: Mohit Maroliya <35539313+mohitm15@users.noreply.github.com> Date: Thu, 31 Oct 2019 20:13:54 +0530 Subject: [PATCH 7/8] Update doc/command-line-flags.md Thank you for the suggestion @shlomi-noach Co-Authored-By: Shlomi Noach <shlomi-noach@github.com> --- doc/command-line-flags.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/command-line-flags.md b/doc/command-line-flags.md index ee0051c..629e9f9 100644 --- a/doc/command-line-flags.md +++ b/doc/command-line-flags.md @@ -161,7 +161,7 @@ List of metrics and threshold values; topping the threshold of any will cause th ### migrate-on-replica -Typically `gh-ost` is used to migrate tables on a master. If you wish to only perform the migration in full on a replica, connect `gh-ost` to said replica and pass `--migrate-on-replica`. `gh-ost` will briefly connect to the master but other issue will make no changes on the master. Migration will be fully executed on the replica, while making sure to maintain a small replication lag. +Typically `gh-ost` is used to migrate tables on a master. If you wish to only perform the migration in full on a replica, connect `gh-ost` to said replica and pass `--migrate-on-replica`. `gh-ost` will briefly connect to the master but otherwise will make no changes on the master. Migration will be fully executed on the replica, while making sure to maintain a small replication lag. ### postpone-cut-over-flag-file From 19215b268834c8d909f866f7561a89ebad0900b6 Mon Sep 17 00:00:00 2001 From: Tobias Johansson <tobias.johansson@fasttrack-solutions.com> Date: Tue, 10 Dec 2019 13:57:18 +0100 Subject: [PATCH 8/8] Update rds docs --- doc/rds.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/rds.md b/doc/rds.md index 37d1d80..da59abb 100644 --- a/doc/rds.md +++ b/doc/rds.md @@ -26,6 +26,14 @@ If you use `pt-table-checksum` as a part of your data integrity checks, you migh This tool requires binlog_format=STATEMENT, but the current binlog_format is set to ROW and an error occurred while attempting to change it. If running MySQL 5.1.29 or newer, setting binlog_format requires the SUPER privilege. You will need to manually set binlog_format to 'STATEMENT' before running this tool. ``` +#### Binlog filtering + +In Aurora, the [binlog filtering feature][aws_replication_docs_bin_log_filtering] is enabled by default. This becomes an issue when gh-ost tries to do the cut-over, because gh-ost waits for an entry in the binlog to proceed but this entry will never end up in the binlog because it gets filtered out by the binlog filtering feature. +You need to turn this feature off during the migration process. +Set the `aurora_enable_repl_bin_log_filtering` parameter to 0 in the Parameter Group for your cluster. +When the migration is done, set it back to 1 (default). + + #### Preflight checklist Before trying to run any `gh-ost` migrations you will want to confirm the following: @@ -35,6 +43,7 @@ Before trying to run any `gh-ost` migrations you will want to confirm the follow - [ ] Executing `SHOW SLAVE STATUS\G` on your replica cluster displays the correct master host, binlog position, etc. - [ ] Database backup retention is greater than 1 day to enable binlogs - [ ] You have setup [`hooks`][ghost_hooks] to issue RDS procedures for stopping and starting replication. (see [github/gh-ost#163][ghost_rds_issue_tracking] for examples) +- [ ] The parameter `aurora_enable_repl_bin_log_filtering` is set to 0 [new_issue]: https://github.com/github/gh-ost/issues/new [assume_rbr_docs]: https://github.com/github/gh-ost/blob/master/doc/command-line-flags.md#assume-rbr @@ -43,3 +52,4 @@ Before trying to run any `gh-ost` migrations you will want to confirm the follow [percona_toolkit_patch]: https://github.com/jacobbednarz/percona-toolkit/commit/0271ba6a094da446a5e5bb8d99b5c26f1777f2b9 [ghost_hooks]: https://github.com/github/gh-ost/blob/master/doc/hooks.md [ghost_rds_issue_tracking]: https://github.com/github/gh-ost/issues/163 +[aws_replication_docs_bin_log_filtering]: https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/AuroraMySQL.Replication.html#AuroraMySQL.Replication.Performance \ No newline at end of file