From 6fa32f6d54144803e54cee52e10f97478ff769a4 Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Tue, 7 Feb 2017 09:31:52 +0200 Subject: [PATCH 1/7] cut-over failure on test-on-replica starts replication again --- go/logic/applier.go | 27 ++++++++++++++++++++++++-- go/logic/hooks.go | 5 +++++ go/logic/migrator.go | 46 +++++++++++++++++++++++++++++++++++++------- 3 files changed, 69 insertions(+), 9 deletions(-) diff --git a/go/logic/applier.go b/go/logic/applier.go index d1ad331..45e92f3 100644 --- a/go/logic/applier.go +++ b/go/logic/applier.go @@ -588,11 +588,22 @@ func (this *Applier) RenameTablesRollback() (renameError error) { // and have them written to the binary log, so that we can then read them via streamer. func (this *Applier) StopSlaveIOThread() error { query := `stop /* gh-ost */ slave io_thread` - log.Infof("Stopping replication") + log.Infof("Stopping replication IO thread") if _, err := sqlutils.ExecNoPrepare(this.db, query); err != nil { return err } - log.Infof("Replication stopped") + log.Infof("Replication IO thread stopped") + return nil +} + +// StartSlaveIOThread is applicable with --test-on-replica +func (this *Applier) StartSlaveIOThread() error { + query := `start /* gh-ost */ slave io_thread` + log.Infof("Starting replication IO thread") + if _, err := sqlutils.ExecNoPrepare(this.db, query); err != nil { + return err + } + log.Infof("Replication IO thread started") return nil } @@ -635,6 +646,18 @@ func (this *Applier) StopReplication() error { return nil } +// StartReplication is used by `--test-on-replica` on cut-over failure +func (this *Applier) StartReplication() error { + if err := this.StartSlaveIOThread(); err != nil { + return err + } + if err := this.StartSlaveSQLThread(); err != nil { + return err + } + log.Infof("Replication started") + return nil +} + // GetSessionLockName returns a name for the special hint session voluntary lock func (this *Applier) GetSessionLockName(sessionId int64) string { return fmt.Sprintf("gh-ost.%d.lock", sessionId) diff --git a/go/logic/hooks.go b/go/logic/hooks.go index 25d745a..89dbd65 100644 --- a/go/logic/hooks.go +++ b/go/logic/hooks.go @@ -30,6 +30,7 @@ const ( onFailure = "gh-ost-on-failure" onStatus = "gh-ost-on-status" onStopReplication = "gh-ost-on-stop-replication" + onStartReplication = "gh-ost-on-start-replication" ) type HooksExecutor struct { @@ -152,3 +153,7 @@ func (this *HooksExecutor) onStatus(statusMessage string) error { func (this *HooksExecutor) onStopReplication() error { return this.executeHooks(onStopReplication) } + +func (this *HooksExecutor) onStartReplication() error { + return this.executeHooks(onStartReplication) +} diff --git a/go/logic/migrator.go b/go/logic/migrator.go index 61db6db..58b5e79 100644 --- a/go/logic/migrator.go +++ b/go/logic/migrator.go @@ -385,8 +385,38 @@ func (this *Migrator) ExecOnFailureHook() (err error) { return this.hooksExecutor.onFailure() } +func (this *Migrator) handleCutOverResult(cutOverError error) (err error) { + if this.migrationContext.TestOnReplica { + // We're merly testing, we don't want to keep this state. Rollback the renames as possible + this.applier.RenameTablesRollback() + } + if cutOverError == nil { + return nil + } + // Only on error: + + if this.migrationContext.TestOnReplica { + // With `--test-on-replica` we stop replication thread, and then proceed to use + // the same cut-over phase as the master would use. That means we take locks + // and swap the tables. + // The difference is that we will later swap the tables back. + if err := this.hooksExecutor.onStartReplication(); err != nil { + return log.Errore(err) + } + if this.migrationContext.TestOnReplicaSkipReplicaStop { + log.Warningf("--test-on-replica-skip-replica-stop enabled, we are not starting replication.") + } else { + log.Debugf("testing on replica. Starting replication IO thread after cut-over failure") + if err := this.retryOperation(this.applier.StartReplication); err != nil { + return log.Errore(err) + } + } + } + return nil +} + // cutOver performs the final step of migration, based on migration -// type (on replica? bumpy? safe?) +// type (on replica? atomic? safe?) func (this *Migrator) cutOver() (err error) { if this.migrationContext.Noop { log.Debugf("Noop operation; not really swapping tables") @@ -441,18 +471,18 @@ func (this *Migrator) cutOver() (err error) { return err } } - // We're merly testing, we don't want to keep this state. Rollback the renames as possible - defer this.applier.RenameTablesRollback() - // We further proceed to do the cutover by normal means; the 'defer' above will rollback the swap } if this.migrationContext.CutOverType == base.CutOverAtomic { // Atomic solution: we use low timeout and multiple attempts. But for // each failed attempt, we throttle until replication lag is back to normal err := this.atomicCutOver() + this.handleCutOverResult(err) return err } if this.migrationContext.CutOverType == base.CutOverTwoStep { - return this.cutOverTwoStep() + err := this.cutOverTwoStep() + this.handleCutOverResult(err) + return err } return log.Fatalf("Unknown cut-over type: %d; should never get here!", this.migrationContext.CutOverType) } @@ -1043,8 +1073,10 @@ func (this *Migrator) onApplyEventStruct(eventStruct *applyEventStruct) error { availableEvents := len(this.applyEventsQueue) batchSize := int(atomic.LoadInt64(&this.migrationContext.DMLBatchSize)) - if availableEvents > batchSize { - availableEvents = batchSize + if availableEvents > batchSize-1 { + // The "- 1" is because we already consumed one event: the original event that led to this function getting called. + // So, if DMLBatchSize==1 we wish to not process any further events + availableEvents = batchSize - 1 } for i := 0; i < availableEvents; i++ { additionalStruct := <-this.applyEventsQueue From 8db8e127ebba537a5d86c5c7224fd7796597702c Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Tue, 21 Feb 2017 10:34:24 -0700 Subject: [PATCH 2/7] supporting --timestamp-old-table --- go/base/context.go | 12 +++++++----- go/cmd/gh-ost/main.go | 1 + localtests/test.sh | 1 + 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/go/base/context.go b/go/base/context.go index 314a1d5..d6cd6ce 100644 --- a/go/base/context.go +++ b/go/base/context.go @@ -121,6 +121,7 @@ type MigrationContext struct { OkToDropTable bool InitiallyDropOldTable bool InitiallyDropGhostTable bool + TimestampOldTable bool // Should old table name include a timestamp CutOverType CutOver ReplicaServerId uint @@ -234,11 +235,12 @@ func (this *MigrationContext) GetGhostTableName() string { // GetOldTableName generates the name of the "old" table, into which the original table is renamed. func (this *MigrationContext) GetOldTableName() string { - if this.TestOnReplica { - return fmt.Sprintf("_%s_ght", this.OriginalTableName) - } - if this.MigrateOnReplica { - return fmt.Sprintf("_%s_ghr", this.OriginalTableName) + if this.TimestampOldTable { + t := this.StartTime + timestamp := fmt.Sprintf("%d%02d%02d%02d%02d%02d", + t.Year(), t.Month(), t.Day(), + t.Hour(), t.Minute(), t.Second()) + return fmt.Sprintf("_%s_%s_del", this.OriginalTableName, timestamp) } return fmt.Sprintf("_%s_del", this.OriginalTableName) } diff --git a/go/cmd/gh-ost/main.go b/go/cmd/gh-ost/main.go index 8e5e20c..238dc81 100644 --- a/go/cmd/gh-ost/main.go +++ b/go/cmd/gh-ost/main.go @@ -77,6 +77,7 @@ func main() { flag.BoolVar(&migrationContext.OkToDropTable, "ok-to-drop-table", false, "Shall the tool drop the old table at end of operation. DROPping tables can be a long locking operation, which is why I'm not doing it by default. I'm an online tool, yes?") flag.BoolVar(&migrationContext.InitiallyDropOldTable, "initially-drop-old-table", false, "Drop a possibly existing OLD table (remains from a previous run?) before beginning operation. Default is to panic and abort if such table exists") flag.BoolVar(&migrationContext.InitiallyDropGhostTable, "initially-drop-ghost-table", false, "Drop a possibly existing Ghost table (remains from a previous run?) before beginning operation. Default is to panic and abort if such table exists") + flag.BoolVar(&migrationContext.TimestampOldTable, "timestamp-old-table", false, "Use a timestamp in old table name. This makes old table names unique and non conflicting cross migrations") cutOver := flag.String("cut-over", "atomic", "choose cut-over type (default|atomic, two-step)") flag.BoolVar(&migrationContext.ForceNamedCutOverCommand, "force-named-cut-over", false, "When true, the 'unpostpone|cut-over' interactive command must name the migrated table") diff --git a/localtests/test.sh b/localtests/test.sh index 8bbcf6f..12ec77b 100755 --- a/localtests/test.sh +++ b/localtests/test.sh @@ -85,6 +85,7 @@ test_single() { --switch-to-rbr \ --initially-drop-old-table \ --initially-drop-ghost-table \ + --timestamp-old-table \ --throttle-query='select timestampdiff(second, min(last_update), now()) < 5 from _gh_ost_test_ghc' \ --serve-socket-file=/tmp/gh-ost.test.sock \ --initially-drop-socket-file \ From b848b78fcb89ef628286439c1deba0a0aebd5920 Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Tue, 21 Feb 2017 11:25:29 -0700 Subject: [PATCH 3/7] localtests should not be using --timestamp-old-table --- localtests/test.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/localtests/test.sh b/localtests/test.sh index 12ec77b..8bbcf6f 100755 --- a/localtests/test.sh +++ b/localtests/test.sh @@ -85,7 +85,6 @@ test_single() { --switch-to-rbr \ --initially-drop-old-table \ --initially-drop-ghost-table \ - --timestamp-old-table \ --throttle-query='select timestampdiff(second, min(last_update), now()) < 5 from _gh_ost_test_ghc' \ --serve-socket-file=/tmp/gh-ost.test.sock \ --initially-drop-socket-file \ From 06c909bd100ea50f35232b33b844a400fbc0c307 Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Tue, 21 Feb 2017 17:34:49 -0700 Subject: [PATCH 4/7] Validating table name length --- go/logic/applier.go | 4 ++++ go/mysql/utils.go | 2 ++ 2 files changed, 6 insertions(+) diff --git a/go/logic/applier.go b/go/logic/applier.go index d1d7970..e84b5cf 100644 --- a/go/logic/applier.go +++ b/go/logic/applier.go @@ -142,6 +142,10 @@ func (this *Applier) ValidateOrDropExistingTables() error { return err } } + if len(this.migrationContext.GetOldTableName()) > mysql.MaxTableNameLength { + log.Fatalf("--timestamp-old-table defined, but resulting table name (%s) is too long (only %d characters allowed)", this.migrationContext.GetOldTableName(), mysql.MaxTableNameLength) + } + if this.tableExists(this.migrationContext.GetOldTableName()) { return fmt.Errorf("Table %s already exists. Panicking. Use --initially-drop-old-table to force dropping it, though I really prefer that you drop it or rename it away", sql.EscapeName(this.migrationContext.GetOldTableName())) } diff --git a/go/mysql/utils.go b/go/mysql/utils.go index c48e07e..4df7639 100644 --- a/go/mysql/utils.go +++ b/go/mysql/utils.go @@ -16,6 +16,8 @@ import ( "github.com/outbrain/golib/sqlutils" ) +const MaxTableNameLength = 64 + type ReplicationLagResult struct { Key InstanceKey Lag time.Duration From d15594824ad051aa80fe3b5ddbdd74539ac047a5 Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Tue, 21 Feb 2017 17:34:58 -0700 Subject: [PATCH 5/7] documenting --timestamp-old-table --- doc/command-line-flags.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/command-line-flags.md b/doc/command-line-flags.md index 83bfe05..d266f2c 100644 --- a/doc/command-line-flags.md +++ b/doc/command-line-flags.md @@ -130,3 +130,7 @@ See `approve-renamed-columns` ### test-on-replica Issue the migration on a replica; do not modify data on master. Useful for validating, testing and benchmarking. See [testing-on-replica](testing-on-replica.md) + +### timestamp-old-table + +Makes the _old_ table include a timestamp value. The _old_ table is what the original table is renamed to at the end of a successful migration. For example, if the table is `gh_ost_test`, then the _old_ table would normally be `_gh_ost_test_del`. With `--timestamp-old-table` it would be, for example, `_gh_ost_test_20170221103147_del`. From f11f6f978ffe3a1de46e4e6302c5834340b9a496 Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Fri, 24 Feb 2017 14:48:10 -0700 Subject: [PATCH 6/7] mitigating cut-over/write race condition --- go/logic/inspect.go | 4 ++-- go/logic/migrator.go | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/go/logic/inspect.go b/go/logic/inspect.go index 862be16..1d30cb5 100644 --- a/go/logic/inspect.go +++ b/go/logic/inspect.go @@ -348,7 +348,7 @@ func (this *Inspector) validateLogSlaveUpdates() error { } if this.migrationContext.IsTungsten { - log.Warning("log_slave_updates not found on %s:%d, but --tungsten provided, so I'm proceeding", this.connectionConfig.Key.Hostname, this.connectionConfig.Key.Port) + log.Warningf("log_slave_updates not found on %s:%d, but --tungsten provided, so I'm proceeding", this.connectionConfig.Key.Hostname, this.connectionConfig.Key.Port) return nil } @@ -357,7 +357,7 @@ func (this *Inspector) validateLogSlaveUpdates() error { } if this.migrationContext.InspectorIsAlsoApplier() { - log.Warning("log_slave_updates not found on %s:%d, but executing directly on master, so I'm proceeeding", this.connectionConfig.Key.Hostname, this.connectionConfig.Key.Port) + log.Warningf("log_slave_updates not found on %s:%d, but executing directly on master, so I'm proceeeding", this.connectionConfig.Key.Hostname, this.connectionConfig.Key.Port) return nil } diff --git a/go/logic/migrator.go b/go/logic/migrator.go index de3a018..94b5e54 100644 --- a/go/logic/migrator.go +++ b/go/logic/migrator.go @@ -1000,11 +1000,13 @@ func (this *Migrator) iterateChunks() error { for { if atomic.LoadInt64(&this.rowCopyCompleteFlag) == 1 { // Done + // There's another such check down the line return nil } copyRowsFunc := func() error { if atomic.LoadInt64(&this.rowCopyCompleteFlag) == 1 { - // Done + // Done. + // There's another such check down the line return nil } hasFurtherRange, err := this.applier.CalculateNextIterationRangeEndValues() @@ -1016,6 +1018,17 @@ func (this *Migrator) iterateChunks() error { } // Copy task: applyCopyRowsFunc := func() error { + if atomic.LoadInt64(&this.rowCopyCompleteFlag) == 1 { + // No need for more writes. + // This is the de-facto place where we avoid writing in the event of completed cut-over. + // There could _still_ be a race condition, but that's as close as we can get. + // What about the race condition? Well, there's actually no data integrity issue. + // when rowCopyCompleteFlag==1 that means **guaranteed** all necessary rows have been copied. + // But some are still then collected at the binary log, and these are the ones we're trying to + // not apply here. If the race condition wins over us, then we just attempt to apply onto the + // _ghost_ table, which no longer exists. So, bothering error messages and all, but no damage. + return nil + } _, rowsAffected, _, err := this.applier.ApplyIterationInsertQuery() if err != nil { return terminateRowIteration(err) From 7f728729e3352af80f0338a46ad9be376bf27035 Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Tue, 7 Mar 2017 08:18:38 +0200 Subject: [PATCH 7/7] documenting gh-ost-on-start-replication hook --- doc/hooks.md | 1 + resources/hooks-sample/gh-ost-on-start-replication-hook | 6 ++++++ resources/hooks-sample/gh-ost-on-stop-replication-hook | 1 + 3 files changed, 8 insertions(+) create mode 100644 resources/hooks-sample/gh-ost-on-start-replication-hook diff --git a/doc/hooks.md b/doc/hooks.md index 03b1501..43ddb7e 100644 --- a/doc/hooks.md +++ b/doc/hooks.md @@ -44,6 +44,7 @@ The full list of supported hooks is best found in code: [hooks.go](https://githu - `gh-ost-on-interactive-command` - `gh-ost-on-row-copy-complete` - `gh-ost-on-stop-replication` +- `gh-ost-on-start-replication` - `gh-ost-on-begin-postponed` - `gh-ost-on-before-cut-over` - `gh-ost-on-success` diff --git a/resources/hooks-sample/gh-ost-on-start-replication-hook b/resources/hooks-sample/gh-ost-on-start-replication-hook new file mode 100644 index 0000000..b476b99 --- /dev/null +++ b/resources/hooks-sample/gh-ost-on-start-replication-hook @@ -0,0 +1,6 @@ +#!/bin/bash + +# Sample hook file for gh-ost-on-start-replication +# Useful for RDS/Aurora setups, see https://github.com/github/gh-ost/issues/163 + +echo "$(date) gh-ost-on-start-replication $GH_OST_DATABASE_NAME.$GH_OST_TABLE_NAME $GH_OST_MIGRATED_HOST" >> /tmp/gh-ost.log diff --git a/resources/hooks-sample/gh-ost-on-stop-replication-hook b/resources/hooks-sample/gh-ost-on-stop-replication-hook index 232ad95..629e4dc 100755 --- a/resources/hooks-sample/gh-ost-on-stop-replication-hook +++ b/resources/hooks-sample/gh-ost-on-stop-replication-hook @@ -1,5 +1,6 @@ #!/bin/bash # Sample hook file for gh-ost-on-stop-replication +# Useful for RDS/Aurora setups, see https://github.com/github/gh-ost/issues/163 echo "$(date) gh-ost-on-stop-replication $GH_OST_DATABASE_NAME.$GH_OST_TABLE_NAME $GH_OST_MIGRATED_HOST" >> /tmp/gh-ost.log