Removed legacy 'safe cut-over'

Now that we have the atomic cut-over, the former is redundant
This commit is contained in:
Shlomi Noach 2016-07-16 08:12:19 -06:00
parent 53508af84a
commit ef59a866d8
5 changed files with 3 additions and 292 deletions

View File

@ -1,7 +1,7 @@
#!/bin/bash
#
#
RELEASE_VERSION="1.0.2"
RELEASE_VERSION="1.0.3"
buildpath=/tmp/gh-ost
target=gh-ost

View File

@ -31,7 +31,6 @@ type CutOver int
const (
CutOverAtomic CutOver = iota
CutOverSafe = iota
CutOverTwoStep = iota
)

View File

@ -66,7 +66,7 @@ func main() {
flag.BoolVar(&migrationContext.OkToDropTable, "ok-to-drop-table", false, "Shall the tool drop the old table at end of operation. DROPping tables can be a long locking operation, which is why I'm not doing it by default. I'm an online tool, yes?")
flag.BoolVar(&migrationContext.InitiallyDropOldTable, "initially-drop-old-table", false, "Drop a possibly existing OLD table (remains from a previous run?) before beginning operation. Default is to panic and abort if such table exists")
flag.BoolVar(&migrationContext.InitiallyDropGhostTable, "initially-drop-ghost-table", false, "Drop a possibly existing Ghost table (remains from a previous run?) before beginning operation. Default is to panic and abort if such table exists")
cutOver := flag.String("cut-over", "atomic", "choose cut-over type (atomic, two-step, voluntary-lock)")
cutOver := flag.String("cut-over", "atomic", "choose cut-over type (default|atomic, two-step)")
flag.BoolVar(&migrationContext.SwitchToRowBinlogFormat, "switch-to-rbr", false, "let this tool automatically switch binary log format to 'ROW' on the replica, if needed. The format will NOT be switched back. I'm too scared to do that, and wish to protect you if you happen to execute another migration while this one is running")
chunkSize := flag.Int64("chunk-size", 1000, "amount of rows to handle in each iteration (allowed range: 100-100,000)")
@ -148,8 +148,6 @@ func main() {
switch *cutOver {
case "atomic", "default", "":
migrationContext.CutOverType = base.CutOverAtomic
case "safe":
migrationContext.CutOverType = base.CutOverSafe
case "two-step":
migrationContext.CutOverType = base.CutOverTwoStep
default:

View File

@ -488,22 +488,6 @@ func (this *Applier) SwapTablesQuickAndBumpy() error {
return nil
}
// RenameTable makes coffee. No, wait. It renames a table.
func (this *Applier) RenameTable(fromName, toName string) (err error) {
query := fmt.Sprintf(`rename /* gh-ost */ table %s.%s to %s.%s`,
sql.EscapeName(this.migrationContext.DatabaseName),
sql.EscapeName(fromName),
sql.EscapeName(this.migrationContext.DatabaseName),
sql.EscapeName(toName),
)
log.Infof("Renaming %s to %s", fromName, toName)
if _, err := sqlutils.ExecNoPrepare(this.db, query); err != nil {
return log.Errore(err)
}
log.Infof("Table renamed")
return nil
}
// RenameTablesRollback renames back both table: original back to ghost,
// _old back to original. This is used by `--test-on-replica`
func (this *Applier) RenameTablesRollback() (renameError error) {
@ -603,151 +587,6 @@ func (this *Applier) GetSessionLockName(sessionId int64) string {
return fmt.Sprintf("gh-ost.%d.lock", sessionId)
}
// LockOriginalTableAndWait locks the original table, notifies the lock is in
// place, and awaits further instruction
func (this *Applier) LockOriginalTableAndWait(sessionIdChan chan int64, tableLocked chan<- error, okToUnlockTable <-chan bool, tableUnlocked chan<- error) error {
tx, err := this.db.Begin()
if err != nil {
tableLocked <- err
return err
}
defer func() {
tx.Rollback()
}()
var sessionId int64
if err := tx.QueryRow(`select connection_id()`).Scan(&sessionId); err != nil {
tableLocked <- err
return err
}
sessionIdChan <- sessionId
query := `select get_lock(?, 0)`
lockResult := 0
lockName := this.GetSessionLockName(sessionId)
log.Infof("Grabbing voluntary lock: %s", lockName)
if err := tx.QueryRow(query, lockName).Scan(&lockResult); err != nil || lockResult != 1 {
err := fmt.Errorf("Unable to acquire lock %s", lockName)
tableLocked <- err
return err
}
tableLockTimeoutSeconds := this.migrationContext.CutOverLockTimeoutSeconds * 2
log.Infof("Setting LOCK timeout as %d seconds", tableLockTimeoutSeconds)
query = fmt.Sprintf(`set session lock_wait_timeout:=%d`, tableLockTimeoutSeconds)
if _, err := tx.Exec(query); err != nil {
tableLocked <- err
return err
}
query = fmt.Sprintf(`lock /* gh-ost */ tables %s.%s write`,
sql.EscapeName(this.migrationContext.DatabaseName),
sql.EscapeName(this.migrationContext.OriginalTableName),
)
log.Infof("Locking %s.%s",
sql.EscapeName(this.migrationContext.DatabaseName),
sql.EscapeName(this.migrationContext.OriginalTableName),
)
this.migrationContext.LockTablesStartTime = time.Now()
if _, err := tx.Exec(query); err != nil {
tableLocked <- err
return err
}
log.Infof("Table locked")
tableLocked <- nil // No error.
// The cut-over phase will proceed to apply remaining backlon onto ghost table,
// and issue RENAMEs. We wait here until told to proceed.
<-okToUnlockTable
// Release
query = `unlock tables`
log.Infof("Releasing lock from %s.%s",
sql.EscapeName(this.migrationContext.DatabaseName),
sql.EscapeName(this.migrationContext.OriginalTableName),
)
if _, err := tx.Exec(query); err != nil {
tableUnlocked <- err
return log.Errore(err)
}
log.Infof("Table unlocked")
tableUnlocked <- nil
return nil
}
// RenameOriginalTable will attempt renaming the original table into _old
func (this *Applier) RenameOriginalTable(sessionIdChan chan int64, originalTableRenamed chan<- error) error {
tx, err := this.db.Begin()
if err != nil {
return err
}
defer func() {
tx.Rollback()
originalTableRenamed <- nil
}()
var sessionId int64
if err := tx.QueryRow(`select connection_id()`).Scan(&sessionId); err != nil {
return err
}
sessionIdChan <- sessionId
log.Infof("Setting RENAME timeout as %d seconds", this.migrationContext.CutOverLockTimeoutSeconds)
query := fmt.Sprintf(`set session lock_wait_timeout:=%d`, this.migrationContext.CutOverLockTimeoutSeconds)
if _, err := tx.Exec(query); err != nil {
return err
}
query = fmt.Sprintf(`rename /* gh-ost */ table %s.%s to %s.%s`,
sql.EscapeName(this.migrationContext.DatabaseName),
sql.EscapeName(this.migrationContext.OriginalTableName),
sql.EscapeName(this.migrationContext.DatabaseName),
sql.EscapeName(this.migrationContext.GetOldTableName()),
)
log.Infof("Issuing and expecting this to block: %s", query)
if _, err := tx.Exec(query); err != nil {
return log.Errore(err)
}
log.Infof("Original table renamed")
return nil
}
// RenameGhostTable will attempt renaming the ghost table into original
func (this *Applier) RenameGhostTable(sessionIdChan chan int64, ghostTableRenamed chan<- error) error {
tx, err := this.db.Begin()
if err != nil {
return err
}
defer func() {
tx.Rollback()
}()
var sessionId int64
if err := tx.QueryRow(`select connection_id()`).Scan(&sessionId); err != nil {
return err
}
sessionIdChan <- sessionId
log.Infof("Setting RENAME timeout as %d seconds", this.migrationContext.CutOverLockTimeoutSeconds)
query := fmt.Sprintf(`set session lock_wait_timeout:=%d`, this.migrationContext.CutOverLockTimeoutSeconds)
if _, err := tx.Exec(query); err != nil {
return err
}
query = fmt.Sprintf(`rename /* gh-ost */ table %s.%s to %s.%s`,
sql.EscapeName(this.migrationContext.DatabaseName),
sql.EscapeName(this.migrationContext.GetGhostTableName()),
sql.EscapeName(this.migrationContext.DatabaseName),
sql.EscapeName(this.migrationContext.OriginalTableName),
)
log.Infof("Issuing and expecting this to block: %s", query)
if _, err := tx.Exec(query); err != nil {
ghostTableRenamed <- err
return log.Errore(err)
}
log.Infof("Ghost table renamed")
ghostTableRenamed <- nil
return nil
}
// ExpectUsedLock expects the special hint voluntary lock to exist on given session
func (this *Applier) ExpectUsedLock(sessionId int64) error {
var result int64
@ -931,7 +770,7 @@ func (this *Applier) AtomicCutOverMagicLock(sessionIdChan chan int64, tableLocke
return nil
}
// RenameOriginalTable will attempt renaming the original table into _old
// AtomicCutoverRename
func (this *Applier) AtomicCutoverRename(sessionIdChan chan int64, tablesRenamed chan<- error) error {
tx, err := this.db.Begin()
if err != nil {

View File

@ -486,16 +486,6 @@ func (this *Migrator) cutOver() (err error) {
)
return err
}
if this.migrationContext.CutOverType == base.CutOverSafe {
// Lock-based solution: we use low timeout and multiple attempts. But for
// each failed attempt, we throttle until replication lag is back to normal
err := this.retryOperation(
func() error {
return this.executeAndThrottleOnError(this.safeCutOver)
},
)
return err
}
if this.migrationContext.CutOverType == base.CutOverTwoStep {
err := this.retryOperation(
func() error {
@ -643,121 +633,6 @@ func (this *Migrator) atomicCutOver() (err error) {
return nil
}
// cutOverSafe performs a safe cut over, where normally (no failure) the original table
// is being locked until swapped, hence DML queries being locked and unaware of the cut-over.
// In the worst case, there will ba a minor outage, where the original table would not exist.
func (this *Migrator) safeCutOver() (err error) {
atomic.StoreInt64(&this.inCutOverCriticalActionFlag, 1)
defer atomic.StoreInt64(&this.inCutOverCriticalActionFlag, 0)
okToUnlockTable := make(chan bool, 2)
originalTableRenamed := make(chan error, 1)
var originalTableRenameIntended int64
defer func() {
log.Infof("Checking to see if we need to roll back")
// The following is to make sure we unlock the table no-matter-what!
// There's enough buffer in the channel to support a redundant write here.
okToUnlockTable <- true
if atomic.LoadInt64(&originalTableRenameIntended) == 1 {
log.Infof("Waiting for original table rename result")
// We need to make sure we wait for the original-rename, successful or not,
// so as to be able to rollback in case the ghost-rename fails.
// But we only wait on this queue if there's actually going to be a rename.
// As an example, what happens should the initial `lock tables` fail? We would
// never proceed to rename the table, hence this queue is never written to.
<-originalTableRenamed
}
// Rollback operation
if !this.applier.tableExists(this.migrationContext.OriginalTableName) {
log.Infof("Cannot find %s, rolling back", this.migrationContext.OriginalTableName)
err := this.applier.RenameTable(this.migrationContext.GetOldTableName(), this.migrationContext.OriginalTableName)
log.Errore(err)
} else {
log.Info("No need for rollback")
}
}()
lockOriginalSessionIdChan := make(chan int64, 1)
tableLocked := make(chan error, 1)
tableUnlocked := make(chan error, 1)
go func() {
if err := this.applier.LockOriginalTableAndWait(lockOriginalSessionIdChan, tableLocked, okToUnlockTable, tableUnlocked); err != nil {
log.Errore(err)
}
}()
if err := <-tableLocked; err != nil {
return log.Errore(err)
}
lockOriginalSessionId := <-lockOriginalSessionIdChan
log.Infof("Session locking original table is %+v", lockOriginalSessionId)
// At this point we know the table is locked.
// We know any newly incoming DML on original table is blocked.
this.waitForEventsUpToLock()
// Step 2
// We now attempt a RENAME on the original table, and expect it to block
renameOriginalSessionIdChan := make(chan int64, 1)
this.migrationContext.RenameTablesStartTime = time.Now()
atomic.StoreInt64(&originalTableRenameIntended, 1)
go func() {
this.applier.RenameOriginalTable(renameOriginalSessionIdChan, originalTableRenamed)
}()
renameOriginalSessionId := <-renameOriginalSessionIdChan
log.Infof("Session renaming original table is %+v", renameOriginalSessionId)
if err := this.retryOperation(
func() error {
return this.applier.ExpectProcess(renameOriginalSessionId, "metadata lock", "rename")
}); err != nil {
return err
}
log.Infof("Found RENAME on original table to be blocking, as expected. Double checking original is still being locked")
if err := this.applier.ExpectUsedLock(lockOriginalSessionId); err != nil {
// Abort operation; but make sure to unlock table!
return log.Errore(err)
}
log.Infof("Connection holding lock on original table still exists")
// Now that we've found the RENAME blocking, AND the locking connection still alive,
// we know it is safe to proceed to renaming ghost table.
// Step 3
// We now attempt a RENAME on the ghost table, and expect it to block
renameGhostSessionIdChan := make(chan int64, 1)
ghostTableRenamed := make(chan error, 1)
go func() {
this.applier.RenameGhostTable(renameGhostSessionIdChan, ghostTableRenamed)
}()
renameGhostSessionId := <-renameGhostSessionIdChan
log.Infof("Session renaming ghost table is %+v", renameGhostSessionId)
if err := this.retryOperation(
func() error {
return this.applier.ExpectProcess(renameGhostSessionId, "metadata lock", "rename")
}); err != nil {
return err
}
log.Infof("Found RENAME on ghost table to be blocking, as expected. Will next release lock on original table")
// Step 4
okToUnlockTable <- true
// BAM! original table lock is released, RENAME original->old released,
// RENAME ghost->original is released, queries on original are unblocked.
// (that is, assuming all went well)
if err := <-tableUnlocked; err != nil {
return log.Errore(err)
}
if err := <-ghostTableRenamed; err != nil {
return log.Errore(err)
}
this.migrationContext.RenameTablesEndTime = time.Now()
// ooh nice! We're actually truly and thankfully done
lockAndRenameDuration := this.migrationContext.RenameTablesEndTime.Sub(this.migrationContext.LockTablesStartTime)
log.Infof("Lock & rename duration: %s. During this time, queries on %s were blocked", lockAndRenameDuration, sql.EscapeName(this.migrationContext.OriginalTableName))
return nil
}
// stopWritesAndCompleteMigrationOnReplica will stop replication IO thread, apply
// what DML events are left, and that's it.
// This only applies in --test-on-replica. It leaves replication stopped, with both tables