2016-04-04 10:27:51 +00:00
/ *
Copyright 2016 GitHub Inc .
2016-05-16 09:09:17 +00:00
See https : //github.com/github/gh-ost/blob/master/LICENSE
2016-04-04 10:27:51 +00:00
* /
package logic
import (
2016-04-04 13:29:02 +00:00
"fmt"
2016-05-19 13:11:36 +00:00
"math"
2016-04-08 08:34:44 +00:00
"os"
2016-04-11 15:27:16 +00:00
"os/signal"
2016-04-07 13:57:12 +00:00
"sync/atomic"
2016-04-11 15:27:16 +00:00
"syscall"
2016-04-07 13:57:12 +00:00
"time"
2016-04-04 13:29:02 +00:00
2016-05-16 09:09:17 +00:00
"github.com/github/gh-ost/go/base"
"github.com/github/gh-ost/go/binlog"
"github.com/github/gh-ost/go/mysql"
"github.com/github/gh-ost/go/sql"
2016-04-04 13:29:02 +00:00
"github.com/outbrain/golib/log"
2016-04-04 10:27:51 +00:00
)
2016-04-08 08:34:44 +00:00
type ChangelogState string
const (
TablesInPlace ChangelogState = "TablesInPlace"
AllEventsUpToLockProcessed = "AllEventsUpToLockProcessed"
)
type tableWriteFunc func ( ) error
const (
2016-04-14 11:37:56 +00:00
applyEventsQueueBuffer = 100
heartbeatIntervalMilliseconds = 1000
2016-04-08 08:34:44 +00:00
)
2016-04-04 10:27:51 +00:00
// Migrator is the main schema migration flow manager.
type Migrator struct {
inspector * Inspector
2016-04-04 13:29:02 +00:00
applier * Applier
2016-04-06 11:05:58 +00:00
eventsStreamer * EventsStreamer
2016-04-04 13:29:02 +00:00
migrationContext * base . MigrationContext
2016-04-07 13:57:12 +00:00
2016-04-08 08:34:44 +00:00
tablesInPlace chan bool
rowCopyComplete chan bool
allEventsUpToLockProcessed chan bool
2016-04-23 02:46:34 +00:00
voluntaryLockAcquired chan bool
2016-04-18 17:57:18 +00:00
panicAbort chan error
2016-04-08 08:34:44 +00:00
2016-05-17 12:40:37 +00:00
rowCopyCompleteFlag int64
allEventsUpToLockProcessedInjectedFlag int64
2016-04-08 08:34:44 +00:00
// copyRowsQueue should not be buffered; if buffered some non-damaging but
// excessive work happens at the end of the iteration as new copy-jobs arrive befroe realizing the copy is complete
copyRowsQueue chan tableWriteFunc
applyEventsQueue chan tableWriteFunc
2016-04-14 11:37:56 +00:00
handledChangelogStates map [ string ] bool
2016-04-04 10:27:51 +00:00
}
2016-04-04 13:29:02 +00:00
func NewMigrator ( ) * Migrator {
2016-04-07 13:57:12 +00:00
migrator := & Migrator {
2016-04-08 08:34:44 +00:00
migrationContext : base . GetMigrationContext ( ) ,
tablesInPlace : make ( chan bool ) ,
rowCopyComplete : make ( chan bool ) ,
allEventsUpToLockProcessed : make ( chan bool ) ,
2016-04-23 02:46:34 +00:00
voluntaryLockAcquired : make ( chan bool , 1 ) ,
2016-04-18 17:57:18 +00:00
panicAbort : make ( chan error ) ,
2016-04-08 08:34:44 +00:00
2016-05-17 12:40:37 +00:00
allEventsUpToLockProcessedInjectedFlag : 0 ,
2016-05-16 09:03:15 +00:00
2016-04-14 11:37:56 +00:00
copyRowsQueue : make ( chan tableWriteFunc ) ,
applyEventsQueue : make ( chan tableWriteFunc , applyEventsQueueBuffer ) ,
handledChangelogStates : make ( map [ string ] bool ) ,
2016-04-07 13:57:12 +00:00
}
return migrator
}
2016-04-11 15:27:16 +00:00
// acceptSignals registers for OS signals
func ( this * Migrator ) acceptSignals ( ) {
c := make ( chan os . Signal , 1 )
signal . Notify ( c , syscall . SIGHUP )
go func ( ) {
for sig := range c {
switch sig {
case syscall . SIGHUP :
log . Debugf ( "Received SIGHUP. Reloading configuration" )
}
}
} ( )
}
2016-04-08 12:35:06 +00:00
func ( this * Migrator ) shouldThrottle ( ) ( result bool , reason string ) {
2016-05-01 18:36:36 +00:00
// User-based throttle
2016-04-08 12:35:06 +00:00
if this . migrationContext . ThrottleFlagFile != "" {
2016-05-17 12:40:37 +00:00
if base . FileExists ( this . migrationContext . ThrottleFlagFile ) {
2016-04-11 15:27:16 +00:00
// Throttle file defined and exists!
return true , "flag-file"
}
}
if this . migrationContext . ThrottleAdditionalFlagFile != "" {
2016-05-17 12:40:37 +00:00
if base . FileExists ( this . migrationContext . ThrottleAdditionalFlagFile ) {
2016-04-11 15:27:16 +00:00
// 2nd Throttle file defined and exists!
2016-04-08 12:35:06 +00:00
return true , "flag-file"
}
}
2016-05-01 18:36:36 +00:00
// Replication lag throttle
lag := atomic . LoadInt64 ( & this . migrationContext . CurrentLag )
if time . Duration ( lag ) > time . Duration ( this . migrationContext . MaxLagMillisecondsThrottleThreshold ) * time . Millisecond {
return true , fmt . Sprintf ( "lag=%fs" , time . Duration ( lag ) . Seconds ( ) )
}
2016-05-17 12:40:37 +00:00
if this . migrationContext . TestOnReplica && ( atomic . LoadInt64 ( & this . allEventsUpToLockProcessedInjectedFlag ) == 0 ) {
2016-05-01 18:36:36 +00:00
replicationLag , err := mysql . GetMaxReplicationLag ( this . migrationContext . InspectorConnectionConfig , this . migrationContext . ThrottleControlReplicaKeys , this . migrationContext . ReplictionLagQuery )
if err != nil {
return true , err . Error ( )
}
if replicationLag > time . Duration ( this . migrationContext . MaxLagMillisecondsThrottleThreshold ) * time . Millisecond {
return true , fmt . Sprintf ( "replica-lag=%fs" , replicationLag . Seconds ( ) )
}
}
2016-04-08 12:35:06 +00:00
for variableName , threshold := range this . migrationContext . MaxLoad {
value , err := this . applier . ShowStatusVariable ( variableName )
if err != nil {
return true , fmt . Sprintf ( "%s %s" , variableName , err )
}
if value > threshold {
return true , fmt . Sprintf ( "%s=%d" , variableName , value )
2016-04-08 08:34:44 +00:00
}
2016-04-07 13:57:12 +00:00
}
2016-04-08 12:35:06 +00:00
return false , ""
}
2016-04-11 15:27:16 +00:00
func ( this * Migrator ) initiateThrottler ( ) error {
throttlerTick := time . Tick ( 1 * time . Second )
throttlerFunction := func ( ) {
alreadyThrottling , currentReason := this . migrationContext . IsThrottled ( )
shouldThrottle , throttleReason := this . shouldThrottle ( )
if shouldThrottle && ! alreadyThrottling {
// New throttling
this . applier . WriteAndLogChangelog ( "throttle" , throttleReason )
} else if shouldThrottle && alreadyThrottling && ( currentReason != throttleReason ) {
// Change of reason
this . applier . WriteAndLogChangelog ( "throttle" , throttleReason )
} else if alreadyThrottling && ! shouldThrottle {
// End of throttling
this . applier . WriteAndLogChangelog ( "throttle" , "done throttling" )
}
this . migrationContext . SetThrottled ( shouldThrottle , throttleReason )
}
throttlerFunction ( )
for range throttlerTick {
throttlerFunction ( )
}
return nil
}
2016-04-08 12:35:06 +00:00
// throttle initiates a throttling event, if need be, updates the Context and
// calls callback functions, if any
2016-04-11 15:27:16 +00:00
func ( this * Migrator ) throttle ( onThrottled func ( ) ) {
2016-04-08 12:35:06 +00:00
for {
2016-04-11 15:27:16 +00:00
if shouldThrottle , _ := this . migrationContext . IsThrottled ( ) ; ! shouldThrottle {
return
2016-04-08 12:35:06 +00:00
}
2016-04-11 15:27:16 +00:00
if onThrottled != nil {
onThrottled ( )
2016-04-08 12:35:06 +00:00
}
time . Sleep ( time . Second )
}
}
2016-05-17 12:40:37 +00:00
// sleepWhileTrue sleeps indefinitely until the given function returns 'false'
// (or fails with error)
func ( this * Migrator ) sleepWhileTrue ( operation func ( ) ( bool , error ) ) error {
for {
shouldSleep , err := operation ( )
if err != nil {
return err
}
if ! shouldSleep {
return nil
}
time . Sleep ( time . Second )
}
}
2016-04-08 12:35:06 +00:00
// retryOperation attempts up to `count` attempts at running given function,
// exiting as soon as it returns with non-error.
func ( this * Migrator ) retryOperation ( operation func ( ) error ) ( err error ) {
maxRetries := this . migrationContext . MaxRetries ( )
for i := 0 ; i < maxRetries ; i ++ {
if i != 0 {
// sleep after previous iteration
time . Sleep ( 1 * time . Second )
}
err = operation ( )
if err == nil {
return nil
}
// there's an error. Let's try again.
}
2016-04-18 17:57:18 +00:00
this . panicAbort <- err
2016-04-08 12:35:06 +00:00
return err
2016-04-07 13:57:12 +00:00
}
2016-04-23 02:46:34 +00:00
// executeAndThrottleOnError executes a given function. If it errors, it
// throttles.
func ( this * Migrator ) executeAndThrottleOnError ( operation func ( ) error ) ( err error ) {
if err := operation ( ) ; err != nil {
this . throttle ( nil )
return err
}
return nil
}
2016-05-16 09:03:15 +00:00
// consumeRowCopyComplete blocks on the rowCopyComplete channel once, and then
// consumers and drops any further incoming events that may be left hanging.
func ( this * Migrator ) consumeRowCopyComplete ( ) {
<- this . rowCopyComplete
2016-05-17 12:40:37 +00:00
atomic . StoreInt64 ( & this . rowCopyCompleteFlag , 1 )
2016-05-16 09:03:15 +00:00
go func ( ) {
for <- this . rowCopyComplete {
}
} ( )
}
2016-04-07 13:57:12 +00:00
func ( this * Migrator ) canStopStreaming ( ) bool {
return false
}
2016-04-18 17:57:18 +00:00
func ( this * Migrator ) onChangelogStateEvent ( dmlEvent * binlog . BinlogDMLEvent ) ( err error ) {
// Hey, I created the changlog table, I know the type of columns it has!
if hint := dmlEvent . NewColumnValues . StringColumn ( 2 ) ; hint != "state" {
return nil
}
changelogState := ChangelogState ( dmlEvent . NewColumnValues . StringColumn ( 3 ) )
switch changelogState {
case TablesInPlace :
{
this . tablesInPlace <- true
}
case AllEventsUpToLockProcessed :
{
2016-05-16 09:03:15 +00:00
applyEventFunc := func ( ) error {
this . allEventsUpToLockProcessed <- true
return nil
}
// at this point we know all events up to lock have been read from the streamer,
// because the streamer works sequentially. So those events are either already handled,
// or have event functions in applyEventsQueue.
// So as not to create a potential deadlock, we write this func to applyEventsQueue
// asynchronously, understanding it doesn't really matter.
go func ( ) {
this . applyEventsQueue <- applyEventFunc
} ( )
2016-04-08 08:34:44 +00:00
}
2016-04-07 13:57:12 +00:00
default :
{
return fmt . Errorf ( "Unknown changelog state: %+v" , changelogState )
}
}
2016-04-08 12:35:06 +00:00
log . Debugf ( "Received state %+v" , changelogState )
2016-04-07 13:57:12 +00:00
return nil
}
2016-04-14 11:37:56 +00:00
func ( this * Migrator ) onChangelogHeartbeat ( heartbeatValue string ) ( err error ) {
2016-04-18 17:57:18 +00:00
heartbeatTime , err := time . Parse ( time . RFC3339Nano , heartbeatValue )
2016-04-07 13:57:12 +00:00
if err != nil {
return log . Errore ( err )
}
lag := time . Now ( ) . Sub ( heartbeatTime )
atomic . StoreInt64 ( & this . migrationContext . CurrentLag , int64 ( lag ) )
return nil
2016-04-04 10:27:51 +00:00
}
2016-04-18 17:57:18 +00:00
//
func ( this * Migrator ) listenOnPanicAbort ( ) {
err := <- this . panicAbort
log . Fatale ( err )
}
2016-04-04 13:29:02 +00:00
func ( this * Migrator ) Migrate ( ) ( err error ) {
2016-05-19 13:11:36 +00:00
log . Infof ( "Migrating %s.%s" , sql . EscapeName ( this . migrationContext . DatabaseName ) , sql . EscapeName ( this . migrationContext . OriginalTableName ) )
2016-04-08 08:34:44 +00:00
this . migrationContext . StartTime = time . Now ( )
2016-04-18 17:57:18 +00:00
go this . listenOnPanicAbort ( )
if err := this . initiateInspector ( ) ; err != nil {
2016-04-04 13:29:02 +00:00
return err
}
2016-04-08 08:34:44 +00:00
if err := this . initiateStreaming ( ) ; err != nil {
return err
}
if err := this . initiateApplier ( ) ; err != nil {
return err
}
log . Debugf ( "Waiting for tables to be in place" )
<- this . tablesInPlace
log . Debugf ( "Tables are in place" )
// Yay! We now know the Ghost and Changelog tables are good to examine!
// When running on replica, this means the replica has those tables. When running
// on master this is always true, of course, and yet it also implies this knowledge
// is in the binlogs.
2016-04-08 12:35:06 +00:00
if err := this . inspector . InspectOriginalAndGhostTables ( ) ; err != nil {
return err
}
2016-05-16 09:03:15 +00:00
if err := this . addDMLEventsListener ( ) ; err != nil {
return err
}
2016-04-18 17:57:18 +00:00
go this . initiateHeartbeatListener ( )
2016-04-08 08:34:44 +00:00
if err := this . applier . ReadMigrationRangeValues ( ) ; err != nil {
return err
}
2016-04-11 15:27:16 +00:00
go this . initiateThrottler ( )
2016-04-08 08:34:44 +00:00
go this . executeWriteFuncs ( )
go this . iterateChunks ( )
2016-04-08 12:35:06 +00:00
this . migrationContext . RowCopyStartTime = time . Now ( )
go this . initiateStatus ( )
2016-04-08 08:34:44 +00:00
log . Debugf ( "Operating until row copy is complete" )
2016-05-16 09:03:15 +00:00
this . consumeRowCopyComplete ( )
2016-05-19 13:11:36 +00:00
log . Infof ( "Row copy complete" )
2016-04-08 08:34:44 +00:00
this . printStatus ( )
2016-04-23 02:46:34 +00:00
if err := this . stopWritesAndCompleteMigration ( ) ; err != nil {
return err
}
2016-04-18 17:57:18 +00:00
2016-05-19 13:11:36 +00:00
log . Infof ( "Done migrating %s.%s" , sql . EscapeName ( this . migrationContext . DatabaseName ) , sql . EscapeName ( this . migrationContext . OriginalTableName ) )
2016-04-18 17:57:18 +00:00
return nil
}
2016-05-01 18:36:36 +00:00
// stopWritesAndCompleteMigration performs the final step of migration, based on migration
// type (on replica? bumpy? safe?)
2016-04-18 17:57:18 +00:00
func ( this * Migrator ) stopWritesAndCompleteMigration ( ) ( err error ) {
2016-04-19 11:25:32 +00:00
if this . migrationContext . Noop {
log . Debugf ( "Noop operation; not really swapping tables" )
return nil
}
2016-04-11 15:27:16 +00:00
this . throttle ( func ( ) {
2016-04-22 20:18:56 +00:00
log . Debugf ( "throttling before swapping tables" )
2016-04-11 15:27:16 +00:00
} )
2016-04-18 17:57:18 +00:00
2016-05-17 12:40:37 +00:00
this . sleepWhileTrue (
func ( ) ( bool , error ) {
if this . migrationContext . PostponeSwapTablesFlagFile == "" {
return false , nil
}
if base . FileExists ( this . migrationContext . PostponeSwapTablesFlagFile ) {
// Throttle file defined and exists!
log . Debugf ( "Postponing final table swap as flag file exists: %+v" , this . migrationContext . PostponeSwapTablesFlagFile )
return true , nil
}
return false , nil
} ,
)
2016-04-18 17:57:18 +00:00
if this . migrationContext . TestOnReplica {
2016-04-22 20:18:56 +00:00
return this . stopWritesAndCompleteMigrationOnReplica ( )
}
// Running on master
if this . migrationContext . QuickAndBumpySwapTables {
return this . stopWritesAndCompleteMigrationOnMasterQuickAndBumpy ( )
}
2016-04-23 02:46:34 +00:00
2016-05-16 09:03:15 +00:00
{
// Lock-based solution: we use low timeout and multiple attempts. But for
// each failed attempt, we throttle until replication lag is back to normal
if err := this . retryOperation (
func ( ) error {
return this . executeAndThrottleOnError ( this . stopWritesAndCompleteMigrationOnMasterViaLock )
} ) ; err != nil {
return err
}
if err := this . dropOldTableIfRequired ( ) ; err != nil {
return err
}
}
2016-04-23 02:46:34 +00:00
return
2016-04-22 20:18:56 +00:00
}
2016-05-03 09:55:17 +00:00
func ( this * Migrator ) dropOldTableIfRequired ( ) ( err error ) {
if ! this . migrationContext . OkToDropTable {
return nil
}
dropTableFunc := func ( ) error {
return this . applier . dropTable ( this . migrationContext . GetOldTableName ( ) )
}
if err := this . retryOperation ( dropTableFunc ) ; err != nil {
return err
}
return nil
}
2016-05-16 09:03:15 +00:00
// Inject the "AllEventsUpToLockProcessed" state hint, wait for it to appear in the binary logs,
// make sure the queue is drained.
func ( this * Migrator ) waitForEventsUpToLock ( ) ( err error ) {
2016-05-19 13:11:36 +00:00
log . Infof ( "Writing changelog state: %+v" , AllEventsUpToLockProcessed )
2016-05-16 09:03:15 +00:00
if _ , err := this . applier . WriteChangelogState ( string ( AllEventsUpToLockProcessed ) ) ; err != nil {
return err
}
2016-05-19 13:11:36 +00:00
log . Infof ( "Waiting for events up to lock" )
2016-05-17 12:40:37 +00:00
atomic . StoreInt64 ( & this . allEventsUpToLockProcessedInjectedFlag , 1 )
2016-05-16 09:03:15 +00:00
<- this . allEventsUpToLockProcessed
2016-05-19 13:11:36 +00:00
log . Infof ( "Done waiting for events up to lock" )
2016-05-16 09:03:15 +00:00
this . printStatus ( )
return nil
}
2016-05-01 18:36:36 +00:00
// stopWritesAndCompleteMigrationOnMasterQuickAndBumpy will lock down the original table, execute
// what's left of last DML entries, and **non-atomically** swap original->old, then new->original.
// There is a point in time where the "original" table does not exist and queries are non-blocked
// and failing.
2016-04-22 20:18:56 +00:00
func ( this * Migrator ) stopWritesAndCompleteMigrationOnMasterQuickAndBumpy ( ) ( err error ) {
if err := this . retryOperation ( this . applier . LockTables ) ; err != nil {
return err
}
2016-05-16 09:03:15 +00:00
if err := this . retryOperation ( this . waitForEventsUpToLock ) ; err != nil {
return err
}
2016-04-22 20:41:20 +00:00
if err := this . retryOperation ( this . applier . SwapTablesQuickAndBumpy ) ; err != nil {
2016-04-22 20:18:56 +00:00
return err
}
if err := this . retryOperation ( this . applier . UnlockTables ) ; err != nil {
return err
}
2016-05-03 09:55:17 +00:00
if err := this . dropOldTableIfRequired ( ) ; err != nil {
return err
2016-04-18 17:57:18 +00:00
}
2016-04-22 20:18:56 +00:00
lockAndRenameDuration := this . migrationContext . RenameTablesEndTime . Sub ( this . migrationContext . LockTablesStartTime )
renameDuration := this . migrationContext . RenameTablesEndTime . Sub ( this . migrationContext . RenameTablesStartTime )
log . Debugf ( "Lock & rename duration: %s (rename only: %s). During this time, queries on %s were locked or failing" , lockAndRenameDuration , renameDuration , sql . EscapeName ( this . migrationContext . OriginalTableName ) )
return nil
}
2016-05-01 18:36:36 +00:00
// stopWritesAndCompleteMigrationOnMasterViaLock will lock down the original table, execute
// what's left of last DML entries, and atomically swap & unlock (original->old && new->original)
2016-04-22 20:18:56 +00:00
func ( this * Migrator ) stopWritesAndCompleteMigrationOnMasterViaLock ( ) ( err error ) {
2016-04-23 02:46:34 +00:00
lockGrabbed := make ( chan error , 1 )
okToReleaseLock := make ( chan bool , 1 )
swapResult := make ( chan error , 1 )
go func ( ) {
if err := this . applier . GrabVoluntaryLock ( lockGrabbed , okToReleaseLock ) ; err != nil {
log . Errore ( err )
}
} ( )
if err := <- lockGrabbed ; err != nil {
return log . Errore ( err )
}
blockingQuerySessionIdChan := make ( chan int64 , 1 )
go func ( ) {
this . applier . IssueBlockingQueryOnVoluntaryLock ( blockingQuerySessionIdChan )
} ( )
blockingQuerySessionId := <- blockingQuerySessionIdChan
log . Infof ( "Intentional blocking query connection id is %+v" , blockingQuerySessionId )
if err := this . retryOperation (
func ( ) error {
return this . applier . ExpectProcess ( blockingQuerySessionId , "User lock" , this . migrationContext . GetVoluntaryLockName ( ) )
} ) ; err != nil {
2016-04-22 20:18:56 +00:00
return err
}
2016-04-23 02:46:34 +00:00
log . Infof ( "Found blocking query to be executing" )
swapSessionIdChan := make ( chan int64 , 1 )
go func ( ) {
swapResult <- this . applier . SwapTablesAtomic ( swapSessionIdChan )
} ( )
swapSessionId := <- swapSessionIdChan
log . Infof ( "RENAME connection id is %+v" , swapSessionId )
if err := this . retryOperation (
func ( ) error {
return this . applier . ExpectProcess ( swapSessionId , "metadata lock" , "rename" )
} ) ; err != nil {
return err
}
log . Infof ( "Found RENAME to be executing" )
2016-04-22 20:18:56 +00:00
2016-04-23 02:46:34 +00:00
// OK, at this time we know any newly incoming DML on original table is blocked.
2016-05-16 09:03:15 +00:00
this . waitForEventsUpToLock ( )
2016-04-08 08:34:44 +00:00
2016-04-23 02:46:34 +00:00
okToReleaseLock <- true
// BAM: voluntary lock is released, blocking query is released, rename is released.
// We now check RENAME result. We have lock_wait_timeout. We put it on purpose, to avoid
// locking the tables for too long. If lock time exceeds said timeout, the RENAME fails
// and returns a non-nil error, in which case tables have not been swapped, and we are
// not really done. We are, however, good to go for more retries.
if err := <- swapResult ; err != nil {
// Bummer. We shall rest a while and try again
2016-04-22 20:18:56 +00:00
return err
}
2016-04-23 02:46:34 +00:00
// ooh nice! We're actually truly and thankfully done
2016-04-18 17:57:18 +00:00
lockAndRenameDuration := this . migrationContext . RenameTablesEndTime . Sub ( this . migrationContext . LockTablesStartTime )
renameDuration := this . migrationContext . RenameTablesEndTime . Sub ( this . migrationContext . RenameTablesStartTime )
2016-04-23 02:46:34 +00:00
log . Debugf ( "Lock & rename duration: %s. Of this, rename time was %s. During rename time, queries on %s were blocked" , lockAndRenameDuration , renameDuration , sql . EscapeName ( this . migrationContext . OriginalTableName ) )
2016-04-18 17:57:18 +00:00
return nil
}
2016-05-01 18:36:36 +00:00
// stopWritesAndCompleteMigrationOnReplica will stop replication IO thread, apply
// what DML events are left, and that's it.
// This only applies in --test-on-replica. It leaves replication stopped, with both tables
// in sync. There is no table swap.
2016-04-22 20:18:56 +00:00
func ( this * Migrator ) stopWritesAndCompleteMigrationOnReplica ( ) ( err error ) {
log . Debugf ( "testing on replica. Instead of LOCK tables I will STOP SLAVE" )
2016-05-16 09:03:15 +00:00
if err := this . retryOperation ( this . applier . StopSlaveNicely ) ; err != nil {
2016-04-22 20:18:56 +00:00
return err
}
2016-05-16 09:03:15 +00:00
this . waitForEventsUpToLock ( )
2016-04-22 20:18:56 +00:00
log . Info ( "Table duplicated with new schema. Am not touching the original table. Replication is stopped. You may now compare the two tables to gain trust into this tool's operation" )
return nil
}
2016-04-18 17:57:18 +00:00
func ( this * Migrator ) initiateInspector ( ) ( err error ) {
this . inspector = NewInspector ( )
if err := this . inspector . InitDBConnections ( ) ; err != nil {
return err
}
if err := this . inspector . ValidateOriginalTable ( ) ; err != nil {
return err
}
if err := this . inspector . InspectOriginalTable ( ) ; err != nil {
return err
}
// So far so good, table is accessible and valid.
// Let's get master connection config
if this . migrationContext . ApplierConnectionConfig , err = this . inspector . getMasterConnectionConfig ( ) ; err != nil {
return err
}
if this . migrationContext . TestOnReplica {
if this . migrationContext . InspectorIsAlsoApplier ( ) {
return fmt . Errorf ( "Instructed to --test-on-replica, but the server we connect to doesn't seem to be a replica" )
}
log . Infof ( "--test-on-replica given. Will not execute on master %+v but rather on replica %+v itself" ,
this . migrationContext . ApplierConnectionConfig . Key , this . migrationContext . InspectorConnectionConfig . Key ,
)
this . migrationContext . ApplierConnectionConfig = this . migrationContext . InspectorConnectionConfig . Duplicate ( )
2016-05-01 18:36:36 +00:00
if this . migrationContext . ThrottleControlReplicaKeys . Len ( ) == 0 {
this . migrationContext . ThrottleControlReplicaKeys . AddKey ( this . migrationContext . InspectorConnectionConfig . Key )
}
2016-04-18 17:57:18 +00:00
} else if this . migrationContext . InspectorIsAlsoApplier ( ) && ! this . migrationContext . AllowedRunningOnMaster {
return fmt . Errorf ( "It seems like this migration attempt to run directly on master. Preferably it would be executed on a replica (and this reduces load from the master). To proceed please provide --allow-on-master" )
}
log . Infof ( "Master found to be %+v" , this . migrationContext . ApplierConnectionConfig . Key )
2016-04-08 08:34:44 +00:00
return nil
}
func ( this * Migrator ) initiateStatus ( ) error {
this . printStatus ( )
statusTick := time . Tick ( 1 * time . Second )
for range statusTick {
go this . printStatus ( )
}
return nil
}
func ( this * Migrator ) printStatus ( ) {
elapsedTime := this . migrationContext . ElapsedTime ( )
elapsedSeconds := int64 ( elapsedTime . Seconds ( ) )
totalRowsCopied := this . migrationContext . GetTotalRowsCopied ( )
2016-05-04 05:23:34 +00:00
rowsEstimate := atomic . LoadInt64 ( & this . migrationContext . RowsEstimate )
2016-05-19 13:11:36 +00:00
var progressPct float64
if rowsEstimate > 0 {
progressPct = 100.0 * float64 ( totalRowsCopied ) / float64 ( rowsEstimate )
2016-04-08 08:34:44 +00:00
}
2016-05-19 13:11:36 +00:00
var etaSeconds float64 = math . MaxFloat64
2016-04-08 08:34:44 +00:00
2016-04-08 12:35:06 +00:00
eta := "N/A"
2016-04-11 15:27:16 +00:00
if isThrottled , throttleReason := this . migrationContext . IsThrottled ( ) ; isThrottled {
eta = fmt . Sprintf ( "throttled, %s" , throttleReason )
2016-05-05 06:18:19 +00:00
} else if progressPct > 100.0 {
eta = "Due"
2016-05-19 13:11:36 +00:00
} else if progressPct >= 1.0 {
2016-05-05 06:18:19 +00:00
elapsedRowCopySeconds := this . migrationContext . ElapsedRowCopyTime ( ) . Seconds ( )
totalExpectedSeconds := elapsedRowCopySeconds * float64 ( rowsEstimate ) / float64 ( totalRowsCopied )
2016-05-19 13:11:36 +00:00
etaSeconds = totalExpectedSeconds - elapsedRowCopySeconds
if etaSeconds >= 0 {
etaDuration := time . Duration ( etaSeconds ) * time . Second
2016-05-05 06:18:19 +00:00
eta = base . PrettifyDurationOutput ( etaDuration )
} else {
eta = "Due"
}
2016-04-08 12:35:06 +00:00
}
2016-05-19 13:11:36 +00:00
shouldPrintStatus := false
if elapsedSeconds <= 60 {
shouldPrintStatus = true
} else if etaSeconds <= 60 {
shouldPrintStatus = true
} else if etaSeconds <= 180 {
shouldPrintStatus = ( elapsedSeconds % 5 == 0 )
} else if elapsedSeconds <= 180 {
shouldPrintStatus = ( elapsedSeconds % 5 == 0 )
2016-05-23 12:58:53 +00:00
} else if this . migrationContext . TimeSincePointOfInterest ( ) <= 60 {
shouldPrintStatus = ( elapsedSeconds % 5 == 0 )
2016-05-19 13:11:36 +00:00
} else {
shouldPrintStatus = ( elapsedSeconds % 30 == 0 )
}
if ! shouldPrintStatus {
return
}
currentBinlogCoordinates := * this . eventsStreamer . GetCurrentBinlogCoordinates ( )
status := fmt . Sprintf ( "Copy: %d/%d %.1f%%; Applied: %d; Backlog: %d/%d; Elapsed: %+v(copy), %+v(total); streamer: %+v; ETA: %s" ,
2016-04-08 08:34:44 +00:00
totalRowsCopied , rowsEstimate , progressPct ,
2016-04-19 11:25:32 +00:00
atomic . LoadInt64 ( & this . migrationContext . TotalDMLEventsApplied ) ,
2016-04-08 08:34:44 +00:00
len ( this . applyEventsQueue ) , cap ( this . applyEventsQueue ) ,
2016-04-18 17:57:18 +00:00
base . PrettifyDurationOutput ( this . migrationContext . ElapsedRowCopyTime ( ) ) , base . PrettifyDurationOutput ( elapsedTime ) ,
2016-05-19 13:11:36 +00:00
currentBinlogCoordinates ,
2016-04-08 12:35:06 +00:00
eta ,
)
this . applier . WriteChangelog (
fmt . Sprintf ( "copy iteration %d at %d" , this . migrationContext . GetIteration ( ) , time . Now ( ) . Unix ( ) ) ,
status ,
)
2016-04-08 08:34:44 +00:00
fmt . Println ( status )
}
2016-04-18 17:57:18 +00:00
func ( this * Migrator ) initiateHeartbeatListener ( ) {
2016-04-14 11:37:56 +00:00
ticker := time . Tick ( ( heartbeatIntervalMilliseconds * time . Millisecond ) / 2 )
for range ticker {
go func ( ) error {
changelogState , err := this . inspector . readChangelogState ( )
if err != nil {
return log . Errore ( err )
}
for hint , value := range changelogState {
switch hint {
case "heartbeat" :
{
this . onChangelogHeartbeat ( value )
}
}
}
return nil
} ( )
}
}
// initiateStreaming begins treaming of binary log events and registers listeners for such events
2016-04-08 08:34:44 +00:00
func ( this * Migrator ) initiateStreaming ( ) error {
2016-04-06 11:05:58 +00:00
this . eventsStreamer = NewEventsStreamer ( )
if err := this . eventsStreamer . InitDBConnections ( ) ; err != nil {
return err
}
2016-04-18 17:57:18 +00:00
this . eventsStreamer . AddListener (
false ,
this . migrationContext . DatabaseName ,
this . migrationContext . GetChangelogTableName ( ) ,
func ( dmlEvent * binlog . BinlogDMLEvent ) error {
return this . onChangelogStateEvent ( dmlEvent )
} ,
)
2016-05-16 09:03:15 +00:00
go func ( ) {
log . Debugf ( "Beginning streaming" )
2016-05-19 13:11:36 +00:00
err := this . eventsStreamer . StreamEvents ( this . canStopStreaming )
if err != nil {
this . panicAbort <- err
}
log . Debugf ( "Done streaming" )
2016-05-16 09:03:15 +00:00
} ( )
return nil
}
// addDMLEventsListener
func ( this * Migrator ) addDMLEventsListener ( ) error {
err := this . eventsStreamer . AddListener (
2016-05-05 14:14:55 +00:00
false ,
2016-04-07 13:57:12 +00:00
this . migrationContext . DatabaseName ,
2016-04-14 11:37:56 +00:00
this . migrationContext . OriginalTableName ,
2016-04-07 13:57:12 +00:00
func ( dmlEvent * binlog . BinlogDMLEvent ) error {
2016-04-14 11:37:56 +00:00
applyEventFunc := func ( ) error {
return this . applier . ApplyDMLEventQuery ( dmlEvent )
}
this . applyEventsQueue <- applyEventFunc
return nil
2016-04-07 13:57:12 +00:00
} ,
)
2016-05-16 09:03:15 +00:00
return err
2016-04-08 08:34:44 +00:00
}
2016-04-06 11:05:58 +00:00
2016-04-08 08:34:44 +00:00
func ( this * Migrator ) initiateApplier ( ) error {
2016-04-04 13:29:02 +00:00
this . applier = NewApplier ( )
if err := this . applier . InitDBConnections ( ) ; err != nil {
return err
}
2016-05-03 09:55:17 +00:00
if err := this . applier . ValidateOrDropExistingTables ( ) ; err != nil {
return err
}
2016-04-06 11:05:58 +00:00
if err := this . applier . CreateGhostTable ( ) ; err != nil {
log . Errorf ( "Unable to create ghost table, see further error details. Perhaps a previous migration failed without dropping the table? Bailing out" )
return err
}
if err := this . applier . AlterGhost ( ) ; err != nil {
log . Errorf ( "Unable to ALTER ghost table, see further error details. Bailing out" )
return err
}
2016-04-07 13:57:12 +00:00
if err := this . applier . CreateChangelogTable ( ) ; err != nil {
log . Errorf ( "Unable to create changelog table, see further error details. Perhaps a previous migration failed without dropping the table? OR is there a running migration? Bailing out" )
return err
}
2016-04-08 12:35:06 +00:00
this . applier . WriteChangelogState ( string ( TablesInPlace ) )
2016-04-14 11:37:56 +00:00
go this . applier . InitiateHeartbeat ( heartbeatIntervalMilliseconds )
2016-04-08 08:34:44 +00:00
return nil
}
2016-04-07 13:57:12 +00:00
2016-04-08 08:34:44 +00:00
func ( this * Migrator ) iterateChunks ( ) error {
terminateRowIteration := func ( err error ) error {
this . rowCopyComplete <- true
return log . Errore ( err )
}
2016-04-14 11:37:56 +00:00
if this . migrationContext . Noop {
log . Debugf ( "Noop operation; not really copying data" )
return terminateRowIteration ( nil )
}
if this . migrationContext . MigrationRangeMinValues == nil {
log . Debugf ( "No rows found in table. Rowcopy will be implicitly empty" )
return terminateRowIteration ( nil )
}
2016-04-08 08:34:44 +00:00
for {
2016-05-17 12:40:37 +00:00
if atomic . LoadInt64 ( & this . rowCopyCompleteFlag ) == 1 {
// Done
return nil
}
2016-04-08 08:34:44 +00:00
copyRowsFunc := func ( ) error {
hasFurtherRange , err := this . applier . CalculateNextIterationRangeEndValues ( )
if err != nil {
return terminateRowIteration ( err )
}
if ! hasFurtherRange {
return terminateRowIteration ( nil )
}
2016-05-16 09:03:15 +00:00
applyCopyRowsFunc := func ( ) error {
_ , rowsAffected , _ , err := this . applier . ApplyIterationInsertQuery ( )
if err != nil {
return terminateRowIteration ( err )
}
atomic . AddInt64 ( & this . migrationContext . TotalRowsCopied , rowsAffected )
atomic . AddInt64 ( & this . migrationContext . Iteration , 1 )
return nil
2016-04-08 08:34:44 +00:00
}
2016-05-16 09:03:15 +00:00
return this . retryOperation ( applyCopyRowsFunc )
2016-04-08 08:34:44 +00:00
}
this . copyRowsQueue <- copyRowsFunc
2016-04-04 13:29:02 +00:00
}
2016-04-08 08:34:44 +00:00
return nil
}
func ( this * Migrator ) executeWriteFuncs ( ) error {
2016-04-14 11:37:56 +00:00
if this . migrationContext . Noop {
2016-04-19 11:25:32 +00:00
log . Debugf ( "Noop operation; not really executing write funcs" )
2016-04-14 11:37:56 +00:00
return nil
}
2016-04-05 07:14:22 +00:00
for {
2016-04-11 15:27:16 +00:00
this . throttle ( nil )
2016-04-08 08:34:44 +00:00
// We give higher priority to event processing, then secondary priority to
// rowcopy
select {
case applyEventFunc := <- this . applyEventsQueue :
{
2016-04-08 12:35:06 +00:00
if err := this . retryOperation ( applyEventFunc ) ; err != nil {
return log . Errore ( err )
}
2016-04-08 08:34:44 +00:00
}
default :
{
select {
case copyRowsFunc := <- this . copyRowsQueue :
{
2016-05-16 09:03:15 +00:00
// Retries are handled within the copyRowsFunc
if err := copyRowsFunc ( ) ; err != nil {
2016-04-08 12:35:06 +00:00
return log . Errore ( err )
}
2016-04-08 08:34:44 +00:00
}
default :
{
// Hmmmmm... nothing in the queue; no events, but also no row copy.
// This is possible upon load. Let's just sleep it over.
log . Debugf ( "Getting nothing in the write queue. Sleeping..." )
time . Sleep ( time . Second )
}
}
}
2016-04-06 11:05:58 +00:00
}
2016-04-07 13:57:12 +00:00
}
2016-04-04 10:27:51 +00:00
return nil
}