Merge branch 'master' into fix-infinite-cutover-loop

This commit is contained in:
Shlomi Noach 2017-02-16 14:16:32 +02:00 committed by GitHub
commit 289ce46a2b
11 changed files with 115 additions and 28 deletions

View File

@ -146,8 +146,12 @@ gh-ost --allow-master-master --assume-master-host=a.specific.master.com
Topologies using _tungsten replicator_ are peculiar in that the participating servers are not actually aware they are replicating. The _tungsten replicator_ looks just like another app issuing queries on those hosts. `gh-ost` is unable to identify that a server participates in a _tungsten_ topology.
If you choose to migrate directly on master (see above), there's nothing special you need to do. If you choose to migrate via replica, then you must supply the identity of the master, and indicate this is a tungsten setup, as follows:
If you choose to migrate directly on master (see above), there's nothing special you need to do.
If you choose to migrate via replica, then you need to make sure Tungsten is configured with log-slave-updates parameter (note this is different from MySQL's own log-slave-updates parameter), otherwise changes will not be in the replica's binlog, causing data to be corrupted after table swap. You must also supply the identity of the master, and indicate this is a tungsten setup, as follows:
```
gh-ost --tungsten --assume-master-host=the.topology.master.com
```
Also note that `--switch-to-rbr` does not work for a Tungsten setup as the replication process is external, so you need to make sure `binlog_format` is set to ROW before Tungsten Replicator connects to the server and starts applying events from the master.

View File

@ -26,9 +26,9 @@ Both interfaces may serve at the same time. Both respond to simple text command,
- The `critical-load` format must be: `some_status=<numeric-threshold>[,some_status=<numeric-threshold>...]`'
- For example: `Threads_running=1000,threads_connected=5000`, and you would then write/echo `critical-load=Threads_running=1000,threads_connected=5000` to the socket.
- `nice-ratio=<ratio>`: change _nice_ ratio: 0 for aggressive (not nice, not sleeping), positive integer `n`:
- For any `1ms` spent copying rows, spend `n*1ms` units of time sleeping.
- Examples: assume a single rows chunk copy takes `100ms` to complete.
- `nice-ratio=0.5` will cause `gh-ost` to sleep for `50ms` immediately following.
- For any `1ms` spent copying rows, spend `n*1ms` units of time sleeping.
- Examples: assume a single rows chunk copy takes `100ms` to complete.
- `nice-ratio=0.5` will cause `gh-ost` to sleep for `50ms` immediately following.
- `nice-ratio=1` will cause `gh-ost` to sleep for `100ms`, effectively doubling runtime
- value of `2` will effectively triple the runtime; etc.
- `throttle-query`: change throttle query
@ -38,6 +38,10 @@ Both interfaces may serve at the same time. Both respond to simple text command,
- `unpostpone`: at a time where `gh-ost` is postponing the [cut-over](cut-over.md) phase, instruct `gh-ost` to stop postponing and proceed immediately to cut-over.
- `panic`: immediately panic and abort operation
### Querying for data
For commands that accept an argumetn as value, pass `?` (question mark) to _get_ current value rather than _set_ a new one.
### Examples
While migration is running:
@ -63,6 +67,11 @@ $ echo "chunk-size=250" | nc -U /tmp/gh-ost.test.sample_data_0.sock
# Serving on TCP port: 10001
```
```shell
$ echo "chunk-size=?" | nc -U /tmp/gh-ost.test.sample_data_0.sock
250
```
```shell
$ echo throttle | nc -U /tmp/gh-ost.test.sample_data_0.sock

View File

@ -135,7 +135,9 @@ type MigrationContext struct {
OriginalBinlogFormat string
OriginalBinlogRowImage string
InspectorConnectionConfig *mysql.ConnectionConfig
InspectorMySQLVersion string
ApplierConnectionConfig *mysql.ConnectionConfig
ApplierMySQLVersion string
StartTime time.Time
RowCopyStartTime time.Time
RowCopyEndTime time.Time
@ -559,7 +561,11 @@ func (this *MigrationContext) GetControlReplicasLagResult() mysql.ReplicationLag
func (this *MigrationContext) SetControlReplicasLagResult(lagResult *mysql.ReplicationLagResult) {
this.throttleMutex.Lock()
defer this.throttleMutex.Unlock()
this.controlReplicasLagResult = *lagResult
if lagResult == nil {
this.controlReplicasLagResult = *mysql.NewNoReplicationLagResult()
} else {
this.controlReplicasLagResult = *lagResult
}
}
func (this *MigrationContext) GetThrottleControlReplicaKeys() *mysql.InstanceKeyMap {

View File

@ -242,5 +242,5 @@ func main() {
migrator.ExecOnFailureHook()
log.Fatale(err)
}
log.Info("Done")
fmt.Fprintf(os.Stdout, "# Done\n")
}

View File

@ -70,14 +70,15 @@ func (this *Applier) InitDBConnections() (err error) {
if err := this.readTableColumns(); err != nil {
return err
}
log.Infof("Applier initiated on %+v, version %+v", this.connectionConfig.ImpliedKey, this.migrationContext.ApplierMySQLVersion)
return nil
}
// validateConnection issues a simple can-connect to MySQL
func (this *Applier) validateConnection(db *gosql.DB) error {
query := `select @@global.port`
query := `select @@global.port, @@global.version`
var port int
if err := db.QueryRow(query).Scan(&port); err != nil {
if err := db.QueryRow(query).Scan(&port, &this.migrationContext.ApplierMySQLVersion); err != nil {
return err
}
if port != this.connectionConfig.Key.Port {

View File

@ -60,6 +60,7 @@ func (this *Inspector) InitDBConnections() (err error) {
if err := this.applyBinlogFormat(); err != nil {
return err
}
log.Infof("Inspector initiated on %+v, version %+v", this.connectionConfig.ImpliedKey, this.migrationContext.InspectorMySQLVersion)
return nil
}
@ -168,9 +169,9 @@ func (this *Inspector) inspectOriginalAndGhostTables() (err error) {
// validateConnection issues a simple can-connect to MySQL
func (this *Inspector) validateConnection() error {
query := `select @@global.port`
query := `select @@global.port, @@global.version`
var port int
if err := this.db.QueryRow(query).Scan(&port); err != nil {
if err := this.db.QueryRow(query).Scan(&port, &this.migrationContext.InspectorMySQLVersion); err != nil {
return err
}
if port != this.connectionConfig.Key.Port {

View File

@ -791,6 +791,12 @@ func (this *Migrator) printMigrationStatusHint(writers ...io.Writer) {
throttleQuery,
))
}
if throttleControlReplicaKeys := this.migrationContext.GetThrottleControlReplicaKeys(); throttleControlReplicaKeys.Len() > 0 {
fmt.Fprintln(w, fmt.Sprintf("# throttle-control-replicas count: %+v",
throttleControlReplicaKeys.Len(),
))
}
if this.migrationContext.PostponeCutOverFlagFile != "" {
setIndicator := ""
if base.FileExists(this.migrationContext.PostponeCutOverFlagFile) {
@ -970,7 +976,9 @@ func (this *Migrator) initiateThrottler() error {
go this.throttler.initiateThrottlerCollection(this.firstThrottlingCollected)
log.Infof("Waiting for first throttle metrics to be collected")
<-this.firstThrottlingCollected
<-this.firstThrottlingCollected // replication lag
<-this.firstThrottlingCollected // other metrics
log.Infof("First throttle metrics collected")
go this.throttler.initiateThrottlerChecks()
return nil

View File

@ -126,7 +126,7 @@ func (this *Server) applyServerCommand(command string, writer *bufio.Writer) (pr
if len(tokens) > 1 {
arg = strings.TrimSpace(tokens[1])
}
argIsQuestion := (arg == "?")
throttleHint := "# Note: you may only throttle for as long as your binary logs are not purged\n"
if err := this.hooksExecutor.onInteractiveCommand(command); err != nil {
@ -152,6 +152,7 @@ no-throttle # End forced throttling (other throttling m
unpostpone # Bail out a cut-over postpone; proceed to cut-over
panic # panic and quit without cleanup
help # This message
- use '?' (question mark) as argument to get info rather than set. e.g. "max-load=?" will just print out current max-load.
`)
}
case "sup":
@ -160,6 +161,10 @@ help # This message
return ForcePrintStatusAndHintRule, nil
case "chunk-size":
{
if argIsQuestion {
fmt.Fprintf(writer, "%+v\n", atomic.LoadInt64(&this.migrationContext.ChunkSize))
return NoPrintStatusRule, nil
}
if chunkSize, err := strconv.Atoi(arg); err != nil {
return NoPrintStatusRule, err
} else {
@ -169,6 +174,10 @@ help # This message
}
case "max-lag-millis":
{
if argIsQuestion {
fmt.Fprintf(writer, "%+v\n", atomic.LoadInt64(&this.migrationContext.MaxLagMillisecondsThrottleThreshold))
return NoPrintStatusRule, nil
}
if maxLagMillis, err := strconv.Atoi(arg); err != nil {
return NoPrintStatusRule, err
} else {
@ -182,6 +191,10 @@ help # This message
}
case "nice-ratio":
{
if argIsQuestion {
fmt.Fprintf(writer, "%+v\n", this.migrationContext.GetNiceRatio())
return NoPrintStatusRule, nil
}
if niceRatio, err := strconv.ParseFloat(arg, 64); err != nil {
return NoPrintStatusRule, err
} else {
@ -191,6 +204,11 @@ help # This message
}
case "max-load":
{
if argIsQuestion {
maxLoad := this.migrationContext.GetMaxLoad()
fmt.Fprintf(writer, "%s\n", maxLoad.String())
return NoPrintStatusRule, nil
}
if err := this.migrationContext.ReadMaxLoad(arg); err != nil {
return NoPrintStatusRule, err
}
@ -198,6 +216,11 @@ help # This message
}
case "critical-load":
{
if argIsQuestion {
criticalLoad := this.migrationContext.GetCriticalLoad()
fmt.Fprintf(writer, "%s\n", criticalLoad.String())
return NoPrintStatusRule, nil
}
if err := this.migrationContext.ReadCriticalLoad(arg); err != nil {
return NoPrintStatusRule, err
}
@ -205,12 +228,20 @@ help # This message
}
case "throttle-query":
{
if argIsQuestion {
fmt.Fprintf(writer, "%+v\n", this.migrationContext.GetThrottleQuery())
return NoPrintStatusRule, nil
}
this.migrationContext.SetThrottleQuery(arg)
fmt.Fprintf(writer, throttleHint)
return ForcePrintStatusAndHintRule, nil
}
case "throttle-control-replicas":
{
if argIsQuestion {
fmt.Fprintf(writer, "%s\n", this.migrationContext.GetThrottleControlReplicaKeys().ToCommaDelimitedList())
return NoPrintStatusRule, nil
}
if err := this.migrationContext.ReadThrottleControlReplicaKeys(arg); err != nil {
return NoPrintStatusRule, err
}

View File

@ -84,21 +84,38 @@ func (this *Throttler) parseChangelogHeartbeat(heartbeatValue string) (err error
}
}
// collectHeartbeat reads the latest changelog heartbeat value
func (this *Throttler) collectHeartbeat() {
ticker := time.Tick(time.Duration(this.migrationContext.HeartbeatIntervalMilliseconds) * time.Millisecond)
for range ticker {
go func() error {
if atomic.LoadInt64(&this.migrationContext.CleanupImminentFlag) > 0 {
return nil
// collectReplicationLag reads the latest changelog heartbeat value
func (this *Throttler) collectReplicationLag(firstThrottlingCollected chan<- bool) {
collectFunc := func() error {
if atomic.LoadInt64(&this.migrationContext.CleanupImminentFlag) > 0 {
return nil
}
if this.migrationContext.TestOnReplica || this.migrationContext.MigrateOnReplica {
// when running on replica, the heartbeat injection is also done on the replica.
// This means we will always get a good heartbeat value.
// When runnign on replica, we should instead check the `SHOW SLAVE STATUS` output.
if lag, err := mysql.GetReplicationLag(this.inspector.connectionConfig); err != nil {
return log.Errore(err)
} else {
atomic.StoreInt64(&this.migrationContext.CurrentLag, int64(lag))
}
} else {
if heartbeatValue, err := this.inspector.readChangelogState("heartbeat"); err != nil {
return log.Errore(err)
} else {
this.parseChangelogHeartbeat(heartbeatValue)
}
return nil
}()
}
return nil
}
collectFunc()
firstThrottlingCollected <- true
ticker := time.Tick(time.Duration(this.migrationContext.HeartbeatIntervalMilliseconds) * time.Millisecond)
for range ticker {
go collectFunc()
}
}
@ -114,6 +131,7 @@ func (this *Throttler) collectControlReplicasLag() {
readReplicaLag := func(connectionConfig *mysql.ConnectionConfig) (lag time.Duration, err error) {
dbUri := connectionConfig.GetDBUri("information_schema")
var heartbeatValue string
if db, _, err := sqlutils.GetDB(dbUri); err != nil {
return lag, err
@ -158,9 +176,7 @@ func (this *Throttler) collectControlReplicasLag() {
// No need to read lag
return
}
if result := readControlReplicasLag(); result != nil {
this.migrationContext.SetControlReplicasLagResult(result)
}
this.migrationContext.SetControlReplicasLagResult(readControlReplicasLag())
}
aggressiveTicker := time.Tick(100 * time.Millisecond)
relaxedFactor := 10
@ -272,13 +288,14 @@ func (this *Throttler) collectGeneralThrottleMetrics() error {
// that may affect throttling. There are several components, all running independently,
// that collect such metrics.
func (this *Throttler) initiateThrottlerCollection(firstThrottlingCollected chan<- bool) {
go this.collectHeartbeat()
go this.collectReplicationLag(firstThrottlingCollected)
go this.collectControlReplicasLag()
go func() {
throttlerMetricsTick := time.Tick(1 * time.Second)
this.collectGeneralThrottleMetrics()
firstThrottlingCollected <- true
throttlerMetricsTick := time.Tick(1 * time.Second)
for range throttlerMetricsTick {
this.collectGeneralThrottleMetrics()
}

View File

@ -22,6 +22,14 @@ type ReplicationLagResult struct {
Err error
}
func NewNoReplicationLagResult() *ReplicationLagResult {
return &ReplicationLagResult{Lag: 0, Err: nil}
}
func (this *ReplicationLagResult) HasLag() bool {
return this.Lag > 0
}
// GetReplicationLag returns replication lag for a given connection config; either by explicit query
// or via SHOW SLAVE STATUS
func GetReplicationLag(connectionConfig *ConnectionConfig) (replicationLag time.Duration, err error) {
@ -32,9 +40,11 @@ func GetReplicationLag(connectionConfig *ConnectionConfig) (replicationLag time.
}
err = sqlutils.QueryRowsMap(db, `show slave status`, func(m sqlutils.RowMap) error {
slaveIORunning := m.GetString("Slave_IO_Running")
slaveSQLRunning := m.GetString("Slave_SQL_Running")
secondsBehindMaster := m.GetNullInt64("Seconds_Behind_Master")
if !secondsBehindMaster.Valid {
return fmt.Errorf("replication not running")
return fmt.Errorf("replication not running; Slave_IO_Running=%+v, Slave_SQL_Running=%+v", slaveIORunning, slaveSQLRunning)
}
replicationLag = time.Duration(secondsBehindMaster.Int64) * time.Second
return nil

View File

@ -88,7 +88,7 @@ test_single() {
--throttle-query='select timestampdiff(second, min(last_update), now()) < 5 from _gh_ost_test_ghc' \
--serve-socket-file=/tmp/gh-ost.test.sock \
--initially-drop-socket-file \
--postpone-cut-over-flag-file="" \
--postpone-cut-over-flag-file=/tmp/gh-ost.test.postpone.flag \
--test-on-replica \
--default-retries=1 \
--verbose \