revision-id: fb619954867232b3a733213a2703b2a425dff8e9 (mariadb-5.5.64-19-gfb619954867) parent(s): 7a7d9904e12335ee8b1eea9671138b3c469a3829 author: Sujatha committer: Sujatha timestamp: 2019-06-19 14:48:18 +0530 message: MDEV-15010: Wrong Seconds_Behind_Master when only starting the SQL_Thread Problem: ======= When I run replication normally, I get a value in "Seconds_Behind_Master" from "SHOW SLAVE STATUS". However, when I run only the SQL Thread (on local relay logs that have been downloaded previously), I have "NULL" in "Seconds_Behind_Master". I would expect to have a numeric value. Fix: === Implemented following changes. case 1: "Seconds_Behind_Master" shows 0, when SQL thread is in sync with IO thread and IO thread is running. case 2: "Seconds_Behind_Master" reports NULL, when SQL thread is in sync with IO thread and IO thread is stopped. case 3: "Seconds_Behind_Master" reports NULL, when SQL thread is stopped while IO thread is up and running. case 4: "Seconds_Behind_Master" reports a valid numerical value when IO thread is stopped and SQL thread is consuming existing relay log. --- .../suite/rpl/r/rpl_seconds_behind_master.result | 40 ++++++++++ .../suite/rpl/t/rpl_seconds_behind_master.test | 92 ++++++++++++++++++++++ sql/slave.cc | 82 ++++++++++++------- 3 files changed, 187 insertions(+), 27 deletions(-) diff --git a/mysql-test/suite/rpl/r/rpl_seconds_behind_master.result b/mysql-test/suite/rpl/r/rpl_seconds_behind_master.result new file mode 100644 index 00000000000..1c0ce7c1b19 --- /dev/null +++ b/mysql-test/suite/rpl/r/rpl_seconds_behind_master.result @@ -0,0 +1,40 @@ +include/master-slave.inc +[connection master] +CREATE TABLE t1 (a INT PRIMARY KEY, b INT); +****************************************************************** +* Case1: SQL thread is in sync with IO thread and IO thread is UP +* Seconds_Behind_Master should be '0'. +****************************************************************** +include/assert.inc [Seconds_Behind_Master should be 0] +****************************************************************** +* Case2: SQL thread is in sync with IO thread and IO thread is +* stopped. +* Seconds_Behind_Master should be 'NULL'. +****************************************************************** +include/stop_slave_io.inc +include/assert.inc [Seconds_Behind_Master should be NULL] +****************************************************************** +* Case3: SQL thread is stopped while IO thread is up and running. +* Seconds_Behind_Master should be 'NULL'. +****************************************************************** +START SLAVE IO_THREAD; +include/wait_for_slave_io_to_start.inc +include/stop_slave_sql.inc +include/assert.inc [Seconds_Behind_Master should be NULL] +****************************************************************** +* Case4: IO thread is stopped. SQL thread is consuming existing relay +* log. Hence the Seconds_Behind_Master should have a numeric +* value > 0 +****************************************************************** +INSERT INTO t1 VALUES (1,sleep(2)); +INSERT INTO t1 VALUES (2,sleep(2)); +include/sync_slave_io_with_master.inc +include/stop_slave_io.inc +START SLAVE SQL_THREAD; +include/wait_for_slave_sql_to_start.inc +include/assert.inc [Seconds_Behind_Master should be > 0] +START SLAVE IO_THREAD; +include/wait_for_slave_io_to_start.inc +******* CLEANUP ********* +DROP TABLE t1; +include/rpl_end.inc diff --git a/mysql-test/suite/rpl/t/rpl_seconds_behind_master.test b/mysql-test/suite/rpl/t/rpl_seconds_behind_master.test new file mode 100644 index 00000000000..6f9ed2939e1 --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_seconds_behind_master.test @@ -0,0 +1,92 @@ +# ==== Purpose ==== +# +# Test verifies that Seconds_Behind_Master(SBM) values are as expected in various +# scenario listed below. +# +# case 1: SBM=0 if SQL thread is in sync with IO thread and IO thread is UP +# case 2: SBM=NULL if SQL thread is in sync with IO thread and IO thread is +# stopped. +# case 3: SBM=NULL if SQL thread is stopped while IO thread is up and running. +# case 4: SBM > 0 and SBM != NULL if IO thread is stopped. SQL thread is +# consuming existing relay log. +# +# ==== Implementation ==== +# +# Steps: +# 1 - Create a table and sync it with slave and verify SBM value. +# 2 - Now stop IO thread and verify SBM +# 3 - Start IO thread, Stop SQL thread and verify SBM +# 4 - Execute slow INSERT statements with SLEEP(2) on master. Ensure that +# the IO thread has read these INSERT statements and then bring it down. +# 5 - Start the SQL thread and verify that while it is trying to consume +# these slow INSERT statements in relay log the SBM value is > 0. +# +# ==== References ==== +# +# MDEV-15010: Wrong Seconds_Behind_Master when only starting the SQL_Thread. + +--source include/have_binlog_format_statement.inc +--source include/master-slave.inc + +--disable_query_log +CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since BINLOG_FORMAT = STATEMENT"); +--enable_query_log + +CREATE TABLE t1 (a INT PRIMARY KEY, b INT); +--sync_slave_with_master + +-- echo ****************************************************************** +-- echo * Case1: SQL thread is in sync with IO thread and IO thread is UP +-- echo * Seconds_Behind_Master should be '0'. +-- echo ****************************************************************** +--let $assert_cond= [SHOW SLAVE STATUS, Seconds_Behind_Master, 1] = 0 +--let $assert_text= Seconds_Behind_Master should be 0 +--source include/assert.inc + +-- echo ****************************************************************** +-- echo * Case2: SQL thread is in sync with IO thread and IO thread is +-- echo * stopped. +-- echo * Seconds_Behind_Master should be 'NULL'. +-- echo ****************************************************************** +--source include/stop_slave_io.inc +--let $assert_cond= "[SHOW SLAVE STATUS, Seconds_Behind_Master, 1]" = "NULL" +--let $assert_text= Seconds_Behind_Master should be NULL +--source include/assert.inc + +-- echo ****************************************************************** +-- echo * Case3: SQL thread is stopped while IO thread is up and running. +-- echo * Seconds_Behind_Master should be 'NULL'. +-- echo ****************************************************************** +START SLAVE IO_THREAD; +--source include/wait_for_slave_io_to_start.inc +--source include/stop_slave_sql.inc +--let $assert_cond= "[SHOW SLAVE STATUS, Seconds_Behind_Master, 1]" = "NULL" +--let $assert_text= Seconds_Behind_Master should be NULL +--source include/assert.inc + +-- echo ****************************************************************** +-- echo * Case4: IO thread is stopped. SQL thread is consuming existing relay +-- echo * log. Hence the Seconds_Behind_Master should have a numeric +-- echo * value > 0 +-- echo ****************************************************************** +--connection master +--disable_warnings +INSERT INTO t1 VALUES (1,sleep(2)); +INSERT INTO t1 VALUES (2,sleep(2)); +--enable_warnings +--source include/sync_slave_io_with_master.inc +--source include/stop_slave_io.inc + +START SLAVE SQL_THREAD; +--source include/wait_for_slave_sql_to_start.inc +--let $assert_cond= [SHOW SLAVE STATUS, Seconds_Behind_Master, 1] > 0 +--let $assert_text= Seconds_Behind_Master should be > 0 +--source include/assert.inc +START SLAVE IO_THREAD; +--source include/wait_for_slave_io_to_start.inc + +--echo ******* CLEANUP ********* +--connection master +DROP TABLE t1; + +--source include/rpl_end.inc diff --git a/sql/slave.cc b/sql/slave.cc index a8946c69d18..7b4b7bd857e 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -2192,38 +2192,66 @@ bool show_master_info(THD* thd, Master_info* mi) protocol->store(mi->ssl_cert, &my_charset_bin); protocol->store(mi->ssl_cipher, &my_charset_bin); protocol->store(mi->ssl_key, &my_charset_bin); - /* - Seconds_Behind_Master: if SQL thread is running and I/O thread is - connected, we can compute it otherwise show NULL (i.e. unknown). + The pseudo code to compute Seconds_Behind_Master: + if (SQL thread is running) + { + if (SQL thread processed all the available relay log) + { + if (IO thread is running) + print 0; + else + print NULL; + } + else + compute Seconds_Behind_Master; + } + else + print NULL; */ - if ((mi->slave_running == MYSQL_SLAVE_RUN_CONNECT) && - mi->rli.slave_running) + if (mi->rli.slave_running) { - long time_diff= ((long)(time(0) - mi->rli.last_master_timestamp) - - mi->clock_diff_with_master); /* - Apparently on some systems time_diff can be <0. Here are possible - reasons related to MySQL: - - the master is itself a slave of another master whose time is ahead. - - somebody used an explicit SET TIMESTAMP on the master. - Possible reason related to granularity-to-second of time functions - (nothing to do with MySQL), which can explain a value of -1: - assume the master's and slave's time are perfectly synchronized, and - that at slave's connection time, when the master's timestamp is read, - it is at the very end of second 1, and (a very short time later) when - the slave's timestamp is read it is at the very beginning of second - 2. Then the recorded value for master is 1 and the recorded value for - slave is 2. At SHOW SLAVE STATUS time, assume that the difference - between timestamp of slave and rli->last_master_timestamp is 0 - (i.e. they are in the same second), then we get 0-(2-1)=-1 as a result. - This confuses users, so we don't go below 0: hence the max(). - - last_master_timestamp == 0 (an "impossible" timestamp 1970) is a - special marker to say "consider we have caught up". + Check if SQL thread is at the end of relay log + Checking should be done using two conditions + condition1: compare the log positions and + condition2: compare the file names (to handle rotation case) */ - protocol->store((longlong)(mi->rli.last_master_timestamp ? - max(0, time_diff) : 0)); + if ((mi->master_log_pos == mi->rli.group_master_log_pos) && + (!strcmp(mi->master_log_name, mi->rli.group_master_log_name))) + { + if (mi->slave_running == MYSQL_SLAVE_RUN_CONNECT) + protocol->store(0LL); + else + protocol->store_null(); + } + else + { + long time_diff = ((long)(time(0) - mi->rli.last_master_timestamp) + - mi->clock_diff_with_master); + /* + Apparently on some systems time_diff can be <0. Here are possible + reasons related to MySQL: + - the master is itself a slave of another master whose time is ahead. + - somebody used an explicit SET TIMESTAMP on the master. + Possible reason related to granularity-to-second of time functions + (nothing to do with MySQL), which can explain a value of -1: + assume the master's and slave's time are perfectly synchronized, and + that at slave's connection time, when the master's timestamp is read, + it is at the very end of second 1, and (a very short time later) when + the slave's timestamp is read it is at the very beginning of second + 2. Then the recorded value for master is 1 and the recorded value for + slave is 2. At SHOW SLAVE STATUS time, assume that the difference + between timestamp of slave and rli->last_master_timestamp is 0 + (i.e. they are in the same second), then we get 0-(2-1)=-1 as a result. + This confuses users, so we don't go below 0: hence the max(). + + last_master_timestamp == 0 (an "impossible" timestamp 1970) is a + special marker to say "consider we have caught up". + */ + protocol->store((longlong)(mi->rli.last_master_timestamp ? + max(0, time_diff) : 0)); + } } else {