[Commits] 90a9c4cae74: MDEV-20217: Semi_sync: Last_IO_Error: Fatal error: Failed to run 'after_queue_event' hook
revision-id: 90a9c4cae74d2ef1008e3f216026b7fd2697e46b (mariadb-10.3.18-15-g90a9c4cae74) parent(s): bfbf0f225179d79c1c568b93ddc8b5dd6a670072 author: Sujatha committer: Sujatha timestamp: 2019-09-16 15:45:24 +0530 message: MDEV-20217: Semi_sync: Last_IO_Error: Fatal error: Failed to run 'after_queue_event' hook Fix: === Implemented upstream fix. commit 7d3d0fc303183ef50a343680ce70df71d5675cd1 Author: He Zhenxing <zhenxing.he@sun.com> Backport Bug#45852 Semisynch: Last_IO_Error: Fatal error: Failed to run 'after_queue_event' hook Errors when send reply to master should never cause the IO thread to stop, because master can fall back to async replication if it does not get reply from slave. The problem is fixed by deliberately ignoring the return value of slave_reply. --- .../rpl/r/rpl_semi_sync_slave_reply_fail.result | 40 ++++++++++ .../rpl/t/rpl_semi_sync_slave_reply_fail.test | 87 ++++++++++++++++++++++ sql/slave.cc | 13 ++-- 3 files changed, 134 insertions(+), 6 deletions(-) diff --git a/mysql-test/suite/rpl/r/rpl_semi_sync_slave_reply_fail.result b/mysql-test/suite/rpl/r/rpl_semi_sync_slave_reply_fail.result new file mode 100644 index 00000000000..6b39b296cdf --- /dev/null +++ b/mysql-test/suite/rpl/r/rpl_semi_sync_slave_reply_fail.result @@ -0,0 +1,40 @@ +include/master-slave.inc +[connection master] +connection slave; +include/stop_slave.inc +connection master; +call mtr.add_suppression("Timeout waiting for reply of binlog*"); +set global rpl_semi_sync_master_enabled = ON; +SET @@GLOBAL.rpl_semi_sync_master_timeout=100; +create table t1 (i int); +connection slave; +set global rpl_semi_sync_slave_enabled = ON; +CALL mtr.add_suppression("Semi-sync slave net_flush*"); +SET @save_debug= @@global.debug; +SET GLOBAL debug_dbug="+d,semislave_failed_net_flush"; +include/start_slave.inc +connection master; +connection slave; +"Assert that the net_fulsh() reply failed is present in slave error log. +FOUND 1 /Semi-sync slave net_flush\(\) reply failed/ in mysqld.2.err +"Assert that Slave IO thread is up and running." +SHOW STATUS LIKE 'Slave_running'; +Variable_name Value +Slave_running ON +Slave_IO_Running= Yes +"Clear the network failure simulation." +SET GLOBAL debug_dbug= @save_debug; +connection master; +insert into t1 values (10); +connection slave; +connection slave; +# Compare the tables on master and slave. +include/diff_tables.inc [master:t1, slave:t1] +connection master; +drop table t1; +connection slave; +set global rpl_semi_sync_slave_enabled = OFF; +connection master; +set global rpl_semi_sync_master_enabled = OFF; +SET @@GLOBAL.rpl_semi_sync_master_timeout = 10000; +include/rpl_end.inc diff --git a/mysql-test/suite/rpl/t/rpl_semi_sync_slave_reply_fail.test b/mysql-test/suite/rpl/t/rpl_semi_sync_slave_reply_fail.test new file mode 100644 index 00000000000..f0eb474f00e --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_semi_sync_slave_reply_fail.test @@ -0,0 +1,87 @@ +# ==== Purpose ==== +# +# Test verifies that slave IO thread doesn't report an error, when slave fails +# to send an acknowledgment to master with semi sync replication in use. +# +# ==== Implementation ==== +# +# Steps: +# 0 - Have semi synchronous replication in use. +# 1 - Enable a debug simulation point which simulates network flush failure +# at the time of slave reply operation. +# 2 - Do some operation on master and wait for it to be replicated. Master +# will timeout waiting for reply from slave. +# 3 - Check the slave error log for appropriate error message regarding +# net_flush operation failure. +# 4 - Remove the debug simulation and do some more DML operations on master +# and wait for them to be replicated. +# 5 - Slave will be able to replicate and data is consistent on both master +# and slave. Semi sync will be automatically turned on. +# +# ==== References ==== +# +# MDEV-20217: Semi_sync: Last_IO_Error: Fatal error: Failed to run +# 'after_queue_event' hook +# +--source include/have_debug.inc +--source include/master-slave.inc + +--connection slave +--source include/stop_slave.inc + +--connection master +call mtr.add_suppression("Timeout waiting for reply of binlog*"); +--let $sav_timeout_master=`SELECT @@GLOBAL.rpl_semi_sync_master_timeout` +set global rpl_semi_sync_master_enabled = ON; +SET @@GLOBAL.rpl_semi_sync_master_timeout=100; +create table t1 (i int); + +--connection slave +set global rpl_semi_sync_slave_enabled = ON; +CALL mtr.add_suppression("Semi-sync slave net_flush*"); +SET @save_debug= @@global.debug; +SET GLOBAL debug_dbug="+d,semislave_failed_net_flush"; +--source include/start_slave.inc + +--connection master +--sync_slave_with_master + +# Check error log for correct messages. +let $log_error_= `SELECT @@GLOBAL.log_error`; +if(!$log_error_) +{ + # MySQL Server on windows is started with --console and thus + # does not know the location of its .err log, use default location + let $log_error_ = $MYSQLTEST_VARDIR/log/mysqld.2.err; +} +--echo "Assert that the net_fulsh() reply failed is present in slave error log. +--let SEARCH_FILE=$log_error_ +--let SEARCH_PATTERN=Semi-sync slave net_flush\(\) reply failed +--source include/search_pattern_in_file.inc + +--echo "Assert that Slave IO thread is up and running." +SHOW STATUS LIKE 'Slave_running'; +let $status= query_get_value("show slave status", Slave_IO_Running, 1); +echo Slave_IO_Running= $status; + +--echo "Clear the network failure simulation." +SET GLOBAL debug_dbug= @save_debug; + +--connection master +insert into t1 values (10); +--sync_slave_with_master + +--connection slave +--echo # Compare the tables on master and slave. +--let $diff_tables= master:t1, slave:t1 +--source include/diff_tables.inc + +--connection master +drop table t1; +--sync_slave_with_master +set global rpl_semi_sync_slave_enabled = OFF; + +--connection master +set global rpl_semi_sync_master_enabled = OFF; +--eval SET @@GLOBAL.rpl_semi_sync_master_timeout = $sav_timeout_master +--source include/rpl_end.inc diff --git a/sql/slave.cc b/sql/slave.cc index 4007f323a6e..165aa20b1ee 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -4911,13 +4911,14 @@ Stopping slave I/O thread due to out-of-memory error from master"); goto err; } - if (rpl_semi_sync_slave_status && (mi->semi_ack & SEMI_SYNC_NEED_ACK) && - repl_semisync_slave.slave_reply(mi)) + if (rpl_semi_sync_slave_status && (mi->semi_ack & SEMI_SYNC_NEED_ACK)) { - mi->report(ERROR_LEVEL, ER_SLAVE_FATAL_ERROR, NULL, - ER_THD(thd, ER_SLAVE_FATAL_ERROR), - "Failed to run 'after_queue_event' hook"); - goto err; + /* + We deliberately ignore the error in slave_reply, such error should + not cause the slave IO thread to stop, and the error messages are + already reported. + */ + (void)repl_semisync_slave.slave_reply(mi); } if (mi->using_gtid == Master_info::USE_GTID_NO &&
participants (1)
-
sujatha