[PATCH 1/2] MDEV-33798: ROW base optimistic deadlock with concurrent writes on same table
One case is conflicting transactions T1 and T2 with different domain id, in optimistic parallel replication in non-GTID mode. Then T2 will wait_for_prior_commit on T1; and if T1 got a row lock wait on T2 it would hang, as different domains caused the deadlock kill to be skipped in thd_rpl_deadlock_check(). More generally, if we have transactions T1 and T2 in one domain/master connection, and independently transactions U1 and U2 in another, then we can still deadlock like this: T1 row low wait on U2 U2 wait_for_prior_commit on U1 U1 row lock wait on T2 T2 wait_for_prior_commit on T1 This commit enforces the deadlock kill in these cases. If the waited-for transaction is speculatively applied, then it will be deadlock killed in case of a conflict, even if the two transactions are in different domains or master connections. Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org> --- mysql-test/suite/rpl/r/rpl_mdev33798.result | 143 +++++++++++++++ mysql-test/suite/rpl/t/rpl_mdev33798.cnf | 17 ++ mysql-test/suite/rpl/t/rpl_mdev33798.test | 182 ++++++++++++++++++++ sql/sql_class.cc | 42 ++++- 4 files changed, 376 insertions(+), 8 deletions(-) create mode 100644 mysql-test/suite/rpl/r/rpl_mdev33798.result create mode 100644 mysql-test/suite/rpl/t/rpl_mdev33798.cnf create mode 100644 mysql-test/suite/rpl/t/rpl_mdev33798.test diff --git a/mysql-test/suite/rpl/r/rpl_mdev33798.result b/mysql-test/suite/rpl/r/rpl_mdev33798.result new file mode 100644 index 00000000000..8796e948546 --- /dev/null +++ b/mysql-test/suite/rpl/r/rpl_mdev33798.result @@ -0,0 +1,143 @@ +include/rpl_init.inc [topology=1->2,1->3] +connect server_2b,127.0.0.1,root,,,$SERVER_MYPORT_2; +connection server_2; +SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads; +SET @old_parallel_mode= @@GLOBAL.slave_parallel_mode; +SET @old_timeout= @@GLOBAL.lock_wait_timeout; +SET @old_innodb_timeout= @@GLOBAL.innodb_lock_wait_timeout; +include/stop_slave.inc +SET GLOBAL slave_parallel_threads=5; +set global slave_parallel_mode= aggressive; +SET GLOBAL lock_wait_timeout= 86400; +SET GLOBAL innodb_lock_wait_timeout= 86400; +SET STATEMENT sql_log_bin=0 FOR ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; +include/start_slave.inc +connection server_1; +CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB; +INSERT INTO t1 VALUES (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0); +connection server_2; +include/stop_slave.inc +connection server_2b; +BEGIN; +SELECT * FROM t1 WHERE a=1 FOR UPDATE; +a b +1 0 +SELECT * FROM t1 WHERE a=5 FOR UPDATE; +a b +5 0 +connection server_1; +SET SESSION gtid_domain_id= 1; +BEGIN; +UPDATE t1 SET b=1 WHERE a=1; +UPDATE t1 SET b=1 WHERE a=7; +COMMIT; +UPDATE t1 SET b=2 WHERE a=3; +SET SESSION gtid_domain_id=2; +BEGIN; +UPDATE t1 SET b=3 WHERE a=5; +UPDATE t1 SET b=3 WHERE a=3; +COMMIT; +UPDATE t1 SET b=4 WHERE a=7; +SET SESSION gtid_domain_id= 0; +include/save_master_gtid.inc +connection server_2; +include/start_slave.inc +connection server_2b; +ROLLBACK; +connection server_2; +include/sync_with_master_gtid.inc +SELECT a, ( +(a=1 AND b=1) OR +(a=3 AND (b=2 OR b=3)) OR +(a=5 AND b=3) OR +(a=7 AND (b=1 OR b=4)) OR +((a MOD 2)=0 AND b=0)) AS `ok` + FROM t1 +ORDER BY a; +a ok +1 1 +2 1 +3 1 +4 1 +5 1 +6 1 +7 1 +8 1 +connection server_3; +include/sync_with_master_gtid.inc +include/stop_slave.inc +connection server_2; +include/stop_slave.inc +CHANGE MASTER 'm2' to master_port=MYPORT_3 , master_host='127.0.0.1', master_user='root', master_use_gtid=slave_pos; +connection server_1; +SET SESSION gtid_domain_id= 1; +BEGIN; +UPDATE t1 SET b=11 WHERE a=1; +UPDATE t1 SET b=11 WHERE a=7; +COMMIT; +UPDATE t1 SET b=12 WHERE a=3; +SET SESSION gtid_domain_id= 1; +connection server_3; +SET SESSION gtid_domain_id=3; +BEGIN; +UPDATE t1 SET b=13 WHERE a=5; +UPDATE t1 SET b=13 WHERE a=3; +COMMIT; +UPDATE t1 SET b=14 WHERE a=7; +include/save_master_gtid.inc +connection server_2b; +BEGIN; +SELECT * FROM t1 WHERE a=1 FOR UPDATE; +a b +1 1 +SELECT * FROM t1 WHERE a=5 FOR UPDATE; +a b +5 3 +START ALL SLAVES; +Warnings: +Note 1937 SLAVE 'm2' started +Note 1937 SLAVE '' started +connection server_2b; +ROLLBACK; +connection server_1; +include/save_master_gtid.inc +connection server_2; +include/sync_with_master_gtid.inc +connection server_3; +include/save_master_gtid.inc +connection server_2; +include/sync_with_master_gtid.inc +SELECT a, ( +(a=1 AND b=11) OR +(a=3 AND (b=12 OR b=13)) OR +(a=5 AND b=13) OR +(a=7 AND (b=11 OR b=14)) OR +((a MOD 2)=0 AND b=0)) AS `ok` + FROM t1 +ORDER BY a; +a ok +1 1 +2 1 +3 1 +4 1 +5 1 +6 1 +7 1 +8 1 +SET default_master_connection = 'm2'; +include/stop_slave.inc +RESET SLAVE 'm2' ALL; +SET default_master_connection = ''; +connection server_3; +include/start_slave.inc +disconnect server_2b; +connection server_1; +DROP TABLE t1; +connection server_2; +include/stop_slave.inc +SET GLOBAL slave_parallel_threads=@old_parallel_threads; +set global slave_parallel_mode= @old_parallel_mode; +SET GLOBAL lock_wait_timeout= @old_timeout; +SET GLOBAL innodb_lock_wait_timeout= @old_innodb_timeout; +include/start_slave.inc +include/rpl_end.inc diff --git a/mysql-test/suite/rpl/t/rpl_mdev33798.cnf b/mysql-test/suite/rpl/t/rpl_mdev33798.cnf new file mode 100644 index 00000000000..8e5125ea6ca --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_mdev33798.cnf @@ -0,0 +1,17 @@ +!include suite/rpl/my.cnf + +[mysqld.1] +log-slave-updates +loose-innodb + +[mysqld.2] +log-slave-updates +loose-innodb + +[mysqld.3] +log-slave-updates +loose-innodb + +[ENV] +SERVER_MYPORT_3= @mysqld.3.port +SERVER_MYSOCK_3= @mysqld.3.socket diff --git a/mysql-test/suite/rpl/t/rpl_mdev33798.test b/mysql-test/suite/rpl/t/rpl_mdev33798.test new file mode 100644 index 00000000000..1448ed91133 --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_mdev33798.test @@ -0,0 +1,182 @@ +--source include/have_innodb.inc +--source include/have_log_bin.inc +--let $rpl_topology=1->2,1->3 +--source include/rpl_init.inc +--connect (server_2b,127.0.0.1,root,,,$SERVER_MYPORT_2) + +--connection server_2 +SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads; +SET @old_parallel_mode= @@GLOBAL.slave_parallel_mode; +SET @old_timeout= @@GLOBAL.lock_wait_timeout; +SET @old_innodb_timeout= @@GLOBAL.innodb_lock_wait_timeout; +--source include/stop_slave.inc +SET GLOBAL slave_parallel_threads=5; +set global slave_parallel_mode= aggressive; +# High timeout so we get replication sync error and test failure if the +# conflict handling is insufficient and lock wait timeout occurs. +SET GLOBAL lock_wait_timeout= 86400; +SET GLOBAL innodb_lock_wait_timeout= 86400; +SET STATEMENT sql_log_bin=0 FOR ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; +--source include/start_slave.inc + +--connection server_1 +CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB; +INSERT INTO t1 VALUES (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0); +--save_master_pos + +--connection server_2 +--sync_with_master +--source include/stop_slave.inc + +# Test the following scenario: +# +# Transactions T1, T2 in domain 1, U1, U2 in domain 2. +# Wait cycle T1->U2->U1->T2->T1 as follows: +# T1 row lock wait on U2 +# U2 wait_for_prior_commit on U1 +# U1 row lock wait on T2 +# T2 wait_for_prior_commit on T1 +# +# Test that the wait cycle is broken correctly with deadlock kill. + +--connection server_2b +# Temporarily block T1 and U1. +BEGIN; +SELECT * FROM t1 WHERE a=1 FOR UPDATE; +SELECT * FROM t1 WHERE a=5 FOR UPDATE; + +--connection server_1 + +SET SESSION gtid_domain_id= 1; +# T1 in domain 1 +BEGIN; +UPDATE t1 SET b=1 WHERE a=1; +UPDATE t1 SET b=1 WHERE a=7; +COMMIT; +# T2 in domain 1 +UPDATE t1 SET b=2 WHERE a=3; + +SET SESSION gtid_domain_id=2; +# U1 in domain 2 +BEGIN; +UPDATE t1 SET b=3 WHERE a=5; +UPDATE t1 SET b=3 WHERE a=3; +COMMIT; +# U2 in domain 2 +UPDATE t1 SET b=4 WHERE a=7; +SET SESSION gtid_domain_id= 0; +--source include/save_master_gtid.inc + +--connection server_2 +--source include/start_slave.inc +# Wait until T2, U2 are holding the row locks. +--let $wait_condition= SELECT COUNT(*)=2 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE state LIKE '%Waiting for prior transaction to commit%' +--source include/wait_condition.inc + +# Then let T1, U1 continue to conflict on the row locks, and check that +# replication correctly handles the conflict. +--connection server_2b +ROLLBACK; + +--connection server_2 +--source include/sync_with_master_gtid.inc + +# Allow either domain to "win" on the conflicting updates. +SELECT a, ( + (a=1 AND b=1) OR + (a=3 AND (b=2 OR b=3)) OR + (a=5 AND b=3) OR + (a=7 AND (b=1 OR b=4)) OR + ((a MOD 2)=0 AND b=0)) AS `ok` + FROM t1 + ORDER BY a; + +# Now try the same thing with multi-source replication. + +# Make server_3 a second master +--connection server_3 +--source include/sync_with_master_gtid.inc +--source include/stop_slave.inc + +--connection server_2 +--source include/stop_slave.inc +--replace_result $SERVER_MYPORT_3 MYPORT_3 +eval CHANGE MASTER 'm2' to master_port=$SERVER_MYPORT_3 , master_host='127.0.0.1', master_user='root', master_use_gtid=slave_pos; + +--connection server_1 + +SET SESSION gtid_domain_id= 1; +# T1 in domain 1 +BEGIN; +UPDATE t1 SET b=11 WHERE a=1; +UPDATE t1 SET b=11 WHERE a=7; +COMMIT; +# T2 in domain 1 +UPDATE t1 SET b=12 WHERE a=3; +SET SESSION gtid_domain_id= 1; + +--connection server_3 +SET SESSION gtid_domain_id=3; +# U1 in domain 3 +BEGIN; +UPDATE t1 SET b=13 WHERE a=5; +UPDATE t1 SET b=13 WHERE a=3; +COMMIT; +# U2 in domain 3 +UPDATE t1 SET b=14 WHERE a=7; +--source include/save_master_gtid.inc + +--connection server_2b +# Temporarily block T1 and U1. +BEGIN; +SELECT * FROM t1 WHERE a=1 FOR UPDATE; +SELECT * FROM t1 WHERE a=5 FOR UPDATE; + +START ALL SLAVES; +# Wait until T2, U2 are holding the row locks. +--let $wait_condition= SELECT COUNT(*)=2 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE state LIKE '%Waiting for prior transaction to commit%' +--source include/wait_condition.inc + +--connection server_2b +ROLLBACK; + +--connection server_1 +--source include/save_master_gtid.inc +--connection server_2 +--source include/sync_with_master_gtid.inc +--connection server_3 +--source include/save_master_gtid.inc +--connection server_2 +--source include/sync_with_master_gtid.inc + +SELECT a, ( + (a=1 AND b=11) OR + (a=3 AND (b=12 OR b=13)) OR + (a=5 AND b=13) OR + (a=7 AND (b=11 OR b=14)) OR + ((a MOD 2)=0 AND b=0)) AS `ok` + FROM t1 + ORDER BY a; + +SET default_master_connection = 'm2'; +--source include/stop_slave.inc +RESET SLAVE 'm2' ALL; +SET default_master_connection = ''; + +--connection server_3 +--source include/start_slave.inc + +# Cleanup + +--disconnect server_2b +--connection server_1 +DROP TABLE t1; +--connection server_2 +--source include/stop_slave.inc +SET GLOBAL slave_parallel_threads=@old_parallel_threads; +set global slave_parallel_mode= @old_parallel_mode; +SET GLOBAL lock_wait_timeout= @old_timeout; +SET GLOBAL innodb_lock_wait_timeout= @old_innodb_timeout; +--source include/start_slave.inc + +--source include/rpl_end.inc diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 0d39f359a09..6026e1e13bb 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -5384,14 +5384,40 @@ thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd) return 0; if (!rgi->is_parallel_exec) return 0; - if (rgi->rli != other_rgi->rli) - return 0; - if (!rgi->gtid_sub_id || !other_rgi->gtid_sub_id) - return 0; - if (rgi->current_gtid.domain_id != other_rgi->current_gtid.domain_id) - return 0; - if (rgi->gtid_sub_id > other_rgi->gtid_sub_id) - return 0; + if (rgi->rli == other_rgi->rli) + { + /* + Within the same master connection, we can compare transaction order on + the GTID sub_id, and rollback the later transaction to allow the earlier + transaction to commit first. + */ + if (!rgi->gtid_sub_id || !other_rgi->gtid_sub_id) + return 0; + if (rgi->current_gtid.domain_id != other_rgi->current_gtid.domain_id && + other_rgi->speculation != rpl_group_info::SPECULATE_OPTIMISTIC) + return 0; + if (rgi->gtid_sub_id > other_rgi->gtid_sub_id) + return 0; + } + else + { + /* + Lock conflicts between different master connection should usually not + occur, but could still happen if user is running some special setup that + tolerates conflicting updates (or in case of user error). We do not have a + pre-defined ordering of transactions in this case, but we still need to + handle conflicts in _some_ way to avoid undetected deadlocks and hangs. + + We do this by rolling back and retrying any transaction that is being + _optimistically_ applied. This can be overly conservative in some cases, + but should be fine as conflicts between different master connections are + not expected to be common. And it ensures that we won't end up in a + deadlock and hang due to a transaction doing wait_for_prior_commit while + holding locks that block something in another master connection. + */ + if (other_rgi->speculation != rpl_group_info::SPECULATE_OPTIMISTIC) + return 0; + } if (rgi->finish_event_group_called || other_rgi->finish_event_group_called) { /* -- 2.30.2
Clear any pending deadlock kill after completing XA PREPARE, and before updating the mysql.gtid_slave_pos table in a separate transaction. Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org> --- sql/log_event_server.cc | 13 +++++++++++++ sql/rpl_parallel.cc | 2 +- sql/rpl_parallel.h | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc index 003774c24aa..7aa43a14b4d 100644 --- a/sql/log_event_server.cc +++ b/sql/log_event_server.cc @@ -4547,6 +4547,19 @@ int XA_prepare_log_event::do_commit() else res= trans_xa_commit(thd); + if (thd->rgi_slave->is_parallel_exec) + { + /* + Since the transaction is prepared/committed without updating the GTID pos + (MDEV-32020...), we need here to clear any pending deadlock kill. + Otherwise if the kill happened after the prepare/commit completed, it + might end up killing the subsequent GTID position update, causing the + slave to fail with error. + */ + wait_for_pending_deadlock_kill(thd, thd->rgi_slave); + thd->reset_killed(); + } + return res; } #endif // HAVE_REPLICATION diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index b63642a2f0d..bbfc02110a7 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -131,7 +131,7 @@ handle_queued_pos_update(THD *thd, rpl_parallel_thread::queued_event *qev) asynchronously, we need to be sure they will be completed before starting a new transaction. Otherwise the new transaction might suffer a spurious kill. */ -static void +void wait_for_pending_deadlock_kill(THD *thd, rpl_group_info *rgi) { PSI_stage_info old_stage; diff --git a/sql/rpl_parallel.h b/sql/rpl_parallel.h index d8ee98f91c6..ef872dec66f 100644 --- a/sql/rpl_parallel.h +++ b/sql/rpl_parallel.h @@ -518,6 +518,7 @@ struct rpl_parallel { extern struct rpl_parallel_thread_pool global_rpl_thread_pool; +extern void wait_for_pending_deadlock_kill(THD *thd, rpl_group_info *rgi); extern int rpl_parallel_resize_pool_if_no_slaves(void); extern int rpl_parallel_activate_pool(rpl_parallel_thread_pool *pool); extern int rpl_parallel_inactivate_pool(rpl_parallel_thread_pool *pool); -- 2.30.2
participants (1)
-
Kristian Nielsen