One case is conflicting transactions T1 and T2 with different domain id, in
optimistic parallel replication in non-GTID mode. Then T2 will
wait_for_prior_commit on T1; and if T1 got a row lock wait on T2 it would
hang, as different domains caused the deadlock kill to be skipped in
thd_rpl_deadlock_check().
More generally, if we have transactions T1 and T2 in one domain/master
connection, and independently transactions U1 and U2 in another, then we can
still deadlock like this:
T1 row low wait on U2
U2 wait_for_prior_commit on U1
U1 row lock wait on T2
T2 wait_for_prior_commit on T1
This commit enforces the deadlock kill in these cases. If the waited-for
transaction is speculatively applied, then it will be deadlock killed in
case of a conflict, even if the two transactions are in different domains
or master connections.
Signed-off-by: Kristian Nielsen
---
mysql-test/suite/rpl/r/rpl_mdev33798.result | 143 +++++++++++++++
mysql-test/suite/rpl/t/rpl_mdev33798.cnf | 17 ++
mysql-test/suite/rpl/t/rpl_mdev33798.test | 182 ++++++++++++++++++++
sql/sql_class.cc | 42 ++++-
4 files changed, 376 insertions(+), 8 deletions(-)
create mode 100644 mysql-test/suite/rpl/r/rpl_mdev33798.result
create mode 100644 mysql-test/suite/rpl/t/rpl_mdev33798.cnf
create mode 100644 mysql-test/suite/rpl/t/rpl_mdev33798.test
diff --git a/mysql-test/suite/rpl/r/rpl_mdev33798.result b/mysql-test/suite/rpl/r/rpl_mdev33798.result
new file mode 100644
index 00000000000..8796e948546
--- /dev/null
+++ b/mysql-test/suite/rpl/r/rpl_mdev33798.result
@@ -0,0 +1,143 @@
+include/rpl_init.inc [topology=1->2,1->3]
+connect server_2b,127.0.0.1,root,,,$SERVER_MYPORT_2;
+connection server_2;
+SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+SET @old_parallel_mode= @@GLOBAL.slave_parallel_mode;
+SET @old_timeout= @@GLOBAL.lock_wait_timeout;
+SET @old_innodb_timeout= @@GLOBAL.innodb_lock_wait_timeout;
+include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=5;
+set global slave_parallel_mode= aggressive;
+SET GLOBAL lock_wait_timeout= 86400;
+SET GLOBAL innodb_lock_wait_timeout= 86400;
+SET STATEMENT sql_log_bin=0 FOR ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+include/start_slave.inc
+connection server_1;
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0);
+connection server_2;
+include/stop_slave.inc
+connection server_2b;
+BEGIN;
+SELECT * FROM t1 WHERE a=1 FOR UPDATE;
+a b
+1 0
+SELECT * FROM t1 WHERE a=5 FOR UPDATE;
+a b
+5 0
+connection server_1;
+SET SESSION gtid_domain_id= 1;
+BEGIN;
+UPDATE t1 SET b=1 WHERE a=1;
+UPDATE t1 SET b=1 WHERE a=7;
+COMMIT;
+UPDATE t1 SET b=2 WHERE a=3;
+SET SESSION gtid_domain_id=2;
+BEGIN;
+UPDATE t1 SET b=3 WHERE a=5;
+UPDATE t1 SET b=3 WHERE a=3;
+COMMIT;
+UPDATE t1 SET b=4 WHERE a=7;
+SET SESSION gtid_domain_id= 0;
+include/save_master_gtid.inc
+connection server_2;
+include/start_slave.inc
+connection server_2b;
+ROLLBACK;
+connection server_2;
+include/sync_with_master_gtid.inc
+SELECT a, (
+(a=1 AND b=1) OR
+(a=3 AND (b=2 OR b=3)) OR
+(a=5 AND b=3) OR
+(a=7 AND (b=1 OR b=4)) OR
+((a MOD 2)=0 AND b=0)) AS `ok`
+ FROM t1
+ORDER BY a;
+a ok
+1 1
+2 1
+3 1
+4 1
+5 1
+6 1
+7 1
+8 1
+connection server_3;
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+connection server_2;
+include/stop_slave.inc
+CHANGE MASTER 'm2' to master_port=MYPORT_3 , master_host='127.0.0.1', master_user='root', master_use_gtid=slave_pos;
+connection server_1;
+SET SESSION gtid_domain_id= 1;
+BEGIN;
+UPDATE t1 SET b=11 WHERE a=1;
+UPDATE t1 SET b=11 WHERE a=7;
+COMMIT;
+UPDATE t1 SET b=12 WHERE a=3;
+SET SESSION gtid_domain_id= 1;
+connection server_3;
+SET SESSION gtid_domain_id=3;
+BEGIN;
+UPDATE t1 SET b=13 WHERE a=5;
+UPDATE t1 SET b=13 WHERE a=3;
+COMMIT;
+UPDATE t1 SET b=14 WHERE a=7;
+include/save_master_gtid.inc
+connection server_2b;
+BEGIN;
+SELECT * FROM t1 WHERE a=1 FOR UPDATE;
+a b
+1 1
+SELECT * FROM t1 WHERE a=5 FOR UPDATE;
+a b
+5 3
+START ALL SLAVES;
+Warnings:
+Note 1937 SLAVE 'm2' started
+Note 1937 SLAVE '' started
+connection server_2b;
+ROLLBACK;
+connection server_1;
+include/save_master_gtid.inc
+connection server_2;
+include/sync_with_master_gtid.inc
+connection server_3;
+include/save_master_gtid.inc
+connection server_2;
+include/sync_with_master_gtid.inc
+SELECT a, (
+(a=1 AND b=11) OR
+(a=3 AND (b=12 OR b=13)) OR
+(a=5 AND b=13) OR
+(a=7 AND (b=11 OR b=14)) OR
+((a MOD 2)=0 AND b=0)) AS `ok`
+ FROM t1
+ORDER BY a;
+a ok
+1 1
+2 1
+3 1
+4 1
+5 1
+6 1
+7 1
+8 1
+SET default_master_connection = 'm2';
+include/stop_slave.inc
+RESET SLAVE 'm2' ALL;
+SET default_master_connection = '';
+connection server_3;
+include/start_slave.inc
+disconnect server_2b;
+connection server_1;
+DROP TABLE t1;
+connection server_2;
+include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+set global slave_parallel_mode= @old_parallel_mode;
+SET GLOBAL lock_wait_timeout= @old_timeout;
+SET GLOBAL innodb_lock_wait_timeout= @old_innodb_timeout;
+include/start_slave.inc
+include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/t/rpl_mdev33798.cnf b/mysql-test/suite/rpl/t/rpl_mdev33798.cnf
new file mode 100644
index 00000000000..8e5125ea6ca
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_mdev33798.cnf
@@ -0,0 +1,17 @@
+!include suite/rpl/my.cnf
+
+[mysqld.1]
+log-slave-updates
+loose-innodb
+
+[mysqld.2]
+log-slave-updates
+loose-innodb
+
+[mysqld.3]
+log-slave-updates
+loose-innodb
+
+[ENV]
+SERVER_MYPORT_3= @mysqld.3.port
+SERVER_MYSOCK_3= @mysqld.3.socket
diff --git a/mysql-test/suite/rpl/t/rpl_mdev33798.test b/mysql-test/suite/rpl/t/rpl_mdev33798.test
new file mode 100644
index 00000000000..1448ed91133
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_mdev33798.test
@@ -0,0 +1,182 @@
+--source include/have_innodb.inc
+--source include/have_log_bin.inc
+--let $rpl_topology=1->2,1->3
+--source include/rpl_init.inc
+--connect (server_2b,127.0.0.1,root,,,$SERVER_MYPORT_2)
+
+--connection server_2
+SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+SET @old_parallel_mode= @@GLOBAL.slave_parallel_mode;
+SET @old_timeout= @@GLOBAL.lock_wait_timeout;
+SET @old_innodb_timeout= @@GLOBAL.innodb_lock_wait_timeout;
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=5;
+set global slave_parallel_mode= aggressive;
+# High timeout so we get replication sync error and test failure if the
+# conflict handling is insufficient and lock wait timeout occurs.
+SET GLOBAL lock_wait_timeout= 86400;
+SET GLOBAL innodb_lock_wait_timeout= 86400;
+SET STATEMENT sql_log_bin=0 FOR ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+--source include/start_slave.inc
+
+--connection server_1
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0);
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+--source include/stop_slave.inc
+
+# Test the following scenario:
+#
+# Transactions T1, T2 in domain 1, U1, U2 in domain 2.
+# Wait cycle T1->U2->U1->T2->T1 as follows:
+# T1 row lock wait on U2
+# U2 wait_for_prior_commit on U1
+# U1 row lock wait on T2
+# T2 wait_for_prior_commit on T1
+#
+# Test that the wait cycle is broken correctly with deadlock kill.
+
+--connection server_2b
+# Temporarily block T1 and U1.
+BEGIN;
+SELECT * FROM t1 WHERE a=1 FOR UPDATE;
+SELECT * FROM t1 WHERE a=5 FOR UPDATE;
+
+--connection server_1
+
+SET SESSION gtid_domain_id= 1;
+# T1 in domain 1
+BEGIN;
+UPDATE t1 SET b=1 WHERE a=1;
+UPDATE t1 SET b=1 WHERE a=7;
+COMMIT;
+# T2 in domain 1
+UPDATE t1 SET b=2 WHERE a=3;
+
+SET SESSION gtid_domain_id=2;
+# U1 in domain 2
+BEGIN;
+UPDATE t1 SET b=3 WHERE a=5;
+UPDATE t1 SET b=3 WHERE a=3;
+COMMIT;
+# U2 in domain 2
+UPDATE t1 SET b=4 WHERE a=7;
+SET SESSION gtid_domain_id= 0;
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/start_slave.inc
+# Wait until T2, U2 are holding the row locks.
+--let $wait_condition= SELECT COUNT(*)=2 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE state LIKE '%Waiting for prior transaction to commit%'
+--source include/wait_condition.inc
+
+# Then let T1, U1 continue to conflict on the row locks, and check that
+# replication correctly handles the conflict.
+--connection server_2b
+ROLLBACK;
+
+--connection server_2
+--source include/sync_with_master_gtid.inc
+
+# Allow either domain to "win" on the conflicting updates.
+SELECT a, (
+ (a=1 AND b=1) OR
+ (a=3 AND (b=2 OR b=3)) OR
+ (a=5 AND b=3) OR
+ (a=7 AND (b=1 OR b=4)) OR
+ ((a MOD 2)=0 AND b=0)) AS `ok`
+ FROM t1
+ ORDER BY a;
+
+# Now try the same thing with multi-source replication.
+
+# Make server_3 a second master
+--connection server_3
+--source include/sync_with_master_gtid.inc
+--source include/stop_slave.inc
+
+--connection server_2
+--source include/stop_slave.inc
+--replace_result $SERVER_MYPORT_3 MYPORT_3
+eval CHANGE MASTER 'm2' to master_port=$SERVER_MYPORT_3 , master_host='127.0.0.1', master_user='root', master_use_gtid=slave_pos;
+
+--connection server_1
+
+SET SESSION gtid_domain_id= 1;
+# T1 in domain 1
+BEGIN;
+UPDATE t1 SET b=11 WHERE a=1;
+UPDATE t1 SET b=11 WHERE a=7;
+COMMIT;
+# T2 in domain 1
+UPDATE t1 SET b=12 WHERE a=3;
+SET SESSION gtid_domain_id= 1;
+
+--connection server_3
+SET SESSION gtid_domain_id=3;
+# U1 in domain 3
+BEGIN;
+UPDATE t1 SET b=13 WHERE a=5;
+UPDATE t1 SET b=13 WHERE a=3;
+COMMIT;
+# U2 in domain 3
+UPDATE t1 SET b=14 WHERE a=7;
+--source include/save_master_gtid.inc
+
+--connection server_2b
+# Temporarily block T1 and U1.
+BEGIN;
+SELECT * FROM t1 WHERE a=1 FOR UPDATE;
+SELECT * FROM t1 WHERE a=5 FOR UPDATE;
+
+START ALL SLAVES;
+# Wait until T2, U2 are holding the row locks.
+--let $wait_condition= SELECT COUNT(*)=2 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE state LIKE '%Waiting for prior transaction to commit%'
+--source include/wait_condition.inc
+
+--connection server_2b
+ROLLBACK;
+
+--connection server_1
+--source include/save_master_gtid.inc
+--connection server_2
+--source include/sync_with_master_gtid.inc
+--connection server_3
+--source include/save_master_gtid.inc
+--connection server_2
+--source include/sync_with_master_gtid.inc
+
+SELECT a, (
+ (a=1 AND b=11) OR
+ (a=3 AND (b=12 OR b=13)) OR
+ (a=5 AND b=13) OR
+ (a=7 AND (b=11 OR b=14)) OR
+ ((a MOD 2)=0 AND b=0)) AS `ok`
+ FROM t1
+ ORDER BY a;
+
+SET default_master_connection = 'm2';
+--source include/stop_slave.inc
+RESET SLAVE 'm2' ALL;
+SET default_master_connection = '';
+
+--connection server_3
+--source include/start_slave.inc
+
+# Cleanup
+
+--disconnect server_2b
+--connection server_1
+DROP TABLE t1;
+--connection server_2
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+set global slave_parallel_mode= @old_parallel_mode;
+SET GLOBAL lock_wait_timeout= @old_timeout;
+SET GLOBAL innodb_lock_wait_timeout= @old_innodb_timeout;
+--source include/start_slave.inc
+
+--source include/rpl_end.inc
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 0d39f359a09..6026e1e13bb 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -5384,14 +5384,40 @@ thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd)
return 0;
if (!rgi->is_parallel_exec)
return 0;
- if (rgi->rli != other_rgi->rli)
- return 0;
- if (!rgi->gtid_sub_id || !other_rgi->gtid_sub_id)
- return 0;
- if (rgi->current_gtid.domain_id != other_rgi->current_gtid.domain_id)
- return 0;
- if (rgi->gtid_sub_id > other_rgi->gtid_sub_id)
- return 0;
+ if (rgi->rli == other_rgi->rli)
+ {
+ /*
+ Within the same master connection, we can compare transaction order on
+ the GTID sub_id, and rollback the later transaction to allow the earlier
+ transaction to commit first.
+ */
+ if (!rgi->gtid_sub_id || !other_rgi->gtid_sub_id)
+ return 0;
+ if (rgi->current_gtid.domain_id != other_rgi->current_gtid.domain_id &&
+ other_rgi->speculation != rpl_group_info::SPECULATE_OPTIMISTIC)
+ return 0;
+ if (rgi->gtid_sub_id > other_rgi->gtid_sub_id)
+ return 0;
+ }
+ else
+ {
+ /*
+ Lock conflicts between different master connection should usually not
+ occur, but could still happen if user is running some special setup that
+ tolerates conflicting updates (or in case of user error). We do not have a
+ pre-defined ordering of transactions in this case, but we still need to
+ handle conflicts in _some_ way to avoid undetected deadlocks and hangs.
+
+ We do this by rolling back and retrying any transaction that is being
+ _optimistically_ applied. This can be overly conservative in some cases,
+ but should be fine as conflicts between different master connections are
+ not expected to be common. And it ensures that we won't end up in a
+ deadlock and hang due to a transaction doing wait_for_prior_commit while
+ holding locks that block something in another master connection.
+ */
+ if (other_rgi->speculation != rpl_group_info::SPECULATE_OPTIMISTIC)
+ return 0;
+ }
if (rgi->finish_event_group_called || other_rgi->finish_event_group_called)
{
/*
--
2.30.2