Hi Monty,
Here's the patch for GTID indexes, thanks for offering to review.
We should aim for getting this done well in time to be included in 10.4.
The main goal of this patch is to eliminate the bottleneck when a
slave connects to the master. This currently needs to scan the binlog file up
to the start position. This can be costly, especially if the binlog file is
not cached in memory (IO cost), or if it is encrypted or a lot of slaves
connect simultaneously (CPU cost).
The size of the index files is generally less than 1% of the binlog data, so
not expected to be an issue.
I wanted to minimise the impact of maintaining the indexes on the performance
and scalability of the binlog commit. Slave connect is a relatively infrequent
operation, so the index lookup is not considered as critical - avoiding to
scan 1/2 a gigabyte of binlog file will already give a huge saving. This is
the reason that index writing is done asynchronously from the binlog
background thread, and why a single global mutex is deemed sufficient to
protect the index access.
Here are the user-visible options and status variables. The feature is on by
default and is expected to need no tuning or configuration for most users.
binlog_gtid_index
On by default. Can be used to disable the indexes for testing purposes.
binlog_gtid_index_page_size (default 4096)
Page size to use for the binlog GTID index. This is the size of the nodes
in the B+-tree used internally in the index. A very small page-size (64 is
the minimum) will be less efficient, but can be used to stress the
BTree-code during testing.
binlog_gtid_index_sparse (default 10)
Control sparseness of the binlog GTID index. If set to N, only every
Nth GTID will be recorded in the index, to reduce the size of the
index. Having a sparseness > 1 greatly reduces the number of records in
the index, at the cost only of having to scan a few more events in the
binlog file before finding the target position.
binlog_gtid_index_span_min (default 4096)
Control sparseness of the binlog GTID index. If set to N, at most one
index record will be added for every N bytes of binlog file written.
binlog_gtid_index_span_max (default 65536)
Control sparseness of the binlog GTID index. If set to N, an index
record will be added after N bytes has been written to the binlog
file, even if this would normally be skipped due to the setting of
--binlog-gtid-index-sparse.
With binlog_gtid_index_span_min, we can reduce the number of records in the
index when the binlog has lots of very small transactions, since the
sequential scan is fast in this case. With binlog_gtid_index_span_max we can
avoid getting a too-sparse index when binlog transactions are very large,
avoiding the more expensive sequential scan in this case.
Two status variables are available to monitor the use of the GTID indexes:
Binlog_gtid_index_hit
Binlog_gtid_index_miss
The "hit" status increments for each successful lookup in a GTID index.
The "miss" increments when a lookup is not possible. This indicates that the
index file is missing (eg. after upgrade), or corrupt. Thus in normal
operation the "miss" counter is expected to be small/zero. A "Note"-level
message is logged in the error log when an index is corrupt and fallback to
sequential scan is needed.
- Kristian.
commit 5bb97895a32836127acd2bff093340dd8e0939fb (HEAD -> tmp_test2, origin/bb-11.3-knielsen)
Author: Kristian Nielsen <knielsen(a)knielsen-hq.org>
Date: Fri Sep 8 13:12:49 2023 +0200
MDEV-4991: GTID binlog indexing
Improve the performance of slave connect using B+-Tree indexes on each binlog
file. The index allows fast lookup of a GTID position to the corresponding
offset in the binlog file, as well as lookup of a position to find the
corresponding GTID position.
Signed-off-by: Kristian Nielsen <knielsen(a)knielsen-hq.org>
---
libmysqld/CMakeLists.txt | 2 +-
.../suite/binlog/r/binlog_gtid_index.result | 138 ++
.../suite/binlog/t/binlog_gtid_index.test | 208 +++
...d_master_switch_to_unencrypted_gtid.result | 6 +-
...ted_master_switch_to_unencrypted_gtid.test | 10 +-
.../perfschema/r/dml_setup_instruments.result | 2 +-
mysql-test/suite/perfschema/r/relaylog.result | 2 +
.../suite/rpl/include/rpl_gtid_index.inc | 196 +++
.../rpl/r/rpl_gtid_glle_no_terminate.result | 1 -
mysql-test/suite/rpl/r/rpl_gtid_index.result | 346 ++++
.../rpl/t/rpl_gtid_glle_no_terminate.test | 1 +
mysql-test/suite/rpl/t/rpl_gtid_index.test | 98 ++
sql/CMakeLists.txt | 2 +-
sql/gtid_index.cc | 1440 +++++++++++++++++
sql/gtid_index.h | 430 +++++
sql/log.cc | 534 +++++-
sql/log.h | 20 +-
sql/mysqld.cc | 21 +-
sql/mysqld.h | 11 +-
sql/privilege.h | 15 +
sql/rpl_gtid.cc | 431 +++--
sql/rpl_gtid.h | 39 +-
sql/rpl_rli.cc | 2 +-
sql/sql_repl.cc | 318 +++-
sql/sys_vars.cc | 54 +
25 files changed, 4074 insertions(+), 253 deletions(-)
create mode 100644 mysql-test/suite/binlog/r/binlog_gtid_index.result
create mode 100644 mysql-test/suite/binlog/t/binlog_gtid_index.test
create mode 100644 mysql-test/suite/rpl/include/rpl_gtid_index.inc
create mode 100644 mysql-test/suite/rpl/r/rpl_gtid_index.result
create mode 100644 mysql-test/suite/rpl/t/rpl_gtid_index.test
create mode 100644 sql/gtid_index.cc
create mode 100644 sql/gtid_index.h
diff --git a/libmysqld/CMakeLists.txt b/libmysqld/CMakeLists.txt
index 2042f7fe321..9fca2c82f8e 100644
--- a/libmysqld/CMakeLists.txt
+++ b/libmysqld/CMakeLists.txt
@@ -125,7 +125,7 @@ SET(SQL_EMBEDDED_SOURCES emb_qcache.cc libmysqld.c lib_sql.cc
../sql/sql_expression_cache.cc
../sql/my_apc.cc ../sql/my_apc.h
../sql/my_json_writer.cc ../sql/my_json_writer.h
- ../sql/rpl_gtid.cc
+ ../sql/rpl_gtid.cc ../sql/gtid_index.cc
../sql/sql_explain.cc ../sql/sql_explain.h
../sql/sql_analyze_stmt.cc ../sql/sql_analyze_stmt.h
../sql/compat56.cc
diff --git a/mysql-test/suite/binlog/r/binlog_gtid_index.result b/mysql-test/suite/binlog/r/binlog_gtid_index.result
new file mode 100644
index 00000000000..4f621142ee7
--- /dev/null
+++ b/mysql-test/suite/binlog/r/binlog_gtid_index.result
@@ -0,0 +1,138 @@
+SET GLOBAL binlog_gtid_index= 0;
+SET GLOBAL binlog_gtid_index= 1;
+SET @gtid1= @@gtid_binlog_pos;
+CREATE TABLE t1 (a INT PRIMARY KEY);
+SET @gtid2= @@gtid_binlog_pos;
+INSERT INTO t1 VALUES (1);
+SET @gtid3= @@gtid_binlog_pos;
+INSERT INTO t1 VALUES (2);
+INSERT INTO t1 VALUES (3);
+INSERT INTO t1 VALUES (4);
+SET @gtid4= @@gtid_binlog_pos;
+INSERT INTO t1 VALUES (5);
+SET @gtid5= @@gtid_binlog_pos;
+SET @gtid6= @@gtid_binlog_pos;
+INSERT INTO t1 VALUES (106);
+INSERT INTO t1 VALUES (107);
+Ok
+1
+Ok
+1
+Ok
+1
+Ok
+1
+Ok
+1
+Ok
+1
+FLUSH BINARY LOGS;
+Ok
+1
+Ok
+1
+Ok
+1
+Ok
+1
+Ok
+1
+Ok
+1
+*** Test that purge deletes the gtid index files. ***
+FLUSH BINARY LOGS;
+INSERT INTO t1 VALUES (200);
+FLUSH BINARY LOGS;
+INSERT INTO t1 VALUES (201);
+FLUSH BINARY LOGS;
+INSERT INTO t1 VALUES (202);
+PURGE BINARY LOGS TO 'FILE';
+*** Test missed index lookup due to missing or corrupt index file.
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+INSERT INTO t1 VALUES (301);
+INSERT INTO t1 VALUES (302);
+INSERT INTO t1 VALUES (303);
+SET @gtid_pos= @@GLOBAL.gtid_binlog_pos;
+INSERT INTO t1 VALUES (304);
+INSERT INTO t1 VALUES (305);
+FLUSH NO_WRITE_TO_BINLOG STATUS;
++++ Initial status:
+SHOW STATUS LIKE 'binlog_gtid_index_%';
+Variable_name Value
+Binlog_gtid_index_hit 0
+Binlog_gtid_index_miss 0
++++ GTID Lookup in good index.
+Gtid_Lookup_Ok
+1
+SHOW STATUS LIKE 'binlog_gtid_index_%';
+Variable_name Value
+Binlog_gtid_index_hit 1
+Binlog_gtid_index_miss 0
++++ GTID Lookup, index file is missing.
+Gtid_Lookup_Ok
+1
+SHOW STATUS LIKE 'binlog_gtid_index_%';
+Variable_name Value
+Binlog_gtid_index_hit 1
+Binlog_gtid_index_miss 1
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+INSERT INTO t1 VALUES (306);
+SET @gtid_pos= @@GLOBAL.gtid_binlog_pos;
+INSERT INTO t1 VALUES (307);
+INSERT INTO t1 VALUES (308);
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
++++ GTID Lookup, first page of index is corrupt.
+Gtid_Lookup_Ok
+1
+SHOW STATUS LIKE 'binlog_gtid_index_%';
+Variable_name Value
+Binlog_gtid_index_hit 1
+Binlog_gtid_index_miss 2
+SET @old_page_size= @@GLOBAL.binlog_gtid_index_page_size;
+SET @old_spars= @@GLOBAL.binlog_gtid_index_sparse;
+SET @old_span_min= @@GLOBAL.binlog_gtid_index_span_min;
+SET GLOBAL binlog_gtid_index_page_size= 64;
+SET GLOBAL binlog_gtid_index_sparse= 1;
+SET GLOBAL binlog_gtid_index_span_min= 1;
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+INSERT INTO t1 VALUES (310);
+INSERT INTO t1 VALUES (311);
+INSERT INTO t1 VALUES (312);
+SET @gtid_pos= @@GLOBAL.gtid_binlog_pos;
+INSERT INTO t1 VALUES (313);
+INSERT INTO t1 VALUES (314);
+INSERT INTO t1 VALUES (315);
+INSERT INTO t1 VALUES (316);
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+SET GLOBAL binlog_gtid_index_page_size= @old_page_size;
+SET GLOBAL binlog_gtid_index_sparse= @old_spars;
+SET GLOBAL binlog_gtid_index_span_min= @old_span_min;
++++ GTID Lookup, root page of index is corrupt.
+Gtid_Lookup_Ok
+1
+SHOW STATUS LIKE 'binlog_gtid_index_%';
+Variable_name Value
+Binlog_gtid_index_hit 1
+Binlog_gtid_index_miss 3
+*** Test BINLOG_GTID_POS() with too-large offset.
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+INSERT INTO t1 VALUES (401);
+INSERT INTO t1 VALUES (402);
++++ Test the hot index.
+SELECT BINLOG_GTID_POS('FILE', 100000000);
+BINLOG_GTID_POS('FILE', 100000000)
+NULL
+SHOW STATUS LIKE 'binlog_gtid_index_%';
+Variable_name Value
+Binlog_gtid_index_hit 2
+Binlog_gtid_index_miss 3
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
++++ Test the cold index.
+SELECT BINLOG_GTID_POS('FILE', 100000000);
+BINLOG_GTID_POS('FILE', 100000000)
+NULL
+SHOW STATUS LIKE 'binlog_gtid_index_%';
+Variable_name Value
+Binlog_gtid_index_hit 3
+Binlog_gtid_index_miss 3
+DROP TABLE t1;
diff --git a/mysql-test/suite/binlog/t/binlog_gtid_index.test b/mysql-test/suite/binlog/t/binlog_gtid_index.test
new file mode 100644
index 00000000000..0346a89ea4b
--- /dev/null
+++ b/mysql-test/suite/binlog/t/binlog_gtid_index.test
@@ -0,0 +1,208 @@
+--source include/have_binlog_format_mixed.inc
+
+SET GLOBAL binlog_gtid_index= 0;
+SET GLOBAL binlog_gtid_index= 1;
+
+--let $file= query_get_value(SHOW MASTER STATUS, File, 1)
+--let $pos1= query_get_value(SHOW MASTER STATUS, Position, 1)
+SET @gtid1= @@gtid_binlog_pos;
+CREATE TABLE t1 (a INT PRIMARY KEY);
+--let $pos2= query_get_value(SHOW MASTER STATUS, Position, 1)
+SET @gtid2= @@gtid_binlog_pos;
+INSERT INTO t1 VALUES (1);
+--let $pos3= query_get_value(SHOW MASTER STATUS, Position, 1)
+SET @gtid3= @@gtid_binlog_pos;
+INSERT INTO t1 VALUES (2);
+INSERT INTO t1 VALUES (3);
+INSERT INTO t1 VALUES (4);
+--let $pos4= query_get_value(SHOW MASTER STATUS, Position, 1)
+SET @gtid4= @@gtid_binlog_pos;
+INSERT INTO t1 VALUES (5);
+--let $pos5= query_get_value(SHOW MASTER STATUS, Position, 1)
+SET @gtid5= @@gtid_binlog_pos;
+
+--disable_query_log
+--let $i=0
+while ($i < 100) {
+ eval INSERT INTO t1 VALUES (6 + $i);
+ inc $i;
+}
+--enable_query_log
+--let $pos6= query_get_value(SHOW MASTER STATUS, Position, 1)
+SET @gtid6= @@gtid_binlog_pos;
+
+INSERT INTO t1 VALUES (106);
+INSERT INTO t1 VALUES (107);
+
+# Test first the hot and then the cold index.
+--let $i= 0
+while ($i < 2) {
+ --disable_query_log
+ eval SELECT BINLOG_GTID_POS('$file', $pos1) = @gtid1 AS Ok;
+ eval SELECT BINLOG_GTID_POS('$file', $pos2) = @gtid2 AS Ok;
+ eval SELECT BINLOG_GTID_POS('$file', $pos3) = @gtid3 AS Ok;
+ eval SELECT BINLOG_GTID_POS('$file', $pos4) = @gtid4 AS Ok;
+ eval SELECT BINLOG_GTID_POS('$file', $pos5) = @gtid5 AS Ok;
+ eval SELECT BINLOG_GTID_POS('$file', $pos6) = @gtid6 AS Ok;
+ --enable_query_log
+
+ inc $i;
+ if ($i == 1) {
+ FLUSH BINARY LOGS;
+ }
+}
+
+--echo *** Test that purge deletes the gtid index files. ***
+FLUSH BINARY LOGS;
+INSERT INTO t1 VALUES (200);
+--let $file2= query_get_value(SHOW MASTER STATUS, File, 1)
+FLUSH BINARY LOGS;
+INSERT INTO t1 VALUES (201);
+--let $file3= query_get_value(SHOW MASTER STATUS, File, 1)
+FLUSH BINARY LOGS;
+INSERT INTO t1 VALUES (202);
+--let $file4= query_get_value(SHOW MASTER STATUS, File, 1)
+
+--replace_result $file3 FILE
+eval PURGE BINARY LOGS TO '$file3';
+
+--let $MYSQLD_DATADIR= `select @@datadir`
+--error 1
+--file_exists $MYSQLD_DATADIR/$file.idx
+--error 1
+--file_exists $MYSQLD_DATADIR/$file2.idx
+--file_exists $MYSQLD_DATADIR/$file3.idx
+--file_exists $MYSQLD_DATADIR/$file4.idx
+
+--echo *** Test missed index lookup due to missing or corrupt index file.
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+--let $file= query_get_value(SHOW MASTER STATUS, File, 1)
+INSERT INTO t1 VALUES (301);
+INSERT INTO t1 VALUES (302);
+INSERT INTO t1 VALUES (303);
+--let $pos= query_get_value(SHOW MASTER STATUS, Position, 1)
+SET @gtid_pos= @@GLOBAL.gtid_binlog_pos;
+INSERT INTO t1 VALUES (304);
+INSERT INTO t1 VALUES (305);
+
+FLUSH NO_WRITE_TO_BINLOG STATUS;
+--echo +++ Initial status:
+SHOW STATUS LIKE 'binlog_gtid_index_%';
+--echo +++ GTID Lookup in good index.
+--disable_query_log
+eval SELECT BINLOG_GTID_POS('$file', $pos) = @gtid_pos AS Gtid_Lookup_Ok;
+--enable_query_log
+SHOW STATUS LIKE 'binlog_gtid_index_%';
+--remove_file $MYSQLD_DATADIR/$file.idx
+--echo +++ GTID Lookup, index file is missing.
+--disable_query_log
+eval SELECT BINLOG_GTID_POS('$file', $pos) = @gtid_pos AS Gtid_Lookup_Ok;
+--enable_query_log
+SHOW STATUS LIKE 'binlog_gtid_index_%';
+
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+--let $file= query_get_value(SHOW MASTER STATUS, File, 1)
+INSERT INTO t1 VALUES (306);
+--let $pos= query_get_value(SHOW MASTER STATUS, Position, 1)
+SET @gtid_pos= @@GLOBAL.gtid_binlog_pos;
+INSERT INTO t1 VALUES (307);
+INSERT INTO t1 VALUES (308);
+# Rotate again so we hit an on-disk index file, not the "hot" index.
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+
+# Corrupt the flag byte of the first page with an unused bit.
+--let FILE_TO_CORRUPT= $MYSQLD_DATADIR/$file.idx
+--perl
+use strict;
+use warnings;
+use Fcntl qw(:DEFAULT :seek);
+sysopen F, $ENV{FILE_TO_CORRUPT}, O_RDWR
+ or die "Cannot open file $ENV{FILE_TO_CORRUPT}: $!\n";
+# Corrupt the flag byte with an unused flag.
+sysseek(F, 16, SEEK_SET)
+ or die "Cannot seek file: $!\n";
+my $buf;
+sysread(F, $buf, 1)
+ or die "Cannot read file: $!\n";
+$buf= chr(ord($buf) | 0x80);
+sysseek(F, 16, SEEK_SET)
+ or die "Cannot seek file: $!\n";
+syswrite(F, $buf, 1) == 1
+ or die "Cannot write file: $!\n";
+close F;
+EOF
+
+--echo +++ GTID Lookup, first page of index is corrupt.
+--disable_query_log
+eval SELECT BINLOG_GTID_POS('$file', $pos) = @gtid_pos AS Gtid_Lookup_Ok;
+--enable_query_log
+SHOW STATUS LIKE 'binlog_gtid_index_%';
+
+# Corrupt the last byte of the root page.
+# Set a small page-size so we test corruption in something not the header page.
+SET @old_page_size= @@GLOBAL.binlog_gtid_index_page_size;
+SET @old_spars= @@GLOBAL.binlog_gtid_index_sparse;
+SET @old_span_min= @@GLOBAL.binlog_gtid_index_span_min;
+SET GLOBAL binlog_gtid_index_page_size= 64;
+SET GLOBAL binlog_gtid_index_sparse= 1;
+SET GLOBAL binlog_gtid_index_span_min= 1;
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+--let $file= query_get_value(SHOW MASTER STATUS, File, 1)
+INSERT INTO t1 VALUES (310);
+INSERT INTO t1 VALUES (311);
+INSERT INTO t1 VALUES (312);
+--let $pos= query_get_value(SHOW MASTER STATUS, Position, 1)
+SET @gtid_pos= @@GLOBAL.gtid_binlog_pos;
+INSERT INTO t1 VALUES (313);
+INSERT INTO t1 VALUES (314);
+INSERT INTO t1 VALUES (315);
+INSERT INTO t1 VALUES (316);
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+SET GLOBAL binlog_gtid_index_page_size= @old_page_size;
+SET GLOBAL binlog_gtid_index_sparse= @old_spars;
+SET GLOBAL binlog_gtid_index_span_min= @old_span_min;
+
+--let FILE_TO_CORRUPT= $MYSQLD_DATADIR/$file.idx
+--perl
+use strict;
+use warnings;
+use Fcntl qw(:DEFAULT :seek);
+sysopen F, $ENV{FILE_TO_CORRUPT}, O_RDWR
+ or die "Cannot open file $ENV{FILE_TO_CORRUPT}: $!\n";
+# Corrupt the flag byte with an unused flag.
+sysseek(F, -2, SEEK_END)
+ or die "Cannot seek file: $!\n";
+my $buf;
+sysread(F, $buf, 1)
+ or die "Cannot read file: $!\n";
+$buf= chr(ord($buf) ^ 0x4);
+sysseek(F, -2, SEEK_END)
+ or die "Cannot seek file: $!\n";
+syswrite(F, $buf, 1) == 1
+ or die "Cannot write file: $!\n";
+close F;
+EOF
+
+--echo +++ GTID Lookup, root page of index is corrupt.
+--disable_query_log
+eval SELECT BINLOG_GTID_POS('$file', $pos) = @gtid_pos AS Gtid_Lookup_Ok;
+--enable_query_log
+SHOW STATUS LIKE 'binlog_gtid_index_%';
+
+--echo *** Test BINLOG_GTID_POS() with too-large offset.
+# New binlog to skip the now corrupted one.
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+--let $file= query_get_value(SHOW MASTER STATUS, File, 1)
+INSERT INTO t1 VALUES (401);
+INSERT INTO t1 VALUES (402);
+--echo +++ Test the hot index.
+--replace_result $file FILE
+eval SELECT BINLOG_GTID_POS('$file', 100000000);
+SHOW STATUS LIKE 'binlog_gtid_index_%';
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+--echo +++ Test the cold index.
+--replace_result $file FILE
+eval SELECT BINLOG_GTID_POS('$file', 100000000);
+SHOW STATUS LIKE 'binlog_gtid_index_%';
+
+DROP TABLE t1;
diff --git a/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.result b/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.result
index 16ea30557e7..8a26eaadbcf 100644
--- a/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.result
+++ b/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.result
@@ -6,6 +6,7 @@ connection server_2;
include/stop_slave.inc
CHANGE MASTER TO MASTER_USE_GTID=SLAVE_POS;
call mtr.add_suppression(" Got fatal error 1236 from master when reading data from binary log: 'Could not set up decryption for binlog.'");
+call mtr.add_suppression(" Got fatal error 1236 from master when reading data from binary log: 'Could not decrypt binlog: encryption key error");
#####################################################
# Part 1: unencrypted master
#####################################################
@@ -58,10 +59,11 @@ INSERT INTO table3_no_encryption SELECT NULL,NOW(),b FROM table3_no_encryption;
connection server_2;
start slave;
include/wait_for_slave_io_error.inc [errno=1236]
-# Ensuring slave was unable to replicate any transactions..
+# Ensuring slave was unable to replicate any encrypted transactions..
# ..success
SHOW TABLES;
Tables_in_test
+table1_no_encryption
include/stop_slave.inc
reset slave;
##########
@@ -80,5 +82,7 @@ COUNT(*)
4
DROP TABLE table1_no_encryption, table2_to_encrypt, table3_no_encryption;
connection server_2;
+RESET MASTER;
+SET GLOBAL gtid_slave_pos= '';
include/start_slave.inc
include/rpl_end.inc
diff --git a/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.test b/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.test
index f882e8f3440..30d0155ce36 100644
--- a/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.test
+++ b/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.test
@@ -36,6 +36,7 @@
CHANGE MASTER TO MASTER_USE_GTID=SLAVE_POS;
--enable_connect_log
call mtr.add_suppression(" Got fatal error 1236 from master when reading data from binary log: 'Could not set up decryption for binlog.'");
+call mtr.add_suppression(" Got fatal error 1236 from master when reading data from binary log: 'Could not decrypt binlog: encryption key error");
--echo #####################################################
--echo # Part 1: unencrypted master
@@ -55,6 +56,7 @@ FLUSH BINARY LOGS;
SET binlog_format=ROW;
INSERT INTO table1_no_encryption SELECT NULL,NOW(),b FROM table1_no_encryption;
INSERT INTO table1_no_encryption SELECT NULL,NOW(),b FROM table1_no_encryption;
+--let $last_unencrypted_gtid= `SELECT @@gtid_binlog_pos`
# Make sure that binary logs are not encrypted
@@ -120,11 +122,11 @@ start slave;
--let $slave_io_errno= 1236
--source include/wait_for_slave_io_error.inc
---echo # Ensuring slave was unable to replicate any transactions..
+--echo # Ensuring slave was unable to replicate any encrypted transactions..
--let $gsp= `SELECT @@global.gtid_slave_pos`
-if (`SELECT strcmp("$gsp","")`)
+if (`SELECT strcmp("$gsp","$last_unencrypted_gtid")`)
{
- die Slave without encryption configured should fail to read encrypted binlog;
+ die Slave without encryption configured should fail to read encrypted binlog (expected $last_unencrypted_gtid but got $gsp);
}
--echo # ..success
@@ -150,5 +152,7 @@ DROP TABLE table1_no_encryption, table2_to_encrypt, table3_no_encryption;
--connection server_2
--disable_connect_log
+RESET MASTER;
+SET GLOBAL gtid_slave_pos= '';
--source include/start_slave.inc
--source include/rpl_end.inc
diff --git a/mysql-test/suite/perfschema/r/dml_setup_instruments.result b/mysql-test/suite/perfschema/r/dml_setup_instruments.result
index cdc52da54dc..ff000a09312 100644
--- a/mysql-test/suite/perfschema/r/dml_setup_instruments.result
+++ b/mysql-test/suite/perfschema/r/dml_setup_instruments.result
@@ -8,12 +8,12 @@ wait/synch/mutex/sql/Ack_receiver::mutex YES YES
wait/synch/mutex/sql/Cversion_lock YES YES
wait/synch/mutex/sql/Delayed_insert::mutex YES YES
wait/synch/mutex/sql/Event_scheduler::LOCK_scheduler_state YES YES
+wait/synch/mutex/sql/Gtid_index_writer::gtid_index_mutex YES YES
wait/synch/mutex/sql/gtid_waiting::LOCK_gtid_waiting YES YES
wait/synch/mutex/sql/hash_filo::lock YES YES
wait/synch/mutex/sql/HA_DATA_PARTITION::LOCK_auto_inc YES YES
wait/synch/mutex/sql/LOCK_active_mi YES YES
wait/synch/mutex/sql/LOCK_after_binlog_sync YES YES
-wait/synch/mutex/sql/LOCK_audit_mask YES YES
select * from performance_schema.setup_instruments
where name like 'Wait/Synch/Rwlock/sql/%'
and name not in (
diff --git a/mysql-test/suite/perfschema/r/relaylog.result b/mysql-test/suite/perfschema/r/relaylog.result
index ce3e9c04a5e..7cc87530770 100644
--- a/mysql-test/suite/perfschema/r/relaylog.result
+++ b/mysql-test/suite/perfschema/r/relaylog.result
@@ -23,6 +23,7 @@ from performance_schema.file_summary_by_instance
where file_name like "%master-%" order by file_name;
FILE_NAME EVENT_NAME COUNT_READ COUNT_WRITE SUM_NUMBER_OF_BYTES_READ SUM_NUMBER_OF_BYTES_WRITE
master-bin.000001 wait/io/file/sql/binlog MANY MANY MANY MANY
+master-bin.000001.idx wait/io/file/sql/gtid_index NONE MANY NONE MANY
master-bin.index wait/io/file/sql/binlog_index MANY MANY MANY MANY
select * from performance_schema.file_summary_by_instance
where file_name like "%slave-%" order by file_name;
@@ -112,6 +113,7 @@ where file_name like "%slave-%"
order by file_name;
FILE_NAME EVENT_NAME COUNT_READ COUNT_WRITE SUM_NUMBER_OF_BYTES_READ SUM_NUMBER_OF_BYTES_WRITE
slave-bin.000001 wait/io/file/sql/binlog MANY MANY MANY MANY
+slave-bin.000001.idx wait/io/file/sql/gtid_index NONE MANY NONE MANY
slave-bin.index wait/io/file/sql/binlog_index MANY MANY MANY MANY
slave-relay-bin.000001 wait/io/file/sql/relaylog MANY MANY MANY MANY
slave-relay-bin.000002 wait/io/file/sql/relaylog MANY MANY MANY MANY
diff --git a/mysql-test/suite/rpl/include/rpl_gtid_index.inc b/mysql-test/suite/rpl/include/rpl_gtid_index.inc
new file mode 100644
index 00000000000..41f44c11dfb
--- /dev/null
+++ b/mysql-test/suite/rpl/include/rpl_gtid_index.inc
@@ -0,0 +1,196 @@
+# Include file for main test rpl.rpl_gtid_index.
+# Test GTID indexes with given parameters.
+#
+# Parameters:
+# $NUM_POS Number of GTIDs/binlog positions to create
+# $NUM_DOMAIN Number of different domains to use
+# $NUM_SERVER Number of different server_id to use
+# $NUM_SLAVE_CONNECTS How many GTID slave connect positions to test
+# $RND_SEED Random seed
+
+
+--echo *** Testing $NUM_POS GTIDs with $NUM_SLAVE_CONNECTS test connects
+
+--connection master
+DELETE FROM t1 WHERE a >= 1000;
+# Rotate binlogs to make new GTID index settings take effect.
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+
+# Prepare some random values, but deterministic between test runs.
+CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT)
+ ENGINE=InnoDB;
+--disable_query_log
+INSERT INTO rand_data(idx, domain_id, server_id) VALUES (0, 0, 1);
+--let $incr= 1
+--let $done= 0
+while (!$done) {
+ eval INSERT INTO rand_data(idx)
+ SELECT idx+$incr FROM rand_data WHERE idx+$incr <= $NUM_POS;
+ --let $done= `SELECT MAX(idx) = $NUM_POS FROM rand_data`
+ --let $incr= `SELECT 2*$incr`
+}
+eval UPDATE rand_data
+ SET domain_id=floor($NUM_DOMAIN*POW(rand($RND_SEED),2)),
+ server_id=100 + $NUM_SERVER*domain_id + floor($NUM_SERVER*rand($RND_SEED))
+ WHERE idx > 0
+ ORDER BY idx;
+--enable_query_log
+# Let's check that the test data is deterministic.
+# If this changes due to some server changes, it's fine, the .result can just
+# be updated. But we want it to be identical between test runs on same code,
+# to facilitate debugging test failures.
+SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data;
+
+# Create some data for the binlog (and GTID index), recording the correct
+# binlog positions and GTIDs.
+CREATE TABLE gtid_data(
+ idx INT PRIMARY KEY,
+ gtid VARCHAR(44),
+ gtid_pos VARCHAR(255),
+ file VARCHAR(100),
+ pos INT,
+ row_count INT,
+ KEY(file, pos)) ENGINE=InnoDB;
+--let $gtid= `SELECT @@last_gtid`
+
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/sync_with_master_gtid.inc
+--source include/stop_slave.inc
+
+--connection master
+SET @orig_domain_id= @@gtid_domain_id;
+SET @orig_server_id= @@server_id;
+--let $i= 0
+--let $rotate_point= `SELECT floor($NUM_POS/2)`
+--let $base_count= `SELECT COUNT(*) FROM t1`
+--disable_query_log
+while ($i < $NUM_POS) {
+ --let $file= query_get_value(SHOW MASTER STATUS, File, 1)
+ --let $pos= query_get_value(SHOW MASTER STATUS, Position, 1)
+ --let $gtid_pos= `SELECT @@gtid_binlog_pos`
+ --let $row_count= `SELECT $base_count + $i`
+ eval SET gtid_domain_id= (SELECT domain_id FROM rand_data WHERE idx=$i+1);
+ eval SET server_id= (SELECT server_id FROM rand_data WHERE idx=$i+1);
+ BEGIN;
+ eval INSERT INTO gtid_data(idx, gtid, gtid_pos, file, pos, row_count)
+ VALUES ($i, '$gtid', '$gtid_pos', '$file', $pos, $row_count);
+ eval INSERT INTO t1 VALUES ($i + 1000, 0);
+ COMMIT;
+--let $gtid= `SELECT @@last_gtid`
+ inc $i;
+ if ($i==$rotate_point) {
+ FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+ }
+}
+--enable_query_log
+SET gtid_domain_id= @orig_domain_id;
+SET server_id= @orig_server_id;
+
+SELECT COUNT(*) FROM gtid_data;
+
+# Test that BINLOG_GTID_POS returns correct positions for every GTID position.
+--echo *** The result should be empty, otherwise some result is wrong:
+SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos)
+ FROM gtid_data
+ WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos))
+ ORDER BY idx;
+
+# Prepare to rewind the slave to this point to test again on same binlog.
+--connection slave
+SET @orig_pos= @@GLOBAL.gtid_slave_pos;
+SET @orig_t1_limit= (SELECT MAX(a) FROM t1);
+
+--echo *** Now connect the slave to each position in turn, and test that
+--echo *** the right amount of data is replicated at each point.
+--let $old_silent= $keep_include_silent
+--let $keep_include_silent= 1
+--let $i= 0
+--disable_query_log
+while ($i < $NUM_POS) {
+ --connection master
+ --let $gtid_pos= `SELECT gtid_pos FROM gtid_data WHERE idx=$i`
+ --let $master_count= `SELECT row_count FROM gtid_data WHERE idx=$i`
+ --connection slave
+ --disable_result_log
+ eval START SLAVE UNTIL master_gtid_pos='$gtid_pos';
+ --enable_result_log
+ --let $res= `SELECT MASTER_GTID_WAIT('$gtid_pos')`
+ if ($res != 0) {
+ --die "FAIL: MASTER_GTID_WAIT($gtid_pos) returned $res, should have been 0"
+ }
+ --source include/wait_for_slave_to_stop.inc
+ --let $slave_count = `SELECT COUNT(*) FROM t1`
+ if ($master_count != $slave_count) {
+ SELECT * FROM gtid_data ORDER BY file, pos;
+ SELECT * FROM t1 ORDER BY a;
+ --die "Not all rows replicated. $master_count on master but $slave_count on slave."
+ }
+ --let $i= `SELECT $i + ceil($NUM_POS / $NUM_SLAVE_CONNECTS)`
+}
+--enable_query_log
+
+--echo *** Test slave connecting to some GTID positions where the position in
+--echo *** the master's binlog is different between the different domains.
+--echo *** Revind the slave and test on the same binlog data from the master as before.
+--connection slave
+SET sql_log_bin= 0;
+TRUNCATE gtid_data;
+DELETE FROM t1 WHERE a > @orig_t1_limit;
+SET sql_log_bin= 1;
+SET GLOBAL gtid_slave_pos= @orig_pos;
+
+--let $i= 0
+--disable_query_log
+while ($i <= $NUM_DOMAIN) {
+ # Build a GTID position from GTIDs that are picked at different locations
+ # in the gtid_data table for each domain.
+ --connection master
+ let $until_pos=`
+ SELECT GROUP_CONCAT(gtid SEPARATOR ',')
+ FROM gtid_data
+ WHERE idx IN (
+ SELECT MAX(gtid_data.idx) AS pick
+ FROM gtid_data
+ INNER JOIN rand_data ON (rand_data.idx = gtid_data.idx)
+ WHERE gtid_data.idx*$NUM_DOMAIN <= (domain_id + $i)*$NUM_POS
+ GROUP BY domain_id
+ )`;
+ --connection slave
+ --disable_result_log
+ eval START SLAVE UNTIL master_gtid_pos='$until_pos';
+ --enable_result_log
+ --let $res= `SELECT MASTER_GTID_WAIT('$until_pos')`
+ if ($res != 0) {
+ --die "FAIL: MASTER_GTID_WAIT($until_pos) returned $res, should have been 0"
+ }
+ --source include/wait_for_slave_to_stop.inc
+
+ inc $i;
+}
+--enable_query_log
+--let $keep_include_silent= $old_silent
+
+# Check that everything was replicated (nothing skipped).
+# We have one less row on the slave since the last UNTIL is the one before
+# the master inserted the last row.
+--connection master
+--let $master_count= `SELECT COUNT(*)-1 FROM t1`
+--connection slave
+--let $slave_count= `SELECT COUNT(*) FROM t1`
+if ($master_count != $slave_count) {
+ SELECT * FROM gtid_data ORDER BY file, pos;
+ SELECT * FROM t1 ORDER BY a;
+ --die "Not all rows replicated. $master_count on master but $slave_count on slave."
+}
+
+--connection master
+DROP TABLE gtid_data, rand_data;
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+
+--connection master
diff --git a/mysql-test/suite/rpl/r/rpl_gtid_glle_no_terminate.result b/mysql-test/suite/rpl/r/rpl_gtid_glle_no_terminate.result
index f4d257c2668..98daf309e8c 100644
--- a/mysql-test/suite/rpl/r/rpl_gtid_glle_no_terminate.result
+++ b/mysql-test/suite/rpl/r/rpl_gtid_glle_no_terminate.result
@@ -28,7 +28,6 @@ include/show_events.inc
Log_name Pos Event_type Server_id End_log_pos Info
slave-relay-bin.000002 # Rotate # # master-bin.000001;pos=POS
slave-relay-bin.000002 # Format_desc # # SERVER_VERSION, BINLOG_VERSION
-slave-relay-bin.000002 # Gtid_list # # []
slave-relay-bin.000002 # Binlog_checkpoint # # master-bin.000001
slave-relay-bin.000002 # Gtid # # GTID #-#-#
slave-relay-bin.000002 # Gtid_list # # [#-#-#]
diff --git a/mysql-test/suite/rpl/r/rpl_gtid_index.result b/mysql-test/suite/rpl/r/rpl_gtid_index.result
new file mode 100644
index 00000000000..6d8122ea76f
--- /dev/null
+++ b/mysql-test/suite/rpl/r/rpl_gtid_index.result
@@ -0,0 +1,346 @@
+include/master-slave.inc
+[connection master]
+connection slave;
+include/stop_slave.inc
+CHANGE MASTER TO master_use_gtid= slave_pos;
+include/start_slave.inc
+connection master;
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (0, 0);
+*** Test looking up a lot of different event positions and GTIDs.
+CREATE FUNCTION gtid_eq(a VARCHAR(255), b VARCHAR(255)) RETURNS BOOLEAN DETERMINISTIC
+BEGIN
+DECLARE g VARCHAR(255);
+IF LENGTH(a) != LENGTH(b) THEN
+RETURN FALSE;
+END IF;
+SET a= CONCAT(a, ',');
+SET b= CONCAT(',', b, ',');
+WHILE LENGTH(a) > 0 DO
+SET g= REGEXP_SUBSTR(a, '^[^,]+,');
+SET a= SUBSTRING(a, LENGTH(g)+1);
+SET b= REPLACE(b, CONCAT(',', g), ',');
+END WHILE;
+RETURN b = ',';
+END //
+SET @old_page_size= @@GLOBAL.binlog_gtid_index_page_size;
+SET @old_spars= @@GLOBAL.binlog_gtid_index_sparse;
+SET @old_span_min= @@GLOBAL.binlog_gtid_index_span_min;
+SET @old_span_max= @@GLOBAL.binlog_gtid_index_span_max;
+*** A fair amount of work with default GTID index settings.
+*** Testing 200 GTIDs with 50 test connects
+connection master;
+DELETE FROM t1 WHERE a >= 1000;
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT)
+ENGINE=InnoDB;
+SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data;
+COUNT(*) SUM(domain_id) SUM(server_id)
+201 285 21852
+CREATE TABLE gtid_data(
+idx INT PRIMARY KEY,
+gtid VARCHAR(44),
+gtid_pos VARCHAR(255),
+file VARCHAR(100),
+pos INT,
+row_count INT,
+KEY(file, pos)) ENGINE=InnoDB;
+include/save_master_gtid.inc
+connection slave;
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+connection master;
+SET @orig_domain_id= @@gtid_domain_id;
+SET @orig_server_id= @@server_id;
+SET gtid_domain_id= @orig_domain_id;
+SET server_id= @orig_server_id;
+SELECT COUNT(*) FROM gtid_data;
+COUNT(*)
+200
+*** The result should be empty, otherwise some result is wrong:
+SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos)
+FROM gtid_data
+WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos))
+ORDER BY idx;
+idx gtid_pos BINLOG_GTID_POS(file, pos)
+connection slave;
+SET @orig_pos= @@GLOBAL.gtid_slave_pos;
+SET @orig_t1_limit= (SELECT MAX(a) FROM t1);
+*** Now connect the slave to each position in turn, and test that
+*** the right amount of data is replicated at each point.
+*** Test slave connecting to some GTID positions where the position in
+*** the master's binlog is different between the different domains.
+*** Revind the slave and test on the same binlog data from the master as before.
+connection slave;
+SET sql_log_bin= 0;
+TRUNCATE gtid_data;
+DELETE FROM t1 WHERE a > @orig_t1_limit;
+SET sql_log_bin= 1;
+SET GLOBAL gtid_slave_pos= @orig_pos;
+connection master;
+connection slave;
+connection master;
+DROP TABLE gtid_data, rand_data;
+include/save_master_gtid.inc
+connection slave;
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+connection master;
+*** A lot of GTIDs with small btree pages to stress the Btree code.
+SET GLOBAL binlog_gtid_index_page_size= 64;
+SET GLOBAL binlog_gtid_index_sparse= 1;
+SET GLOBAL binlog_gtid_index_span_min= 1;
+SET GLOBAL binlog_gtid_index_span_max= 1;
+*** Testing 1000 GTIDs with 50 test connects
+connection master;
+DELETE FROM t1 WHERE a >= 1000;
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT)
+ENGINE=InnoDB;
+SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data;
+COUNT(*) SUM(domain_id) SUM(server_id)
+1001 2881 116394
+CREATE TABLE gtid_data(
+idx INT PRIMARY KEY,
+gtid VARCHAR(44),
+gtid_pos VARCHAR(255),
+file VARCHAR(100),
+pos INT,
+row_count INT,
+KEY(file, pos)) ENGINE=InnoDB;
+include/save_master_gtid.inc
+connection slave;
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+connection master;
+SET @orig_domain_id= @@gtid_domain_id;
+SET @orig_server_id= @@server_id;
+SET gtid_domain_id= @orig_domain_id;
+SET server_id= @orig_server_id;
+SELECT COUNT(*) FROM gtid_data;
+COUNT(*)
+1000
+*** The result should be empty, otherwise some result is wrong:
+SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos)
+FROM gtid_data
+WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos))
+ORDER BY idx;
+idx gtid_pos BINLOG_GTID_POS(file, pos)
+connection slave;
+SET @orig_pos= @@GLOBAL.gtid_slave_pos;
+SET @orig_t1_limit= (SELECT MAX(a) FROM t1);
+*** Now connect the slave to each position in turn, and test that
+*** the right amount of data is replicated at each point.
+*** Test slave connecting to some GTID positions where the position in
+*** the master's binlog is different between the different domains.
+*** Revind the slave and test on the same binlog data from the master as before.
+connection slave;
+SET sql_log_bin= 0;
+TRUNCATE gtid_data;
+DELETE FROM t1 WHERE a > @orig_t1_limit;
+SET sql_log_bin= 1;
+SET GLOBAL gtid_slave_pos= @orig_pos;
+connection master;
+connection slave;
+connection master;
+DROP TABLE gtid_data, rand_data;
+include/save_master_gtid.inc
+connection slave;
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+connection master;
+*** Small page size with sparse and span.
+SET GLOBAL binlog_gtid_index_page_size= 64;
+SET GLOBAL binlog_gtid_index_sparse= 10;
+SET GLOBAL binlog_gtid_index_span_min= 1024;
+SET GLOBAL binlog_gtid_index_span_max= 16384;
+*** Testing 200 GTIDs with 50 test connects
+connection master;
+DELETE FROM t1 WHERE a >= 1000;
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT)
+ENGINE=InnoDB;
+SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data;
+COUNT(*) SUM(domain_id) SUM(server_id)
+201 599 23410
+CREATE TABLE gtid_data(
+idx INT PRIMARY KEY,
+gtid VARCHAR(44),
+gtid_pos VARCHAR(255),
+file VARCHAR(100),
+pos INT,
+row_count INT,
+KEY(file, pos)) ENGINE=InnoDB;
+include/save_master_gtid.inc
+connection slave;
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+connection master;
+SET @orig_domain_id= @@gtid_domain_id;
+SET @orig_server_id= @@server_id;
+SET gtid_domain_id= @orig_domain_id;
+SET server_id= @orig_server_id;
+SELECT COUNT(*) FROM gtid_data;
+COUNT(*)
+200
+*** The result should be empty, otherwise some result is wrong:
+SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos)
+FROM gtid_data
+WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos))
+ORDER BY idx;
+idx gtid_pos BINLOG_GTID_POS(file, pos)
+connection slave;
+SET @orig_pos= @@GLOBAL.gtid_slave_pos;
+SET @orig_t1_limit= (SELECT MAX(a) FROM t1);
+*** Now connect the slave to each position in turn, and test that
+*** the right amount of data is replicated at each point.
+*** Test slave connecting to some GTID positions where the position in
+*** the master's binlog is different between the different domains.
+*** Revind the slave and test on the same binlog data from the master as before.
+connection slave;
+SET sql_log_bin= 0;
+TRUNCATE gtid_data;
+DELETE FROM t1 WHERE a > @orig_t1_limit;
+SET sql_log_bin= 1;
+SET GLOBAL gtid_slave_pos= @orig_pos;
+connection master;
+connection slave;
+connection master;
+DROP TABLE gtid_data, rand_data;
+include/save_master_gtid.inc
+connection slave;
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+connection master;
+*** Medium page size.
+SET GLOBAL binlog_gtid_index_page_size= 512;
+SET GLOBAL binlog_gtid_index_sparse= 3;
+SET GLOBAL binlog_gtid_index_span_min= 1;
+SET GLOBAL binlog_gtid_index_span_max= 65536;
+*** Testing 200 GTIDs with 50 test connects
+connection master;
+DELETE FROM t1 WHERE a >= 1000;
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT)
+ENGINE=InnoDB;
+SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data;
+COUNT(*) SUM(domain_id) SUM(server_id)
+201 555 23160
+CREATE TABLE gtid_data(
+idx INT PRIMARY KEY,
+gtid VARCHAR(44),
+gtid_pos VARCHAR(255),
+file VARCHAR(100),
+pos INT,
+row_count INT,
+KEY(file, pos)) ENGINE=InnoDB;
+include/save_master_gtid.inc
+connection slave;
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+connection master;
+SET @orig_domain_id= @@gtid_domain_id;
+SET @orig_server_id= @@server_id;
+SET gtid_domain_id= @orig_domain_id;
+SET server_id= @orig_server_id;
+SELECT COUNT(*) FROM gtid_data;
+COUNT(*)
+200
+*** The result should be empty, otherwise some result is wrong:
+SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos)
+FROM gtid_data
+WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos))
+ORDER BY idx;
+idx gtid_pos BINLOG_GTID_POS(file, pos)
+connection slave;
+SET @orig_pos= @@GLOBAL.gtid_slave_pos;
+SET @orig_t1_limit= (SELECT MAX(a) FROM t1);
+*** Now connect the slave to each position in turn, and test that
+*** the right amount of data is replicated at each point.
+*** Test slave connecting to some GTID positions where the position in
+*** the master's binlog is different between the different domains.
+*** Revind the slave and test on the same binlog data from the master as before.
+connection slave;
+SET sql_log_bin= 0;
+TRUNCATE gtid_data;
+DELETE FROM t1 WHERE a > @orig_t1_limit;
+SET sql_log_bin= 1;
+SET GLOBAL gtid_slave_pos= @orig_pos;
+connection master;
+connection slave;
+connection master;
+DROP TABLE gtid_data, rand_data;
+include/save_master_gtid.inc
+connection slave;
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+connection master;
+*** Large page size.
+SET GLOBAL binlog_gtid_index_page_size= 16384;
+SET GLOBAL binlog_gtid_index_sparse= 1;
+*** Testing 200 GTIDs with 50 test connects
+connection master;
+DELETE FROM t1 WHERE a >= 1000;
+FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT)
+ENGINE=InnoDB;
+SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data;
+COUNT(*) SUM(domain_id) SUM(server_id)
+201 571 23252
+CREATE TABLE gtid_data(
+idx INT PRIMARY KEY,
+gtid VARCHAR(44),
+gtid_pos VARCHAR(255),
+file VARCHAR(100),
+pos INT,
+row_count INT,
+KEY(file, pos)) ENGINE=InnoDB;
+include/save_master_gtid.inc
+connection slave;
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+connection master;
+SET @orig_domain_id= @@gtid_domain_id;
+SET @orig_server_id= @@server_id;
+SET gtid_domain_id= @orig_domain_id;
+SET server_id= @orig_server_id;
+SELECT COUNT(*) FROM gtid_data;
+COUNT(*)
+200
+*** The result should be empty, otherwise some result is wrong:
+SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos)
+FROM gtid_data
+WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos))
+ORDER BY idx;
+idx gtid_pos BINLOG_GTID_POS(file, pos)
+connection slave;
+SET @orig_pos= @@GLOBAL.gtid_slave_pos;
+SET @orig_t1_limit= (SELECT MAX(a) FROM t1);
+*** Now connect the slave to each position in turn, and test that
+*** the right amount of data is replicated at each point.
+*** Test slave connecting to some GTID positions where the position in
+*** the master's binlog is different between the different domains.
+*** Revind the slave and test on the same binlog data from the master as before.
+connection slave;
+SET sql_log_bin= 0;
+TRUNCATE gtid_data;
+DELETE FROM t1 WHERE a > @orig_t1_limit;
+SET sql_log_bin= 1;
+SET GLOBAL gtid_slave_pos= @orig_pos;
+connection master;
+connection slave;
+connection master;
+DROP TABLE gtid_data, rand_data;
+include/save_master_gtid.inc
+connection slave;
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+connection master;
+connection master;
+SET GLOBAL binlog_gtid_index_page_size= @old_page_size;
+SET GLOBAL binlog_gtid_index_sparse= @old_spars;
+SET GLOBAL binlog_gtid_index_span_min= @old_span_min;
+SET GLOBAL binlog_gtid_index_span_max= @old_span_max;
+DROP TABLE t1;
+DROP FUNCTION gtid_eq;
+include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/t/rpl_gtid_glle_no_terminate.test b/mysql-test/suite/rpl/t/rpl_gtid_glle_no_terminate.test
index f0f38a31da6..8d8f22bb1e7 100644
--- a/mysql-test/suite/rpl/t/rpl_gtid_glle_no_terminate.test
+++ b/mysql-test/suite/rpl/t/rpl_gtid_glle_no_terminate.test
@@ -24,6 +24,7 @@ CHANGE MASTER TO MASTER_USE_GTID=slave_pos;
--echo #
--echo # Initialize test data
--connection master
+--source include/wait_for_binlog_checkpoint.inc
create table t1 (a int);
SET @@session.server_id= 3;
create table t2 (a int);
diff --git a/mysql-test/suite/rpl/t/rpl_gtid_index.test b/mysql-test/suite/rpl/t/rpl_gtid_index.test
new file mode 100644
index 00000000000..1acb58ba492
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_gtid_index.test
@@ -0,0 +1,98 @@
+--source include/have_innodb.inc
+--source include/master-slave.inc
+--source include/have_binlog_format_mixed.inc
+
+--connection slave
+--source include/stop_slave.inc
+CHANGE MASTER TO master_use_gtid= slave_pos;
+--source include/start_slave.inc
+
+--connection master
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (0, 0);
+
+
+--echo *** Test looking up a lot of different event positions and GTIDs.
+
+# A function for comparing GTID positions.
+# Handles that the domain_id order is different in the two strings.
+# Works by repeatedly removing one GTID from each string. If the strings have
+# the same length and nothing is left at the end, then they are identical.
+delimiter //;
+CREATE FUNCTION gtid_eq(a VARCHAR(255), b VARCHAR(255)) RETURNS BOOLEAN DETERMINISTIC
+BEGIN
+ DECLARE g VARCHAR(255);
+ IF LENGTH(a) != LENGTH(b) THEN
+ RETURN FALSE;
+ END IF;
+ SET a= CONCAT(a, ',');
+ SET b= CONCAT(',', b, ',');
+ WHILE LENGTH(a) > 0 DO
+ SET g= REGEXP_SUBSTR(a, '^[^,]+,');
+ SET a= SUBSTRING(a, LENGTH(g)+1);
+ SET b= REPLACE(b, CONCAT(',', g), ',');
+ END WHILE;
+ RETURN b = ',';
+END //
+delimiter ;//
+
+SET @old_page_size= @@GLOBAL.binlog_gtid_index_page_size;
+SET @old_spars= @@GLOBAL.binlog_gtid_index_sparse;
+SET @old_span_min= @@GLOBAL.binlog_gtid_index_span_min;
+SET @old_span_max= @@GLOBAL.binlog_gtid_index_span_max;
+
+--echo *** A fair amount of work with default GTID index settings.
+--let $NUM_POS= 200
+--let $NUM_DOMAIN= 5
+--let $NUM_SERVER= 5
+--let $NUM_SLAVE_CONNECTS= 50
+--let $RND_SEED= 42
+--source suite/rpl/include/rpl_gtid_index.inc
+
+--echo *** A lot of GTIDs with small btree pages to stress the Btree code.
+--let $NUM_POS= 1000
+--let $NUM_DOMAIN= 10
+--let $RND_SEED= 150
+SET GLOBAL binlog_gtid_index_page_size= 64;
+SET GLOBAL binlog_gtid_index_sparse= 1;
+SET GLOBAL binlog_gtid_index_span_min= 1;
+SET GLOBAL binlog_gtid_index_span_max= 1;
+--source suite/rpl/include/rpl_gtid_index.inc
+
+--echo *** Small page size with sparse and span.
+--let $NUM_POS= 200
+--let $RND_SEED= 666
+SET GLOBAL binlog_gtid_index_page_size= 64;
+SET GLOBAL binlog_gtid_index_sparse= 10;
+SET GLOBAL binlog_gtid_index_span_min= 1024;
+SET GLOBAL binlog_gtid_index_span_max= 16384;
+--source suite/rpl/include/rpl_gtid_index.inc
+
+--echo *** Medium page size.
+--let $NUM_POS= 200
+--let $RND_SEED= 1024
+SET GLOBAL binlog_gtid_index_page_size= 512;
+SET GLOBAL binlog_gtid_index_sparse= 3;
+SET GLOBAL binlog_gtid_index_span_min= 1;
+SET GLOBAL binlog_gtid_index_span_max= 65536;
+--source suite/rpl/include/rpl_gtid_index.inc
+
+--echo *** Large page size.
+--let $NUM_POS= 200
+--let $RND_SEED= 12345
+SET GLOBAL binlog_gtid_index_page_size= 16384;
+SET GLOBAL binlog_gtid_index_sparse= 1;
+--source suite/rpl/include/rpl_gtid_index.inc
+
+
+# Cleanup.
+--connection master
+SET GLOBAL binlog_gtid_index_page_size= @old_page_size;
+SET GLOBAL binlog_gtid_index_sparse= @old_spars;
+SET GLOBAL binlog_gtid_index_span_min= @old_span_min;
+SET GLOBAL binlog_gtid_index_span_max= @old_span_max;
+
+DROP TABLE t1;
+DROP FUNCTION gtid_eq;
+
+--source include/rpl_end.inc
diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index ea4d6c9ae9a..fe57e82bf56 100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -163,7 +163,7 @@ SET (SQL_SOURCE
gcalc_slicescan.cc gcalc_tools.cc
my_apc.cc mf_iocache_encr.cc item_jsonfunc.cc
my_json_writer.cc json_schema.cc json_schema_helper.cc
- rpl_gtid.cc rpl_parallel.cc
+ rpl_gtid.cc gtid_index.cc rpl_parallel.cc
semisync.cc semisync_master.cc semisync_slave.cc
semisync_master_ack_receiver.cc
sp_instr.cc
diff --git a/sql/gtid_index.cc b/sql/gtid_index.cc
new file mode 100644
index 00000000000..a58fadc5405
--- /dev/null
+++ b/sql/gtid_index.cc
@@ -0,0 +1,1440 @@
+/*
+ Copyright (c) 2023 Kristian Nielsen <knielsen(a)knielsen-hq.org>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+*/
+
+#include "gtid_index.h"
+#include "sql_const.h"
+#include "log.h"
+
+
+static const uchar GTID_INDEX_MAGIC[8]= {
+ 'M', 'D', 'B', 'B', 'L', 'I', 'D', 'X'
+};
+
+Gtid_index_writer *Gtid_index_writer::hot_index_list= nullptr;
+/* gtid_index_mutex is inited in MYSQL_LOG::init_pthread_objects(). */
+mysql_mutex_t Gtid_index_writer::gtid_index_mutex;
+
+
+Gtid_index_writer::Gtid_index_writer(const char *filename, uint32 offset,
+ rpl_binlog_state_base *binlog_state,
+ uint32 opt_page_size, uint32 opt_sparse,
+ my_off_t opt_span_min,
+ my_off_t opt_span_max)
+ : gtid_threshold(opt_sparse),
+ offset_min_threshold(opt_span_min), offset_max_threshold(opt_span_max),
+ nodes(nullptr), previous_offset(0),
+ max_level(0), pending_gtid_count(0), index_file(-1),
+ error_state(false), file_header_written(false), in_hot_index_list(false)
+{
+ uint32 count;
+ rpl_gtid *gtid_list;
+ page_size= opt_page_size;
+ pending_state.init();
+
+ if (alloc_level_if_missing(0))
+ {
+ give_error("Out of memory allocating node list");
+ return;
+ }
+
+ /*
+ Lock the index mutex at this point just before we create the new index
+ file on disk. From this point on, and until the index is fully written,
+ the reader will find us in the "hot index" list and will be able to read
+ from the index while it's still being constructed.
+ */
+ lock_gtid_index();
+
+ build_index_filename(filename);
+ int create_flags= O_WRONLY|O_TRUNC|O_BINARY|O_EXCL;
+ index_file= mysql_file_create(key_file_gtid_index, index_file_name,
+ CREATE_MODE, create_flags, MYF(0));
+ if (index_file < 0 && my_errno == EEXIST)
+ {
+ /*
+ It shouldn't happen that an old GTID index file remains, as we remove
+ them as part of RESET MASTER and PURGE BINARY LOGS. But if it happens
+ due to some external file copy of the user or something, delete any old
+ GTID index file first.
+ */
+ sql_print_information("Old GTID index file found '%s', deleting",
+ index_file_name);
+ my_errno= 0;
+ mysql_file_delete(key_file_gtid_index, index_file_name, MYF(0));
+ index_file= mysql_file_create(key_file_gtid_index, index_file_name,
+ CREATE_MODE, create_flags, MYF(0));
+ }
+ if (index_file < 0)
+ {
+ give_error("Failed to open new index file for writing");
+ goto err;
+ }
+
+ /*
+ Write out an initial index record, i.e. corresponding to the GTID_LIST
+ event / binlog state at the start of the binlog file.
+ */
+ count= binlog_state->count_nolock();
+ gtid_list= gtid_list_buffer(count);
+ if (count > 0)
+ {
+ if (!gtid_list)
+ goto err;
+ binlog_state->get_gtid_list_nolock(gtid_list, count);
+ }
+ write_record(offset, gtid_list, count);
+
+ insert_in_hot_index();
+
+err:
+ unlock_gtid_index();
+}
+
+
+Gtid_index_writer::~Gtid_index_writer()
+{
+ if (in_hot_index_list)
+ {
+ lock_gtid_index();
+ close();
+ unlock_gtid_index();
+ }
+
+ if (index_file > 0)
+ {
+ /*
+ Should have been closed by call to Gtid_index_writer::close().
+ We can at least avoid leaking file descriptor.
+ */
+ mysql_file_close(index_file, MYF(0));
+ }
+
+ if (nodes)
+ {
+ for (uint32 i= 0; i <= max_level; ++i)
+ delete nodes[i];
+ my_free(nodes);
+ }
+
+ /*
+ state.free() is not needed here, will be called from rpl_binlog_state_base
+ destructor.
+ */
+}
+
+
+void
+Gtid_index_writer::gtid_index_init()
+{
+ mysql_mutex_init(key_gtid_index_lock, >id_index_mutex, MY_MUTEX_INIT_SLOW);
+}
+
+void
+Gtid_index_writer::gtid_index_cleanup()
+{
+ mysql_mutex_destroy(>id_index_mutex);
+}
+
+
+const Gtid_index_writer *
+Gtid_index_writer::find_hot_index(const char *file_name)
+{
+ mysql_mutex_assert_owner(>id_index_mutex);
+
+ for (const Gtid_index_writer *p= hot_index_list; p; p= p->next_hot_index)
+ {
+ if (0 == strcmp(file_name, p->index_file_name))
+ return p;
+ }
+ return nullptr;
+}
+
+void
+Gtid_index_writer::insert_in_hot_index()
+{
+ mysql_mutex_assert_owner(>id_index_mutex);
+
+ next_hot_index= hot_index_list;
+ hot_index_list= this;
+ in_hot_index_list= true;
+}
+
+
+void
+Gtid_index_writer::remove_from_hot_index()
+{
+ mysql_mutex_assert_owner(>id_index_mutex);
+
+ Gtid_index_writer **next_ptr_ptr= &hot_index_list;
+ for (;;)
+ {
+ Gtid_index_writer *p= *next_ptr_ptr;
+ if (!p)
+ break;
+ if (p == this)
+ {
+ *next_ptr_ptr= p->next_hot_index;
+ break;
+ }
+ next_ptr_ptr= &p->next_hot_index;
+ }
+ next_hot_index= nullptr;
+ in_hot_index_list= false;
+}
+
+void
+Gtid_index_writer::process_gtid(uint32 offset, const rpl_gtid *gtid)
+{
+ rpl_gtid *gtid_list;
+ uint32 gtid_count;
+
+ if (process_gtid_check_batch(offset, gtid, >id_list, >id_count))
+ return; // Error
+
+ if (gtid_list)
+ async_update(offset, gtid_list, gtid_count);
+}
+
+
+int
+Gtid_index_writer::process_gtid_check_batch(uint32 offset, const rpl_gtid *gtid,
+ rpl_gtid **out_gtid_list,
+ uint32 *out_gtid_count)
+{
+ uint32 count;
+ rpl_gtid *gtid_list;
+
+ mysql_mutex_assert_not_owner(>id_index_mutex);
+
+ ++pending_gtid_count;
+ if (unlikely(pending_state.update_nolock(gtid)))
+ {
+ give_error("Out of memory processing GTID for binlog GTID index");
+ return 1;
+ }
+ /*
+ Sparse index; we record only selected GTIDs, and scan the binlog forward
+ from there to find the exact spot.
+ */
+ if (offset - previous_offset < offset_max_threshold &&
+ (offset - previous_offset < offset_min_threshold ||
+ pending_gtid_count < gtid_threshold))
+ {
+ *out_gtid_list= nullptr;
+ *out_gtid_count= 0;
+ return 0;
+ }
+
+ count= pending_state.count_nolock();
+ DBUG_ASSERT(count > 0 /* Since we just updated with a GTID. */);
+ gtid_list= (rpl_gtid *)
+ my_malloc(key_memory_binlog_gtid_index, count*sizeof(*gtid_list), MYF(0));
+ if (unlikely(!gtid_list))
+ {
+ give_error("Out of memory allocating GTID list for binlog GTID index");
+ return 1;
+ }
+ if (unlikely(pending_state.get_gtid_list_nolock(gtid_list, count)))
+ {
+ /* Shouldn't happen as we allocated the list with the correct length. */
+ DBUG_ASSERT(false);
+ give_error("Internal error allocating GTID list for binlog GTID index");
+ my_free(gtid_list);
+ return 1;
+ }
+ pending_state.reset_nolock();
+ previous_offset= offset;
+ pending_gtid_count= 0;
+ *out_gtid_list= gtid_list;
+ *out_gtid_count= count;
+ return 0;
+}
+
+
+int
+Gtid_index_writer::async_update(uint32 event_offset,
+ rpl_gtid *gtid_list,
+ uint32 gtid_count)
+{
+ lock_gtid_index();
+ int res= write_record(event_offset, gtid_list, gtid_count);
+ unlock_gtid_index();
+ my_free(gtid_list);
+ return res;
+}
+
+
+void
+Gtid_index_writer::close()
+{
+ lock_gtid_index();
+ if (!error_state)
+ {
+
+ /*
+ Write out the remaining pending pages, and insert the final child pointer
+ in interior nodes.
+ */
+ for (uint32 level= 0; ; ++level)
+ {
+ uint32 node_ptr= write_current_node(level, level==max_level);
+ nodes[level]->reset();
+ if (!node_ptr || level >= max_level)
+ break;
+ add_child_ptr(level+1, node_ptr);
+ }
+ }
+ remove_from_hot_index();
+ unlock_gtid_index();
+
+ if (!error_state)
+ {
+ if (mysql_file_sync(index_file, MYF(0)))
+ give_error("Error syncing index file to disk");
+ }
+
+ mysql_file_close(index_file, MYF(0));
+ index_file= (File)-1;
+}
+
+
+Gtid_index_base::Index_node_base::Index_node_base()
+ : first_page(nullptr), current_page(nullptr), current_ptr(nullptr)
+{
+}
+
+
+Gtid_index_base::Index_node_base::~Index_node_base()
+{
+ free_pages();
+}
+
+
+void
+Gtid_index_base::Index_node_base::free_pages()
+{
+ for (Node_page *p= first_page; p; )
+ {
+ Node_page *q= p->next;
+ my_free(p);
+ p= q;
+ }
+}
+
+
+void
+Gtid_index_base::Index_node_base::reset()
+{
+ free_pages();
+ first_page= current_page= nullptr;
+}
+
+
+Gtid_index_base::Gtid_index_base()
+ : gtid_buffer(nullptr), gtid_buffer_alloc(0)
+{
+}
+
+
+Gtid_index_base::~Gtid_index_base()
+{
+ if (gtid_buffer_alloc > 0)
+ my_free(gtid_buffer);
+}
+
+
+void
+Gtid_index_base::make_gtid_index_file_name(char *out_name, size_t bufsize,
+ const char *base_filename)
+{
+ char *p= strmake(out_name, base_filename, bufsize-1);
+ size_t remain= bufsize - (p - out_name);
+ strmake(p, ".idx", remain-1);
+}
+
+
+void
+Gtid_index_base::build_index_filename(const char *filename)
+{
+ make_gtid_index_file_name(index_file_name, sizeof(index_file_name), filename);
+}
+
+
+rpl_gtid *
+Gtid_index_base::gtid_list_buffer(uint32 count)
+{
+ if (gtid_buffer_alloc >= count)
+ return gtid_buffer;
+ rpl_gtid *new_buffer= (rpl_gtid *)
+ my_malloc(key_memory_binlog_gtid_index, count*sizeof(*new_buffer), MYF(0));
+ if (!new_buffer)
+ {
+ give_error("Out of memory allocating buffer for GTID list");
+ return NULL;
+ }
+ my_free(gtid_buffer);
+ gtid_buffer= new_buffer;
+ gtid_buffer_alloc= count;
+ return new_buffer;
+}
+
+
+Gtid_index_writer::Index_node::Index_node(uint32 level_)
+ : num_records(0), level(level_), force_spill_page(false)
+{
+ state.init();
+}
+
+
+Gtid_index_writer::Index_node::~Index_node()
+{
+ free_pages();
+}
+
+
+uint32
+Gtid_index_writer::write_current_node(uint32 level, bool is_root)
+{
+ Index_node *n= nodes[level];
+
+ uint32 node_pos= (uint32)mysql_file_tell(index_file, MYF(0));
+
+ for (Node_page *p= n->first_page; p ; p= p->next)
+ {
+ if (unlikely(is_root))
+ *(p->flag_ptr) |= PAGE_FLAG_ROOT;
+ if (likely(!p->next))
+ *(p->flag_ptr) |= PAGE_FLAG_LAST;
+ int4store(p->page + page_size - CHECKSUM_LEN,
+ my_checksum(0, p->page, page_size - CHECKSUM_LEN));
+ if (mysql_file_write(index_file, p->page, page_size, MYF(MY_NABP)))
+ {
+ give_error("Error writing index page");
+ return 0;
+ }
+ }
+
+ DBUG_ASSERT(node_pos % page_size == 0);
+ /* Page numbers are +1 just so that zero can denote invalid page pointer. */
+ return 1 + (node_pos / (uint32)page_size);
+}
+
+
+void
+Gtid_index_writer::Index_node::reset()
+{
+ Index_node_base::reset();
+ state.reset_nolock();
+ num_records= 0;
+ force_spill_page= false;
+}
+
+
+/*
+ Make sure there is requested space in the current page, by allocating a
+ new spill page if necessary.
+*/
+int
+Gtid_index_writer::reserve_space(Index_node *n, size_t bytes)
+{
+ DBUG_ASSERT(bytes <= page_size);
+ if (likely(n->current_page) &&
+ likely(n->current_ptr - n->current_page->page + bytes <=
+ (page_size - CHECKSUM_LEN)))
+ return 0;
+ /* Not enough room, allocate a spill page. */
+ Node_page *page= alloc_page();
+ n->force_spill_page= false;
+ if (!page)
+ return 1;
+ n->current_ptr=
+ init_header(page, n->level==0, !n->current_page);
+ if (n->current_page)
+ n->current_page->next= page;
+ else
+ n->first_page= page;
+ n->current_page= page;
+ return 0;
+}
+
+
+int
+Gtid_index_writer::do_write_record(uint32 level,
+ uint32 event_offset,
+ const rpl_gtid *gtid_list,
+ uint32 gtid_count)
+{
+ DBUG_ASSERT(level <= max_level);
+ Index_node *n= nodes[level];
+ if (reserve_space(n, 8))
+ return 1;
+ /* Store the count as +1, so that 0 can mean "no more records". */
+ int4store(n->current_ptr, gtid_count+1);
+ int4store(n->current_ptr+4, event_offset);
+ n->current_ptr+= 8;
+ for (uint32 i= 0; i < gtid_count; ++i)
+ {
+ if (reserve_space(n, 16))
+ return 1;
+ int4store(n->current_ptr, gtid_list[i].domain_id);
+ int4store(n->current_ptr+4, gtid_list[i].server_id);
+ int8store(n->current_ptr+8, gtid_list[i].seq_no);
+ n->current_ptr+= 16;
+ }
+
+ ++n->num_records;
+ return 0;
+}
+
+
+/*
+ Add a child pointer to the current node on LEVEL.
+ The first page has node_ptr=1 just so that a zero node_ptr can be used as
+ a no/invalid value (effectively node_ptr points to the end of the target
+ page, in unit of pages).
+
+ Adding a child pointer shouldn't spill to a new page, code must make sure that
+ there is always room for the final child pointer in current non-leaf node.
+*/
+int
+Gtid_index_writer::add_child_ptr(uint32 level, my_off_t node_offset)
+{
+ DBUG_ASSERT(level <= max_level);
+ DBUG_ASSERT(node_offset > 0);
+ Index_node *n= nodes[level];
+ if (reserve_space(n, 4))
+ return 1;
+ DBUG_ASSERT(n->current_page);
+ DBUG_ASSERT((size_t)(n->current_ptr - n->current_page->page + 4) <=
+ page_size - CHECKSUM_LEN);
+
+ int4store(n->current_ptr, node_offset);
+ n->current_ptr+= 4;
+ return 0;
+}
+
+
+/*
+ Write one index record to the GTID index, flushing nodes and allocating
+ new nodes as necessary.
+*/
+int
+Gtid_index_writer::write_record(uint32 event_offset,
+ const rpl_gtid *gtid_list,
+ uint32 gtid_count)
+{
+ if (error_state)
+ return 1; /* Avoid continuing on a possibly corrupt state. */
+
+ uint32 level= 0;
+ /*
+ The most frequent case is when there is room in the current page for the
+ current position to be written, in which case we exit early in the first
+ iteration of the following loop.
+
+ In the general case, we move up through the path to the root, writing
+ lower-level node page to disk and adding child pointers in higher-level
+ nodes, until we reach a node that has room. This final node may be a
+ freshly allocated new root node in the few times when the height of the
+ tree increases.
+ */
+ for (;;)
+ {
+ Index_node *n= nodes[level];
+ if (update_gtid_state(&n->state, gtid_list, gtid_count))
+ return give_error("Out of memory updating the local GTID state");
+
+ if (check_room(level, gtid_count))
+ {
+ /* There is room in the node, just add the index record. */
+ return do_write_record(level, event_offset, gtid_list, gtid_count);
+ }
+
+ /*
+ This node is full:
+ - First, write out this node to disk.
+ - Add a child pointer in the parent node (allocating one if needed).
+ - On level 0, allocate a new leaf node and add the index record there.
+ - On levels >0, skip the last index record when the node gets full
+ (B+-Tree has (k-1) keys for k child pointers).
+ - Loop to the parent node to add an index record there.
+ */
+ uint32 node_ptr= write_current_node(level, false);
+ if (!node_ptr)
+ return 1;
+ if (alloc_level_if_missing(level+1) ||
+ add_child_ptr(level+1, node_ptr))
+ return 1;
+ uint32 new_count= n->state.count_nolock();
+ rpl_gtid *new_gtid_list= gtid_list_buffer(new_count);
+ if (new_count > 0 && !new_gtid_list)
+ return 1;
+ if (n->state.get_gtid_list_nolock(new_gtid_list, new_count))
+ return give_error("Internal error processing GTID state");
+ n->reset();
+ if (level == 0)
+ {
+ if (do_write_record(level, event_offset, new_gtid_list, new_count))
+ return 1;
+ }
+ else
+ {
+ /*
+ Allocate a page for the node. This is mostly to help the reader of hot
+ index to not see NULL pointers, and we will need the page later anyway
+ to put at least one child pointer to the level below.
+ */
+ if (reserve_space(n, 4))
+ return 1;
+ }
+ gtid_list= new_gtid_list;
+ gtid_count= new_count;
+ ++level;
+ }
+ // NotReached.
+}
+
+
+bool
+Gtid_index_writer::check_room(uint32 level, uint32 gtid_count)
+{
+ Index_node *n= nodes[level];
+ /* There's always room in an empty (to-be-allocated) page. */
+ if (!n->current_page || n->num_records == 0)
+ return true;
+ /*
+ Make sure we use at least 1/2 a page of room after the initial record,
+ setting a flag to allocate a spill page later if needed.
+ */
+ size_t avail= page_size - CHECKSUM_LEN - (n->current_ptr - n->current_page->page);
+ if (n->num_records==1 && avail < page_size/2)
+ {
+ n->force_spill_page= true;
+ return true;
+ }
+ if (n->force_spill_page)
+ return true;
+ size_t needed= 8 + 16*gtid_count;
+ /* Non-leaf pages need extra 4 bytes for a child pointer. */
+ if (level > 0)
+ needed+= 4;
+ return needed <= avail;
+}
+
+
+int
+Gtid_index_writer::alloc_level_if_missing(uint32 level)
+{
+ if (likely(nodes))
+ {
+ if (likely(max_level >= level))
+ return 0;
+ DBUG_ASSERT(level == max_level+1); // Alloc one at a time
+ }
+
+ Index_node *node= new Index_node(level);
+ if (!node)
+ return give_error("Out of memory allocating new node");
+ Index_node **new_nodes= (Index_node **)
+ my_realloc(key_memory_binlog_gtid_index, nodes, (level+1)*sizeof(*nodes),
+ MYF(MY_ALLOW_ZERO_PTR|MY_ZEROFILL));
+ if (!new_nodes)
+ {
+ delete node;
+ return give_error("Out of memory allocating larger node list");
+ }
+ new_nodes[level]= node;
+ nodes= new_nodes;
+ max_level= level;
+ return 0;
+}
+
+
+/*
+ Initialize the start of a data page.
+ This is at the start of a page, except for the very first page where it
+ comes after the global file header.
+ Format:
+ 0 flags.
+ 1-3 unused padding/reserved.
+
+ The argument FIRST denotes if this is the first page (if false it is a
+ continuation page).
+*/
+uchar *
+Gtid_index_writer::init_header(Node_page *page, bool is_leaf, bool is_first)
+{
+ uchar *p= page->page;
+ bool is_file_header= !file_header_written;
+
+ if (unlikely(is_file_header))
+ {
+ memcpy(p, GTID_INDEX_MAGIC, sizeof(GTID_INDEX_MAGIC));
+ p+= sizeof(GTID_INDEX_MAGIC);
+ *p++= GTID_INDEX_VERSION_MAJOR;
+ *p++= GTID_INDEX_VERSION_MINOR;
+ /* Flags/padding currently unused. */
+ *p++= 0;
+ *p++= 0;
+ int4store(p, page_size);
+ p+= 4;
+ DBUG_ASSERT(p == page->page + GTID_INDEX_FILE_HEADER_SIZE);
+ file_header_written= true;
+ }
+
+ uchar flags= 0;
+ if (is_leaf)
+ flags|= PAGE_FLAG_IS_LEAF;
+ if (unlikely(!is_first))
+ flags|= PAGE_FLAG_IS_CONT;
+ page->flag_ptr= p;
+ *p++= flags;
+ /* Padding/reserved. */
+ p+= 3;
+ DBUG_ASSERT(p == page->page +
+ (is_file_header ? GTID_INDEX_FILE_HEADER_SIZE : 0) +
+ GTID_INDEX_PAGE_HEADER_SIZE);
+ DBUG_ASSERT((size_t)(p - page->page) < page_size - CHECKSUM_LEN);
+ return p;
+}
+
+
+int
+Gtid_index_base::update_gtid_state(rpl_binlog_state_base *state,
+ const rpl_gtid *gtid_list, uint32 gtid_count)
+{
+ for (uint32 i= 0; i < gtid_count; ++i)
+ if (state->update_nolock(>id_list[i]))
+ return 1;
+ return 0;
+}
+
+
+Gtid_index_base::Node_page *Gtid_index_base::alloc_page()
+{
+ Node_page *new_node= (Node_page *)
+ my_malloc(key_memory_binlog_gtid_index,
+ sizeof(Node_page) + page_size,
+ MYF(MY_ZEROFILL));
+ if (!new_node)
+ give_error("Out of memory for allocating index page");
+ return new_node;
+}
+
+
+int Gtid_index_writer::give_error(const char *msg)
+{
+ if (!error_state)
+ {
+ sql_print_information("Error during binlog GTID index creation, will "
+ "fallback to slower sequential binlog scan. "
+ "Error is: %s", msg);
+ error_state= true;
+ }
+ return 1;
+}
+
+
+Gtid_index_reader::Gtid_index_reader()
+ : n(nullptr), index_file(-1),
+ file_open(false), index_valid(false), has_root_node(false),
+ version_major(0), version_minor(0)
+{
+ current_state.init();
+ compare_state.init();
+}
+
+
+Gtid_index_reader::~Gtid_index_reader()
+{
+ if (file_open)
+ mysql_file_close(index_file, MYF(0));
+}
+
+
+int
+Gtid_index_reader::search_offset(uint32 in_offset,
+ uint32 *out_offset, uint32 *out_gtid_count)
+{
+ in_search_offset= in_offset;
+ search_cmp_function= &Gtid_index_reader::search_cmp_offset;
+
+ return do_index_search(out_offset, out_gtid_count);
+}
+
+int
+Gtid_index_reader::search_gtid_pos(slave_connection_state *in_gtid_pos,
+ uint32 *out_offset, uint32 *out_gtid_count)
+{
+ in_search_gtid_pos= in_gtid_pos;
+ search_cmp_function= &Gtid_index_reader::search_cmp_gtid_pos;
+
+ int res= do_index_search(out_offset, out_gtid_count);
+ /* Let's not leave a dangling pointer to the caller's memory. */
+ in_search_gtid_pos= nullptr;
+
+ return res;
+}
+
+rpl_gtid *
+Gtid_index_reader::search_gtid_list()
+{
+ return gtid_buffer;
+}
+
+
+int
+Gtid_index_reader::search_cmp_offset(uint32 offset,
+ rpl_binlog_state_base *state)
+{
+ if (offset <= in_search_offset)
+ return 0;
+ else
+ return -1;
+}
+
+
+int
+Gtid_index_reader::search_cmp_gtid_pos(uint32 offset,
+ rpl_binlog_state_base *state)
+{
+ if (state->is_before_pos(in_search_gtid_pos))
+ return 0;
+ else
+ return -1;
+}
+
+
+int
+Gtid_index_reader::next_page()
+{
+ if (!read_page->next)
+ return 1;
+ read_page= read_page->next;
+ read_ptr= read_page->flag_ptr + 4;
+ return 0;
+}
+
+
+int
+Gtid_index_reader::find_bytes(uint32 num_bytes)
+{
+ if (read_ptr - read_page->page + num_bytes <=
+ (my_ptrdiff_t)(page_size - CHECKSUM_LEN))
+ return 0;
+ return next_page();
+}
+
+
+int
+Gtid_index_reader::get_child_ptr(uint32 *out_child_ptr)
+{
+ if (find_bytes(4))
+ return give_error("Corrupt index, short index node");
+ *out_child_ptr= (uint32)uint4korr(read_ptr);
+ read_ptr+= 4;
+ return 0;
+}
+
+
+/*
+ Read the start of an index record (count of GTIDs in the differential state
+ and offset).
+ Returns:
+ 0 ok
+ 1 EOF, no more data in this node
+*/
+int
+Gtid_index_reader::get_offset_count(uint32 *out_offset, uint32 *out_gtid_count)
+{
+ if (find_bytes(8))
+ return 1;
+ uint32 gtid_count= uint4korr(read_ptr);
+ if (gtid_count == 0)
+ {
+ /* 0 means invalid/no record (we store N+1 for N GTIDs in record). */
+ return 1;
+ }
+ *out_gtid_count= gtid_count - 1;
+ *out_offset= uint4korr(read_ptr + 4);
+ read_ptr+= 8;
+ return 0;
+}
+
+
+int
+Gtid_index_reader::get_gtid_list(rpl_gtid *out_gtid_list, uint32 count)
+{
+ for (uint32 i= 0; i < count; ++i)
+ {
+ if (find_bytes(16))
+ return give_error("Corrupt index, short index node");
+ out_gtid_list[i].domain_id= uint4korr(read_ptr);
+ out_gtid_list[i].server_id= uint4korr(read_ptr + 4);
+ out_gtid_list[i].seq_no= uint8korr(read_ptr + 8);
+ read_ptr+= 16;
+ }
+ return 0;
+}
+
+
+int
+Gtid_index_reader::open_index_file(const char *binlog_filename)
+{
+ close_index_file();
+ build_index_filename(binlog_filename);
+ if ((index_file= mysql_file_open(key_file_gtid_index, index_file_name,
+ O_RDONLY|O_BINARY, MYF(0))) < 0)
+ return 1; // No error for missing index (eg. upgrade)
+
+ file_open= true;
+ if (read_file_header())
+ return 1;
+
+ return 0;
+}
+
+void
+Gtid_index_reader::close_index_file()
+{
+ if (!file_open)
+ return;
+ mysql_file_close(index_file, MYF(0));
+ file_open= false;
+ index_valid= false;
+}
+
+
+int
+Gtid_index_reader::do_index_search(uint32 *out_offset, uint32 *out_gtid_count)
+{
+ /* In cold index, we require a complete index with a valid root node. */
+ if (!has_root_node)
+ return -1;
+
+ return do_index_search_root(out_offset, out_gtid_count);
+}
+
+
+int
+Gtid_index_reader::do_index_search_root(uint32 *out_offset,
+ uint32 *out_gtid_count)
+{
+ current_state.reset_nolock();
+ compare_state.reset_nolock();
+ /*
+ These states will be initialized to the full state stored at the start of
+ the root node and then incrementally updated.
+ */
+ bool current_state_updated= false;
+
+ if (read_root_node())
+ return -1;
+ for (;;)
+ {
+ if (*n->first_page->flag_ptr & PAGE_FLAG_IS_LEAF)
+ break;
+
+ if (compare_state.load_nolock(¤t_state))
+ {
+ give_error("Out of memory allocating GTID list");
+ return -1;
+ }
+ uint32 child_ptr;
+ if (get_child_ptr(&child_ptr))
+ return -1;
+
+ /* Scan over the keys in the node to find the child pointer to follow */
+ for (;;)
+ {
+ uint32 offset, gtid_count;
+ int res= get_offset_count(&offset, >id_count);
+ if (res == 1) // EOF?
+ {
+ /* Follow the right-most child pointer. */
+ if (read_node(child_ptr))
+ return -1;
+ break;
+ }
+ rpl_gtid *gtid_list= gtid_list_buffer(gtid_count);
+ uint32 child2_ptr;
+ if ((gtid_count > 0 && !gtid_list) ||
+ get_gtid_list(gtid_list, gtid_count) ||
+ get_child_ptr(&child2_ptr))
+ return -1;
+ if (update_gtid_state(&compare_state, gtid_list, gtid_count))
+ return -1;
+ int cmp= (this->*search_cmp_function)(offset, &compare_state);
+ if (cmp < 0)
+ {
+ /* Follow the left child of this key. */
+ if (read_node(child_ptr))
+ return -1;
+ break;
+ }
+ /* Continue to scan the next key. */
+ update_gtid_state(¤t_state, gtid_list, gtid_count);
+ current_state_updated= true;
+ current_offset= offset;
+ child_ptr= child2_ptr;
+ }
+ }
+ return do_index_search_leaf(current_state_updated,
+ out_offset, out_gtid_count);
+}
+
+int Gtid_index_reader::do_index_search_leaf(bool current_state_updated,
+ uint32 *out_offset,
+ uint32 *out_gtid_count)
+{
+ uint32 offset, gtid_count;
+ int res= get_offset_count(&offset, >id_count);
+ if (res == 1)
+ {
+ DBUG_ASSERT(0);
+ give_error("Corrupt index; empty leaf node");
+ return -1;
+ }
+ rpl_gtid *gtid_list= gtid_list_buffer(gtid_count);
+ if ((gtid_count > 0 && !gtid_list) ||
+ get_gtid_list(gtid_list, gtid_count))
+ return -1;
+ /*
+ The first key is ignored (already included in the current state), unless
+ it is the very first state in the index.
+ */
+ if (!current_state_updated)
+ update_gtid_state(¤t_state, gtid_list, gtid_count);
+ current_offset= offset;
+ if (compare_state.load_nolock(¤t_state))
+ {
+ give_error("Out of memory allocating GTID state");
+ return -1;
+ }
+ int cmp= (this->*search_cmp_function)(offset, &compare_state);
+ if (cmp < 0)
+ return 0; // Search position is before start of index.
+
+ /* Scan over the keys in the leaf node. */
+ for (;;)
+ {
+ uint32 offset, gtid_count;
+ int res= get_offset_count(&offset, >id_count);
+ if (res == 1) // EOF?
+ {
+ /* Reached end of leaf, last key is the one searched for. */
+ break;
+ }
+ gtid_list= gtid_list_buffer(gtid_count);
+ if ((gtid_count > 0 && !gtid_list) ||
+ get_gtid_list(gtid_list, gtid_count))
+ return -1;
+ if (update_gtid_state(&compare_state, gtid_list, gtid_count))
+ return -1;
+ cmp= (this->*search_cmp_function)(offset, &compare_state);
+ if (cmp < 0)
+ {
+ /* Next key is larger, so current state is the one searched for. */
+ break;
+ }
+ update_gtid_state(¤t_state, gtid_list, gtid_count);
+ current_offset= offset;
+ }
+
+ *out_offset= current_offset;
+ *out_gtid_count= current_state.count_nolock();
+ /* Save the result in the shared gtid list buffer. */
+ if ((!(gtid_list= gtid_list_buffer(*out_gtid_count)) && *out_gtid_count > 0) ||
+ current_state.get_gtid_list_nolock(gtid_list, *out_gtid_count))
+ return -1;
+
+ return 1;
+}
+
+
+/*
+ Read the file header and check that it's valid and that the format is not
+ too new a version for us to be able to read it.
+*/
+int
+Gtid_index_reader::read_file_header()
+{
+ if (!file_open)
+ return 1;
+
+ uchar buf[GTID_INDEX_FILE_HEADER_SIZE + GTID_INDEX_PAGE_HEADER_SIZE];
+
+ if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, 0, MY_SEEK_SET, MYF(0)) ||
+ mysql_file_read(index_file, buf,
+ GTID_INDEX_FILE_HEADER_SIZE + GTID_INDEX_PAGE_HEADER_SIZE,
+ MYF(MY_NABP)))
+ return give_error("Error reading page from index file");
+ if (memcmp(&buf[0], GTID_INDEX_MAGIC, sizeof(GTID_INDEX_MAGIC)))
+ return give_error("Corrupt index file, magic not found in header");
+ version_major= buf[8];
+ version_minor= buf[9];
+ /* We cannot safely read a major version we don't know about. */
+ if (version_major > GTID_INDEX_VERSION_MAJOR)
+ return give_error("Incompatible index file, version too high");
+ page_size= uint4korr(&buf[12]);
+
+ /* Verify checksum integrity of page_size and major/minor version. */
+ uint32 crc= my_checksum(0, buf, sizeof(buf));
+ uchar *buf3= (uchar *)
+ my_malloc(key_memory_binlog_gtid_index, page_size - sizeof(buf), MYF(0));
+ if (!buf3)
+ return give_error("Error allocating memory for index page");
+ int res= 0;
+ if (mysql_file_read(index_file, buf3, page_size - sizeof(buf), MYF(MY_NABP)))
+ res= give_error("Error reading page from index file");
+ else
+ {
+ crc= my_checksum(crc, buf3, page_size - sizeof(buf) - CHECKSUM_LEN);
+ if (crc != uint4korr(buf3 + page_size - sizeof(buf) - CHECKSUM_LEN))
+ res= give_error("Corrupt page, invalid checksum");
+ }
+ my_free(buf3);
+ if (res)
+ return res;
+
+ /*
+ Check that there is a valid root node at the end of the file.
+ If there is not, the index may be a "hot index" that is currently being
+ constructed. Or it was only partially written before server crash and not
+ recovered for some reason.
+ */
+ uchar flags= buf[GTID_INDEX_PAGE_HEADER_SIZE];
+ constexpr uchar needed_flags= PAGE_FLAG_ROOT|PAGE_FLAG_LAST;
+ if ((flags & needed_flags) == needed_flags)
+ {
+ /* Special case: the index is a single page, which is the root node. */
+ has_root_node= true;
+ }
+ else
+ {
+ uchar buf2[GTID_INDEX_PAGE_HEADER_SIZE];
+ if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, -(int32)page_size,
+ MY_SEEK_END, MYF(0)) ||
+ mysql_file_read(index_file, buf2, GTID_INDEX_PAGE_HEADER_SIZE,
+ MYF(MY_NABP)))
+ return give_error("Error reading root page from index file");
+ flags= buf2[0];
+ has_root_node= ((flags & needed_flags) == needed_flags);
+ /* No need to verify checksum here, will be done by read_root_node(). */
+ }
+ index_valid= true;
+ return 0;
+}
+
+
+int
+Gtid_index_reader::verify_checksum(Gtid_index_base::Node_page *page)
+{
+ uint32 calc_checksum= my_checksum(0, page->page, page_size - CHECKSUM_LEN);
+ uint32 read_checksum= uint4korr(page->page + page_size - CHECKSUM_LEN);
+ if (calc_checksum != read_checksum)
+ return give_error("Corrupt page, invalid checksum");
+ return 0;
+}
+
+
+Gtid_index_base::Node_page *
+Gtid_index_reader::alloc_and_read_page()
+{
+ Node_page *page= alloc_page();
+ if (!page)
+ {
+ give_error("Error allocating memory for index page");
+ return nullptr;
+ }
+ if (mysql_file_read(index_file, page->page, page_size, MYF(MY_NABP)))
+ {
+ my_free(page);
+ give_error("Error reading page from index file");
+ return nullptr;
+ }
+ if (verify_checksum(page))
+ {
+ my_free(page);
+ return nullptr;
+ }
+ return page;
+}
+
+
+int
+Gtid_index_reader::read_root_node()
+{
+ if (!index_valid || !has_root_node)
+ return 1;
+
+ cold_node.reset();
+ n= &cold_node;
+ /*
+ Read pages one by one from the back of the file until we have a complete
+ root node.
+ */
+ if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, -(int32)page_size,
+ MY_SEEK_END, MYF(0)))
+ return give_error("Error seeking index file");
+
+ for (;;)
+ {
+ Node_page *page= alloc_and_read_page();
+ if (!page)
+ return 1;
+ if (mysql_file_tell(index_file, MYF(0)) == page_size)
+ page->flag_ptr= &page->page[GTID_INDEX_FILE_HEADER_SIZE];
+ else
+ page->flag_ptr= &page->page[0];
+ page->next= n->first_page;
+ n->first_page= page;
+ uchar flags= page->page[0];
+ if (unlikely(!(flags & PAGE_FLAG_ROOT)))
+ return give_error("Corrupt or truncated index, no root node found");
+ if (!(flags & PAGE_FLAG_IS_CONT))
+ break; // Found start of root node
+ if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, -(int32)(2*page_size),
+ MY_SEEK_CUR, MYF(0)))
+ return give_error("Error seeking index file for multi-page root node");
+ }
+
+ read_page= n->first_page;
+ read_ptr= read_page->flag_ptr + GTID_INDEX_PAGE_HEADER_SIZE;
+ return 0;
+}
+
+
+int
+Gtid_index_reader::read_node(uint32 page_ptr)
+{
+ DBUG_ASSERT(page_ptr != 0 /* No zero child pointers in on-disk pages. */);
+ if (!index_valid || !page_ptr)
+ return 1;
+ return read_node_cold(page_ptr);
+}
+
+
+int
+Gtid_index_reader::read_node_cold(uint32 page_ptr)
+{
+ if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, (page_ptr-1)*page_size,
+ MY_SEEK_SET, MYF(0)))
+ return give_error("Error seeking index file");
+
+ bool file_header= (page_ptr == 1);
+ cold_node.reset();
+ n= &cold_node;
+ Node_page **next_ptr_ptr= &n->first_page;
+ for (;;)
+ {
+ Node_page *page= alloc_and_read_page();
+ if (!page)
+ return 1;
+ page->flag_ptr= &page->page[file_header ? GTID_INDEX_FILE_HEADER_SIZE : 0];
+ file_header= false;
+ /* Insert the page at the end of the list. */
+ page->next= nullptr;
+ *next_ptr_ptr= page;
+ next_ptr_ptr= &page->next;
+
+ uchar flags= *(page->flag_ptr);
+ if (flags & PAGE_FLAG_LAST)
+ break;
+ }
+
+ read_page= n->first_page;
+ read_ptr= read_page->flag_ptr + GTID_INDEX_PAGE_HEADER_SIZE;
+ return 0;
+}
+
+
+int Gtid_index_reader::give_error(const char *msg)
+{
+ sql_print_information("Error reading binlog GTID index, will "
+ "fallback to slower sequential binlog scan. "
+ "Error is: %s", msg);
+ return 1;
+}
+
+
+Gtid_index_reader_hot::Gtid_index_reader_hot()
+ : hot_writer(nullptr)
+{
+}
+
+
+int
+Gtid_index_reader_hot::get_child_ptr(uint32 *out_child_ptr)
+{
+ if (find_bytes(4))
+ {
+ /*
+ If reading hot index, EOF or zero child ptr means the child pointer has
+ not yet been written. A zero out_child_ptr makes read_node() read the
+ hot node for the child.
+ */
+ if (hot_writer)
+ {
+ *out_child_ptr= 0;
+ return 0;
+ }
+ return give_error("Corrupt index, short index node");
+ }
+ *out_child_ptr= (uint32)uint4korr(read_ptr);
+ read_ptr+= 4;
+ return 0;
+}
+
+
+int
+Gtid_index_reader_hot::do_index_search(uint32 *out_offset,
+ uint32 *out_gtid_count)
+{
+ /* Check for a "hot" index. */
+ Gtid_index_writer::lock_gtid_index();
+ hot_writer= Gtid_index_writer::find_hot_index(index_file_name);
+ if (!hot_writer)
+ {
+ Gtid_index_writer::unlock_gtid_index();
+ /*
+ Check the index file header (and index end) again, in case it was
+ hot when open_index_file() was called, but became cold in the meantime.
+ */
+ if (!has_root_node && Gtid_index_reader::read_file_header())
+ return -1;
+ }
+
+ int res= do_index_search_root(out_offset, out_gtid_count);
+
+ if (hot_writer)
+ {
+ hot_writer= nullptr;
+ Gtid_index_writer::unlock_gtid_index();
+ }
+ return res;
+}
+
+
+int
+Gtid_index_reader_hot::read_file_header()
+{
+ if (!file_open)
+ return 1;
+
+ Gtid_index_writer::lock_gtid_index();
+ hot_writer= Gtid_index_writer::find_hot_index(index_file_name);
+ if (!hot_writer)
+ Gtid_index_writer::unlock_gtid_index();
+
+ int res;
+ if (hot_writer && hot_writer->max_level == 0)
+ {
+ /*
+ No pages from the hot index have been written to disk, there's just a
+ single incomplete node at level 0.
+ We have to read the file header from the in-memory page.
+ */
+ uchar *p= hot_writer->nodes[0]->first_page->page;
+ page_size= uint4korr(p + 12);
+ has_root_node= false;
+ index_valid= true;
+ res= 0;
+ }
+ else
+ res= Gtid_index_reader::read_file_header();
+
+ if (hot_writer)
+ {
+ hot_writer= nullptr;
+ Gtid_index_writer::unlock_gtid_index();
+ }
+ return res;
+}
+
+
+int
+Gtid_index_reader_hot::read_root_node()
+{
+ if (!index_valid)
+ return 1;
+
+ if (hot_writer)
+ {
+ hot_level= hot_writer->max_level;
+ return read_node_hot();
+ }
+ if (has_root_node)
+ {
+ return Gtid_index_reader::read_root_node();
+ }
+ return 1;
+}
+
+
+int
+Gtid_index_reader_hot::read_node(uint32 page_ptr)
+{
+ if (!index_valid || (!page_ptr && !hot_writer))
+ return 1;
+
+ if (hot_writer)
+ {
+ if (!page_ptr)
+ {
+ /*
+ The "hot" index is only partially written. Not yet written child pages
+ are indicated by zero child pointers. Such child pages are found from
+ the list of active nodes in the writer.
+ */
+ if (hot_level <= 0)
+ {
+ DBUG_ASSERT(0 /* Should be no child pointer to follow on leaf page. */);
+ return give_error("Corrupt hot index (child pointer on leaf page");
+ }
+ DBUG_ASSERT(n == hot_writer->nodes[hot_level]);
+ --hot_level;
+ return read_node_hot();
+ }
+
+ /*
+ We started searching the "hot" index, but now we've reached a "cold"
+ part of the index that's already fully written. So leave the "hot index"
+ mode and continue reading pages from the on-disk index from here.
+ */
+ hot_writer= nullptr;
+ Gtid_index_writer::unlock_gtid_index();
+ }
+
+ return read_node_cold(page_ptr);
+}
+
+
+int
+Gtid_index_reader_hot::read_node_hot()
+{
+ if (hot_writer->error_state)
+ return give_error("Cannot access hot index");
+ n= hot_writer->nodes[hot_level];
+ read_page= n->first_page;
+ /* The writer should allocate pages for all nodes. */
+ DBUG_ASSERT(read_page != nullptr);
+ if (!read_page)
+ return give_error("Page not available in hot index");
+ read_ptr= read_page->flag_ptr + GTID_INDEX_PAGE_HEADER_SIZE;
+ return 0;
+}
diff --git a/sql/gtid_index.h b/sql/gtid_index.h
new file mode 100644
index 00000000000..3219995c88d
--- /dev/null
+++ b/sql/gtid_index.h
@@ -0,0 +1,430 @@
+/*
+ Copyright (c) 2023 Kristian Nielsen <knielsen(a)knielsen-hq.org>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+*/
+
+#ifndef GTID_INDEX_H
+#define GTID_INDEX_H
+
+#include "my_global.h"
+#include "mysqld.h"
+#include "mariadb.h"
+#include "rpl_gtid.h"
+
+/*
+ This implements an on-disk index for each binlog file to speed up access to
+ the binlog at a specific offset or GTID position. This is primarily used when
+ a slave connects to the master, but also by user calling BINLOG_GTID_POS().
+
+ A connecting slave first scans the binlog files to find the last one with an
+ initial GTID_LIST event that lies before the starting GTID position. Then a
+ sequential scan of the binlog file is done until the requested GTID position
+ is found.
+
+ The binlog index conceptually extends this using index records corresponding
+ to different offset within one binlog file. Each record functions as if it
+ was the initial GTID_LIST event of a new binlog file, allowing the
+ sequential scan to start from the corresponding position. By having
+ sufficiently many index records, the scan will be fast.
+
+ The code has a performance-critical "sync" path which is called while holding
+ LOCK_log whenever a new GTID is added to a binlog file. And a less critical
+ "async" path which runs in the binlog background thread and does most of the
+ processing. The "sync" and "async" paths each run single threaded, but can
+ execute in parallel with each other.
+
+ The index file is written incrementally together with the binlog file.
+ However there is no fsync()'s of the index file needed while writing. A
+ partially written index left by a crashing server will be re-written during
+ binlog recovery. A reader is allowed to use the index as it is begin written
+ (for the "hot" binlog file); such access is protected by mutex.
+
+ In case of lost or corrupt index, fallback to full sequential scan is done
+ (so performance will be affected but not correct functionality).
+
+ The index file is structured like a B+-tree. The index is append-only, so
+ also resembles a log-structured merge-tree, but with no merging of levels
+ needed as it covers a single fixed-size binlog file. This makes the building
+ of the tree relatively simple.
+
+ Keys in the tree consist of a GTID state (corresponding to a GTID_LIST
+ event) and the associated binlog file offset. All keys (except the first key
+ in each level of the tree) are delta-compressed to save space, holding only
+ the (domain_id, server_id) pairs that differ from the previous record.
+
+ The file is page-based. The first page contains the leftmost leaf node, and
+ the root node is at the end of the file. An incompletely written index file
+ can be detected by the last page in the file not being a root node page.
+ Nodes in the B+-tree usually fit in one page, but a node can be split across
+ multiple pages if GTID states are very large.
+
+ Page format:
+
+ The first page contains an extra file header:
+
+ Offset Size Description
+ 0 8 MAGIC header identifying the file as a binlog index
+ 8 1 Major version number. A new major version of the file format
+ is not readable by older server versions.
+ 9 1 Minor version number. Formats differing only in minor version
+ are backwards compatible and can be read by older servers.
+ 10 2 Padding/unused.
+ 12 4 Page size.
+
+ Each page additionally contains this header:
+
+ Offset Size Description
+ 0 1 Flags
+ 1 3 Padding/unused
+
+ The last 4 bytes of each page is a 32-bit CRC.
+
+ An interior node is a sequence of
+ <child ptr> <key> <child ptr> <key> ... <key> <child ptr>
+ while a leaf node has only keys.
+
+ A child pointer is stored as 4 byte integer. The first page is 1, so that
+ 0 can be used to denote "not present".
+
+ Format of a key:
+
+ Offset Size Description
+ 0 4 Number of GTIDs in the key, plus 1. Or 0 for EOF.
+ 4 4 Binlog file offset
+ 8 4 Domain_id of first GTID
+ 12 4 Server_id of first GTID
+ 16 8 Seq_no of first GTID
+ ... and so on for each GTID in the key.
+
+ A node typically fits in one page. But if the GTID state is very big (or
+ the page size very small), multiple pages may be used. When a node is split,
+ it can be split after a child pointer or before or after a GTID, but not
+ elsewhere.
+
+ Here is an example index file in schematic form:
+
+ S0 D1 D2 D3 D4 D5 D6 D7 D8 D9 D10 D11
+ A(S0 D1 D2) B(D3 D4 D5) C(D6 D7 D8) E(D9 D10) F(D11)
+ D(A <S3> B <D4+D5+D6> C) G(E <D10+D11> F)
+ H(D <S9> G)
+
+ S0 is the full initial GTID state at the start of the file.
+ D1-D11 are the differential GTID states in the binlog file; eg. they could
+ be the individual GTIDs in the binlog file if a record is writte for
+ each GTID.
+ S3 is the full GTID state corresponding to D3, ie. S3=S0+D1+D2+D3.
+ A(), B(), ..., H() are the nodes in the binlog index. H is the root.
+ A(S0 D1 D2) is a leaf node containing records S0, D1, and D2.
+ G(E <D10+D11> F) is an interior node with key <D10+D11> and child pointers to
+ E and F.
+
+ To find eg. S4, we start from the root H. S4<S9, so we follow the left child
+ pointer to D. S4>S3, so we follow the child pointer to leaf node C.
+
+ Here are the operations that occur while writing the example index file:
+
+ S0 A(A) R(A,S0)
+ D1 R(A,D1)
+ D2 R(A,D2)
+ D3 W(A) I(D) P(D,A) A(B) R(B,D3) R(D,S3)
+ D4 R(A,D4)
+ D5 R(A,D5)
+ D6 W(B) P(D,B) A(C) R(C,D6) R(D,D4+D5+D6)
+ D7 R(C,D7)
+ D8 R(C,D8)
+ D9 W(C) P(D,C) A(E) R(E,D9) W(D) I(H) P(H,D) R(H,S9)
+ D10 R(E,D10)
+ D11 W(E) I(G) P(G,E) A(F) R(F,S10) R(G,D10+D11)
+ <EOF> W(F) P(G,F) W(G) P(H,G) W(H)
+
+ A(x) -> allocate leaf node x.
+ R(x,k) -> insert an index record containing key k in node x.
+ W(x) -> write node x to the index file.
+ I(y) -> allocate interior node y.
+ P(y,x) -> insert a child pointer to y in x.
+*/
+
+
+class Gtid_index_base
+{
+public:
+ /* +4 for ".idx" prefix. */
+ static constexpr size_t GTID_INDEX_FILENAME_MAX_SIZE= FN_REFLEN+4;
+
+protected:
+ enum enum_page_flags {
+ /* Set for a leaf node page, cleared for an interior node page. */
+ PAGE_FLAG_IS_LEAF= 1,
+ /* This is a continuation page. */
+ PAGE_FLAG_IS_CONT= 2,
+ /* No continuation page follows (the last page in a group). */
+ PAGE_FLAG_LAST= 4,
+ /*
+ Flag set to mark the root node. (The root node is normally the last page
+ in the index file, but having an explicit flag allows us to detect a
+ partially written index file with the root node missing.
+ */
+ PAGE_FLAG_ROOT= 8,
+ };
+
+ /*
+ Minor version increment represents a backwards-compatible format (can be
+ read by any server version that knows the format of the major version).
+ Major version increment means a server should not attempt to read from the
+ index.
+ */
+ static constexpr uchar GTID_INDEX_VERSION_MAJOR= 1;
+ static constexpr uchar GTID_INDEX_VERSION_MINOR= 0;
+ static constexpr size_t GTID_INDEX_FILE_HEADER_SIZE= 16;
+ static constexpr size_t GTID_INDEX_PAGE_HEADER_SIZE= 4;
+ static constexpr size_t CHECKSUM_LEN= 4;
+
+#ifdef _MSC_VER
+/*
+ Flexible array member is part of C99, but it is not standard in C++.
+ All the compilers and platforms we support do support it, though.
+ Just we need to disable on Windows a warning about using a non-standard
+ C++ extension.
+*/
+#pragma warning(disable : 4200)
+#endif
+ struct Node_page
+ {
+ Node_page *next;
+ /* Pointer to allow to update the "flags" byte at page writeout. */
+ uchar *flag_ptr;
+ /* Flexible array member; will be allocated to opt_gtid_index_page_size. */
+ uchar page[];
+ };
+
+ struct Index_node_base
+ {
+ Node_page *first_page;
+ Node_page *current_page;
+ /* The current_ptr is only valid if current_page != 0. */
+ uchar *current_ptr;
+
+ Index_node_base();
+ ~Index_node_base();
+ void free_pages();
+ void reset();
+ };
+
+public:
+ static void make_gtid_index_file_name(char *out_name, size_t bufsize,
+ const char *base_filename);
+
+protected:
+ int update_gtid_state(rpl_binlog_state_base *state,
+ const rpl_gtid *gtid_list, uint32 gtid_count);
+ Node_page *alloc_page();
+ rpl_gtid *gtid_list_buffer(uint32 count);
+ void build_index_filename(const char *filename);
+ virtual int give_error(const char *msg) = 0;
+
+ /*
+ A buffer to hold a gtid_list temporarily.
+ Increased as needed to hold largest needed list.
+ */
+ rpl_gtid *gtid_buffer;
+ uint32 gtid_buffer_alloc;
+ size_t page_size;
+public:
+ char index_file_name[GTID_INDEX_FILENAME_MAX_SIZE];
+
+protected:
+ Gtid_index_base();
+ virtual ~Gtid_index_base();
+};
+
+
+class Gtid_index_writer : public Gtid_index_base
+{
+private:
+ const uint32 gtid_threshold;
+ const my_off_t offset_min_threshold;
+ const my_off_t offset_max_threshold;
+
+ struct Index_node : public Index_node_base
+ {
+ rpl_binlog_state_base state;
+ uint32 num_records;
+ uint32 level;
+ bool force_spill_page;
+
+ Index_node(uint32 level_);
+ ~Index_node();
+ void reset();
+ };
+
+public:
+ static void gtid_index_init();
+ static void gtid_index_cleanup();
+protected:
+ friend class Gtid_index_reader_hot;
+ static void lock_gtid_index() { mysql_mutex_lock(>id_index_mutex); }
+ static void unlock_gtid_index() { mysql_mutex_unlock(>id_index_mutex); }
+ static const Gtid_index_writer *find_hot_index(const char *file_name);
+
+public:
+ Gtid_index_writer(const char *filename, uint32 offset,
+ rpl_binlog_state_base *binlog_state,
+ uint32 opt_page_size, uint32 opt_sparse,
+ my_off_t opt_span_min, my_off_t opt_span_max);
+ virtual ~Gtid_index_writer();
+ void process_gtid(uint32 offset, const rpl_gtid *gtid);
+ int process_gtid_check_batch(uint32 offset, const rpl_gtid *gtid,
+ rpl_gtid **out_gtid_list,
+ uint32 *out_gtid_count);
+ int async_update(uint32 event_offset, rpl_gtid *gtid_list, uint32 gtid_count);
+ void close();
+
+private:
+ void insert_in_hot_index();
+ void remove_from_hot_index();
+ uint32 write_current_node(uint32 level, bool is_root);
+ int reserve_space(Index_node *n, size_t bytes);
+ int do_write_record(uint32 level, uint32 event_offset,
+ const rpl_gtid *gtid_list, uint32 gtid_count);
+ int add_child_ptr(uint32 level, my_off_t node_offset);
+ int write_record(uint32 event_offset, const rpl_gtid *gtid_list,
+ uint32 gtid_count);
+ bool check_room(uint32 level, uint32 gtid_count);
+ int alloc_level_if_missing(uint32 level);
+ uchar *init_header(Node_page *page, bool is_leaf, bool is_first);
+ int give_error(const char *msg) override;
+
+ static mysql_mutex_t gtid_index_mutex;
+ static Gtid_index_writer *hot_index_list;
+
+ rpl_binlog_state_base pending_state;
+ /* Next pointer for the hot_index_list linked list. */
+ Gtid_index_writer *next_hot_index;
+ /* The currently being built index nodes, from leaf[0] to root[max_level]. */
+ Index_node **nodes;
+ my_off_t previous_offset;
+ uint32 max_level;
+ uint32 pending_gtid_count;
+
+ File index_file;
+
+ /*
+ This is set if we encounter an error (such as out-of-memory or I/O error).
+ Then we will no longer do any updates to the index, to prevent leaving a
+ corrupt index. This is not fatal; the partial index will work up to where
+ it got the error, and the code can fall-back to sequential scan of the
+ binlog.
+ */
+ bool error_state;
+ /* Flag to help put the file header at the start of the very first page. */
+ bool file_header_written;
+ /* Flag set while this object is visible in the "hot index" list. */
+ bool in_hot_index_list;
+};
+
+
+class Gtid_index_reader : public Gtid_index_base
+{
+public:
+ Gtid_index_reader();
+ virtual ~Gtid_index_reader();
+
+ int open_index_file(const char *binlog_filename);
+ void close_index_file();
+ /*
+ The search functions take either a binlog offset or GTID position to search
+ for. They return:
+ 0 for "not found" (searched position is earlier than start of index).
+ 1 for "found"
+ -1 for error.
+ When found, the returned position is the last position in the index that
+ lies at or before the searched position. The offset of the returned
+ position is written to *out_offset. The number of GTIDs in the returned
+ GTID state is written to *out_gtid_count; the list of found GTIDs can be
+ accessed with search_gtid_list() and is valid only until next search or
+ freeing of the Gtid_index_reader object.
+ */
+ int search_offset(uint32 in_offset, uint32 *out_offset,
+ uint32 *out_gtid_count);
+ int search_gtid_pos(slave_connection_state *in_gtid_pos, uint32 *out_offset,
+ uint32 *out_gtid_count);
+ rpl_gtid *search_gtid_list();
+
+protected:
+ int search_cmp_offset(uint32 offset, rpl_binlog_state_base *state);
+ int search_cmp_gtid_pos(uint32 offset, rpl_binlog_state_base *state);
+ virtual int do_index_search(uint32 *out_offset, uint32 *out_gtid_count);
+ int do_index_search_root(uint32 *out_offset, uint32 *out_gtid_count);
+ int do_index_search_leaf(bool current_state_updated,
+ uint32 *out_offset, uint32 *out_gtid_count);
+ int next_page();
+ int find_bytes(uint32 num_bytes);
+ virtual int get_child_ptr(uint32 *out_child_ptr);
+ int get_offset_count(uint32 *out_offset, uint32 *out_gtid_count);
+ int get_gtid_list(rpl_gtid *out_gtid_list, uint32 count);
+ virtual int read_file_header();
+ int verify_checksum(Node_page *page);
+ Node_page *alloc_and_read_page();
+ virtual int read_root_node();
+ virtual int read_node(uint32 page_ptr);
+ int read_node_cold(uint32 page_ptr);
+ int give_error(const char *msg) override;
+
+ rpl_binlog_state_base current_state;
+ rpl_binlog_state_base compare_state;
+ Index_node_base cold_node;
+ /* n points to either cold node or hot node in writer. */
+ Index_node_base *n;
+ int (Gtid_index_reader::* search_cmp_function)(uint32, rpl_binlog_state_base *);
+ slave_connection_state *in_search_gtid_pos;
+ Node_page *read_page;
+ uchar *read_ptr;
+ File index_file;
+ uint32 current_offset;
+ uint32 in_search_offset;
+ bool file_open;
+ bool index_valid;
+ bool has_root_node;
+ uchar version_major;
+ uchar version_minor;
+};
+
+
+/*
+ Sub-class of Gtid_index_reader that can additionally access in-memory "hot"
+ pages of the index, which are partially filled pages of the current binlog
+ file, not yet written to disk.
+*/
+class Gtid_index_reader_hot : public Gtid_index_reader
+{
+public:
+ Gtid_index_reader_hot();
+ virtual ~Gtid_index_reader_hot() { }
+
+private:
+ int do_index_search(uint32 *out_offset, uint32 *out_gtid_count) override;
+ int get_child_ptr(uint32 *out_child_ptr) override;
+ int read_file_header() override;
+ int read_root_node() override;
+ int read_node(uint32 page_ptr) override;
+ int read_node_hot();
+
+ /* Pointer to the writer object, if we're reading a hot index. */
+ const Gtid_index_writer *hot_writer;
+ /* The level we are currently reading in the hot writer .*/
+ uint32 hot_level;
+};
+
+#endif /* GTID_INDEX_H */
diff --git a/sql/log.cc b/sql/log.cc
index 227ce075c19..7ce32ad835d 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -40,6 +40,7 @@
#include "sql_audit.h"
#include "mysqld.h"
#include "ddl_log.h"
+#include "gtid_index.h"
#include <my_dir.h>
#include <m_ctype.h> // For test_if_number
@@ -158,12 +159,44 @@ static SHOW_VAR binlog_status_vars_detail[]=
Variables for the binlog background thread.
Protected by the MYSQL_BIN_LOG::LOCK_binlog_background_thread mutex.
*/
+struct Binlog_background_job
+{
+ union
+ {
+ MYSQL_BIN_LOG::xid_count_per_binlog *notify_entry;
+ struct {
+ Gtid_index_writer *gi;
+ rpl_gtid *gtid_list;
+ uint32 gtid_count;
+ uint32 offset;
+ } gtid_index_data;
+ };
+ Binlog_background_job *next;
+ enum enum_job_type {
+ CHECKPOINT_NOTIFY,
+ GTID_INDEX_UPDATE,
+ GTID_INDEX_CLOSE,
+ SENTINEL
+ } job_type;
+};
static bool binlog_background_thread_started= false;
static bool binlog_background_thread_stop= false;
-static MYSQL_BIN_LOG::xid_count_per_binlog *
- binlog_background_thread_queue= NULL;
+static bool binlog_background_thread_sentinel= false;
+static Binlog_background_job *binlog_background_thread_queue= NULL;
+static Binlog_background_job **binlog_background_thread_endptr=
+ &binlog_background_thread_queue;
+static Binlog_background_job *binlog_background_freelist= NULL;
static bool start_binlog_background_thread();
+static int queue_binlog_background_checkpoint_notify(
+ MYSQL_BIN_LOG::xid_count_per_binlog *entry);
+static int queue_binlog_background_gtid_index_update(Gtid_index_writer *gi,
+ uint32 offset,
+ rpl_gtid *gtid_list,
+ uint32 count);
+static int queue_binlog_background_gtid_index_close(Gtid_index_writer *gi);
+static int queue_binlog_background_sentinel();
+static void binlog_background_wait_for_sentinel();
static rpl_binlog_state rpl_global_gtid_binlog_state;
@@ -3667,7 +3700,7 @@ MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_period)
group_commit_queue(0), group_commit_queue_busy(FALSE),
num_commits(0), num_group_commits(0),
group_commit_trigger_count(0), group_commit_trigger_timeout(0),
- group_commit_trigger_lock_wait(0),
+ group_commit_trigger_lock_wait(0), gtid_index(nullptr),
sync_period_ptr(sync_period), sync_counter(0),
state_file_deleted(false), binlog_state_recover_done(false),
is_relay_log(0), relay_signal_cnt(0),
@@ -4113,6 +4146,28 @@ bool MYSQL_BIN_LOG::open(const char *log_name,
if (write_event(&gl_ev))
goto err;
+ /* Open an index file for this binlog file. */
+ DBUG_ASSERT(!gtid_index); /* Binlog close should clear it. */
+ if (gtid_index)
+ delete gtid_index;
+ if (opt_binlog_gtid_index)
+ {
+ my_off_t offset= my_b_tell(&log_file);
+ gtid_index=
+ new Gtid_index_writer(log_file_name, (uint32)offset,
+ &rpl_global_gtid_binlog_state,
+ (uint32)opt_binlog_gtid_index_page_size,
+ (uint32)opt_binlog_gtid_index_sparse,
+ (my_off_t)opt_binlog_gtid_index_span_min,
+ (my_off_t)opt_binlog_gtid_index_span_max);
+ if (!gtid_index)
+ sql_print_information("Could not create GTID index for binlog "
+ "file '%s'. Accesses to this binlog file will "
+ "fallback to slower sequential scan.");
+ }
+ else
+ gtid_index= nullptr;
+
/* Output a binlog checkpoint event at the start of the binlog file. */
/*
@@ -4661,12 +4716,31 @@ bool MYSQL_BIN_LOG::reset_logs(THD *thd, bool create_new_log,
no new ones will be written. So we can proceed to delete the logs.
*/
mysql_mutex_unlock(&LOCK_xid_list);
+
+ /*
+ Push a sentinel through the binlog background thread and wait for it to
+ return. When it does, we know that no more GTID index operations are
+ pending as we are holding LOCK_log.
+ (This is normally already the case as we pushed a binlog checkpoint
+ request through. But if no XID-capable engines are enabled (eg. running
+ without InnoDB), then that is a no-op).
+ */
+ queue_binlog_background_sentinel();
+ binlog_background_wait_for_sentinel();
}
/* Save variables so that we can reopen the log */
save_name=name;
name=0; // Protect against free
- close(LOG_CLOSE_TO_BE_OPENED);
+
+ /*
+ Close the active log.
+ Close the active GTID index synchroneously. We don't want the close
+ running in the background while we delete the gtid index file. And we just
+ pushed a sentinel through the binlog background thread while holding
+ LOCK_log, so no other GTID index operations can be pending.
+ */
+ close(LOG_CLOSE_TO_BE_OPENED|LOG_CLOSE_SYNC_GTID_INDEX);
last_used_log_number= 0; // Reset log number cache
@@ -4691,6 +4765,28 @@ bool MYSQL_BIN_LOG::reset_logs(THD *thd, bool create_new_log,
for (;;)
{
+ /* Delete any GTID index file. */
+ char buf[Gtid_index_base::GTID_INDEX_FILENAME_MAX_SIZE];
+ Gtid_index_base::make_gtid_index_file_name(buf, sizeof(buf),
+ linfo.log_file_name);
+ if (my_delete(buf, MYF(0)))
+ {
+ /* If ENOENT, the GTID index file is already deleted or never existed. */
+ if (my_errno != ENOENT)
+ {
+ if (thd)
+ {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_DELETE_FILE, ER_THD(thd, ER_CANT_DELETE_FILE),
+ buf, my_errno);
+ }
+ sql_print_information("Failed to delete file '%s' (errno=%d)",
+ buf, my_errno);
+ }
+ my_errno= 0;
+ }
+
+ /* Delete the binlog file. */
if (unlikely((error= my_delete(linfo.log_file_name, MYF(0)))))
{
if (my_errno == ENOENT)
@@ -5201,6 +5297,7 @@ int MYSQL_BIN_LOG::purge_index_entry(THD *thd, ulonglong *reclaimed_space,
int error= 0;
LOG_INFO log_info;
LOG_INFO check_log_info;
+ char buf[Gtid_index_base::GTID_INDEX_FILENAME_MAX_SIZE];
DBUG_ASSERT(my_b_inited(&purge_index_file));
@@ -5234,6 +5331,24 @@ int MYSQL_BIN_LOG::purge_index_entry(THD *thd, ulonglong *reclaimed_space,
/* Get rid of the trailing '\n' */
log_info.log_file_name[length-1]= 0;
+ Gtid_index_base::make_gtid_index_file_name(buf, sizeof(buf),
+ log_info.log_file_name);
+ if (my_delete(buf, MYF(0)))
+ {
+ /* If ENOENT, the GTID index file is already deleted or never existed. */
+ if (my_errno != ENOENT)
+ {
+ if (thd)
+ {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_DELETE_FILE, ER_THD(thd, ER_CANT_DELETE_FILE),
+ buf, my_errno);
+ }
+ sql_print_information("Failed to delete file '%s'", buf);
+ }
+ my_errno= 0;
+ }
+
if (unlikely(!mysql_file_stat(m_key_file_log, log_info.log_file_name, &s,
MYF(0))))
{
@@ -7220,6 +7335,8 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info, my_bool *with_annotate)
{
bool synced;
+ update_gtid_index((uint32)offset, thd->get_last_commit_gtid());
+
if ((error= flush_and_sync(&synced)))
{
}
@@ -7297,6 +7414,30 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info, my_bool *with_annotate)
}
+void
+MYSQL_BIN_LOG::update_gtid_index(uint32 offset, rpl_gtid gtid)
+{
+ if (!unlikely(gtid_index))
+ return;
+
+ rpl_gtid *gtid_list;
+ uint32 gtid_count;
+ int err= gtid_index->process_gtid_check_batch(offset, >id,
+ >id_list, >id_count);
+ if (err)
+ return;
+ if (gtid_list)
+ {
+ /*
+ Perform the GTID index update in the binlog background thread,
+ as we are running under the critical LOCK_log mutex.
+ */
+ if (queue_binlog_background_gtid_index_update(gtid_index, offset,
+ gtid_list, gtid_count))
+ my_free(gtid_list);
+ }
+}
+
int error_log_print(enum loglevel level, const char *format,
va_list args)
{
@@ -8921,6 +9062,8 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
strmake_buf(cache_mngr->last_commit_pos_file, log_file_name);
commit_offset= my_b_write_tell(&log_file);
+ update_gtid_index((uint32)commit_offset,
+ current->thd->get_last_commit_gtid());
cache_mngr->last_commit_pos_offset= commit_offset;
if ((cache_mngr->using_xa && cache_mngr->xa_xid) || current->need_unlog)
{
@@ -9495,6 +9638,33 @@ void MYSQL_BIN_LOG::close(uint exiting)
}
#endif /* HAVE_REPLICATION */
+ if (!is_relay_log && likely(gtid_index))
+ {
+ if (exiting & (LOG_CLOSE_STOP_EVENT|LOG_CLOSE_SYNC_GTID_INDEX))
+ {
+ /*
+ The binlog background thread is already stopped just close the final
+ GTID index synchronously. Or caller explicitly requested synchronous
+ close of the GTID index.
+ */
+ gtid_index->close();
+ delete gtid_index;
+ }
+ else
+ {
+ /*
+ Queue a close on the current GTID index.
+ Important that this is queued _before_ the checkpoint request is sent
+ (and thus before chechpoint notifications can be queued); this way, if
+ we crash before the GTID index is synced to disk, the checkpoint will
+ still be pending and the binlog file will be scanned during crash
+ recovery and the GTID index recovered.
+ */
+ queue_binlog_background_gtid_index_close(gtid_index);
+ }
+ gtid_index= nullptr;
+ }
+
/* don't pwrite in a file opened with O_APPEND - it doesn't work */
if (log_file.type == WRITE_CACHE && !(exiting & LOG_CLOSE_DELAYED_CLOSE))
{
@@ -11088,22 +11258,7 @@ void
TC_LOG_BINLOG::commit_checkpoint_notify(void *cookie)
{
xid_count_per_binlog *entry= static_cast<xid_count_per_binlog *>(cookie);
- bool found_entry= false;
- mysql_mutex_lock(&LOCK_binlog_background_thread);
- /* count the same notification kind from different engines */
- for (xid_count_per_binlog *link= binlog_background_thread_queue;
- link && !found_entry; link= link->next_in_queue)
- {
- if ((found_entry= (entry == link)))
- entry->notify_count++;
- }
- if (!found_entry)
- {
- entry->next_in_queue= binlog_background_thread_queue;
- binlog_background_thread_queue= entry;
- }
- mysql_cond_signal(&COND_binlog_background_thread);
- mysql_mutex_unlock(&LOCK_binlog_background_thread);
+ queue_binlog_background_checkpoint_notify(entry);
}
/*
@@ -11122,7 +11277,9 @@ pthread_handler_t
binlog_background_thread(void *arg __attribute__((unused)))
{
bool stop;
- MYSQL_BIN_LOG::xid_count_per_binlog *queue, *next;
+ Binlog_background_job *queue, *next;
+ Binlog_background_job *freelist= nullptr;
+ Binlog_background_job **freelist_endptr= &freelist;
THD *thd;
my_thread_init();
DBUG_ENTER("binlog_background_thread");
@@ -11166,6 +11323,18 @@ binlog_background_thread(void *arg __attribute__((unused)))
*/
THD_STAGE_INFO(thd, stage_binlog_waiting_background_tasks);
mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
+
+ /*
+ Put back our job objects in the freelist, now that we own the mutex again.
+ */
+ if (freelist)
+ {
+ *freelist_endptr= binlog_background_freelist;
+ binlog_background_freelist= freelist;
+ freelist= nullptr;
+ freelist_endptr= &freelist;
+ }
+
for (;;)
{
stop= binlog_background_thread_stop;
@@ -11184,6 +11353,7 @@ binlog_background_thread(void *arg __attribute__((unused)))
}
/* Grab the queue, if any. */
binlog_background_thread_queue= NULL;
+ binlog_background_thread_endptr= &binlog_background_thread_queue;
mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
/* Process any incoming commit_checkpoint_notify() calls. */
@@ -11199,17 +11369,40 @@ binlog_background_thread(void *arg __attribute__((unused)))
#endif
while (queue)
{
- long count= queue->notify_count;
- THD_STAGE_INFO(thd, stage_binlog_processing_checkpoint_notify);
- DEBUG_SYNC(thd, "binlog_background_thread_before_mark_xid_done");
- /* Set the thread start time */
- thd->set_time();
- /* Grab next pointer first, as mark_xid_done() may free the element. */
- next= queue->next_in_queue;
- queue->notify_count= 0;
- for (long i= 0; i <= count; i++)
- mysql_bin_log.mark_xid_done(queue->binlog_id, true);
- queue= next;
+ switch (queue->job_type)
+ {
+ case Binlog_background_job::CHECKPOINT_NOTIFY:
+ THD_STAGE_INFO(thd, stage_binlog_processing_checkpoint_notify);
+ DEBUG_SYNC(thd, "binlog_background_thread_before_mark_xid_done");
+ /* Set the thread start time */
+ thd->set_time();
+ mysql_bin_log.mark_xid_done(queue->notify_entry->binlog_id, true);
+ break;
+
+ case Binlog_background_job::GTID_INDEX_UPDATE:
+ queue->gtid_index_data.gi->
+ async_update(queue->gtid_index_data.offset,
+ queue->gtid_index_data.gtid_list,
+ queue->gtid_index_data.gtid_count);
+ break;
+
+ case Binlog_background_job::GTID_INDEX_CLOSE:
+ queue->gtid_index_data.gi->close();
+ delete queue->gtid_index_data.gi;
+ break;
+
+ case Binlog_background_job::SENTINEL:
+ /*
+ The sentinel is a way to signal to reset_logs() that all pending
+ background jobs prior to the sentinel have been processed.
+ */
+ mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
+ DBUG_ASSERT(binlog_background_thread_sentinel);
+ binlog_background_thread_sentinel= false;
+ mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread_end);
+ mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
+ break;
+ }
#ifdef ENABLED_DEBUG_SYNC
DBUG_EXECUTE_IF("binlog_background_checkpoint_processed",
@@ -11218,6 +11411,12 @@ binlog_background_thread(void *arg __attribute__((unused)))
STRING_WITH_LEN("now SIGNAL binlog_background_checkpoint_processed")));
);
#endif
+
+ next= queue->next;
+ queue->next= nullptr;
+ *freelist_endptr= queue;
+ freelist_endptr= &queue->next;
+ queue= next;
}
if (stop)
@@ -11226,6 +11425,13 @@ binlog_background_thread(void *arg __attribute__((unused)))
THD_STAGE_INFO(thd, stage_binlog_stopping_background_thread);
+ while (freelist)
+ {
+ next= freelist->next;
+ my_free(freelist);
+ freelist= next;
+ }
+
/* No need to use mutex as thd is not linked into other threads */
THD_count::count++;
delete thd;
@@ -11234,6 +11440,12 @@ binlog_background_thread(void *arg __attribute__((unused)))
/* Signal that we are (almost) stopped. */
mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
+ while (binlog_background_freelist)
+ {
+ next= binlog_background_freelist->next;
+ my_free(binlog_background_freelist);
+ binlog_background_freelist= next;
+ }
binlog_background_thread_stop= false;
mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread_end);
mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
@@ -11277,6 +11489,139 @@ start_binlog_background_thread()
return 0;
}
+
+
+static Binlog_background_job *
+get_binlog_background_job()
+{
+ Binlog_background_job *job;
+ mysql_mutex_assert_owner(&mysql_bin_log.LOCK_binlog_background_thread);
+
+ if ((job= binlog_background_freelist) != nullptr)
+ binlog_background_freelist= job->next;
+ else
+ job= (Binlog_background_job *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*job),
+ MYF(MY_WME));
+
+ return job;
+}
+
+
+static void
+queue_binlog_background_job(Binlog_background_job *job)
+{
+ mysql_mutex_assert_owner(&mysql_bin_log.LOCK_binlog_background_thread);
+
+ job->next= nullptr;
+ *binlog_background_thread_endptr= job;
+ binlog_background_thread_endptr= &job->next;
+ mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread);
+}
+
+
+static int
+queue_binlog_background_checkpoint_notify(
+ MYSQL_BIN_LOG::xid_count_per_binlog *entry)
+{
+ int res;
+
+ mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
+ Binlog_background_job *job= get_binlog_background_job();
+ if (!job)
+ res= 1;
+ else
+ {
+ job->job_type= Binlog_background_job::CHECKPOINT_NOTIFY;
+ job->notify_entry= entry;
+ queue_binlog_background_job(job);
+ res= 0;
+ }
+ mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
+ return res;
+}
+
+
+static int
+queue_binlog_background_gtid_index_update(Gtid_index_writer *gi, uint32 offset,
+ rpl_gtid *gtid_list, uint32 count)
+{
+ int res;
+
+ mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
+ Binlog_background_job *job= get_binlog_background_job();
+ if (!unlikely(job))
+ res= 1;
+ else
+ {
+ job->job_type= Binlog_background_job::GTID_INDEX_UPDATE;
+ job->gtid_index_data.gi= gi;
+ job->gtid_index_data.gtid_list= gtid_list;
+ job->gtid_index_data.gtid_count= count;
+ job->gtid_index_data.offset= offset;
+ queue_binlog_background_job(job);
+ res= 0;
+ }
+ mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
+
+ return res;
+}
+
+
+static int
+queue_binlog_background_gtid_index_close(Gtid_index_writer *gi)
+{
+ int res;
+
+ mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
+ Binlog_background_job *job= get_binlog_background_job();
+ if (!job)
+ return 1;
+ else
+ {
+ job->job_type= Binlog_background_job::GTID_INDEX_CLOSE;
+ job->gtid_index_data.gi= gi;
+ queue_binlog_background_job(job);
+ res= 0;
+ }
+ mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
+
+ return res;
+}
+
+
+static int
+queue_binlog_background_sentinel()
+{
+ int res;
+
+ mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
+ DBUG_ASSERT(!binlog_background_thread_sentinel);
+ Binlog_background_job *job= get_binlog_background_job();
+ if (!job)
+ return 1;
+ else
+ {
+ binlog_background_thread_sentinel= true;
+ job->job_type= Binlog_background_job::SENTINEL;
+ queue_binlog_background_job(job);
+ res= 0;
+ }
+ mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
+
+ return res;
+}
+
+static void
+binlog_background_wait_for_sentinel()
+{
+ mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
+ while(binlog_background_thread_sentinel)
+ mysql_cond_wait(&mysql_bin_log.COND_binlog_background_thread_end,
+ &mysql_bin_log.LOCK_binlog_background_thread);
+ mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
+}
+
+
#ifdef HAVE_REPLICATION
class Recovery_context
{
@@ -11543,7 +11888,7 @@ bool Recovery_context::reset_truncate_coord(my_off_t pos)
for (uint i= 0; i < gtid_maybe_to_truncate->elements(); i++)
{
rpl_gtid gtid= gtid_maybe_to_truncate->at(i);
- if (rpl_global_gtid_binlog_state.update_nolock(>id, false))
+ if (rpl_global_gtid_binlog_state.update_nolock(>id))
return true;
}
gtid_maybe_to_truncate->clear();
@@ -11808,11 +12153,13 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
Format_description_log_event *fdle, bool do_xa)
{
Log_event *ev= NULL;
+ Gtid_index_writer *gtid_index_recover= NULL;
HASH xids, ddl_log_ids;
MEM_ROOT mem_root;
char binlog_checkpoint_name[FN_REFLEN];
bool binlog_checkpoint_found;
IO_CACHE log;
+ IO_CACHE *cur_log;
File file= -1;
const char *errmsg;
#ifdef HAVE_REPLICATION
@@ -11859,12 +12206,16 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
*/
binlog_checkpoint_found= false;
+ cur_log= first_log;
for (round= 1;;)
{
- while ((ev= Log_event::read_log_event(round == 1 ? first_log : &log,
- fdle, opt_master_verify_checksum))
+ while ((ev= Log_event::read_log_event(cur_log, fdle,
+ opt_master_verify_checksum))
&& ev->is_valid())
{
+#ifdef HAVE_REPLICATION
+ my_off_t end_pos= my_b_tell(cur_log);
+#endif
enum Log_event_type typ= ev->get_type_code();
switch (typ)
{
@@ -11939,6 +12290,8 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
/* Initialise the binlog state from the Gtid_list event. */
if (rpl_global_gtid_binlog_state.load(glev->list, glev->count))
goto err2;
+ if (opt_binlog_gtid_index)
+ gtid_index_recover= recover_gtid_index_start(last_log_name, end_pos);
}
break;
@@ -11978,7 +12331,9 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
(((Query_log_event *)ev)->is_commit() ||
((Query_log_event *)ev)->is_rollback()))));
- if (rpl_global_gtid_binlog_state.update_nolock(&ctx.last_gtid, false))
+ recover_gtid_index_process(gtid_index_recover, end_pos,
+ (Gtid_log_event *)ev);
+ if (rpl_global_gtid_binlog_state.update_nolock(&ctx.last_gtid))
goto err2;
ctx.last_gtid_valid= false;
}
@@ -11987,6 +12342,9 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
delete ev;
ev= NULL;
} // end of while
+ recover_gtid_index_end(gtid_index_recover);
+ gtid_index_recover= NULL;
+ cur_log= &log;
/*
If the last binlog checkpoint event points to an older log, we have to
@@ -12071,6 +12429,7 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
err2:
delete ev;
+ recover_gtid_index_abort(gtid_index_recover);
if (file >= 0)
{
end_io_cache(&log);
@@ -12089,6 +12448,109 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
}
+/*
+ Start recovery of the GTID index for a binlog file.
+ The old index is deleted and a new index is rebuilt while scanning the
+ binlog file during binlog recovery.
+ Errors are not fatal, as the code can fallback to slower full binlog file
+ scan when no GTID index is available.
+
+ @param base_name File name of the binlog file.
+ @param offset End log pos of the GTID_LIST log event of the binlog file.
+
+ @return Gtid_index_writer object or NULL.
+*/
+Gtid_index_writer *
+MYSQL_BIN_LOG::recover_gtid_index_start(const char *base_name, my_off_t offset)
+{
+ char buf[Gtid_index_base::GTID_INDEX_FILENAME_MAX_SIZE];
+
+ Gtid_index_base::make_gtid_index_file_name(buf, sizeof(buf), base_name);
+ if (my_delete(buf, MYF(0)))
+ {
+ /* If ENOENT, the GTID index file is already deleted or never existed. */
+ if (my_errno != ENOENT)
+ {
+ sql_print_information("Failed to delete file '%s' (errno=%d)", buf, my_errno);
+ }
+ my_errno= 0;
+ }
+ Gtid_index_writer *gi=
+ new Gtid_index_writer(base_name, (uint32)offset,
+ &rpl_global_gtid_binlog_state,
+ (uint32)opt_binlog_gtid_index_page_size,
+ (uint32)opt_binlog_gtid_index_sparse,
+ (my_off_t)opt_binlog_gtid_index_span_min,
+ (my_off_t)opt_binlog_gtid_index_span_max);
+ return gi;
+}
+
+
+/*
+ Process one GTID during GTID index recovery.
+
+ @param gi Gtid_index_writer object or NULL.
+ @param offset End log pos of the GTID event.
+ @param gev GTID log event to process.
+
+ @return nothing
+*/
+void
+MYSQL_BIN_LOG::recover_gtid_index_process(Gtid_index_writer *gi,
+ my_off_t offset, Gtid_log_event *gev)
+{
+ if (gi)
+ {
+ rpl_gtid gtid;
+ gtid.domain_id= gev->domain_id;
+ gtid.server_id= gev->server_id;
+ gtid.seq_no= gev->seq_no;
+ gi->process_gtid((uint32)offset, >id);
+ }
+}
+
+
+/*
+ Complete the recovery of one GTID index, syncing and closing it.
+
+ @param gi Gtid_index_writer object or NULL.
+
+ @return nothing
+*/
+void
+MYSQL_BIN_LOG::recover_gtid_index_end(Gtid_index_writer *gi)
+{
+ if (gi)
+ {
+ gi->close();
+ delete gi;
+ }
+}
+
+
+/*
+ Abort the recovery of one GTID index, deleting any partially recovered index.
+
+ @param gi Gtid_index_writer object or NULL.
+
+ @return nothing
+*/
+void
+MYSQL_BIN_LOG::recover_gtid_index_abort(Gtid_index_writer *gi)
+{
+ if (gi)
+ {
+ char buf[Gtid_index_base::GTID_INDEX_FILENAME_MAX_SIZE];
+ strmake(buf, gi->index_file_name, sizeof(buf)-1);
+ /*
+ Delete first the Gtid_index_writer object and then the partial index
+ (the writer still has the index file open and active until destructed).
+ */
+ delete(gi);
+ my_delete(buf, MYF(0));
+ }
+}
+
int
MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery)
diff --git a/sql/log.h b/sql/log.h
index f79305c174a..e5bafa186bc 100644
--- a/sql/log.h
+++ b/sql/log.h
@@ -21,8 +21,10 @@
#include "rpl_constants.h"
class Relay_log_info;
+class Gtid_index_writer;
class Format_description_log_event;
+class Gtid_log_event;
bool reopen_fstreams(const char *filename, FILE *outstream, FILE *errstream);
void setup_log_handling();
@@ -240,6 +242,7 @@ extern TC_LOG_DUMMY tc_log_dummy;
#define LOG_CLOSE_TO_BE_OPENED 2
#define LOG_CLOSE_STOP_EVENT 4
#define LOG_CLOSE_DELAYED_CLOSE 8
+#define LOG_CLOSE_SYNC_GTID_INDEX 16
/*
Maximum unique log filename extension.
@@ -708,6 +711,9 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log
ulonglong group_commit_trigger_count, group_commit_trigger_timeout;
ulonglong group_commit_trigger_lock_wait;
+ /* Binlog GTID index. */
+ Gtid_index_writer *gtid_index;
+
/* pointer to the sync period variable, for binlog this will be
sync_binlog_period, for relay log this will be
sync_relay_log_period
@@ -717,6 +723,13 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log
bool state_file_deleted;
bool binlog_state_recover_done;
+ Gtid_index_writer *recover_gtid_index_start(const char *base_name,
+ my_off_t offset);
+ void recover_gtid_index_process(Gtid_index_writer *gi, my_off_t offset,
+ Gtid_log_event *gev);
+ void recover_gtid_index_end(Gtid_index_writer *gi);
+ void recover_gtid_index_abort(Gtid_index_writer *gi);
+
inline uint get_sync_period()
{
return *sync_period_ptr;
@@ -736,6 +749,8 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log
bool write_transaction_to_binlog_events(group_commit_entry *entry);
void trx_group_commit_leader(group_commit_entry *leader);
bool is_xidlist_idle_nolock();
+ void update_gtid_index(uint32 offset, rpl_gtid gtid);
+
public:
int new_file_without_locking();
/*
@@ -756,11 +771,8 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log
ulong binlog_id;
/* Total prepared XIDs and pending checkpoint requests in this binlog. */
long xid_count;
- long notify_count;
- /* For linking in requests to the binlog background thread. */
- xid_count_per_binlog *next_in_queue;
xid_count_per_binlog(char *log_file_name, uint log_file_name_len)
- :binlog_id(0), xid_count(0), notify_count(0)
+ :binlog_id(0), xid_count(0)
{
binlog_name_len= log_file_name_len;
binlog_name= (char *) my_malloc(PSI_INSTRUMENT_ME, binlog_name_len, MYF(MY_ZEROFILL));
diff --git a/sql/mysqld.cc b/sql/mysqld.cc
index 23e327c6802..b9652182969 100644
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@ -83,6 +83,7 @@
#include "wsrep_server_state.h"
#endif /* WITH_WSREP */
#include "proxy_protocol.h"
+#include "gtid_index.h"
#include "sql_callback.h"
#include "threadpool.h"
@@ -441,6 +442,11 @@ my_bool sp_automatic_privileges= 1;
ulong opt_binlog_rows_event_max_size;
ulong binlog_row_metadata;
+my_bool opt_binlog_gtid_index= TRUE;
+ulong opt_binlog_gtid_index_page_size= 4096;
+ulong opt_binlog_gtid_index_sparse= 10;
+ulong opt_binlog_gtid_index_span_min= 4096;
+ulong opt_binlog_gtid_index_span_max= 65536;
my_bool opt_master_verify_checksum= 0;
my_bool opt_slave_sql_verify_checksum= 1;
const char *binlog_format_names[]= {"MIXED", "STATEMENT", "ROW", NullS};
@@ -489,6 +495,7 @@ ulong malloc_calls;
ulong specialflag=0;
ulong binlog_cache_use= 0, binlog_cache_disk_use= 0;
ulong binlog_stmt_cache_use= 0, binlog_stmt_cache_disk_use= 0;
+ulong binlog_gtid_index_hit= 0, binlog_gtid_index_miss= 0;
ulong max_connections, max_connect_errors;
uint max_password_errors;
ulong extra_max_connections;
@@ -893,7 +900,7 @@ PSI_file_key key_file_binlog, key_file_binlog_cache, key_file_binlog_index,
PSI_file_key key_file_query_log, key_file_slow_log;
PSI_file_key key_file_relaylog, key_file_relaylog_index,
key_file_relaylog_cache, key_file_relaylog_index_cache;
-PSI_file_key key_file_binlog_state;
+PSI_file_key key_file_binlog_state, key_file_gtid_index;
#ifdef HAVE_PSI_INTERFACE
#ifdef HAVE_MMAP
@@ -918,6 +925,7 @@ PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list,
key_LOCK_status, key_LOCK_temp_pool,
key_LOCK_system_variables_hash, key_LOCK_thd_data, key_LOCK_thd_kill,
key_LOCK_user_conn, key_LOCK_uuid_short_generator, key_LOG_LOCK_log,
+ key_gtid_index_lock,
key_master_info_data_lock, key_master_info_run_lock,
key_master_info_sleep_lock, key_master_info_start_stop_lock,
key_master_info_start_alter_lock,
@@ -1004,6 +1012,7 @@ static PSI_mutex_info all_server_mutexes[]=
{ &key_LOCK_user_conn, "LOCK_user_conn", PSI_FLAG_GLOBAL},
{ &key_LOCK_uuid_short_generator, "LOCK_uuid_short_generator", PSI_FLAG_GLOBAL},
{ &key_LOG_LOCK_log, "LOG::LOCK_log", 0},
+ { &key_gtid_index_lock, "Gtid_index_writer::gtid_index_mutex", 0},
{ &key_master_info_data_lock, "Master_info::data_lock", 0},
{ &key_master_info_start_stop_lock, "Master_info::start_stop_lock", 0},
{ &key_master_info_run_lock, "Master_info::run_lock", 0},
@@ -1992,6 +2001,7 @@ static void clean_up(bool print_message)
injector::free_instance();
mysql_bin_log.cleanup();
+ Gtid_index_writer::gtid_index_cleanup();
my_tz_free();
my_dboptions_cache_free();
@@ -4006,6 +4016,7 @@ static int init_common_variables()
inited before MY_INIT(). So we do it here.
*/
mysql_bin_log.init_pthread_objects();
+ Gtid_index_writer::gtid_index_init();
/* TODO: remove this when my_time_t is 64 bit compatible */
if (!IS_TIME_T_VALID_FOR_TIMESTAMP(server_start_time))
@@ -7429,6 +7440,8 @@ SHOW_VAR status_vars[]= {
{"Binlog_bytes_written", (char*) offsetof(STATUS_VAR, binlog_bytes_written), SHOW_LONGLONG_STATUS},
{"Binlog_cache_disk_use", (char*) &binlog_cache_disk_use, SHOW_LONG},
{"Binlog_cache_use", (char*) &binlog_cache_use, SHOW_LONG},
+ {"Binlog_gtid_index_hit", (char*) &binlog_gtid_index_hit, SHOW_LONG},
+ {"Binlog_gtid_index_miss", (char*) &binlog_gtid_index_miss, SHOW_LONG},
{"Binlog_stmt_cache_disk_use",(char*) &binlog_stmt_cache_disk_use, SHOW_LONG},
{"Binlog_stmt_cache_use", (char*) &binlog_stmt_cache_use, SHOW_LONG},
{"Busy_time", (char*) offsetof(STATUS_VAR, busy_time), SHOW_DOUBLE_STATUS},
@@ -7854,6 +7867,7 @@ static int mysql_init_variables(void)
delayed_insert_errors= thread_created= 0;
specialflag= 0;
binlog_cache_use= binlog_cache_disk_use= 0;
+ binlog_gtid_index_hit= binlog_gtid_index_miss= 0;
max_used_connections= slow_launch_threads = 0;
max_used_connections_time= 0;
mysqld_user= mysqld_chroot= opt_init_file= opt_bin_logname = 0;
@@ -9249,7 +9263,8 @@ static PSI_file_info all_server_files[]=
{ &key_file_trg, "trigger_name", 0},
{ &key_file_trn, "trigger", 0},
{ &key_file_init, "init", 0},
- { &key_file_binlog_state, "binlog_state", 0}
+ { &key_file_binlog_state, "binlog_state", 0},
+ { &key_file_gtid_index, "gtid_index", 0}
};
#endif /* HAVE_PSI_INTERFACE */
@@ -9443,6 +9458,7 @@ PSI_memory_key key_memory_acl_cache;
PSI_memory_key key_memory_acl_mem;
PSI_memory_key key_memory_acl_memex;
PSI_memory_key key_memory_binlog_cache_mngr;
+PSI_memory_key key_memory_binlog_gtid_index;
PSI_memory_key key_memory_binlog_pos;
PSI_memory_key key_memory_binlog_recover_exec;
PSI_memory_key key_memory_binlog_statement_buffer;
@@ -9682,6 +9698,7 @@ static PSI_memory_info all_server_memory[]=
// { &key_memory_Slave_job_group_group_relay_log_name, "Slave_job_group::group_relay_log_name", 0},
{ &key_memory_Relay_log_info_group_relay_log_name, "Relay_log_info::group_relay_log_name", 0},
{ &key_memory_binlog_cache_mngr, "binlog_cache_mngr", 0},
+ { &key_memory_binlog_gtid_index, "binlog_gtid_index", 0},
{ &key_memory_Row_data_memory_memory, "Row_data_memory::memory", 0},
// { &key_memory_Gtid_set_to_string, "Gtid_set::to_string", 0},
// { &key_memory_Gtid_state_to_string, "Gtid_state::to_string", 0},
diff --git a/sql/mysqld.h b/sql/mysqld.h
index 8a248ae3554..a9ad03c2438 100644
--- a/sql/mysqld.h
+++ b/sql/mysqld.h
@@ -216,6 +216,7 @@ extern ulonglong thd_startup_options;
extern my_thread_id global_thread_id;
extern ulong binlog_cache_use, binlog_cache_disk_use;
extern ulong binlog_stmt_cache_use, binlog_stmt_cache_disk_use;
+extern ulong binlog_gtid_index_hit, binlog_gtid_index_miss;
extern ulong aborted_threads, aborted_connects, aborted_connects_preauth;
extern ulong delayed_insert_timeout;
extern ulong delayed_insert_limit, delayed_queue_size;
@@ -248,6 +249,11 @@ extern ulonglong slave_max_statement_time;
extern double slave_max_statement_time_double;
extern ulong opt_binlog_rows_event_max_size;
extern ulong binlog_row_metadata;
+extern my_bool opt_binlog_gtid_index;
+extern ulong opt_binlog_gtid_index_page_size;
+extern ulong opt_binlog_gtid_index_sparse;
+extern ulong opt_binlog_gtid_index_span_min;
+extern ulong opt_binlog_gtid_index_span_max;
extern ulong thread_cache_size;
extern ulong stored_program_cache_size;
extern ulong opt_slave_parallel_threads;
@@ -332,7 +338,7 @@ extern PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list,
key_LOCK_rpl_status, key_LOCK_server_started,
key_LOCK_status, key_LOCK_optimizer_costs,
key_LOCK_thd_data, key_LOCK_thd_kill,
- key_LOCK_user_conn, key_LOG_LOCK_log,
+ key_LOCK_user_conn, key_LOG_LOCK_log, key_gtid_index_lock,
key_master_info_data_lock, key_master_info_run_lock,
key_master_info_sleep_lock, key_master_info_start_stop_lock,
key_master_info_start_alter_lock,
@@ -410,7 +416,7 @@ extern PSI_file_key key_file_relaylog, key_file_relaylog_index,
key_file_relaylog_cache, key_file_relaylog_index_cache;
extern PSI_socket_key key_socket_tcpip, key_socket_unix,
key_socket_client_connection;
-extern PSI_file_key key_file_binlog_state;
+extern PSI_file_key key_file_binlog_state, key_file_gtid_index;
#ifdef HAVE_PSI_INTERFACE
void init_server_psi_keys();
@@ -455,6 +461,7 @@ extern PSI_memory_key key_memory_user_var_entry_value;
extern PSI_memory_key key_memory_Slave_job_group_group_relay_log_name;
extern PSI_memory_key key_memory_Relay_log_info_group_relay_log_name;
extern PSI_memory_key key_memory_binlog_cache_mngr;
+extern PSI_memory_key key_memory_binlog_gtid_index;
extern PSI_memory_key key_memory_Row_data_memory_memory;
extern PSI_memory_key key_memory_errmsgs;
extern PSI_memory_key key_memory_Event_queue_element_for_exec_names;
diff --git a/sql/privilege.h b/sql/privilege.h
index d32c28b9e94..797c0280fef 100644
--- a/sql/privilege.h
+++ b/sql/privilege.h
@@ -368,6 +368,21 @@ constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_COMMIT_WAIT_USEC=
constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_ROW_METADATA=
BINLOG_ADMIN_ACL;
+constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX=
+ BINLOG_ADMIN_ACL;
+
+constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX_PAGE_SIZE=
+ BINLOG_ADMIN_ACL;
+
+constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX_SPARSE=
+ BINLOG_ADMIN_ACL;
+
+constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX_SPAN_MIN=
+ BINLOG_ADMIN_ACL;
+
+constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX_SPAN_MAX=
+ BINLOG_ADMIN_ACL;
+
constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_EXPIRE_LOGS_DAYS=
BINLOG_ADMIN_ACL;
diff --git a/sql/rpl_gtid.cc b/sql/rpl_gtid.cc
index 1162905925a..1557eac092d 100644
--- a/sql/rpl_gtid.cc
+++ b/sql/rpl_gtid.cc
@@ -1541,19 +1541,18 @@ rpl_slave_state::alloc_gtid_pos_table(LEX_CSTRING *table_name, void *hton,
}
-void rpl_binlog_state::init()
+void
+rpl_binlog_state_base::init()
{
my_hash_init(PSI_INSTRUMENT_ME, &hash, &my_charset_bin, 32,
offsetof(element, domain_id), sizeof(element::domain_id),
NULL, my_free, HASH_UNIQUE);
- my_init_dynamic_array(PSI_INSTRUMENT_ME, >id_sort_array, sizeof(rpl_gtid), 8, 8, MYF(0));
- mysql_mutex_init(key_LOCK_binlog_state, &LOCK_binlog_state,
- MY_MUTEX_INIT_SLOW);
initialized= 1;
}
+
void
-rpl_binlog_state::reset_nolock()
+rpl_binlog_state_base::reset_nolock()
{
uint32 i;
@@ -1564,72 +1563,67 @@ rpl_binlog_state::reset_nolock()
void
-rpl_binlog_state::reset()
-{
- mysql_mutex_lock(&LOCK_binlog_state);
- reset_nolock();
- mysql_mutex_unlock(&LOCK_binlog_state);
-}
-
-
-void rpl_binlog_state::free()
+rpl_binlog_state_base::free()
{
if (initialized)
{
initialized= 0;
reset_nolock();
my_hash_free(&hash);
- delete_dynamic(>id_sort_array);
- mysql_mutex_destroy(&LOCK_binlog_state);
}
}
+rpl_binlog_state_base::~rpl_binlog_state_base()
+{
+ free();
+}
+
+
bool
-rpl_binlog_state::load(struct rpl_gtid *list, uint32 count)
+rpl_binlog_state_base::load_nolock(struct rpl_gtid *list, uint32 count)
{
uint32 i;
bool res= false;
- mysql_mutex_lock(&LOCK_binlog_state);
reset_nolock();
for (i= 0; i < count; ++i)
{
- if (update_nolock(&(list[i]), false))
+ if (update_nolock(&(list[i])))
{
res= true;
break;
}
}
- mysql_mutex_unlock(&LOCK_binlog_state);
return res;
}
-static int rpl_binlog_state_load_cb(rpl_gtid *gtid, void *data)
-{
- rpl_binlog_state *self= (rpl_binlog_state *)data;
- return self->update_nolock(gtid, false);
-}
-
-
bool
-rpl_binlog_state::load(rpl_slave_state *slave_pos)
+rpl_binlog_state_base::load_nolock(rpl_binlog_state_base *orig_state)
{
- bool res= false;
+ ulong i, j;
+ HASH *h1= &orig_state->hash;
- mysql_mutex_lock(&LOCK_binlog_state);
reset_nolock();
- if (slave_pos->iterate(rpl_binlog_state_load_cb, this, NULL, 0, false))
- res= true;
- mysql_mutex_unlock(&LOCK_binlog_state);
- return res;
-}
-
+ for (i= 0; i < h1->records; ++i)
+ {
+ element *e= (element *)my_hash_element(h1, i);
+ HASH *h2= &e->hash;
+ const rpl_gtid *last_gtid= e->last_gtid;
+ for (j= 0; j < h2->records; ++j)
+ {
+ const rpl_gtid *gtid= (const rpl_gtid *)my_hash_element(h2, j);
+ if (gtid == last_gtid)
+ continue;
+ if (update_nolock(gtid))
+ return true;
+ }
+ if (likely(last_gtid) && update_nolock(last_gtid))
+ return true;
+ }
-rpl_binlog_state::~rpl_binlog_state()
-{
- free();
+ return false;
}
@@ -1639,10 +1633,13 @@ rpl_binlog_state::~rpl_binlog_state()
If the (domain_id, server_id) pair already exists, then the new GTID replaces
the old one for that domain id. Else a new entry is inserted.
+ Note that rpl_binlog_state_base::update_nolock() does not call my_error()
+ for out-of-memory, caller must do that if needed (eg. ER_OUT_OF_RESOURCES).
+
Returns 0 for ok, 1 for error.
*/
int
-rpl_binlog_state::update_nolock(const struct rpl_gtid *gtid, bool strict)
+rpl_binlog_state_base::update_nolock(const struct rpl_gtid *gtid)
{
element *elem;
@@ -1650,13 +1647,6 @@ rpl_binlog_state::update_nolock(const struct rpl_gtid *gtid, bool strict)
(const uchar *)(>id->domain_id),
sizeof(gtid->domain_id))))
{
- if (strict && elem->last_gtid && elem->last_gtid->seq_no >= gtid->seq_no)
- {
- my_error(ER_GTID_STRICT_OUT_OF_ORDER, MYF(0), gtid->domain_id,
- gtid->server_id, gtid->seq_no, elem->last_gtid->domain_id,
- elem->last_gtid->server_id, elem->last_gtid->seq_no);
- return 1;
- }
if (elem->seq_no_counter < gtid->seq_no)
elem->seq_no_counter= gtid->seq_no;
if (!elem->update_element(gtid))
@@ -1665,17 +1655,267 @@ rpl_binlog_state::update_nolock(const struct rpl_gtid *gtid, bool strict)
else if (!alloc_element_nolock(gtid))
return 0;
- my_error(ER_OUT_OF_RESOURCES, MYF(0));
return 1;
}
+int
+rpl_binlog_state_base::alloc_element_nolock(const rpl_gtid *gtid)
+{
+ element *elem;
+ rpl_gtid *lookup_gtid;
+
+ /* First time we see this domain_id; allocate a new element. */
+ elem= (element *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*elem), MYF(0));
+ lookup_gtid= (rpl_gtid *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*lookup_gtid),
+ MYF(0));
+ if (elem && lookup_gtid)
+ {
+ elem->domain_id= gtid->domain_id;
+ my_hash_init(PSI_INSTRUMENT_ME, &elem->hash, &my_charset_bin, 32,
+ offsetof(rpl_gtid, server_id), sizeof(rpl_gtid::domain_id),
+ NULL, my_free, HASH_UNIQUE);
+ elem->last_gtid= lookup_gtid;
+ elem->seq_no_counter= gtid->seq_no;
+ memcpy(lookup_gtid, gtid, sizeof(*lookup_gtid));
+ if (0 == my_hash_insert(&elem->hash, (const uchar *)lookup_gtid))
+ {
+ lookup_gtid= NULL; /* Do not free. */
+ if (0 == my_hash_insert(&hash, (const uchar *)elem))
+ return 0;
+ }
+ my_hash_free(&elem->hash);
+ }
+
+ /* An error. */
+ if (elem)
+ my_free(elem);
+ if (lookup_gtid)
+ my_free(lookup_gtid);
+ return 1;
+}
+
+
+uint32
+rpl_binlog_state_base::count_nolock()
+{
+ uint32 c= 0;
+ uint32 i;
+
+ for (i= 0; i < hash.records; ++i)
+ c+= ((element *)my_hash_element(&hash, i))->hash.records;
+
+ return c;
+}
+
+
+int
+rpl_binlog_state_base::get_gtid_list_nolock(rpl_gtid *gtid_list, uint32 list_size)
+{
+ uint32 i, j, pos;
+
+ pos= 0;
+ for (i= 0; i < hash.records; ++i)
+ {
+ element *e= (element *)my_hash_element(&hash, i);
+ if (!e->last_gtid)
+ {
+ DBUG_ASSERT(e->hash.records==0);
+ continue;
+ }
+ for (j= 0; j <= e->hash.records; ++j)
+ {
+ const rpl_gtid *gtid;
+ if (j < e->hash.records)
+ {
+ gtid= (rpl_gtid *)my_hash_element(&e->hash, j);
+ if (gtid == e->last_gtid)
+ continue;
+ }
+ else
+ gtid= e->last_gtid;
+
+ if (pos >= list_size)
+ return 1;
+ memcpy(>id_list[pos++], gtid, sizeof(*gtid));
+ }
+ }
+
+ return 0;
+}
+
+
+rpl_gtid *
+rpl_binlog_state_base::find_nolock(uint32 domain_id, uint32 server_id)
+{
+ element *elem;
+ if (!(elem= (element *)my_hash_search(&hash, (const uchar *)&domain_id,
+ sizeof(domain_id))))
+ return NULL;
+ return (rpl_gtid *)my_hash_search(&elem->hash, (const uchar *)&server_id,
+ sizeof(server_id));
+}
+
+
+/*
+ Return true if this binlog state is before the position specified by the
+ passed-in slave_connection_state, false otherwise.
+ Note that if the GTID D-S-N is the last GTID added to the state in the
+ domain D, then the state is considered to come before the position D-S-N
+ within domain D.
+*/
+bool
+rpl_binlog_state_base::is_before_pos(slave_connection_state *pos)
+{
+ /*
+ First check each GTID in the slave position, if it comes after what is
+ in the state.
+ */
+ for (uint32 i= 0; i < pos->hash.records; ++i)
+ {
+ const slave_connection_state::entry *e=
+ (const slave_connection_state::entry *)my_hash_element(&pos->hash, i);
+ /*
+ IF we have an entry with the same (domain_id, server_id),
+ AND either
+ ( we are ahead in that server_id
+ OR we are identical, but there's some other server_id after)
+ THEN that position lies before our state.
+ */
+ element *elem;
+ if ((elem= (element *)my_hash_search(&hash,
+ (const uchar *)&e->gtid.domain_id,
+ sizeof(e->gtid.domain_id))))
+ {
+ const rpl_gtid *g= (rpl_gtid *)
+ my_hash_search(&elem->hash, (const uchar *)&e->gtid.server_id,
+ sizeof(e->gtid.server_id));
+ if (g != nullptr &&
+ ( g->seq_no > e->gtid.seq_no ||
+ ( g->seq_no == e->gtid.seq_no && g != elem->last_gtid) ))
+ return false;
+ }
+ }
+
+ /*
+ Then check the state, if there are any domains present that are missing
+ from the position.
+ */
+ for (uint32 i= 0; i < hash.records; ++i)
+ {
+ const element *elem= (const element *) my_hash_element(&hash, i);
+ if (likely(elem->hash.records > 0) &&
+ !pos->find(elem->domain_id))
+ return false;
+ }
+
+ /* Nothing in our state lies after anything in the position. */
+ return true;
+}
+
+
+void rpl_binlog_state::init()
+{
+ rpl_binlog_state_base::init();
+ my_init_dynamic_array(PSI_INSTRUMENT_ME, >id_sort_array, sizeof(rpl_gtid), 8, 8, MYF(0));
+ mysql_mutex_init(key_LOCK_binlog_state, &LOCK_binlog_state,
+ MY_MUTEX_INIT_SLOW);
+}
+
+
+void
+rpl_binlog_state::reset()
+{
+ mysql_mutex_lock(&LOCK_binlog_state);
+ reset_nolock();
+ mysql_mutex_unlock(&LOCK_binlog_state);
+}
+
+
+void rpl_binlog_state::free()
+{
+ if (initialized)
+ {
+ rpl_binlog_state_base::free();
+ delete_dynamic(>id_sort_array);
+ mysql_mutex_destroy(&LOCK_binlog_state);
+ }
+}
+
+
+rpl_binlog_state::~rpl_binlog_state()
+{
+ free();
+}
+
+
+bool
+rpl_binlog_state::load(struct rpl_gtid *list, uint32 count)
+{
+ mysql_mutex_lock(&LOCK_binlog_state);
+ bool res= load_nolock(list, count);
+ mysql_mutex_unlock(&LOCK_binlog_state);
+ if (res)
+ my_error(ER_OUT_OF_RESOURCES, MYF(0));
+ return res;
+}
+
+
+static int rpl_binlog_state_load_cb(rpl_gtid *gtid, void *data)
+{
+ rpl_binlog_state *self= (rpl_binlog_state *)data;
+ return self->update_nolock(gtid);
+}
+
+
+bool
+rpl_binlog_state::load(rpl_slave_state *slave_pos)
+{
+ bool res= false;
+
+ mysql_mutex_lock(&LOCK_binlog_state);
+ reset_nolock();
+ if (slave_pos->iterate(rpl_binlog_state_load_cb, this, NULL, 0, false))
+ {
+ my_error(ER_OUT_OF_RESOURCES, MYF(0));
+ res= true;
+ }
+ mysql_mutex_unlock(&LOCK_binlog_state);
+ return res;
+}
+
+
int
rpl_binlog_state::update(const struct rpl_gtid *gtid, bool strict)
{
- int res;
+ int res= 0;
+ element *elem;
+
mysql_mutex_lock(&LOCK_binlog_state);
- res= update_nolock(gtid, strict);
+ if ((elem= (element *)my_hash_search(&hash,
+ (const uchar *)(>id->domain_id),
+ sizeof(gtid->domain_id))))
+ {
+ if (strict && elem->last_gtid && elem->last_gtid->seq_no >= gtid->seq_no)
+ {
+ my_error(ER_GTID_STRICT_OUT_OF_ORDER, MYF(0), gtid->domain_id,
+ gtid->server_id, gtid->seq_no, elem->last_gtid->domain_id,
+ elem->last_gtid->server_id, elem->last_gtid->seq_no);
+ res= 1;
+ }
+ else
+ {
+ if (elem->seq_no_counter < gtid->seq_no)
+ elem->seq_no_counter= gtid->seq_no;
+ if (elem->update_element(gtid))
+ res= 1;
+ }
+ }
+ else if (alloc_element_nolock(gtid))
+ {
+ my_error(ER_OUT_OF_RESOURCES, MYF(0));
+ res= 1;
+ }
mysql_mutex_unlock(&LOCK_binlog_state);
return res;
}
@@ -1761,43 +2001,6 @@ rpl_binlog_state::element::update_element(const rpl_gtid *gtid)
}
-int
-rpl_binlog_state::alloc_element_nolock(const rpl_gtid *gtid)
-{
- element *elem;
- rpl_gtid *lookup_gtid;
-
- /* First time we see this domain_id; allocate a new element. */
- elem= (element *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*elem), MYF(MY_WME));
- lookup_gtid= (rpl_gtid *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*lookup_gtid),
- MYF(MY_WME));
- if (elem && lookup_gtid)
- {
- elem->domain_id= gtid->domain_id;
- my_hash_init(PSI_INSTRUMENT_ME, &elem->hash, &my_charset_bin, 32,
- offsetof(rpl_gtid, server_id), sizeof(rpl_gtid::domain_id),
- NULL, my_free, HASH_UNIQUE);
- elem->last_gtid= lookup_gtid;
- elem->seq_no_counter= gtid->seq_no;
- memcpy(lookup_gtid, gtid, sizeof(*lookup_gtid));
- if (0 == my_hash_insert(&elem->hash, (const uchar *)lookup_gtid))
- {
- lookup_gtid= NULL; /* Do not free. */
- if (0 == my_hash_insert(&hash, (const uchar *)elem))
- return 0;
- }
- my_hash_free(&elem->hash);
- }
-
- /* An error. */
- if (elem)
- my_free(elem);
- if (lookup_gtid)
- my_free(lookup_gtid);
- return 1;
-}
-
-
/*
Check that a new GTID can be logged without creating an out-of-order
sequence number with existing GTIDs.
@@ -1949,7 +2152,7 @@ rpl_binlog_state::read_from_iocache(IO_CACHE *src)
p= buf;
end= buf + len;
if (gtid_parser_helper(&p, end, >id) ||
- update_nolock(>id, false))
+ update_nolock(>id))
{
res= 1;
break;
@@ -1960,17 +2163,6 @@ rpl_binlog_state::read_from_iocache(IO_CACHE *src)
}
-rpl_gtid *
-rpl_binlog_state::find_nolock(uint32 domain_id, uint32 server_id)
-{
- element *elem;
- if (!(elem= (element *)my_hash_search(&hash, (const uchar *)&domain_id,
- sizeof(domain_id))))
- return NULL;
- return (rpl_gtid *)my_hash_search(&elem->hash, (const uchar *)&server_id,
- sizeof(server_id));
-}
-
rpl_gtid *
rpl_binlog_state::find(uint32 domain_id, uint32 server_id)
{
@@ -2001,12 +2193,8 @@ rpl_binlog_state::find_most_recent(uint32 domain_id)
uint32
rpl_binlog_state::count()
{
- uint32 c= 0;
- uint32 i;
-
mysql_mutex_lock(&LOCK_binlog_state);
- for (i= 0; i < hash.records; ++i)
- c+= ((element *)my_hash_element(&hash, i))->hash.records;
+ uint32 c= count_nolock();
mysql_mutex_unlock(&LOCK_binlog_state);
return c;
@@ -2016,41 +2204,8 @@ rpl_binlog_state::count()
int
rpl_binlog_state::get_gtid_list(rpl_gtid *gtid_list, uint32 list_size)
{
- uint32 i, j, pos;
- int res= 0;
-
mysql_mutex_lock(&LOCK_binlog_state);
- pos= 0;
- for (i= 0; i < hash.records; ++i)
- {
- element *e= (element *)my_hash_element(&hash, i);
- if (!e->last_gtid)
- {
- DBUG_ASSERT(e->hash.records==0);
- continue;
- }
- for (j= 0; j <= e->hash.records; ++j)
- {
- const rpl_gtid *gtid;
- if (j < e->hash.records)
- {
- gtid= (rpl_gtid *)my_hash_element(&e->hash, j);
- if (gtid == e->last_gtid)
- continue;
- }
- else
- gtid= e->last_gtid;
-
- if (pos >= list_size)
- {
- res= 1;
- goto end;
- }
- memcpy(>id_list[pos++], gtid, sizeof(*gtid));
- }
- }
-
-end:
+ int res= get_gtid_list_nolock(gtid_list, list_size);
mysql_mutex_unlock(&LOCK_binlog_state);
return res;
}
diff --git a/sql/rpl_gtid.h b/sql/rpl_gtid.h
index 7d25ee6e75d..8b697c79515 100644
--- a/sql/rpl_gtid.h
+++ b/sql/rpl_gtid.h
@@ -26,6 +26,11 @@
extern const LEX_CSTRING rpl_gtid_slave_state_table_name;
class String;
+#ifdef MYSQL_SERVER
+struct TABLE;
+#endif
+struct slave_connection_state;
+
#define PARAM_GTID(G) G.domain_id, G.server_id, G.seq_no
#define GTID_MAX_STR_LENGTH (10+1+10+1+20)
@@ -296,8 +301,13 @@ struct rpl_slave_state
to know where to start when a master is changed to a slave. As a side
effect, it also allows to skip a hash lookup in the very common case of
logging a new GTID with same server id as last GTID.
+
+ The base class rpl_binlog_state_base contains just be basic data operations
+ to insert/update GTIDs, and is used eg. from Gtid_index_*. The main class
+ rpl_binlog_state builds server logic on top of that like mutex locking,
+ gtid_strict_mode handling, etc.
*/
-struct rpl_binlog_state
+struct rpl_binlog_state_base
{
struct element {
uint32 domain_id;
@@ -309,29 +319,45 @@ struct rpl_binlog_state
int update_element(const rpl_gtid *gtid);
};
+
/* Mapping from domain_id to collection of elements. */
HASH hash;
+ my_bool initialized;
+
+ rpl_binlog_state_base() : initialized(0) {}
+ ~rpl_binlog_state_base();
+ void init();
+ void reset_nolock();
+ void free();
+ bool load_nolock(struct rpl_gtid *list, uint32 count);
+ bool load_nolock(rpl_binlog_state_base *orig_state);
+ int update_nolock(const struct rpl_gtid *gtid);
+ int alloc_element_nolock(const rpl_gtid *gtid);
+ uint32 count_nolock();
+ int get_gtid_list_nolock(rpl_gtid *gtid_list, uint32 list_size);
+ rpl_gtid *find_nolock(uint32 domain_id, uint32 server_id);
+ bool is_before_pos(slave_connection_state *pos);
+};
+
+struct rpl_binlog_state : public rpl_binlog_state_base
+{
/* Mutex protecting access to the state. */
mysql_mutex_t LOCK_binlog_state;
- my_bool initialized;
/* Auxiliary buffer to sort gtid list. */
DYNAMIC_ARRAY gtid_sort_array;
- rpl_binlog_state() :initialized(0) {}
+ rpl_binlog_state() {}
~rpl_binlog_state();
void init();
- void reset_nolock();
void reset();
void free();
bool load(struct rpl_gtid *list, uint32 count);
bool load(rpl_slave_state *slave_pos);
- int update_nolock(const struct rpl_gtid *gtid, bool strict);
int update(const struct rpl_gtid *gtid, bool strict);
int update_with_next_gtid(uint32 domain_id, uint32 server_id,
rpl_gtid *gtid);
- int alloc_element_nolock(const rpl_gtid *gtid);
bool check_strict_sequence(uint32 domain_id, uint32 server_id, uint64 seq_no,
bool no_error= false);
int bump_seq_no_if_needed(uint32 domain_id, uint64 seq_no);
@@ -342,7 +368,6 @@ struct rpl_binlog_state
int get_most_recent_gtid_list(rpl_gtid **list, uint32 *size);
bool append_pos(String *str);
bool append_state(String *str);
- rpl_gtid *find_nolock(uint32 domain_id, uint32 server_id);
rpl_gtid *find(uint32 domain_id, uint32 server_id);
rpl_gtid *find_most_recent(uint32 domain_id);
const char* drop_domain(DYNAMIC_ARRAY *ids, Gtid_list_log_event *glev, char*);
diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc
index bc1f0ebbff5..82d220fac98 100644
--- a/sql/rpl_rli.cc
+++ b/sql/rpl_rli.cc
@@ -1547,7 +1547,7 @@ Relay_log_info::update_relay_log_state(rpl_gtid *gtid_list, uint32 count)
int res= 0;
while (count)
{
- if (relay_log_state.update_nolock(gtid_list, false))
+ if (relay_log_state.update_nolock(gtid_list))
res= 1;
++gtid_list;
--count;
diff --git a/sql/sql_repl.cc b/sql/sql_repl.cc
index f3017c3d311..f54e07d3de4 100644
--- a/sql/sql_repl.cc
+++ b/sql/sql_repl.cc
@@ -31,6 +31,7 @@
#include "semisync_master.h"
#include "semisync_slave.h"
#include "mysys_err.h"
+#include "gtid_index.h"
enum enum_gtid_until_state {
@@ -1286,6 +1287,100 @@ check_slave_start_position(binlog_send_info *info, const char **errormsg,
return err;
}
+
+/*
+ Helper function for gtid_find_binlog_pos() below.
+ Check a binlog file against a slave position. Use a GTID index if present.
+ Returns:
+ 0 This is the binlog file that contains the position. If *out_start_seek
+ is non-zero, it is the offset found in the GTID index at which to start
+ scanning the binlog file for events to send to the slave.
+ 1 This binlog file is too new to contain the given slave position.
+ -1 Error, *out_errormsg contains error string.
+
+ The *out_glev event must be deleted by the caller if set non-null.
+ */
+static int
+gtid_check_binlog_file(slave_connection_state *state,
+ Gtid_index_reader_hot *reader,
+ const binlog_file_entry *list,
+ bool *found_in_index, uint32 *out_start_seek,
+ uint32 *found_count,
+ char *out_name, Gtid_list_log_event **out_glev,
+ const char **out_errormsg)
+{
+ Gtid_list_log_event *glev= nullptr;
+ char buf[FN_REFLEN];
+ File file;
+ IO_CACHE cache;
+ int res= -1;
+
+ *found_in_index= false;
+ *out_glev= nullptr;
+ *out_errormsg= nullptr;
+ /*
+ Try to lookup the GTID position in the gtid index.
+ If that doesn't work, read the Gtid_list_log_event at the start of the
+ binlog file to get the binlog state.
+ */
+ if (normalize_binlog_name(buf, list->name.str, false))
+ {
+ *out_errormsg= "Failed to determine binlog file name while looking for "
+ "GTID position in binlog";
+ goto end;
+ }
+
+ if (likely(reader && !reader->open_index_file(buf)))
+ {
+ int lookup= reader->search_gtid_pos(state, out_start_seek, found_count);
+ reader->close_index_file();
+ if (lookup >= 0)
+ {
+ statistic_increment(binlog_gtid_index_hit, &LOCK_status);
+ if (lookup == 0)
+ res= 1;
+ else
+ {
+ strmake(out_name, buf, FN_REFLEN);
+ *found_in_index= true;
+ res= 0;
+ }
+ goto end;
+ }
+ /*
+ Error in the index lookup; fall back to reading the GTID_LIST event from
+ the binlog file and scan it from the beginning.
+ */
+ }
+ statistic_increment(binlog_gtid_index_miss, &LOCK_status);
+
+ bzero((char*) &cache, sizeof(cache));
+ if (unlikely((file= open_binlog(&cache, buf, out_errormsg)) == (File)-1))
+ goto end;
+ *out_errormsg= get_gtid_list_event(&cache, &glev);
+ end_io_cache(&cache);
+ mysql_file_close(file, MYF(MY_WME));
+ if (unlikely(*out_errormsg))
+ goto end;
+
+ if (!glev || contains_all_slave_gtid(state, glev))
+ {
+ strmake(out_name, buf, FN_REFLEN);
+ *out_glev= glev;
+ *out_errormsg= nullptr;
+ res= 0;
+ }
+ else
+ {
+ delete glev;
+ res= 1;
+ }
+
+end:
+ return res;
+}
+
+
/*
Find the name of the binlog file to start reading for a slave that connects
using GTID state.
@@ -1314,14 +1409,17 @@ check_slave_start_position(binlog_send_info *info, const char **errormsg,
the requested GTID that was already purged.
*/
static const char *
-gtid_find_binlog_file(slave_connection_state *state, char *out_name,
- slave_connection_state *until_gtid_state)
+gtid_find_binlog_pos(slave_connection_state *state, char *out_name,
+ slave_connection_state *until_gtid_state,
+ rpl_binlog_state *until_binlog_state,
+ bool *found_in_index, uint32 *out_start_seek)
{
MEM_ROOT memroot;
binlog_file_entry *list;
Gtid_list_log_event *glev= NULL;
const char *errormsg= NULL;
- char buf[FN_REFLEN];
+ Gtid_index_reader_hot *reader= NULL;
+ *found_in_index= false;
init_alloc_root(PSI_INSTRUMENT_ME, &memroot,
10*(FN_REFLEN+sizeof(binlog_file_entry)), 0,
@@ -1332,48 +1430,41 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
goto end;
}
+ if (opt_binlog_gtid_index)
+ reader= new Gtid_index_reader_hot();
+
while (list)
{
- File file;
- IO_CACHE cache;
-
- if (!list->next)
- {
- /*
- It should be safe to read the currently used binlog, as we will only
- read the header part that is already written.
-
- But if that does not work on windows, then we will need to cache the
- event somewhere in memory I suppose - that could work too.
- */
- }
- /*
- Read the Gtid_list_log_event at the start of the binlog file to
- get the binlog state.
- */
- if (normalize_binlog_name(buf, list->name.str, false))
- {
- errormsg= "Failed to determine binlog file name while looking for "
- "GTID position in binlog";
- goto end;
- }
- bzero((char*) &cache, sizeof(cache));
- if (unlikely((file= open_binlog(&cache, buf, &errormsg)) == (File)-1))
- goto end;
- errormsg= get_gtid_list_event(&cache, &glev);
- end_io_cache(&cache);
- mysql_file_close(file, MYF(MY_WME));
- if (unlikely(errormsg))
+ uint32 found_count;
+ int res= gtid_check_binlog_file(state, reader, list, found_in_index,
+ out_start_seek, &found_count,
+ out_name, &glev, &errormsg);
+ if (res < 0)
goto end;
-
- if (!glev || contains_all_slave_gtid(state, glev))
+ if (res == 0)
{
- strmake(out_name, buf, FN_REFLEN);
-
- if (glev)
+ if (*found_in_index || glev)
{
uint32 i;
+ uint32 count;
+ rpl_gtid *gtids;
+ if (*found_in_index)
+ {
+ count= found_count;
+ gtids= reader->search_gtid_list();
+ /*
+ Load the initial GTID state corresponding to the position found in
+ the GTID index, as we will not have a GTID_LIST event to load it
+ from.
+ */
+ until_binlog_state->load(gtids, count);
+ }
+ else
+ {
+ count= glev->count;
+ gtids= glev->list;
+ }
/*
As a special case, we allow to start from binlog file N if the
requested GTID is the last event (in the corresponding domain) in
@@ -1385,9 +1476,9 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
from the UNTIL hash, to mark that such domains have already reached
their UNTIL condition.
*/
- for (i= 0; i < glev->count; ++i)
+ for (i= 0; i < count; ++i)
{
- const rpl_gtid *gtid= state->find(glev->list[i].domain_id);
+ const rpl_gtid *gtid= state->find(gtids[i].domain_id);
if (!gtid)
{
/*
@@ -1400,8 +1491,8 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
further GTIDs in the Gtid_list.
*/
DBUG_ASSERT(0);
- } else if (gtid->server_id == glev->list[i].server_id &&
- gtid->seq_no == glev->list[i].seq_no)
+ } else if (gtid->server_id == gtids[i].server_id &&
+ gtid->seq_no == gtids[i].seq_no)
{
/*
The slave requested to start from the very beginning of this
@@ -1412,9 +1503,9 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
}
if (until_gtid_state &&
- (gtid= until_gtid_state->find(glev->list[i].domain_id)) &&
- gtid->server_id == glev->list[i].server_id &&
- gtid->seq_no <= glev->list[i].seq_no)
+ (gtid= until_gtid_state->find(gtids[i].domain_id)) &&
+ gtid->server_id == gtids[i].server_id &&
+ gtid->seq_no <= gtids[i].seq_no)
{
/*
We've already reached the stop position in UNTIL for this domain,
@@ -1427,8 +1518,6 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
goto end;
}
- delete glev;
- glev= NULL;
list= list->next;
}
@@ -1441,11 +1530,56 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
if (glev)
delete glev;
+ if (reader)
+ delete reader;
+
free_root(&memroot, MYF(0));
return errormsg;
}
+static bool
+gtid_index_lookup_pos(const char *name, uint32 offset, uint32 *out_start_seek,
+ slave_connection_state *out_gtid_state)
+{
+ Gtid_index_reader_hot *reader= nullptr;
+ bool opened= false;
+ bool found= false;
+ uint32 found_offset, found_gtid_count;
+ rpl_gtid *found_gtids;
+ int res;
+
+ if (!(reader= new Gtid_index_reader_hot()) ||
+ reader->open_index_file(name))
+ {
+ statistic_increment(binlog_gtid_index_miss, &LOCK_status);
+ goto err;
+ }
+ opened= true;
+ res= reader->search_offset(offset, &found_offset, &found_gtid_count);
+ if (res <= 0)
+ {
+ statistic_increment(binlog_gtid_index_miss, &LOCK_status);
+ goto err;
+ }
+ statistic_increment(binlog_gtid_index_hit, &LOCK_status);
+
+ /* We found the position, initialize the state from the index. */
+ found_gtids= reader->search_gtid_list();
+ if (out_gtid_state->load(found_gtids, found_gtid_count))
+ goto err;
+ *out_start_seek= found_offset;
+ found= true;
+
+err:
+ if (opened)
+ reader->close_index_file();
+ if (reader)
+ delete reader;
+ return found;
+}
+
+
/*
Given an old-style binlog position with file name and file offset, find the
corresponding gtid position. If the offset is not at an event boundary, give
@@ -1469,8 +1603,22 @@ gtid_state_from_pos(const char *name, uint32 offset,
int err;
String packet;
Format_description_log_event *fdev= NULL;
+ bool found_in_index;
+ uint32 UNINIT_VAR(start_seek);
+ bool seek_done= false;
- if (unlikely(gtid_state->load((const rpl_gtid *)NULL, 0)))
+ /*
+ Try to lookup the position in the binlog gtid index. If found (as it will
+ usually be unless the index is corrupted somehow), we can seek directly to
+ a point at or just before the desired location, saving an expensive scan
+ of the binlog file from the start.
+ */
+ found_in_index= opt_binlog_gtid_index ?
+ gtid_index_lookup_pos(name, offset, &start_seek, gtid_state) :
+ false;
+ if (found_in_index)
+ found_gtid_list_event= true;
+ else if (unlikely(gtid_state->load((const rpl_gtid *)NULL, 0)))
{
errormsg= "Internal error (out of memory?) initializing slave state "
"while scanning binlog to find start position";
@@ -1559,6 +1707,25 @@ gtid_state_from_pos(const char *name, uint32 offset,
errormsg= "Could not start decryption of binlog.";
goto end;
}
+ if (found_in_index && !seek_done)
+ {
+ /*
+ Just to avoid a redundant event read before hitting the next branch.
+ ToDo: share this code with the below somehow.
+ */
+ my_b_seek(&cache, start_seek);
+ seek_done= true;
+ }
+ }
+ else if (found_in_index && !seek_done)
+ {
+ /*
+ After reading the format_description event and possibly
+ start_encryption, we can seek forward to avoid most or all of the scan
+ (depending on the sparseness of the index).
+ */
+ my_b_seek(&cache, start_seek);
+ seek_done= true;
}
else if (unlikely(typ != FORMAT_DESCRIPTION_EVENT &&
!found_format_description_event))
@@ -1570,7 +1737,7 @@ gtid_state_from_pos(const char *name, uint32 offset,
else if (typ == ROTATE_EVENT || typ == STOP_EVENT ||
typ == BINLOG_CHECKPOINT_EVENT)
continue; /* Continue looking */
- else if (typ == GTID_LIST_EVENT)
+ else if (typ == GTID_LIST_EVENT && !found_in_index)
{
rpl_gtid *gtid_list;
bool status;
@@ -1798,7 +1965,7 @@ send_event_to_slave(binlog_send_info *info, Log_event_type event_type,
}
});
- if (info->until_binlog_state.update_nolock(&event_gtid, false))
+ if (info->until_binlog_state.update_nolock(&event_gtid))
{
info->error= ER_MASTER_FATAL_ERROR_READING_BINLOG;
return "Failed in internal GTID book-keeping: Out of memory";
@@ -2198,6 +2365,8 @@ static int init_binlog_sender(binlog_send_info *info,
char search_file_name[FN_REFLEN];
const char *name=search_file_name;
+ bool found_in_index= false;
+ uint32 start_seek= 0;
if (info->using_gtid_state)
{
if (info->gtid_state.load(connect_gtid_state.ptr(),
@@ -2223,16 +2392,26 @@ static int init_binlog_sender(binlog_send_info *info,
info->error= error;
return 1;
}
- if ((info->errmsg= gtid_find_binlog_file(&info->gtid_state,
- search_file_name,
- info->until_gtid_state)))
+ if ((info->errmsg= gtid_find_binlog_pos(&info->gtid_state,
+ search_file_name,
+ info->until_gtid_state,
+ &info->until_binlog_state,
+ &found_in_index, &start_seek)))
{
info->error= ER_MASTER_FATAL_ERROR_READING_BINLOG;
return 1;
}
- /* start from beginning of binlog file */
- *pos = 4;
+ if (found_in_index)
+ {
+ /* Start from a position looked up in the binlog gtid index. */
+ *pos = start_seek;
+ }
+ else
+ {
+ /* start from beginning of binlog file */
+ *pos = 4;
+ }
}
else
{
@@ -2865,6 +3044,7 @@ void mysql_binlog_send(THD* thd, char* log_ident, my_off_t pos,
ushort flags)
{
LOG_INFO linfo;
+ ulong ev_offset;
IO_CACHE log;
File file = -1;
@@ -2990,6 +3170,34 @@ void mysql_binlog_send(THD* thd, char* log_ident, my_off_t pos,
if (info->until_gtid_state && info->until_gtid_state->count() == 0)
info->gtid_until_group= GTID_UNTIL_STOP_AFTER_STANDALONE;
+ if (info->using_gtid_state && pos > BIN_LOG_HEADER_SIZE &&
+ ( info->gtid_state.is_pos_reached() ||
+ info->gtid_until_group == GTID_UNTIL_STOP_AFTER_STANDALONE ) )
+ {
+ /*
+ We are starting a GTID connect from a point not at the start of the
+ binlog file (from a GTID index lookup). Send a fake GTID_LIST event
+ in place of the real GTID_LIST that would normally be sent from the
+ start of the binlog file.
+
+ If we already reached the gtid UNTIL position, then set the
+ FLAG_UNTIL_REACHED in the GTID_LIST event and stop immediately.
+ */
+ uint32 flag= 0;
+ if (info->gtid_until_group == GTID_UNTIL_STOP_AFTER_STANDALONE)
+ {
+ flag= Gtid_list_log_event::FLAG_UNTIL_REACHED;
+ info->should_stop= true;
+ }
+ Gtid_list_log_event glev(&info->until_binlog_state, flag);
+ if (reset_transmit_packet(info, info->flags, &ev_offset, &info->errmsg) ||
+ fake_gtid_list_event(info, &glev, &info->errmsg, (int32)pos))
+ {
+ info->error= ER_MASTER_FATAL_ERROR_READING_BINLOG;
+ goto err;
+ }
+ }
+
THD_STAGE_INFO(thd, stage_sending_binlog_event_to_slave);
if (send_one_binlog_file(info, &log, &linfo, pos))
break;
diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc
index 7572ddd04c4..7adc29d8e9f 100644
--- a/sql/sys_vars.cc
+++ b/sql/sys_vars.cc
@@ -6829,6 +6829,60 @@ Sys_binlog_row_metadata(
ON_UPDATE(NULL));
+static Sys_var_on_access_global<Sys_var_mybool,
+ PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX>
+Sys_binlog_gtid_index(
+ "binlog_gtid_index",
+ "Enable the creation of a GTID index for every binlog file, and the use "
+ "of such index for speeding up GTID lookup in the binlog.",
+ GLOBAL_VAR(opt_binlog_gtid_index), CMD_LINE(OPT_ARG),
+ DEFAULT(TRUE));
+
+
+static Sys_var_on_access_global<Sys_var_ulong,
+ PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX_PAGE_SIZE>
+Sys_binlog_gtid_index_page_size(
+ "binlog_gtid_index_page_size",
+ "Page size to use for the binlog GTID index.",
+ GLOBAL_VAR(opt_binlog_gtid_index_page_size), CMD_LINE(REQUIRED_ARG),
+ VALID_RANGE(64, 1<<24), DEFAULT(4096), BLOCK_SIZE(1));
+
+
+static Sys_var_on_access_global<Sys_var_ulong,
+ PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX_SPARSE>
+Sys_binlog_gtid_index_sparse(
+ "binlog_gtid_index_sparse",
+ "Control sparseness of the binlog GTID index. If set to N, only every "
+ "Nth GTID will be recorded in the index, to reduce the size of the "
+ "index. Normally does not need tuning.",
+ GLOBAL_VAR(opt_binlog_gtid_index_sparse), CMD_LINE(REQUIRED_ARG),
+ VALID_RANGE(1, 1024*1024L*1024L), DEFAULT(10), BLOCK_SIZE(1));
+
+
+static Sys_var_on_access_global<Sys_var_ulong,
+ PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX_SPAN_MIN>
+Sys_binlog_gtid_index_span_min(
+ "binlog_gtid_index_span_min",
+ "Control sparseness of the binlog GTID index. If set to N, at most one "
+ "index record will be added for every N bytes of binlog file written. "
+ "Normally does not need tuning.",
+ GLOBAL_VAR(opt_binlog_gtid_index_span_min), CMD_LINE(REQUIRED_ARG),
+ VALID_RANGE(1, 1024*1024L*1024L), DEFAULT(4096), BLOCK_SIZE(1));
+
+
+static Sys_var_on_access_global<Sys_var_ulong,
+ PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX_SPAN_MAX>
+Sys_binlog_gtid_index_span_max(
+ "binlog_gtid_index_span_max",
+ "Control sparseness of the binlog GTID index. If set to N, an index "
+ "record will be added after N bytes has been written to the binlog "
+ "file, even if this would normally be skipped due to the setting of "
+ "--binlog-gtid-index-sparse."
+ "Normally does not need tuning.",
+ GLOBAL_VAR(opt_binlog_gtid_index_span_max), CMD_LINE(REQUIRED_ARG),
+ VALID_RANGE(1, 1024*1024L*1024L), DEFAULT(65536), BLOCK_SIZE(1));
+
+
static bool check_pseudo_slave_mode(sys_var *self, THD *thd, set_var *var)
{
longlong previous_val= thd->variables.pseudo_slave_mode;
--
2.30.2
1
0

[PATCH] MDEV-26632: GTID master switch when slave position is filtered on new master
by Kristian Nielsen 26 Oct '23
by Kristian Nielsen 26 Oct '23
26 Oct '23
If an intermediate slave S1 has replication filters enabled, its
@@gtid_slave_pos may contain a GTID that is filtered and doesn't propagate
to lower-level slaves with S1 as master.
If then later S1 is demoted to a slave, it may attempt to connect to the
filtered position. This is normally disallowed in --gtid-strict-mode. But if
--gtid-ignore-duplicates is enabled, we should allow it, as in this case we
can trust the GTID sequence numbers between different server ids. So we can
know that the next GTID is the right one for the filtered slave GTID
position.
This allows advanced users to use replication filtering in topologies like
this and still run with --gtid-strict-mode enabled.
Signed-off-by: Kristian Nielsen <knielsen(a)knielsen-hq.org>
---
mysql-test/suite/rpl/r/rpl_mdev26632.result | 78 ++++++++++++++
mysql-test/suite/rpl/t/rpl_mdev26632.cnf | 28 +++++
mysql-test/suite/rpl/t/rpl_mdev26632.test | 109 ++++++++++++++++++++
sql/sql_repl.cc | 8 +-
4 files changed, 222 insertions(+), 1 deletion(-)
create mode 100644 mysql-test/suite/rpl/r/rpl_mdev26632.result
create mode 100644 mysql-test/suite/rpl/t/rpl_mdev26632.cnf
create mode 100644 mysql-test/suite/rpl/t/rpl_mdev26632.test
diff --git a/mysql-test/suite/rpl/r/rpl_mdev26632.result b/mysql-test/suite/rpl/r/rpl_mdev26632.result
new file mode 100644
index 00000000000..84080b94de8
--- /dev/null
+++ b/mysql-test/suite/rpl/r/rpl_mdev26632.result
@@ -0,0 +1,78 @@
+include/rpl_init.inc [topology=1->2->3]
+*** Test GTID master switch in a topology with filtered events.
+*** With --gtid-ignore-duplicate and --gtid-strict-mode, should allow
+*** GTID connect at a GTID position that is filtered on the new master.
+connection server_1;
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1,1);
+CREATE TABLE t3 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t3 VALUES (1,1);
+INSERT INTO t1 VALUES (2,1);
+INSERT INTO t3 VALUES (2,1);
+include/save_master_gtid.inc
+connection server_2;
+CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t2 VALUES (1,2);
+include/sync_with_master_gtid.inc
+include/save_master_gtid.inc
+connection server_3;
+include/sync_with_master_gtid.inc
+*** Promote 3 as new master, demote 2 as slave of 3.
+*** GTID position of 2 in domain 0 is filtered on 3.
+connection server_2;
+include/stop_slave.inc
+connection server_3;
+include/stop_slave.inc
+CHANGE MASTER TO master_host = '127.0.0.1', master_port = SERVER_MYPORT_1,
+MASTER_USE_GTID=SLAVE_POS;
+connection server_2;
+CHANGE MASTER TO master_host = '127.0.0.1', master_port = SERVER_MYPORT_3,
+MASTER_USE_GTID=SLAVE_POS;
+include/start_slave.inc
+connection server_3;
+include/start_slave.inc
+connection server_1;
+INSERT INTO t1 VALUES (3,1);
+INSERT INTO t3 VALUES (3,1);
+include/save_master_gtid.inc
+connection server_3;
+INSERT INTO t2 VALUES (2,2);
+include/sync_with_master_gtid.inc
+include/save_master_gtid.inc
+connection server_2;
+include/sync_with_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+a b
+1 1
+2 1
+3 1
+SELECT * FROM t3 ORDER BY a;
+ERROR 42S02: Table 'test.t3' doesn't exist
+SELECT * FROM t2 ORDER BY a;
+a b
+1 2
+2 2
+*** Restore original topology.
+connection server_3;
+include/stop_slave.inc
+connection server_2;
+include/stop_slave.inc
+CHANGE MASTER TO master_host = '127.0.0.1', master_port = SERVER_MYPORT_1,
+MASTER_USE_GTID=SLAVE_POS;
+include/start_slave.inc
+connection server_3;
+CHANGE MASTER TO master_host = '127.0.0.1', master_port = SERVER_MYPORT_2,
+MASTER_USE_GTID=SLAVE_POS;
+include/start_slave.inc
+connection server_1;
+DROP TABLE t1;
+DROP TABLE t3;
+include/save_master_gtid.inc
+connection server_2;
+DROP TABLE t2;
+include/sync_with_master_gtid.inc
+include/save_master_gtid.inc
+connection server_3;
+include/sync_with_master_gtid.inc
+include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/t/rpl_mdev26632.cnf b/mysql-test/suite/rpl/t/rpl_mdev26632.cnf
new file mode 100644
index 00000000000..5eda3ad0725
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_mdev26632.cnf
@@ -0,0 +1,28 @@
+!include ../my.cnf
+
+[mysqld.1]
+log-slave-updates
+loose-innodb
+gtid-domain-id=1
+gtid-strict-mode=1
+gtid-ignore-duplicates=1
+
+[mysqld.2]
+log-slave-updates
+loose-innodb
+gtid-domain-id=0
+replicate-ignore-table=test.t3
+gtid-strict-mode=1
+gtid-ignore-duplicates=1
+
+[mysqld.3]
+log-slave-updates
+loose-innodb
+gtid-domain-id=0
+replicate-ignore-table=test.t3
+gtid-strict-mode=1
+gtid-ignore-duplicates=1
+
+[ENV]
+SERVER_MYPORT_3= @mysqld.3.port
+SERVER_MYSOCK_3= @mysqld.3.socket
diff --git a/mysql-test/suite/rpl/t/rpl_mdev26632.test b/mysql-test/suite/rpl/t/rpl_mdev26632.test
new file mode 100644
index 00000000000..842bae8234c
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_mdev26632.test
@@ -0,0 +1,109 @@
+--source include/have_innodb.inc
+--source include/have_binlog_format_mixed.inc
+
+--let $rpl_topology=1->2->3
+--source include/rpl_init.inc
+
+--echo *** Test GTID master switch in a topology with filtered events.
+--echo *** With --gtid-ignore-duplicate and --gtid-strict-mode, should allow
+--echo *** GTID connect at a GTID position that is filtered on the new master.
+
+--connection server_1
+
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1,1);
+CREATE TABLE t3 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t3 VALUES (1,1);
+INSERT INTO t1 VALUES (2,1);
+INSERT INTO t3 VALUES (2,1);
+--source include/save_master_gtid.inc
+
+--connection server_2
+CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t2 VALUES (1,2);
+
+--let $slave_timeout= 10
+--source include/sync_with_master_gtid.inc
+--source include/save_master_gtid.inc
+
+--connection server_3
+--source include/sync_with_master_gtid.inc
+
+--echo *** Promote 3 as new master, demote 2 as slave of 3.
+--echo *** GTID position of 2 in domain 0 is filtered on 3.
+
+--connection server_2
+--source include/stop_slave.inc
+
+--connection server_3
+--source include/stop_slave.inc
+--replace_result $SERVER_MYPORT_1 SERVER_MYPORT_1
+eval CHANGE MASTER TO master_host = '127.0.0.1', master_port = $SERVER_MYPORT_1,
+ MASTER_USE_GTID=SLAVE_POS;
+
+--connection server_2
+--replace_result $SERVER_MYPORT_3 SERVER_MYPORT_3
+eval CHANGE MASTER TO master_host = '127.0.0.1', master_port = $SERVER_MYPORT_3,
+ MASTER_USE_GTID=SLAVE_POS;
+--source include/start_slave.inc
+
+--connection server_3
+--source include/start_slave.inc
+
+--connection server_1
+INSERT INTO t1 VALUES (3,1);
+INSERT INTO t3 VALUES (3,1);
+--source include/save_master_gtid.inc
+
+--connection server_3
+INSERT INTO t2 VALUES (2,2);
+
+--source include/sync_with_master_gtid.inc
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/sync_with_master_gtid.inc
+
+SELECT * FROM t1 ORDER BY a;
+# Verify that table t3 is being filtered.
+--error 1146
+SELECT * FROM t3 ORDER BY a;
+SELECT * FROM t2 ORDER BY a;
+
+
+--echo *** Restore original topology.
+
+--connection server_3
+--source include/stop_slave.inc
+
+--connection server_2
+--source include/stop_slave.inc
+--replace_result $SERVER_MYPORT_1 SERVER_MYPORT_1
+eval CHANGE MASTER TO master_host = '127.0.0.1', master_port = $SERVER_MYPORT_1,
+ MASTER_USE_GTID=SLAVE_POS;
+--source include/start_slave.inc
+
+--connection server_3
+--replace_result $SERVER_MYPORT_2 SERVER_MYPORT_2
+eval CHANGE MASTER TO master_host = '127.0.0.1', master_port = $SERVER_MYPORT_2,
+ MASTER_USE_GTID=SLAVE_POS;
+--source include/start_slave.inc
+
+
+# Cleanup
+
+--connection server_1
+DROP TABLE t1;
+DROP TABLE t3;
+--source include/save_master_gtid.inc
+
+--connection server_2
+DROP TABLE t2;
+--source include/sync_with_master_gtid.inc
+--source include/save_master_gtid.inc
+
+--connection server_3
+--source include/sync_with_master_gtid.inc
+
+--source include/rpl_end.inc
diff --git a/sql/sql_repl.cc b/sql/sql_repl.cc
index 0d2e61f7f59..e3b6d5fb7f3 100644
--- a/sql/sql_repl.cc
+++ b/sql/sql_repl.cc
@@ -1823,13 +1823,19 @@ send_event_to_slave(binlog_send_info *info, Log_event_type event_type,
{
if (info->slave_gtid_strict_mode &&
event_gtid.seq_no > gtid->seq_no &&
- !(gtid_entry->flags & slave_connection_state::START_OWN_SLAVE_POS))
+ !(gtid_entry->flags & slave_connection_state::START_OWN_SLAVE_POS) &&
+ !info->slave_gtid_ignore_duplicates)
{
/*
In strict mode, it is an error if the slave requests to start
in a "hole" in the master's binlog: a GTID that does not
exist, even though both the prior and subsequent seq_no exists
for same domain_id and server_id.
+
+ But in --gtid-ignore-duplicates this is relaxed, as this
+ implies that we trust the sequence numbers between different
+ server_id. Thus, we want to allow the slave to connect at the
+ "hole", which could eg. be a filtered event.
*/
info->error= ER_GTID_START_FROM_BINLOG_HOLE;
*error_gtid= *gtid;
--
2.30.2
1
0

[PATCH] MDEV-27436: binlog corruption (/tmp no space left on device at the same moment)
by Kristian Nielsen 26 Oct '23
by Kristian Nielsen 26 Oct '23
26 Oct '23
This commit fixes several bugs in error handling around disk full when
writing the statement/transaction binlog caches:
1. If the error occurs during a non-transactional statement, the code
attempts to binlog the partially executed statement (as it cannot roll
back). The stmt_cache->error was still set from the disk full error. This
caused MYSQL_BIN_LOG::write_cache() to get an error while trying to read the
cache to copy it to the binlog. This was then wrongly interpreted as a disk
full error writing to the binlog file. As a result, a partial event group
containing just a GTID event (no query or commit) was binlogged. Fixed by
checking if an error is set in the statement cache, and if so binlog an
INCIDENT event instead of a corrupt event group.
2. For LOAD DATA LOCAL INFILE, if a disk full error occured while writing to
the statement cache, the code would attempt to abort and read-and-discard
any remaining data sent by the client. The discard code would however
continue trying to write data to the statement cache, and wrongly interpret
another disk full error as end-of-file from the client. This left the client
connection with extra data which corrupts the communication for the next
command, as well as again causing an corrupt/incomplete event to be
binlogged. Fixed by restoring the default read function before reading any
remaining data from the client connection.
Signed-off-by: Kristian Nielsen <knielsen(a)knielsen-hq.org>
---
.../rpl_binlog_cache_disk_full_loaddata.test | 45 +++++++
.../rpl/t/rpl_binlog_cache_disk_full_row.test | 59 +++++++++
sql/log.cc | 121 ++++++++++++------
sql/sql_load.cc | 4 +
sql/sql_repl.cc | 34 ++++-
5 files changed, 224 insertions(+), 39 deletions(-)
create mode 100644 mysql-test/suite/rpl/t/rpl_binlog_cache_disk_full_loaddata.test
create mode 100644 mysql-test/suite/rpl/t/rpl_binlog_cache_disk_full_row.test
diff --git a/mysql-test/suite/rpl/t/rpl_binlog_cache_disk_full_loaddata.test b/mysql-test/suite/rpl/t/rpl_binlog_cache_disk_full_loaddata.test
new file mode 100644
index 00000000000..be4399a52ac
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_binlog_cache_disk_full_loaddata.test
@@ -0,0 +1,45 @@
+--source include/have_binlog_format_statement.inc
+--source include/have_debug.inc
+--source include/master-slave.inc
+
+--connection master
+# Set minimal cache size so smaller transaction can trigger spill to disk.
+SET @save_binlog_stmt_cache_size= @@GLOBAL.binlog_stmt_cache_size;
+SET GLOBAL binlog_stmt_cache_size= 4096;
+
+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=MyISAM;
+
+FLUSH STATUS;
+SHOW STATUS LIKE "binlog_stmt_cache%";
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,load_data_binlog_cache_error";
+--error 3
+LOAD DATA CONCURRENT LOCAL INFILE 'std_data/bug30435_5k.txt'
+ REPLACE INTO TABLE t1 (a);
+SET SESSION debug_dbug= @old_dbug;
+SHOW STATUS LIKE "binlog_stmt_cache%";
+# The actual number of rows left after the disk full error may change as
+# binlog event sizes are modified. So here we just test that we get partial
+# update from the last INSERT..SELECT that gets disk full error.
+SELECT IF(COUNT(*) > 0 AND COUNT(*) < 5000,
+ "ok",
+ CONCAT("ERROR! Row count ", COUNT(*), " not as expected for partially executed query"))
+ AS check_result
+ FROM t1;
+
+--save_master_pos
+
+--connection slave
+--let $slave_sql_errno= 1590
+--source include/wait_for_slave_sql_error_and_skip.inc
+
+--sync_with_master
+SELECT COUNT(*) FROM t1;
+
+# Cleanup
+
+--connection master
+SET GLOBAL binlog_stmt_cache_size= @save_binlog_stmt_cache_size;
+DROP TABLE t1;
+
+--source include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/t/rpl_binlog_cache_disk_full_row.test b/mysql-test/suite/rpl/t/rpl_binlog_cache_disk_full_row.test
new file mode 100644
index 00000000000..eb67eca5071
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_binlog_cache_disk_full_row.test
@@ -0,0 +1,59 @@
+--source include/have_binlog_format_row.inc
+--source include/have_debug.inc
+--source include/master-slave.inc
+
+--connection master
+# Set minimal cache size so smaller transaction can trigger spill to disk.
+SET @save_binlog_stmt_cache_size= @@GLOBAL.binlog_stmt_cache_size;
+SET GLOBAL binlog_stmt_cache_size= 4096;
+
+CREATE TABLE t1 (a INT PRIMARY KEY, b VARCHAR(255)) ENGINE=MyISAM;
+
+FLUSH STATUS;
+SHOW STATUS LIKE "binlog_stmt_cache%";
+INSERT INTO t1 VALUES (0, CONCAT("?", "-", REPEAT("x", 200)));
+INSERT INTO t1 SELECT a+1, CONCAT(a, "-", REPEAT("x", 200)) FROM t1;
+INSERT INTO t1 SELECT a+2, CONCAT(a, "-", REPEAT("x", 200)) FROM t1;
+INSERT INTO t1 SELECT a+4, CONCAT(a, "-", REPEAT("x", 200)) FROM t1;
+INSERT INTO t1 SELECT a+8, CONCAT(a, "-", REPEAT("x", 200)) FROM t1;
+INSERT INTO t1 SELECT a+16, CONCAT(a, "-", REPEAT("x", 200)) FROM t1;
+INSERT INTO t1 SELECT a+32, CONCAT(a, "-", REPEAT("x", 200)) FROM t1;
+INSERT INTO t1 SELECT a+64, CONCAT(a, "-", REPEAT("x", 200)) FROM t1;
+INSERT INTO t1 SELECT a+128, CONCAT(a, "-", REPEAT("x", 200)) FROM t1;
+SHOW STATUS LIKE "binlog_stmt_cache%";
+
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,simulate_disk_full_at_flush_pending";
+--error 3
+INSERT INTO t1 SELECT a+256, CONCAT(a, "-", REPEAT("x", 200)) FROM t1;
+SET SESSION debug_dbug= @old_dbug;
+SHOW STATUS LIKE "binlog_stmt_cache%";
+# The actual number of rows left after the disk full error may change as
+# binlog event sizes are modified. So here we just test that we get partial
+# update from the last INSERT..SELECT that gets disk full error.
+SELECT IF(COUNT(*) > 256 AND COUNT(*) < 512,
+ "ok",
+ CONCAT("ERROR! Row count ", COUNT(*), " not as expected for partially executed query"))
+ AS check_result
+ FROM t1;
+
+# A random extra event that helped show the bug that a partial event
+# group was binlogged.
+ALTER TABLE t1 COMMENT '<mumble>';
+
+--save_master_pos
+
+--connection slave
+--let $slave_sql_errno= 1590
+--source include/wait_for_slave_sql_error_and_skip.inc
+
+--sync_with_master
+SELECT COUNT(*) FROM t1;
+
+# Cleanup
+
+--connection master
+SET GLOBAL binlog_stmt_cache_size= @save_binlog_stmt_cache_size;
+DROP TABLE t1;
+
+--source include/rpl_end.inc
diff --git a/sql/log.cc b/sql/log.cc
index e7292064747..8c93f9adf41 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -6005,8 +6005,17 @@ MYSQL_BIN_LOG::flush_and_set_pending_rows_event(THD *thd,
/*
Write pending event to the cache.
*/
+#ifndef DBUG_OFF
+ bool clear_dbug= false;
+#endif
DBUG_EXECUTE_IF("simulate_disk_full_at_flush_pending",
- {DBUG_SET("+d,simulate_file_write_error");});
+ {
+ if (my_b_tell(&cache_data->cache_log) > 10000)
+ {
+ DBUG_SET("+d,simulate_file_write_error");
+ clear_dbug= true;
+ }
+ });
if (writer.write(pending))
{
set_write_error(thd, is_transactional);
@@ -6016,9 +6025,17 @@ MYSQL_BIN_LOG::flush_and_set_pending_rows_event(THD *thd,
delete pending;
cache_data->set_pending(NULL);
DBUG_EXECUTE_IF("simulate_disk_full_at_flush_pending",
- {DBUG_SET("-d,simulate_file_write_error");});
+ {
+ if (clear_dbug)
+ DBUG_SET("-d,simulate_file_write_error");
+ });
DBUG_RETURN(1);
}
+ DBUG_EXECUTE_IF("simulate_disk_full_at_flush_pending",
+ {
+ if (clear_dbug)
+ DBUG_SET("-d,simulate_file_write_error");
+ });
delete pending;
}
@@ -8337,51 +8354,83 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry,
binlog_cache_mngr *mngr= entry->cache_mngr;
DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_or_stmt");
- if (write_gtid_event(entry->thd, false, entry->using_trx_cache, commit_id))
- DBUG_RETURN(ER_ERROR_ON_WRITE);
+ bool do_stmt= entry->using_stmt_cache && !mngr->stmt_cache.empty();
+ bool do_trx= entry->using_trx_cache && !mngr->trx_cache.empty();
+ IO_CACHE *stmt_cache= mngr->get_binlog_cache_log(FALSE);
+ IO_CACHE *trx_cache= mngr->get_binlog_cache_log(TRUE);
- if (entry->using_stmt_cache && !mngr->stmt_cache.empty() &&
- write_cache(entry->thd, mngr->get_binlog_cache_log(FALSE)))
+ if (likely(!( (do_stmt && stmt_cache->error) ||
+ (do_trx && trx_cache->error) )))
{
- entry->error_cache= &mngr->stmt_cache.cache_log;
- DBUG_RETURN(ER_ERROR_ON_WRITE);
- }
+ if (write_gtid_event(entry->thd, false, entry->using_trx_cache, commit_id))
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
- if (entry->using_trx_cache && !mngr->trx_cache.empty())
- {
- DBUG_EXECUTE_IF("crash_before_writing_xid",
+ if (do_stmt &&
+ write_cache(entry->thd, mngr->get_binlog_cache_log(FALSE)))
+ {
+ entry->error_cache= &mngr->stmt_cache.cache_log;
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+ }
+
+ if (do_trx)
+ {
+ DBUG_EXECUTE_IF("crash_before_writing_xid",
+ {
+ if ((write_cache(entry->thd,
+ mngr->get_binlog_cache_log(TRUE))))
+ DBUG_PRINT("info", ("error writing binlog cache"));
+ else
+ flush_and_sync(0);
+
+ DBUG_PRINT("info", ("crashing before writing xid"));
+ DBUG_SUICIDE();
+ });
+
+ if (write_cache(entry->thd, mngr->get_binlog_cache_log(TRUE)))
+ {
+ entry->error_cache= &mngr->trx_cache.cache_log;
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+ }
+ }
+
+ DBUG_EXECUTE_IF("inject_error_writing_xid",
{
- if ((write_cache(entry->thd,
- mngr->get_binlog_cache_log(TRUE))))
- DBUG_PRINT("info", ("error writing binlog cache"));
- else
- flush_and_sync(0);
-
- DBUG_PRINT("info", ("crashing before writing xid"));
- DBUG_SUICIDE();
+ entry->error_cache= NULL;
+ errno= 28;
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
});
- if (write_cache(entry->thd, mngr->get_binlog_cache_log(TRUE)))
+ if (write_event(entry->end_event))
{
- entry->error_cache= &mngr->trx_cache.cache_log;
+ entry->error_cache= NULL;
DBUG_RETURN(ER_ERROR_ON_WRITE);
}
+ status_var_add(entry->thd->status_var.binlog_bytes_written,
+ entry->end_event->data_written);
}
+ else
+ {
+ /*
+ If writing the IO_CACHE caused an error, we musn't flush it to the main
+ binlog, it's probably corrupt/truncated.
- DBUG_EXECUTE_IF("inject_error_writing_xid",
- {
- entry->error_cache= NULL;
- errno= 28;
- DBUG_RETURN(ER_ERROR_ON_WRITE);
- });
+ We clear the error (otherwise it would be interpreted as an error
+ _reading_ the IO_CACHE).
- if (write_event(entry->end_event))
- {
- entry->error_cache= NULL;
- DBUG_RETURN(ER_ERROR_ON_WRITE);
+ And generate an incident event, if one wasn't set already.
+ */
+ stmt_cache->error= trx_cache->error= 0;
+ if (!entry->incident_event)
+ {
+ Incident_log_event inc_ev(entry->thd, INCIDENT_LOST_EVENTS,
+ &write_error_msg);
+ if (write_event(&inc_ev))
+ {
+ entry->error_cache= NULL;
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+ }
+ }
}
- status_var_add(entry->thd->status_var.binlog_bytes_written,
- entry->end_event->data_written);
if (entry->incident_event)
{
@@ -8392,12 +8441,12 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry,
}
}
- if (unlikely(mngr->get_binlog_cache_log(FALSE)->error))
+ if (unlikely(do_stmt && stmt_cache->error))
{
entry->error_cache= &mngr->stmt_cache.cache_log;
DBUG_RETURN(ER_ERROR_ON_WRITE);
}
- if (unlikely(mngr->get_binlog_cache_log(TRUE)->error)) // Error on read
+ if (unlikely(do_trx && trx_cache->error)) // Error on read
{
entry->error_cache= &mngr->trx_cache.cache_log;
DBUG_RETURN(ER_ERROR_ON_WRITE);
diff --git a/sql/sql_load.cc b/sql/sql_load.cc
index 8264286a022..cc4361b0472 100644
--- a/sql/sql_load.cc
+++ b/sql/sql_load.cc
@@ -253,6 +253,10 @@ class READ_INFO: public Load_data_param
*/
void skip_data_till_eof()
{
+#ifndef EMBEDDED_LIBRARY
+ if (mysql_bin_log.is_open())
+ cache.read_function= cache.real_read_function;
+#endif
while (GET != my_b_EOF)
;
}
diff --git a/sql/sql_repl.cc b/sql/sql_repl.cc
index d9b93742195..0d2e61f7f59 100644
--- a/sql/sql_repl.cc
+++ b/sql/sql_repl.cc
@@ -4499,6 +4499,10 @@ int log_loaded_block(IO_CACHE* file, uchar *Buffer, size_t Count)
/* buffer contains position where we started last read */
uchar* buffer= (uchar*) my_b_get_buffer_start(file);
uint max_event_size= lf_info->thd->variables.max_allowed_packet;
+ int res;
+#ifndef DBUG_OFF
+ bool did_dbug_inject= false;
+#endif
if (lf_info->thd->is_current_stmt_binlog_format_row())
goto ret;
@@ -4506,6 +4510,19 @@ int log_loaded_block(IO_CACHE* file, uchar *Buffer, size_t Count)
lf_info->last_pos_in_file >= my_b_get_pos_in_file(file))
goto ret;
+ DBUG_EXECUTE_IF("load_data_binlog_cache_error",
+ {
+ /*
+ Simulate "disk full" error in the middle of writing to
+ the binlog cache.
+ */
+ if (lf_info->last_pos_in_file >= 2*4096)
+ {
+ DBUG_SET("+d,simulate_file_write_error");
+ did_dbug_inject= true;
+ }
+ };);
+
for (block_len= (uint) (my_b_get_bytes_in_buffer(file)); block_len > 0;
buffer += MY_MIN(block_len, max_event_size),
block_len -= MY_MIN(block_len, max_event_size))
@@ -4517,7 +4534,10 @@ int log_loaded_block(IO_CACHE* file, uchar *Buffer, size_t Count)
MY_MIN(block_len, max_event_size),
lf_info->log_delayed);
if (mysql_bin_log.write(&a))
- DBUG_RETURN(1);
+ {
+ res= 1;
+ goto err;
+ }
}
else
{
@@ -4526,12 +4546,20 @@ int log_loaded_block(IO_CACHE* file, uchar *Buffer, size_t Count)
MY_MIN(block_len, max_event_size),
lf_info->log_delayed);
if (mysql_bin_log.write(&b))
- DBUG_RETURN(1);
+ {
+ res= 1;
+ goto err;
+ }
lf_info->wrote_create_file= 1;
}
}
ret:
- int res= Buffer ? lf_info->real_read_function(file, Buffer, Count) : 0;
+ res= Buffer ? lf_info->real_read_function(file, Buffer, Count) : 0;
+err:
+#ifndef DBUG_OFF
+ if (did_dbug_inject)
+ DBUG_SET("-d,simulate_file_write_error");
+#endif
DBUG_RETURN(res);
}
--
2.30.2
1
0

22 Oct '23
From: Andrei <andrei.elkin(a)mariadb.com>
XA-Prepare group of events
XA START xid
...
XA END xid
XA PREPARE xid
and its XA-"complete" terminator
XA COMMIT or
XA ROLLBACK
are made distributed Round-Robin across slave parallel workers.
The former hash-based policy was proven to attribute to execution
latency through creating a big - many times larger than the size
of the worker pool - queue of binlog-ordered transactions
to commit.
Acronyms and notations used below:
XAP := XA-Prepare event or the whole prepared XA group of events
XAC := XA-"complete", which is a solitary group of events
|W| := the size of the slave worker pool
Subscripts like `_k' denote order in a corresponding sequence
(e.g binlog file).
KEY CHANGES:
The parallel slave
------------------
driver thread now maintains a list XAP:s currently
in processing. It's purpose is to avoid "wild" parallel execution of XA:s
with duplicate xids (unlikely, but that's the user's right).
The list is arranged as a sliding window with the size of 2*|W| to account
a possibility of XAP_k -> XAP_k+2|W|-1 the largest (in the group-of-events
count sense) dependency.
Say k=1, and |W| the # of Workers is 4. As transactions are distributed
Round-Robin, it's possible to have T^*_1 -> T^*_8 as the largest
dependency ('*' marks the dependents) in runtime.
It can be seen from worker queues, like in the picture below.
Let Q_i worker queues develop downward:
Q1 ... Q4
1^* 2 3 4
5 6 7 8^*
Worker # 1 has assigned with T_1 and T_5.
Worker #4 can take on its T_8 when T_1 is yet at the
beginning of its processing, so even before XA START of that XAP.
XA related
----------
XID_cache_element is extended with two pointers to resolve
two types of dependencies: the duplicate xid XAP_k -> XAP_k+i
and the ordinary completion on the prepare XAP_k -> XAC_k+j.
The former is handled by a wait-for-xid protocol conducted by
xid_cache_delete() and xid_cache_insert_maybe_wait().
The later is done analogously by xid_cache_search_maybe_wait() and
slave_applier_reset_xa_trans().
XA-"complete" are allowed to go forward before its XAP parent
has released the xid (all recovery concerns are covered in MDEV-21496,
MDEV-21777).
Yet XAC is going to wait for it at a critical
point of execution which is at "complete" the work in Engine.
CAVEAT: storage/innobase/trx/trx0undo.cc changes are due to possibly
fixed MDEV-32144,
TODO: to be verified.
Thanks to Brandon Nesterenko at mariadb.com for initial review and
a lot of creative efforts to advance with this work!
---
mysql-test/include/show_binlog_events2.inc | 11 +
.../binlog/r/binlog_xa_prepared_bugs.result | 53 +
.../binlog/t/binlog_xa_prepared_bugs.test | 30 +
.../rpl/include/rpl_xa_concurrent_2pc.inc | 442 ++++++++
.../suite/rpl/r/rpl_xa_concurrent_2pc.result | 953 ++++++++++++++++++
.../rpl/r/rpl_xa_empty_transaction.result | 51 +
.../rpl/r/rpl_xa_prepare_gtid_fail.result | 2 +-
.../suite/rpl/t/rpl_xa_concurrent_2pc.test | 111 ++
.../suite/rpl/t/rpl_xa_empty_transaction.test | 89 ++
.../suite/rpl/t/rpl_xa_prepare_gtid_fail.test | 2 +-
sql/handler.cc | 23 +-
sql/log.cc | 219 +++-
sql/log_event_server.cc | 17 +-
sql/mysqld.cc | 4 +-
sql/mysqld.h | 4 +-
sql/rpl_parallel.cc | 128 ++-
sql/rpl_parallel.h | 16 +
sql/rpl_rli.cc | 6 +-
sql/rpl_rli.h | 8 +
sql/sql_array.h | 4 +-
sql/sql_class.cc | 2 +
sql/sql_class.h | 9 +
sql/xa.cc | 333 +++++-
storage/innobase/trx/trx0undo.cc | 3 +-
24 files changed, 2419 insertions(+), 101 deletions(-)
create mode 100644 mysql-test/suite/binlog/r/binlog_xa_prepared_bugs.result
create mode 100644 mysql-test/suite/binlog/t/binlog_xa_prepared_bugs.test
create mode 100644 mysql-test/suite/rpl/include/rpl_xa_concurrent_2pc.inc
create mode 100644 mysql-test/suite/rpl/r/rpl_xa_concurrent_2pc.result
create mode 100644 mysql-test/suite/rpl/t/rpl_xa_concurrent_2pc.test
diff --git a/mysql-test/include/show_binlog_events2.inc b/mysql-test/include/show_binlog_events2.inc
index 84c62cced66..416514faea4 100644
--- a/mysql-test/include/show_binlog_events2.inc
+++ b/mysql-test/include/show_binlog_events2.inc
@@ -1,3 +1,9 @@
+# ==== Usage ====
+#
+# [--let $binlog_file= [<FILENAME> | LAST]]
+# [--let $binlog_start= <POSITION> ]
+# [--let $filter_cid= [0 | 1]
+
if ($binlog_start)
{
--let $_binlog_start=$binlog_start
@@ -14,4 +20,9 @@ if ($binlog_file)
--replace_result "$_from_binlog_start" "from <binlog_start>" $MYSQLTEST_VARDIR MYSQLTEST_VARDIR
--replace_column 2 # 5 #
--replace_regex /\/\* xid=.* \*\//\/* XID *\// /table_id: [0-9]+/table_id: #/ /file_id=[0-9]+/file_id=#/ /GTID [0-9]+-[0-9]+-[0-9]+/GTID #-#-#/
+if ($filter_cid)
+{
+--replace_regex /\/\* xid=.* \*\//\/* XID *\// /table_id: [0-9]+/table_id: #/ /file_id=[0-9]+/file_id=#/ /GTID [0-9]+-[0-9]+-[0-9]+/GTID #-#-#/ / cid=[0-9]+//
+
+}
--eval show binlog events $_in_binlog_file from $_binlog_start
diff --git a/mysql-test/suite/binlog/r/binlog_xa_prepared_bugs.result b/mysql-test/suite/binlog/r/binlog_xa_prepared_bugs.result
new file mode 100644
index 00000000000..35b7accfb24
--- /dev/null
+++ b/mysql-test/suite/binlog/r/binlog_xa_prepared_bugs.result
@@ -0,0 +1,53 @@
+CREATE TABLE ta (c INT KEY) engine=Aria;
+XA START 'xid_a';
+INSERT INTO ta VALUES (1);
+XA END 'xid_a';
+XA PREPARE 'xid_a';
+Warnings:
+Warning 1030 Got error 131 "Command not supported by the engine" from storage engine Aria
+LOAD INDEX INTO CACHE c KEY(PRIMARY);
+Table Op Msg_type Msg_text
+test.c preload_keys Error XAER_RMFAIL: The command cannot be executed when global transaction is in the PREPARED state
+test.c preload_keys Error XAER_RMFAIL: The command cannot be executed when global transaction is in the PREPARED state
+test.c preload_keys error Corrupt
+Warnings:
+Warning 1196 Some non-transactional changed tables couldn't be rolled back
+XA ROLLBACK 'xid_a';
+CREATE TABLE ti (c INT KEY) engine=Innodb;
+XA START 'xid_i';
+INSERT INTO ti VALUES (1);
+XA END 'xid_i';
+XA PREPARE 'xid_i';
+LOAD INDEX INTO CACHE c KEY(PRIMARY);
+Table Op Msg_type Msg_text
+test.c preload_keys Error XAER_RMFAIL: The command cannot be executed when global transaction is in the PREPARED state
+test.c preload_keys Error XAER_RMFAIL: The command cannot be executed when global transaction is in the PREPARED state
+test.c preload_keys error Corrupt
+XA COMMIT 'xid_i';
+SELECT * FROM ti;
+c
+include/show_binlog_events.inc
+Log_name Pos Event_type Server_id End_log_pos Info
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # use `test`; CREATE TABLE ta (c INT KEY) engine=Aria
+master-bin.000001 # Gtid # # BEGIN GTID #-#-#
+master-bin.000001 # Annotate_rows # # INSERT INTO ta VALUES (1)
+master-bin.000001 # Table_map # # table_id: # (test.ta)
+master-bin.000001 # Write_rows_v1 # # table_id: # flags: STMT_END_F
+master-bin.000001 # Query # # COMMIT
+master-bin.000001 # Gtid # # XA START X'7869645f61',X'',1 GTID #-#-#
+master-bin.000001 # Query # # XA END X'7869645f61',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'7869645f61',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA ROLLBACK X'7869645f61',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # use `test`; CREATE TABLE ti (c INT KEY) engine=Innodb
+master-bin.000001 # Gtid # # XA START X'7869645f69',X'',1 GTID #-#-#
+master-bin.000001 # Annotate_rows # # INSERT INTO ti VALUES (1)
+master-bin.000001 # Table_map # # table_id: # (test.ti)
+master-bin.000001 # Write_rows_v1 # # table_id: # flags: STMT_END_F
+master-bin.000001 # Query # # XA END X'7869645f69',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'7869645f69',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA ROLLBACK X'7869645f69',X'',1
+drop table ta,ti;
diff --git a/mysql-test/suite/binlog/t/binlog_xa_prepared_bugs.test b/mysql-test/suite/binlog/t/binlog_xa_prepared_bugs.test
new file mode 100644
index 00000000000..3a5bb15968e
--- /dev/null
+++ b/mysql-test/suite/binlog/t/binlog_xa_prepared_bugs.test
@@ -0,0 +1,30 @@
+--source include/have_binlog_format_row.inc
+--source include/have_innodb.inc
+
+CREATE TABLE ta (c INT KEY) engine=Aria;
+XA START 'xid_a';
+INSERT INTO ta VALUES (1);
+XA END 'xid_a';
+XA PREPARE 'xid_a';
+
+#--error ER_XAER_RMFAIL
+LOAD INDEX INTO CACHE c KEY(PRIMARY);
+
+XA ROLLBACK 'xid_a';
+
+CREATE TABLE ti (c INT KEY) engine=Innodb;
+XA START 'xid_i';
+INSERT INTO ti VALUES (1);
+XA END 'xid_i';
+XA PREPARE 'xid_i';
+
+# --error ER_XAER_RMFAIL
+LOAD INDEX INTO CACHE c KEY(PRIMARY);
+
+XA COMMIT 'xid_i';
+SELECT * FROM ti;
+
+#
+--source include/show_binlog_events.inc
+
+drop table ta,ti;
diff --git a/mysql-test/suite/rpl/include/rpl_xa_concurrent_2pc.inc b/mysql-test/suite/rpl/include/rpl_xa_concurrent_2pc.inc
new file mode 100644
index 00000000000..d1f1868d2c9
--- /dev/null
+++ b/mysql-test/suite/rpl/include/rpl_xa_concurrent_2pc.inc
@@ -0,0 +1,442 @@
+#
+# Helper file to run the 1-4(a,b) test cases for rpl_xa_concurrent_2pc,
+# with either XA COMMIT or XA ROLLBACK used to complete XA transactions.
+#
+# Parameters
+# $xa_complete_sym (string) : COMMIT or ROLLBACK, the action used to complete
+# a prepared XA transaction
+#
+
+if (!$xa_complete_sym)
+{
+ die MTR variable xa_complete_sym not specified, must be either COMMIT or ROLLBACK;
+}
+
+--let $is_xac= 0
+--let $is_xar= 0
+
+if (`SELECT strcmp("COMMIT", "$xa_complete_sym") = 0`)
+{
+ --let $is_xac= 1
+}
+
+if (`SELECT strcmp("ROLLBACK", "$xa_complete_sym") = 0`)
+{
+ --let $is_xar= 1
+}
+
+if (`SELECT !$is_xar && !$is_xac`)
+{
+ die MTR variable xa_complete_sym invalid, must be either COMMIT or ROLLBACK;
+}
+
+
+--echo #
+--echo # Initialize test data
+--connection slave
+--source include/stop_slave.inc
+RESET SLAVE;
+set @@global.gtid_slave_pos= "";
+
+if ($is_xac)
+{
+--connection slave
+RESET MASTER;
+
+--connection master
+RESET MASTER;
+}
+
+--connection master
+create table t1 (a int primary key, b int) engine=innodb;
+
+# Slave locks this row before updates to pause transaction progress
+--let $hold_row= -1
+--let $t1_ctr= 0
+--eval insert into t1 values ($hold_row, 0)
+
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+--source include/stop_slave.inc
+set @save_debug= @@GLOBAL.debug_dbug;
+set @save_par_thds= @@GLOBAL.slave_parallel_threads;
+set @save_par_mode= @@GLOBAL.slave_parallel_mode;
+set @@GLOBAL.slave_parallel_threads= 4;
+set @@GLOBAL.slave_parallel_mode= optimistic;
+
+set statement sql_log_bin=0 for call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
+
+
+--echo #
+--echo # Test Case 1: Ensure that a 2-phase XA transaction has its XA PREPARE
+--echo # and XA $xa_complete_sym run concurrently. That is, the
+--echo # XA $xa_complete_sym will wait at group commit until the XA PREPARE
+--echo # binlogs, and then it will wait again until the XA PREPARE finishes
+--echo # preparing in all engines. At this point, the XA $xa_complete_sym will
+--echo # run to completion.
+--connection master
+# For worker thread to hold XAP at dequeue time via debug_sync through
+# `hold_worker_on_schedule`.
+set @@session.gtid_seq_no= 100;
+XA START 'x';
+--eval insert into t1 values ($t1_ctr, 0)
+--inc $t1_ctr
+XA END 'x';
+XA PREPARE 'x';
+--eval XA $xa_complete_sym 'x'
+--source include/save_master_gtid.inc
+
+--connection slave
+# For worker to stop at dequeue event time and after binlogging XA PREPARE
+set @@global.debug_dbug= "+d,hold_worker_on_schedule,stop_after_binlog_prepare";
+--source include/start_slave.inc
+
+--echo # Waiting for XAP to pause when it is pulled from the queue
+set debug_sync= "now wait_for reached_pause";
+
+--echo # Before the XA PREPARE executes, the XA $xa_complete_sym should wait in group commit..
+--let $wait_condition=SELECT count(*) = 1 FROM information_schema.processlist WHERE command = 'Slave_worker' AND state LIKE "Waiting for prior transaction to commit"
+--source include/wait_condition.inc
+--echo # ..done
+
+
+--echo # Execute the XA PREPARE
+set debug_sync= "now signal continue_worker";
+
+--echo # Wait for XA PREPARE to have binlogged, but hold it before it prepares in engines
+set debug_sync= "now wait_for xa_prepare_binlogged";
+
+--echo # The XA $xa_complete_sym should move on from binlog to wait for the XA PREPARE to complete in engines
+--let $wait_condition=SELECT count(*) = 1 FROM information_schema.processlist WHERE command = 'Slave_worker' AND state LIKE "Waiting for prior xa transaction"
+--source include/wait_condition.inc
+--echo # ..done
+
+--echo # Signal the XAP to complete in engines (which will automatically signal XAC)
+set debug_sync= "now signal continue_xap";
+
+--source include/sync_with_master_gtid.inc
+
+--let $diff_tables=master:test.t1, slave:test.t1
+--source include/diff_tables.inc
+
+--connection slave
+--source include/stop_slave.inc
+set @@global.debug_dbug= @save_debug;
+
+
+--echo #
+--echo # Test Case 2: If two XA $xa_complete_sym transactions have different
+--echo # XIDs, ensure both phases of both transactions all execute concurrently.
+--echo #
+
+--echo # Ensure slave is stopped
+--connection slave
+--source include/wait_for_slave_to_stop.inc
+
+# Stop both XAP after their binlogging and before their engine changing
+set @@global.debug_dbug= "+d,stop_before_binlog_prepare,stop_after_binlog_prepare";
+
+--connection master
+XA START 'x1';
+--eval insert into t1 values ($t1_ctr, 0)
+--inc $t1_ctr
+XA END 'x1';
+XA PREPARE 'x1';
+--eval XA $xa_complete_sym 'x1'
+
+XA START 'x2';
+--eval insert into t1 values ($t1_ctr, 0)
+--inc $t1_ctr
+XA END 'x2';
+XA PREPARE 'x2';
+--eval XA $xa_complete_sym 'x2'
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/start_slave.inc
+
+# This stage is necessary to avoid XAP_1 <-register-> XAC_2 race in that
+# XAC_2 may get stuck in the below WFPT2C state all time until XAP_1 has finished.
+# Prove the workers' status are like the following:
+--let $count_wait= 2
+--let $wait_condition=SELECT count(*) = $count_wait FROM information_schema.processlist WHERE command = 'Slave_worker' AND state LIKE "Waiting for prior transaction to commit"
+--source include/wait_condition.inc
+
+--let $count_wait= 2
+--let $wait_condition=SELECT count(*) = $count_wait FROM information_schema.processlist WHERE command = 'Slave_worker' AND state LIKE "debug sync point: now"
+--source include/wait_condition.inc
+set debug_sync= "now signal binlog_xap";
+
+# wait for two XAP:s arrive at their stations (XAP_1 to next one after binlog is done)
+--let $wait_condition=SELECT count(*) >= 1 FROM information_schema.processlist WHERE command = 'Slave_worker' AND state LIKE "debug sync point: now"
+--source include/wait_condition.inc
+set debug_sync= "now signal binlog_xap";
+
+--echo # Ensuring both phases of both transactions all execute concurrently
+# Waiting for both XA "COMPLETE"s to binlog proves this, as they would not pass
+# group commit if their preceding XA PREPAREs had not also binlogged
+--let $count_xa_wait_workers= 2
+
+--let $wait_condition=SELECT count(*) = $count_xa_wait_workers FROM information_schema.processlist WHERE command = 'Slave_worker' AND state LIKE "Waiting for prior xa transaction"
+--source include/wait_condition.inc
+--echo # ..done
+
+--echo # Verify XA PREPARE has binlogged
+set debug_sync= "now wait_for xa_prepare_binlogged";
+
+--echo # Signal the XAPs to complete in engines (which will automatically signal XACs)
+set debug_sync= "now signal continue_xap";
+
+--let $wait_condition=SELECT count(*) = 1 FROM information_schema.processlist WHERE command = 'Slave_worker' AND state LIKE "debug sync point: now"
+--source include/wait_condition.inc
+set debug_sync= "now signal continue_xap";
+
+--source include/sync_with_master_gtid.inc
+
+--let $diff_tables=master:test.t1, slave:test.t1
+--source include/diff_tables.inc
+
+--connection slave
+--source include/stop_slave.inc
+set @@global.debug_dbug= @save_debug;
+
+
+--echo #
+--echo # Test Case 3: Two current 2-phase XA transactions with matching XIDs
+--echo # should run one after the other, while each transaction still allows
+--echo # its XA PREPARE and XA $xa_complete_sym to run concurrently
+
+--echo # Ensure slave is stopped
+--connection slave
+--source include/wait_for_slave_to_stop.inc
+
+# Stop both XAP after their binlogging and before their engine changing
+set @@global.debug_dbug= "+d,stop_before_binlog_prepare,stop_after_binlog_prepare,stop_after_binlog_cor_by_xid";
+
+--connection master
+XA START 'x';
+--eval insert into t1 values ($t1_ctr, 0)
+--inc $t1_ctr
+XA END 'x';
+XA PREPARE 'x';
+--eval XA $xa_complete_sym 'x'
+
+XA START 'x';
+--eval insert into t1 values ($t1_ctr, 0)
+--inc $t1_ctr
+XA END 'x';
+XA PREPARE 'x';
+--eval XA $xa_complete_sym 'x'
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/start_slave.inc
+
+# This stage is necessary to avoid XAP_1 <-register-> XAC_2 race in that
+# XAC_2 may get stuck in the below WFPT2C state all time until XAP_1 has finished.
+--let $count_wait= 3
+--let $wait_condition=SELECT count(*) = $count_wait FROM information_schema.processlist WHERE command = 'Slave_worker' AND state LIKE "Waiting for prior transaction to commit"
+
+--source include/wait_condition.inc
+--let $wait_condition=SELECT count(*) = 1 FROM information_schema.processlist WHERE command = 'Slave_worker' AND state LIKE "debug sync point: now"
+--source include/wait_condition.inc
+set debug_sync= "now signal binlog_xap";
+
+--echo # Verify first XA PREPARE has binlogged
+set debug_sync= "now wait_for xa_prepare_binlogged";
+
+--echo # Ensure first XA transaction is running concurrently
+--let $wait_condition=SELECT count(*) = 1 FROM information_schema.processlist WHERE command = 'Slave_worker' AND state LIKE "Waiting for prior xa transaction" AND info LIKE "XA $xa_complete_sym%"
+--source include/wait_condition.inc
+
+--echo # Ensure second XA transaction's XAP waits for the first transaction
+--let $wait_condition=SELECT count(*) = 1 FROM information_schema.processlist WHERE command = 'Slave_worker' AND state LIKE "Waiting for prior xa transaction" AND info LIKE "XA START%"
+--source include/wait_condition.inc
+
+--echo # Signal first XA PREPARE to complete
+set debug_sync= "now signal continue_xap";
+
+--echo # Wait for first XA $xa_complete_sym to binlog
+set debug_sync= "now wait_for xa_cor_binlogged";
+
+--echo # Ensure second XA PREPARE doesn't begin yet because the XAC hadn't released its XID
+--let $wait_condition=SELECT count(*) = 1 FROM information_schema.processlist WHERE command = 'Slave_worker' AND state LIKE "Waiting for prior xa transaction"
+--source include/wait_condition.inc
+
+--echo # Signal first XA $xa_complete_sym to complete
+set debug_sync= "now signal continue_xa_cor";
+
+--echo # Wait for second XA PREPARE to binlogged
+--echo # First pass through binlog_xap
+--let $wait_condition=SELECT count(*) = 1 FROM information_schema.processlist WHERE command = 'Slave_worker' AND state LIKE "debug sync point: now"
+--source include/wait_condition.inc
+set debug_sync= "now signal binlog_xap";
+set debug_sync= "now wait_for xa_prepare_binlogged";
+
+--echo # Ensure second XA $xa_complete_sym is concurrent with XAP
+--let $wait_condition=SELECT count(*) = 1 FROM information_schema.processlist WHERE command = 'Slave_worker' AND state LIKE "Waiting for prior xa transaction"
+--source include/wait_condition.inc
+
+--echo # Signal second XA transaction to complete
+set debug_sync= "now signal continue_xap";
+set debug_sync= "now wait_for xa_cor_binlogged";
+set debug_sync= "now signal continue_xa_cor";
+
+--source include/sync_with_master_gtid.inc
+
+--let $diff_tables=master:test.t1, slave:test.t1
+--source include/diff_tables.inc
+
+--connection slave
+--source include/stop_slave.inc
+set @@global.debug_dbug= @save_debug;
+
+
+--echo #
+--echo # Test Case 4 (Error Case): If an XA PREPARE errors while its
+--echo # XA $xa_complete_sym is waiting on it, both phases should rollback
+--echo # successfully. Note this tests both:
+--echo # a) XA $xa_complete_sym is waiting in group commit (first phase
+--echo # times out in DMLs)
+--echo # b) XA $xa_complete_sym is waiting in group commit, with another XAP
+--echo # with a duplicate XID waiting on it.
+
+--echo # Case a)
+--echo # Ensure slave is stopped
+--connection slave
+--source include/wait_for_slave_to_stop.inc
+set @save_lock_wait_timeout= @@GLOBAL.innodb_lock_wait_timeout;
+set @save_trans_retries= @@GLOBAL.slave_transaction_retries;
+set @@global.innodb_lock_wait_timeout= 1;
+set @@global.slave_transaction_retries= 0;
+
+--connection master
+XA START 'x';
+--eval update t1 set b=b+1 where a=$hold_row
+XA END 'x';
+XA PREPARE 'x';
+--eval XA $xa_complete_sym 'x'
+--source include/save_master_gtid.inc
+
+--connection slave1
+BEGIN;
+--eval select * from t1 where a=$hold_row for update;
+
+--connection slave
+--source include/start_slave.inc
+
+--let $slave_sql_errno= 1205
+--source include/wait_for_slave_sql_error.inc
+
+--connection slave1
+ROLLBACK;
+
+--connection slave
+# Stop the IO thread too
+--source include/stop_slave_io.inc
+set @@global.innodb_lock_wait_timeout= @save_lock_wait_timeout;
+set @@global.slave_transaction_retries= @save_trans_retries;
+
+--echo # Ensure on slave restart, we can re-execute the XA transaction
+--source include/start_slave.inc
+--source include/save_master_gtid.inc
+--source include/stop_slave.inc
+set @@global.debug_dbug= @save_debug;
+
+
+--echo # Case b)
+--echo # Ensure slave is stopped
+--connection slave
+--source include/wait_for_slave_to_stop.inc
+set @save_lock_wait_timeout= @@GLOBAL.innodb_lock_wait_timeout;
+set @save_trans_retries= @@GLOBAL.slave_transaction_retries;
+set @@global.innodb_lock_wait_timeout= 1;
+set @@global.slave_transaction_retries= 0;
+
+--connection master
+XA START 'x';
+--eval update t1 set b=b+1 where a=$hold_row
+XA END 'x';
+XA PREPARE 'x';
+--eval XA $xa_complete_sym 'x'
+
+XA START 'x';
+--eval insert into t1 values ($t1_ctr, 0)
+--let $new_row_idx= $t1_ctr
+--inc $t1_ctr
+XA END 'x';
+XA PREPARE 'x';
+--source include/save_master_gtid.inc
+--eval XA $xa_complete_sym 'x'
+
+--connection slave1
+BEGIN;
+--eval select * from t1 where a=$hold_row for update;
+
+--connection slave
+--source include/start_slave.inc
+
+--let $slave_sql_errno= 1205
+--source include/wait_for_slave_sql_error.inc
+
+--connection slave1
+ROLLBACK;
+
+--echo # There should not be any prepared rows seen by XA RECOVER
+XA RECOVER;
+
+--echo # Ensuring data from second XAP isn't visible..
+if (`select count(*) from t1 where a=$new_row_idx`)
+{
+ --die Failed, row exists
+}
+--echo # ..done
+
+--connection slave
+--source include/stop_slave_io.inc
+set @@global.innodb_lock_wait_timeout= @save_lock_wait_timeout;
+set @@global.slave_transaction_retries= @save_trans_retries;
+
+--echo # Ensure on slave restart, we can re-execute the XA transaction
+--source include/start_slave.inc
+--source include/save_master_gtid.inc
+--source include/stop_slave.inc
+set @@global.debug_dbug= @save_debug;
+--source include/start_slave.inc
+
+--echo # Ensuring data from second XAP is visible..
+if ($is_xac)
+{
+ --let $expected_row_count= 1
+}
+if ($is_xar)
+{
+ --let $expected_row_count= 0
+}
+if (`select count(*) != $expected_row_count from t1 where a=$new_row_idx`)
+{
+ --die Failed, XA $xa_complete_sym was not observed
+}
+--echo # ..done
+
+--echo #
+--echo # Cleanup
+--connection master
+DROP TABLE t1;
+--source include/save_master_gtid.inc
+--let $binlog_file=query_get_value(SHOW MASTER STATUS, File, 1)
+--source include/show_binlog_events.inc
+
+--connection slave
+--source include/sync_with_master_gtid.inc
+--let $binlog_file=query_get_value(SHOW MASTER STATUS, File, 1)
+--let $filter_cid=1
+--source include/show_binlog_events2.inc
+
+--source include/stop_slave.inc
+set @@GLOBAL.slave_parallel_threads= @save_par_thds;
+set @@GLOBAL.slave_parallel_mode= @save_par_mode;
+--source include/start_slave.inc
diff --git a/mysql-test/suite/rpl/r/rpl_xa_concurrent_2pc.result b/mysql-test/suite/rpl/r/rpl_xa_concurrent_2pc.result
new file mode 100644
index 00000000000..d30943b23da
--- /dev/null
+++ b/mysql-test/suite/rpl/r/rpl_xa_concurrent_2pc.result
@@ -0,0 +1,953 @@
+include/master-slave.inc
+[connection master]
+#
+# Initialize test data
+connection slave;
+include/stop_slave.inc
+RESET SLAVE;
+set @@global.gtid_slave_pos= "";
+connection slave;
+RESET MASTER;
+connection master;
+RESET MASTER;
+connection master;
+create table t1 (a int primary key, b int) engine=innodb;
+insert into t1 values (-1, 0);
+include/save_master_gtid.inc
+connection slave;
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+set @save_debug= @@GLOBAL.debug_dbug;
+set @save_par_thds= @@GLOBAL.slave_parallel_threads;
+set @save_par_mode= @@GLOBAL.slave_parallel_mode;
+set @@GLOBAL.slave_parallel_threads= 4;
+set @@GLOBAL.slave_parallel_mode= optimistic;
+set statement sql_log_bin=0 for call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
+#
+# Test Case 1: Ensure that a 2-phase XA transaction has its XA PREPARE
+# and XA COMMIT run concurrently. That is, the
+# XA COMMIT will wait at group commit until the XA PREPARE
+# binlogs, and then it will wait again until the XA PREPARE finishes
+# preparing in all engines. At this point, the XA COMMIT will
+# run to completion.
+connection master;
+set @@session.gtid_seq_no= 100;
+XA START 'x';
+insert into t1 values (0, 0);
+XA END 'x';
+XA PREPARE 'x';
+XA COMMIT 'x';
+include/save_master_gtid.inc
+connection slave;
+set @@global.debug_dbug= "+d,hold_worker_on_schedule,stop_after_binlog_prepare";
+include/start_slave.inc
+# Waiting for XAP to pause when it is pulled from the queue
+set debug_sync= "now wait_for reached_pause";
+# Before the XA PREPARE executes, the XA COMMIT should wait in group commit..
+# ..done
+# Execute the XA PREPARE
+set debug_sync= "now signal continue_worker";
+# Wait for XA PREPARE to have binlogged, but hold it before it prepares in engines
+set debug_sync= "now wait_for xa_prepare_binlogged";
+# The XA COMMIT should move on from binlog to wait for the XA PREPARE to complete in engines
+# ..done
+# Signal the XAP to complete in engines (which will automatically signal XAC)
+set debug_sync= "now signal continue_xap";
+include/sync_with_master_gtid.inc
+include/diff_tables.inc [master:test.t1, slave:test.t1]
+connection slave;
+include/stop_slave.inc
+set @@global.debug_dbug= @save_debug;
+#
+# Test Case 2: If two XA COMMIT transactions have different
+# XIDs, ensure both phases of both transactions all execute concurrently.
+#
+# Ensure slave is stopped
+connection slave;
+include/wait_for_slave_to_stop.inc
+set @@global.debug_dbug= "+d,stop_before_binlog_prepare,stop_after_binlog_prepare";
+connection master;
+XA START 'x1';
+insert into t1 values (1, 0);
+XA END 'x1';
+XA PREPARE 'x1';
+XA COMMIT 'x1';
+XA START 'x2';
+insert into t1 values (2, 0);
+XA END 'x2';
+XA PREPARE 'x2';
+XA COMMIT 'x2';
+include/save_master_gtid.inc
+connection slave;
+include/start_slave.inc
+set debug_sync= "now signal binlog_xap";
+set debug_sync= "now signal binlog_xap";
+# Ensuring both phases of both transactions all execute concurrently
+# ..done
+# Verify XA PREPARE has binlogged
+set debug_sync= "now wait_for xa_prepare_binlogged";
+# Signal the XAPs to complete in engines (which will automatically signal XACs)
+set debug_sync= "now signal continue_xap";
+set debug_sync= "now signal continue_xap";
+include/sync_with_master_gtid.inc
+include/diff_tables.inc [master:test.t1, slave:test.t1]
+connection slave;
+include/stop_slave.inc
+set @@global.debug_dbug= @save_debug;
+#
+# Test Case 3: Two current 2-phase XA transactions with matching XIDs
+# should run one after the other, while each transaction still allows
+# its XA PREPARE and XA COMMIT to run concurrently
+# Ensure slave is stopped
+connection slave;
+include/wait_for_slave_to_stop.inc
+set @@global.debug_dbug= "+d,stop_before_binlog_prepare,stop_after_binlog_prepare,stop_after_binlog_cor_by_xid";
+connection master;
+XA START 'x';
+insert into t1 values (3, 0);
+XA END 'x';
+XA PREPARE 'x';
+XA COMMIT 'x';
+XA START 'x';
+insert into t1 values (4, 0);
+XA END 'x';
+XA PREPARE 'x';
+XA COMMIT 'x';
+include/save_master_gtid.inc
+connection slave;
+include/start_slave.inc
+set debug_sync= "now signal binlog_xap";
+# Verify first XA PREPARE has binlogged
+set debug_sync= "now wait_for xa_prepare_binlogged";
+# Ensure first XA transaction is running concurrently
+# Ensure second XA transaction's XAP waits for the first transaction
+# Signal first XA PREPARE to complete
+set debug_sync= "now signal continue_xap";
+# Wait for first XA COMMIT to binlog
+set debug_sync= "now wait_for xa_cor_binlogged";
+# Ensure second XA PREPARE doesn't begin yet because the XAC hadn't released its XID
+# Signal first XA COMMIT to complete
+set debug_sync= "now signal continue_xa_cor";
+# Wait for second XA PREPARE to binlogged
+# First pass through binlog_xap
+set debug_sync= "now signal binlog_xap";
+set debug_sync= "now wait_for xa_prepare_binlogged";
+# Ensure second XA COMMIT is concurrent with XAP
+# Signal second XA transaction to complete
+set debug_sync= "now signal continue_xap";
+set debug_sync= "now wait_for xa_cor_binlogged";
+set debug_sync= "now signal continue_xa_cor";
+include/sync_with_master_gtid.inc
+include/diff_tables.inc [master:test.t1, slave:test.t1]
+connection slave;
+include/stop_slave.inc
+set @@global.debug_dbug= @save_debug;
+#
+# Test Case 4 (Error Case): If an XA PREPARE errors while its
+# XA COMMIT is waiting on it, both phases should rollback
+# successfully. Note this tests both:
+# a) XA COMMIT is waiting in group commit (first phase
+# times out in DMLs)
+# b) XA COMMIT is waiting in group commit, with another XAP
+# with a duplicate XID waiting on it.
+# Case a)
+# Ensure slave is stopped
+connection slave;
+include/wait_for_slave_to_stop.inc
+set @save_lock_wait_timeout= @@GLOBAL.innodb_lock_wait_timeout;
+set @save_trans_retries= @@GLOBAL.slave_transaction_retries;
+set @@global.innodb_lock_wait_timeout= 1;
+set @@global.slave_transaction_retries= 0;
+connection master;
+XA START 'x';
+update t1 set b=b+1 where a=-1;
+XA END 'x';
+XA PREPARE 'x';
+XA COMMIT 'x';
+include/save_master_gtid.inc
+connection slave1;
+BEGIN;
+select * from t1 where a=-1 for update;;
+a b
+-1 0
+connection slave;
+include/start_slave.inc
+include/wait_for_slave_sql_error.inc [errno=1205]
+connection slave1;
+ROLLBACK;
+connection slave;
+include/stop_slave_io.inc
+set @@global.innodb_lock_wait_timeout= @save_lock_wait_timeout;
+set @@global.slave_transaction_retries= @save_trans_retries;
+# Ensure on slave restart, we can re-execute the XA transaction
+include/start_slave.inc
+include/save_master_gtid.inc
+include/stop_slave.inc
+set @@global.debug_dbug= @save_debug;
+# Case b)
+# Ensure slave is stopped
+connection slave;
+include/wait_for_slave_to_stop.inc
+set @save_lock_wait_timeout= @@GLOBAL.innodb_lock_wait_timeout;
+set @save_trans_retries= @@GLOBAL.slave_transaction_retries;
+set @@global.innodb_lock_wait_timeout= 1;
+set @@global.slave_transaction_retries= 0;
+connection master;
+XA START 'x';
+update t1 set b=b+1 where a=-1;
+XA END 'x';
+XA PREPARE 'x';
+XA COMMIT 'x';
+XA START 'x';
+insert into t1 values (5, 0);
+XA END 'x';
+XA PREPARE 'x';
+include/save_master_gtid.inc
+XA COMMIT 'x';
+connection slave1;
+BEGIN;
+select * from t1 where a=-1 for update;;
+a b
+-1 1
+connection slave;
+include/start_slave.inc
+include/wait_for_slave_sql_error.inc [errno=1205]
+connection slave1;
+ROLLBACK;
+# There should not be any prepared rows seen by XA RECOVER
+XA RECOVER;
+formatID gtrid_length bqual_length data
+# Ensuring data from second XAP isn't visible..
+# ..done
+connection slave;
+include/stop_slave_io.inc
+set @@global.innodb_lock_wait_timeout= @save_lock_wait_timeout;
+set @@global.slave_transaction_retries= @save_trans_retries;
+# Ensure on slave restart, we can re-execute the XA transaction
+include/start_slave.inc
+include/save_master_gtid.inc
+include/stop_slave.inc
+set @@global.debug_dbug= @save_debug;
+include/start_slave.inc
+# Ensuring data from second XAP is visible..
+# ..done
+#
+# Cleanup
+connection master;
+DROP TABLE t1;
+include/save_master_gtid.inc
+include/show_binlog_events.inc
+Log_name Pos Event_type Server_id End_log_pos Info
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # use `test`; create table t1 (a int primary key, b int) engine=innodb
+master-bin.000001 # Gtid # # BEGIN GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (-1, 0)
+master-bin.000001 # Xid # # COMMIT /* XID */
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (0, 0)
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'78',X'',1
+master-bin.000001 # Gtid # # XA START X'7831',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (1, 0)
+master-bin.000001 # Query # # XA END X'7831',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'7831',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'7831',X'',1
+master-bin.000001 # Gtid # # XA START X'7832',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (2, 0)
+master-bin.000001 # Query # # XA END X'7832',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'7832',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'7832',X'',1
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (3, 0)
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'78',X'',1
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (4, 0)
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'78',X'',1
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; update t1 set b=b+1 where a=-1
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'78',X'',1
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; update t1 set b=b+1 where a=-1
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'78',X'',1
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (5, 0)
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # use `test`; DROP TABLE `t1` /* generated by server */
+connection slave;
+include/sync_with_master_gtid.inc
+show binlog events in 'slave-bin.000001' from <binlog_start>;
+Log_name Pos Event_type Server_id End_log_pos Info
+slave-bin.000001 # Gtid_list 2 # []
+slave-bin.000001 # Binlog_checkpoint 2 # slave-bin.000001
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; create table t1 (a int primary key, b int) engine=innodb
+slave-bin.000001 # Gtid 1 # BEGIN GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (-1, 0)
+slave-bin.000001 # Xid 1 # COMMIT /* XID */
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (0, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'7831',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (1, 0)
+slave-bin.000001 # Query 1 # XA END X'7831',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'7831',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'7831',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'7832',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (2, 0)
+slave-bin.000001 # Query 1 # XA END X'7832',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'7832',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'7832',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (3, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (4, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; update t1 set b=b+1 where a=-1
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; update t1 set b=b+1 where a=-1
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (5, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; DROP TABLE IF EXISTS `t1` /* generated by server */
+include/stop_slave.inc
+set @@GLOBAL.slave_parallel_threads= @save_par_thds;
+set @@GLOBAL.slave_parallel_mode= @save_par_mode;
+include/start_slave.inc
+#
+# Initialize test data
+connection slave;
+include/stop_slave.inc
+RESET SLAVE;
+set @@global.gtid_slave_pos= "";
+connection master;
+create table t1 (a int primary key, b int) engine=innodb;
+insert into t1 values (-1, 0);
+include/save_master_gtid.inc
+connection slave;
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+set @save_debug= @@GLOBAL.debug_dbug;
+set @save_par_thds= @@GLOBAL.slave_parallel_threads;
+set @save_par_mode= @@GLOBAL.slave_parallel_mode;
+set @@GLOBAL.slave_parallel_threads= 4;
+set @@GLOBAL.slave_parallel_mode= optimistic;
+set statement sql_log_bin=0 for call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
+#
+# Test Case 1: Ensure that a 2-phase XA transaction has its XA PREPARE
+# and XA ROLLBACK run concurrently. That is, the
+# XA ROLLBACK will wait at group commit until the XA PREPARE
+# binlogs, and then it will wait again until the XA PREPARE finishes
+# preparing in all engines. At this point, the XA ROLLBACK will
+# run to completion.
+connection master;
+set @@session.gtid_seq_no= 100;
+XA START 'x';
+insert into t1 values (0, 0);
+XA END 'x';
+XA PREPARE 'x';
+XA ROLLBACK 'x';
+include/save_master_gtid.inc
+connection slave;
+set @@global.debug_dbug= "+d,hold_worker_on_schedule,stop_after_binlog_prepare";
+include/start_slave.inc
+# Waiting for XAP to pause when it is pulled from the queue
+set debug_sync= "now wait_for reached_pause";
+# Before the XA PREPARE executes, the XA ROLLBACK should wait in group commit..
+# ..done
+# Execute the XA PREPARE
+set debug_sync= "now signal continue_worker";
+# Wait for XA PREPARE to have binlogged, but hold it before it prepares in engines
+set debug_sync= "now wait_for xa_prepare_binlogged";
+# The XA ROLLBACK should move on from binlog to wait for the XA PREPARE to complete in engines
+# ..done
+# Signal the XAP to complete in engines (which will automatically signal XAC)
+set debug_sync= "now signal continue_xap";
+include/sync_with_master_gtid.inc
+include/diff_tables.inc [master:test.t1, slave:test.t1]
+connection slave;
+include/stop_slave.inc
+set @@global.debug_dbug= @save_debug;
+#
+# Test Case 2: If two XA ROLLBACK transactions have different
+# XIDs, ensure both phases of both transactions all execute concurrently.
+#
+# Ensure slave is stopped
+connection slave;
+include/wait_for_slave_to_stop.inc
+set @@global.debug_dbug= "+d,stop_before_binlog_prepare,stop_after_binlog_prepare";
+connection master;
+XA START 'x1';
+insert into t1 values (1, 0);
+XA END 'x1';
+XA PREPARE 'x1';
+XA ROLLBACK 'x1';
+XA START 'x2';
+insert into t1 values (2, 0);
+XA END 'x2';
+XA PREPARE 'x2';
+XA ROLLBACK 'x2';
+include/save_master_gtid.inc
+connection slave;
+include/start_slave.inc
+set debug_sync= "now signal binlog_xap";
+set debug_sync= "now signal binlog_xap";
+# Ensuring both phases of both transactions all execute concurrently
+# ..done
+# Verify XA PREPARE has binlogged
+set debug_sync= "now wait_for xa_prepare_binlogged";
+# Signal the XAPs to complete in engines (which will automatically signal XACs)
+set debug_sync= "now signal continue_xap";
+set debug_sync= "now signal continue_xap";
+include/sync_with_master_gtid.inc
+include/diff_tables.inc [master:test.t1, slave:test.t1]
+connection slave;
+include/stop_slave.inc
+set @@global.debug_dbug= @save_debug;
+#
+# Test Case 3: Two current 2-phase XA transactions with matching XIDs
+# should run one after the other, while each transaction still allows
+# its XA PREPARE and XA ROLLBACK to run concurrently
+# Ensure slave is stopped
+connection slave;
+include/wait_for_slave_to_stop.inc
+set @@global.debug_dbug= "+d,stop_before_binlog_prepare,stop_after_binlog_prepare,stop_after_binlog_cor_by_xid";
+connection master;
+XA START 'x';
+insert into t1 values (3, 0);
+XA END 'x';
+XA PREPARE 'x';
+XA ROLLBACK 'x';
+XA START 'x';
+insert into t1 values (4, 0);
+XA END 'x';
+XA PREPARE 'x';
+XA ROLLBACK 'x';
+include/save_master_gtid.inc
+connection slave;
+include/start_slave.inc
+set debug_sync= "now signal binlog_xap";
+# Verify first XA PREPARE has binlogged
+set debug_sync= "now wait_for xa_prepare_binlogged";
+# Ensure first XA transaction is running concurrently
+# Ensure second XA transaction's XAP waits for the first transaction
+# Signal first XA PREPARE to complete
+set debug_sync= "now signal continue_xap";
+# Wait for first XA ROLLBACK to binlog
+set debug_sync= "now wait_for xa_cor_binlogged";
+# Ensure second XA PREPARE doesn't begin yet because the XAC hadn't released its XID
+# Signal first XA ROLLBACK to complete
+set debug_sync= "now signal continue_xa_cor";
+# Wait for second XA PREPARE to binlogged
+# First pass through binlog_xap
+set debug_sync= "now signal binlog_xap";
+set debug_sync= "now wait_for xa_prepare_binlogged";
+# Ensure second XA ROLLBACK is concurrent with XAP
+# Signal second XA transaction to complete
+set debug_sync= "now signal continue_xap";
+set debug_sync= "now wait_for xa_cor_binlogged";
+set debug_sync= "now signal continue_xa_cor";
+include/sync_with_master_gtid.inc
+include/diff_tables.inc [master:test.t1, slave:test.t1]
+connection slave;
+include/stop_slave.inc
+set @@global.debug_dbug= @save_debug;
+#
+# Test Case 4 (Error Case): If an XA PREPARE errors while its
+# XA ROLLBACK is waiting on it, both phases should rollback
+# successfully. Note this tests both:
+# a) XA ROLLBACK is waiting in group commit (first phase
+# times out in DMLs)
+# b) XA ROLLBACK is waiting in group commit, with another XAP
+# with a duplicate XID waiting on it.
+# Case a)
+# Ensure slave is stopped
+connection slave;
+include/wait_for_slave_to_stop.inc
+set @save_lock_wait_timeout= @@GLOBAL.innodb_lock_wait_timeout;
+set @save_trans_retries= @@GLOBAL.slave_transaction_retries;
+set @@global.innodb_lock_wait_timeout= 1;
+set @@global.slave_transaction_retries= 0;
+connection master;
+XA START 'x';
+update t1 set b=b+1 where a=-1;
+XA END 'x';
+XA PREPARE 'x';
+XA ROLLBACK 'x';
+include/save_master_gtid.inc
+connection slave1;
+BEGIN;
+select * from t1 where a=-1 for update;;
+a b
+-1 0
+connection slave;
+include/start_slave.inc
+include/wait_for_slave_sql_error.inc [errno=1205]
+connection slave1;
+ROLLBACK;
+connection slave;
+include/stop_slave_io.inc
+set @@global.innodb_lock_wait_timeout= @save_lock_wait_timeout;
+set @@global.slave_transaction_retries= @save_trans_retries;
+# Ensure on slave restart, we can re-execute the XA transaction
+include/start_slave.inc
+include/save_master_gtid.inc
+include/stop_slave.inc
+set @@global.debug_dbug= @save_debug;
+# Case b)
+# Ensure slave is stopped
+connection slave;
+include/wait_for_slave_to_stop.inc
+set @save_lock_wait_timeout= @@GLOBAL.innodb_lock_wait_timeout;
+set @save_trans_retries= @@GLOBAL.slave_transaction_retries;
+set @@global.innodb_lock_wait_timeout= 1;
+set @@global.slave_transaction_retries= 0;
+connection master;
+XA START 'x';
+update t1 set b=b+1 where a=-1;
+XA END 'x';
+XA PREPARE 'x';
+XA ROLLBACK 'x';
+XA START 'x';
+insert into t1 values (5, 0);
+XA END 'x';
+XA PREPARE 'x';
+include/save_master_gtid.inc
+XA ROLLBACK 'x';
+connection slave1;
+BEGIN;
+select * from t1 where a=-1 for update;;
+a b
+-1 0
+connection slave;
+include/start_slave.inc
+include/wait_for_slave_sql_error.inc [errno=1205]
+connection slave1;
+ROLLBACK;
+# There should not be any prepared rows seen by XA RECOVER
+XA RECOVER;
+formatID gtrid_length bqual_length data
+# Ensuring data from second XAP isn't visible..
+# ..done
+connection slave;
+include/stop_slave_io.inc
+set @@global.innodb_lock_wait_timeout= @save_lock_wait_timeout;
+set @@global.slave_transaction_retries= @save_trans_retries;
+# Ensure on slave restart, we can re-execute the XA transaction
+include/start_slave.inc
+include/save_master_gtid.inc
+include/stop_slave.inc
+set @@global.debug_dbug= @save_debug;
+include/start_slave.inc
+# Ensuring data from second XAP is visible..
+# ..done
+#
+# Cleanup
+connection master;
+DROP TABLE t1;
+include/save_master_gtid.inc
+include/show_binlog_events.inc
+Log_name Pos Event_type Server_id End_log_pos Info
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # use `test`; create table t1 (a int primary key, b int) engine=innodb
+master-bin.000001 # Gtid # # BEGIN GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (-1, 0)
+master-bin.000001 # Xid # # COMMIT /* XID */
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (0, 0)
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'78',X'',1
+master-bin.000001 # Gtid # # XA START X'7831',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (1, 0)
+master-bin.000001 # Query # # XA END X'7831',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'7831',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'7831',X'',1
+master-bin.000001 # Gtid # # XA START X'7832',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (2, 0)
+master-bin.000001 # Query # # XA END X'7832',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'7832',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'7832',X'',1
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (3, 0)
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'78',X'',1
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (4, 0)
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'78',X'',1
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; update t1 set b=b+1 where a=-1
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'78',X'',1
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; update t1 set b=b+1 where a=-1
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'78',X'',1
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (5, 0)
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA COMMIT X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # use `test`; DROP TABLE `t1` /* generated by server */
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # use `test`; create table t1 (a int primary key, b int) engine=innodb
+master-bin.000001 # Gtid # # BEGIN GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (-1, 0)
+master-bin.000001 # Xid # # COMMIT /* XID */
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (0, 0)
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA ROLLBACK X'78',X'',1
+master-bin.000001 # Gtid # # XA START X'7831',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (1, 0)
+master-bin.000001 # Query # # XA END X'7831',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'7831',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA ROLLBACK X'7831',X'',1
+master-bin.000001 # Gtid # # XA START X'7832',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (2, 0)
+master-bin.000001 # Query # # XA END X'7832',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'7832',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA ROLLBACK X'7832',X'',1
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (3, 0)
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA ROLLBACK X'78',X'',1
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (4, 0)
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA ROLLBACK X'78',X'',1
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; update t1 set b=b+1 where a=-1
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA ROLLBACK X'78',X'',1
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; update t1 set b=b+1 where a=-1
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA ROLLBACK X'78',X'',1
+master-bin.000001 # Gtid # # XA START X'78',X'',1 GTID #-#-#
+master-bin.000001 # Query # # use `test`; insert into t1 values (5, 0)
+master-bin.000001 # Query # # XA END X'78',X'',1
+master-bin.000001 # XA_prepare # # XA PREPARE X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # XA ROLLBACK X'78',X'',1
+master-bin.000001 # Gtid # # GTID #-#-#
+master-bin.000001 # Query # # use `test`; DROP TABLE `t1` /* generated by server */
+connection slave;
+include/sync_with_master_gtid.inc
+show binlog events in 'slave-bin.000001' from <binlog_start>;
+Log_name Pos Event_type Server_id End_log_pos Info
+slave-bin.000001 # Gtid_list 2 # []
+slave-bin.000001 # Binlog_checkpoint 2 # slave-bin.000001
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; create table t1 (a int primary key, b int) engine=innodb
+slave-bin.000001 # Gtid 1 # BEGIN GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (-1, 0)
+slave-bin.000001 # Xid 1 # COMMIT /* XID */
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (0, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'7831',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (1, 0)
+slave-bin.000001 # Query 1 # XA END X'7831',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'7831',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'7831',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'7832',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (2, 0)
+slave-bin.000001 # Query 1 # XA END X'7832',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'7832',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'7832',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (3, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (4, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; update t1 set b=b+1 where a=-1
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; update t1 set b=b+1 where a=-1
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (5, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; DROP TABLE IF EXISTS `t1` /* generated by server */
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; create table t1 (a int primary key, b int) engine=innodb
+slave-bin.000001 # Gtid 1 # BEGIN GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (-1, 0)
+slave-bin.000001 # Xid 1 # COMMIT /* XID */
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (0, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'7831',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (1, 0)
+slave-bin.000001 # Query 1 # XA END X'7831',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'7831',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'7831',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'7832',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (2, 0)
+slave-bin.000001 # Query 1 # XA END X'7832',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'7832',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'7832',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (3, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (4, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; update t1 set b=b+1 where a=-1
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; update t1 set b=b+1 where a=-1
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (5, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA COMMIT X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; DROP TABLE IF EXISTS `t1` /* generated by server */
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; create table t1 (a int primary key, b int) engine=innodb
+slave-bin.000001 # Gtid 1 # BEGIN GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (-1, 0)
+slave-bin.000001 # Xid 1 # COMMIT /* XID */
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (0, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA ROLLBACK X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'7831',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (1, 0)
+slave-bin.000001 # Query 1 # XA END X'7831',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'7831',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA ROLLBACK X'7831',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'7832',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (2, 0)
+slave-bin.000001 # Query 1 # XA END X'7832',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'7832',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA ROLLBACK X'7832',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (3, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA ROLLBACK X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (4, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA ROLLBACK X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; update t1 set b=b+1 where a=-1
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA ROLLBACK X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; update t1 set b=b+1 where a=-1
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA ROLLBACK X'78',X'',1
+slave-bin.000001 # Gtid 1 # XA START X'78',X'',1 GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; insert into t1 values (5, 0)
+slave-bin.000001 # Query 1 # XA END X'78',X'',1
+slave-bin.000001 # XA_prepare 1 # XA PREPARE X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # XA ROLLBACK X'78',X'',1
+slave-bin.000001 # Gtid 1 # GTID #-#-#
+slave-bin.000001 # Query 1 # use `test`; DROP TABLE IF EXISTS `t1` /* generated by server */
+include/stop_slave.inc
+set @@GLOBAL.slave_parallel_threads= @save_par_thds;
+set @@GLOBAL.slave_parallel_mode= @save_par_mode;
+include/start_slave.inc
+#
+# Test Case 5: If an XAP is skipped by the replica (e.g. by incorrectly
+# setting gtid_slave_pos), and only its XAC/XAR is tried to execute, the
+# replica should report ER_XAER_NOTA.
+connection master;
+create table t1 (a int) engine=innodb;
+include/save_master_gtid.inc
+connection slave;
+include/sync_with_master_gtid.inc
+call mtr.add_suppression("XAER_NOTA: Unknown XID");
+include/stop_slave.inc
+change master to master_use_gtid = slave_pos;
+connection master;
+xa start '1';
+insert into t1 set a=1;
+xa end '1';
+xa prepare '1';
+xa rollback '1';
+insert into t1 set a=2;
+include/save_master_gtid.inc
+connection slave;
+set @save_gtid_slave_pos= @@global.gtid_slave_pos;
+SELECT CONCAT(domain_id,"-",server_id,"-", seq_no + 1)
+into @gtid_skip
+FROM mysql.gtid_slave_pos
+WHERE seq_no = (SELECT DISTINCT max(seq_no) FROM mysql.gtid_slave_pos) limit 1;
+set @@global.gtid_slave_pos = @gtid_skip;
+start slave;
+include/wait_for_slave_sql_error.inc [errno=1397]
+select count(*) = 2 % 2 as 'must be true' from t1;;
+must be true
+1
+include/stop_slave.inc
+set @@global.gtid_slave_pos = @save_gtid_slave_pos;
+show warnings;
+Level Code Message
+Warning 1947 Specified GTID <value> conflicts with the binary log which contains a more recent GTID <value>. If MASTER_GTID_POS=CURRENT_POS is used, the binlog position will override the new value of @@gtid_slave_pos
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+change master to master_use_gtid = slave_pos;
+connection master;
+xa start '1';
+insert into t1 set a=1;
+xa end '1';
+xa prepare '1';
+xa commit '1';
+insert into t1 set a=2;
+include/save_master_gtid.inc
+connection slave;
+set @save_gtid_slave_pos= @@global.gtid_slave_pos;
+SELECT CONCAT(domain_id,"-",server_id,"-", seq_no + 1)
+into @gtid_skip
+FROM mysql.gtid_slave_pos
+WHERE seq_no = (SELECT DISTINCT max(seq_no) FROM mysql.gtid_slave_pos) limit 1;
+set @@global.gtid_slave_pos = @gtid_skip;
+start slave;
+include/wait_for_slave_sql_error.inc [errno=1397]
+select count(*) = 1 % 2 as 'must be true' from t1;;
+must be true
+1
+include/stop_slave.inc
+set @@global.gtid_slave_pos = @save_gtid_slave_pos;
+show warnings;
+Level Code Message
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+connection master;
+drop table t1;
+connection slave;
+include/rpl_end.inc
+# End of rpl_xa_concurrent_2pc.test
diff --git a/mysql-test/suite/rpl/r/rpl_xa_empty_transaction.result b/mysql-test/suite/rpl/r/rpl_xa_empty_transaction.result
index f3ea53c219a..92a820d6753 100644
--- a/mysql-test/suite/rpl/r/rpl_xa_empty_transaction.result
+++ b/mysql-test/suite/rpl/r/rpl_xa_empty_transaction.result
@@ -1165,5 +1165,56 @@ connection server_1;
set @@binlog_format = @sav_binlog_format;
set @@global.binlog_format = @sav_binlog_format;
connection server_1;
+create table t_not_in_binlog (a int) engine=innodb;
+flush logs;
+include/save_master_gtid.inc
+connect con1,localhost,root,,;
+call mtr.add_suppression("XAER_NOTA: Unknown XID");
+SET sql_log_bin=0;
+XA START 'a';
+insert into t_not_in_binlog set a=1;
+XA END 'a';
+XA PREPARE 'a';
+disconnect con1;
+connection server_1;
+xa recover;
+formatID gtrid_length bqual_length data
+1 1 0 a
+XA ROLLBACK 'a';
+drop table t_not_in_binlog;
+include/save_master_gtid.inc
+connection server_2;
+XAER_NOTA: Unknown XID
+include/wait_for_slave_sql_error.inc [errno=1397]
+connect con2,127.0.0.1,root,,test,$SERVER_MYPORT_2,;
+SET sql_log_bin=0;
+XA START 'a';
+insert into t_not_in_binlog set a=1;
+XA END 'a';
+XA PREPARE 'a';
+disconnect con2;
+connection server_2;
+xa recover;
+formatID gtrid_length bqual_length data
+1 1 0 a
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+connection server_3;
+XAER_NOTA: Unknown XID
+include/wait_for_slave_sql_error.inc [errno=1397]
+connect con3,127.0.0.1,root,,test,$SERVER_MYPORT_3,;
+SET sql_log_bin=0;
+XA START 'a';
+insert into t_not_in_binlog set a=1;
+XA END 'a';
+XA PREPARE 'a';
+disconnect con3;
+connection server_3;
+xa recover;
+formatID gtrid_length bqual_length data
+1 1 0 a
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+connection server_1;
include/rpl_end.inc
# End of rpl_xa_empty_transaction.test
diff --git a/mysql-test/suite/rpl/r/rpl_xa_prepare_gtid_fail.result b/mysql-test/suite/rpl/r/rpl_xa_prepare_gtid_fail.result
index dd0d132471e..a3f5414b9da 100644
--- a/mysql-test/suite/rpl/r/rpl_xa_prepare_gtid_fail.result
+++ b/mysql-test/suite/rpl/r/rpl_xa_prepare_gtid_fail.result
@@ -42,7 +42,7 @@ connection master;
drop table t1;
connection slave;
# TODO: Remove after fixing MDEV-21777
-set @@global.gtid_slave_pos= "0-1-100";
+set @@global.gtid_slave_pos= "0-1-101";
set @@global.slave_parallel_threads= @save_par_thds;
set @@global.gtid_strict_mode= @save_strict_mode;
set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
diff --git a/mysql-test/suite/rpl/t/rpl_xa_concurrent_2pc.test b/mysql-test/suite/rpl/t/rpl_xa_concurrent_2pc.test
new file mode 100644
index 00000000000..d762114ff26
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_xa_concurrent_2pc.test
@@ -0,0 +1,111 @@
+#
+# This test ensures that two-phase XA transactions have their first and
+# second phases parallelized for both XA COMMIT and XA ROLLBACK. It ensures the
+# following behaviors:
+#
+# Test Case 1: Ensure that a 2-phase XA transaction has its XA PREPARE and
+# XA COMMIT/ROLLBACK run concurrently. That is, the XA COMMIT/ROLLBACK will
+# wait at group commit until the XA PREPARE binlogs, and then it will wait
+# again until the XA PREPARE finishes preparing in all engines. At this point,
+# the XA COMMIT/ROLLBACK will run to completion.
+#
+# Test Case 2: If two XA transactions have different XIDs, if XA COMMIT ends
+# a transaction, ensure both phases of both transactions can all execute
+# concurrently.
+#
+# Test Case 3: Two current 2-phase XA transactions with matching XIDs should
+# run one after the other, while each transaction still allows both phases of
+# its own transaction to run concurrently.
+#
+# Test Case 4: Error Case. If an XAP errors while its XAC/R is waiting on it,
+# both the XAP and XAC/R should rollback successfully. Note this tests both:
+# a) XAC/R is waiting in group commit (first phase times out in DMLs)
+# b) XAC/R is waiting in group commit, with another XAP with a duplicate XID
+# waiting on it.
+#
+# Test Case 5: If an XAP is skipped by the replica (e.g. by incorrectly
+# setting gtid_slave_pos), and only its XAC/XAR is tried to execute, the
+# replica should report ER_XAER_NOTA.
+#
+#
+# References:
+# MDEV-31949: slow parallel replication of user xa
+#
+--source include/have_debug.inc
+--source include/have_innodb.inc
+--source include/have_binlog_format_mixed.inc
+--source include/master-slave.inc
+
+--let $xa_complete_sym= COMMIT
+--source include/rpl_xa_concurrent_2pc.inc
+
+--let $xa_complete_sym= ROLLBACK
+--source include/rpl_xa_concurrent_2pc.inc
+
+
+--echo #
+--echo # Test Case 5: If an XAP is skipped by the replica (e.g. by incorrectly
+--echo # setting gtid_slave_pos), and only its XAC/XAR is tried to execute, the
+--echo # replica should report ER_XAER_NOTA.
+
+--connection master
+create table t1 (a int) engine=innodb;
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/sync_with_master_gtid.inc
+call mtr.add_suppression("XAER_NOTA: Unknown XID");
+
+--let $i=2
+while ($i)
+{
+ --source include/stop_slave.inc
+ --replace_regex /[0-9]*-[0-9]*-[0-9]*/<value>/
+ change master to master_use_gtid = slave_pos;
+
+ --connection master
+ --let $complete=rollback
+ if ($i == 1)
+ {
+ --let $complete=commit
+ }
+ xa start '1'; insert into t1 set a=1; xa end '1'; xa prepare '1';
+ --eval xa $complete '1'
+ insert into t1 set a=2;
+ --source include/save_master_gtid.inc
+
+ --connection slave
+
+ # reposition the slave to skip one transaction from master
+ set @save_gtid_slave_pos= @@global.gtid_slave_pos;
+ SELECT CONCAT(domain_id,"-",server_id,"-", seq_no + 1)
+ into @gtid_skip
+ FROM mysql.gtid_slave_pos
+ WHERE seq_no = (SELECT DISTINCT max(seq_no) FROM mysql.gtid_slave_pos) limit 1;
+ set @@global.gtid_slave_pos = @gtid_skip;
+
+ start slave;
+ let $slave_sql_errno= 1397; # ER_XAER_NOTA
+ source include/wait_for_slave_sql_error.inc;
+ --eval select count(*) = $i % 2 as 'must be true' from t1;
+ --source include/stop_slave.inc
+
+ --disable_warnings
+ set @@global.gtid_slave_pos = @save_gtid_slave_pos;
+ --enable_warnings
+ --replace_regex /[0-9]*-[0-9]*-[0-9]*/<value>/
+ show warnings;
+ --source include/start_slave.inc
+ --source include/sync_with_master_gtid.inc
+
+ --dec $i
+}
+
+# MDEV-31949 cleanup
+--connection master
+drop table t1;
+
+--sync_slave_with_master
+
+--source include/rpl_end.inc
+--echo # End of rpl_xa_concurrent_2pc.test
diff --git a/mysql-test/suite/rpl/t/rpl_xa_empty_transaction.test b/mysql-test/suite/rpl/t/rpl_xa_empty_transaction.test
index 61cc0621d5a..f43af653ace 100644
--- a/mysql-test/suite/rpl/t/rpl_xa_empty_transaction.test
+++ b/mysql-test/suite/rpl/t/rpl_xa_empty_transaction.test
@@ -167,6 +167,95 @@ set @@global.binlog_format = row;
set @@binlog_format = @sav_binlog_format;
set @@global.binlog_format = @sav_binlog_format;
+
+# MDEV-32257 dangling XA-rollback in binlog from emtpy XA
+# create a case of XA ROLLBACK gets to binlog while its XAP was not and
+# try replicate it.
+# Expected result is both slaves error out.
+--connection server_1
+--let $binlog_start = query_get_value(SHOW MASTER STATUS, Position, 1)
+--let $binlog_file = query_get_value(SHOW MASTER STATUS, File, 1)
+create table t_not_in_binlog (a int) engine=innodb;
+flush logs;
+--source include/save_master_gtid.inc
+--let $binlog_file=query_get_value(SHOW MASTER STATUS, File, 1)
+
+# External connection XID access after disconnect is subject to race.
+# "(" open parenthesis to remember # of connection before ...
+--source include/count_sessions.inc
+
+--connect(con1,localhost,root,,)
+call mtr.add_suppression("XAER_NOTA: Unknown XID");
+
+SET sql_log_bin=0;
+XA START 'a';
+insert into t_not_in_binlog set a=1;
+XA END 'a';
+XA PREPARE 'a';
+--disconnect con1
+
+--connection server_1
+# .. ")" close parenthesis, to wait until con1 fully releases access to xid.
+--source include/wait_until_count_sessions.inc
+xa recover;
+#
+# replicate orphan XAR to server 2,3 and expect the error first
+# after that compensate it.
+
+--error 0
+XA ROLLBACK 'a';
+# cleanup at once
+drop table t_not_in_binlog;
+--source include/save_master_gtid.inc
+
+--connection server_2
+--echo XAER_NOTA: Unknown XID
+--let $slave_sql_errno= 1397
+--source include/wait_for_slave_sql_error.inc
+
+# "("
+--source include/count_sessions.inc
+
+--connect (con2,127.0.0.1,root,,test,$SERVER_MYPORT_2,)
+SET sql_log_bin=0;
+XA START 'a';
+insert into t_not_in_binlog set a=1;
+XA END 'a';
+XA PREPARE 'a';
+--disconnect con2
+
+--connection server_2
+# ")"
+--source include/wait_until_count_sessions.inc
+
+xa recover;
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+
+--connection server_3
+--echo XAER_NOTA: Unknown XID
+--let $slave_sql_errno= 1397
+--source include/wait_for_slave_sql_error.inc
+
+# "("
+--source include/count_sessions.inc
+
+--connect (con3,127.0.0.1,root,,test,$SERVER_MYPORT_3,)
+SET sql_log_bin=0;
+XA START 'a';
+insert into t_not_in_binlog set a=1;
+XA END 'a';
+XA PREPARE 'a';
+--disconnect con3
+
+--connection server_3
+# ")"
+--source include/wait_until_count_sessions.inc
+
+xa recover;
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+
#
# Cleanup
--connection server_1
diff --git a/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail.test b/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail.test
index aa1b088ed23..72589953ac0 100644
--- a/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail.test
+++ b/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail.test
@@ -56,8 +56,8 @@ xa start '1';
update t1 set b=b+10 where a=1;
xa end '1';
xa prepare '1';
---let $new_gtid= `SELECT @@global.gtid_binlog_pos`
xa commit '1';
+--let $new_gtid= `SELECT @@global.gtid_binlog_pos`
--source include/save_master_gtid.inc
--connection slave1
diff --git a/sql/handler.cc b/sql/handler.cc
index 1ea1818749c..7fa78e4d9b2 100644
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -2180,13 +2180,13 @@ int ha_rollback_trans(THD *thd, bool all)
rollback without signalling following transactions. And in release
builds, we explicitly do the signalling before rolling back.
*/
- DBUG_ASSERT(
- !(thd->rgi_slave &&
- !thd->rgi_slave->worker_error &&
- thd->rgi_slave->did_mark_start_commit) ||
- (thd->transaction->xid_state.is_explicit_XA() ||
- (thd->rgi_slave->gtid_ev_flags2 & Gtid_log_event::FL_PREPARED_XA)));
-
+ DBUG_ASSERT(!(thd->rgi_slave &&
+ !thd->rgi_slave->worker_error &&
+ thd->rgi_slave->did_mark_start_commit) ||
+ (thd->transaction->xid_state.is_explicit_XA() ||
+ (thd->rgi_slave->gtid_ev_flags2 &
+ (Gtid_log_event::FL_PREPARED_XA |
+ Gtid_log_event::FL_COMPLETED_XA))));
if (thd->rgi_slave &&
!thd->rgi_slave->worker_error &&
thd->rgi_slave->did_mark_start_commit)
@@ -2343,6 +2343,15 @@ int ha_commit_or_rollback_by_xid(XID *xid, bool commit)
else
binlog_rollback_by_xid(binlog_hton, xid);
+#ifdef ENABLED_DEBUG_SYNC
+ DBUG_EXECUTE_IF(
+ "stop_after_binlog_cor_by_xid",
+ DBUG_ASSERT(!debug_sync_set_action(
+ current_thd,
+ STRING_WITH_LEN(
+ "now SIGNAL xa_cor_binlogged WAIT_FOR continue_xa_cor"))););
+#endif
+
plugin_foreach(NULL, commit ? xacommit_handlerton : xarollback_handlerton,
MYSQL_STORAGE_ENGINE_PLUGIN, &xaop);
diff --git a/sql/log.cc b/sql/log.cc
index eef8d86e4da..78a9419fb15 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -101,6 +101,8 @@ static int binlog_flush_cache(THD *thd, binlog_cache_mngr *cache_mngr,
Log_event *end_ev, bool all, bool using_stmt,
bool using_trx, bool is_ro_1pc);
+XID_cache_element *xid_cache_search_maybe_wait(THD *thd);
+
static const LEX_CSTRING write_error_msg=
{ STRING_WITH_LEN("error writing to the binary log") };
@@ -1751,7 +1753,8 @@ binlog_flush_cache(THD *thd, binlog_cache_mngr *cache_mngr,
if ((using_stmt && !cache_mngr->stmt_cache.empty()) ||
(using_trx && !cache_mngr->trx_cache.empty()) ||
- thd->transaction->xid_state.is_explicit_XA())
+ (thd->transaction->xid_state.is_explicit_XA() ||
+ (thd->rgi_slave && thd->rgi_slave->is_async_xac)))
{
if (using_stmt && thd->binlog_flush_pending_rows_event(TRUE, FALSE))
DBUG_RETURN(1);
@@ -1858,11 +1861,19 @@ binlog_commit_flush_trx_cache(THD *thd, bool all, binlog_cache_mngr *cache_mngr,
if (thd->lex->sql_command == SQLCOM_XA_COMMIT &&
thd->lex->xa_opt != XA_ONE_PHASE)
{
- DBUG_ASSERT(thd->transaction->xid_state.is_explicit_XA());
+ bool is_async_xac= (thd->rgi_slave && thd->rgi_slave->is_async_xac);
+ DBUG_ASSERT(thd->transaction->xid_state.is_explicit_XA() || is_async_xac);
DBUG_ASSERT(thd->transaction->xid_state.get_state_code() ==
- XA_PREPARED);
-
- buflen= serialize_with_xid(thd->transaction->xid_state.get_xid(),
+ XA_PREPARED || is_async_xac);
+ DBUG_ASSERT(is_async_xac ||
+ thd->lex->xid->eq(thd->transaction->xid_state.get_xid()));
+ /*
+ While xid_state.get_xid() is a robust method to access `xid`
+ it can't be used on slave by the asynchronously running XA-"complete".
+ In the latter case thd->lex->xid is safely accessible.
+ */
+ buflen= serialize_with_xid(is_async_xac? thd->lex->xid :
+ thd->transaction->xid_state.get_xid(),
buf, query, q_len);
}
Query_log_event end_evt(thd, buf, buflen, TRUE, TRUE, TRUE, 0);
@@ -1888,13 +1899,19 @@ binlog_rollback_flush_trx_cache(THD *thd, bool all,
const size_t q_len= sizeof(query) - 1; // do not count trailing 0
char buf[q_len + ser_buf_size]= "ROLLBACK";
size_t buflen= sizeof("ROLLBACK") - 1;
+ bool is_async_xac= false;
- if (thd->transaction->xid_state.is_explicit_XA())
+ if (thd->transaction->xid_state.is_explicit_XA() ||
+ (is_async_xac= (thd->rgi_slave && thd->rgi_slave->is_async_xac)))
{
/* for not prepared use plain ROLLBACK */
- if (thd->transaction->xid_state.get_state_code() == XA_PREPARED)
- buflen= serialize_with_xid(thd->transaction->xid_state.get_xid(),
+ if (thd->transaction->xid_state.get_state_code() == XA_PREPARED ||
+ is_async_xac)
+ {
+ buflen= serialize_with_xid(is_async_xac? thd->lex->xid :
+ thd->transaction->xid_state.get_xid(),
buf, query, q_len);
+ }
}
Query_log_event end_evt(thd, buf, buflen, TRUE, TRUE, TRUE, 0);
@@ -1985,11 +2002,71 @@ inline bool is_preparing_xa(THD *thd)
static int binlog_prepare(handlerton *hton, THD *thd, bool all)
{
+ int rc;
+
/* Do nothing unless the transaction is a user XA. */
- return is_preparing_xa(thd) ? binlog_commit(thd, all, FALSE) : 0;
+ if (is_preparing_xa(thd))
+ {
+ DBUG_EXECUTE_IF(
+ "stop_before_binlog_prepare",
+ DBUG_ASSERT(!debug_sync_set_action(
+ thd, STRING_WITH_LEN("now WAIT_FOR binlog_xap"))););
+
+ rc= binlog_commit(thd, all, FALSE);
+
+#ifdef ENABLED_DEBUG_SYNC
+ DBUG_EXECUTE_IF(
+ "stop_after_binlog_prepare",
+ DBUG_ASSERT(!debug_sync_set_action(
+ thd,
+ STRING_WITH_LEN(
+ "now SIGNAL xa_prepare_binlogged WAIT_FOR continue_xap"))););
+#endif
+ }
+ else
+ {
+ rc= 0;
+ }
+
+ return rc;
}
+/**
+ @c acquire_xid takes control by slave worker's THD over a xid record the system
+ xid cache. Implicitly provided @c xid corresponds to a being asynchronously
+ handled XA-"complete".
+
+ @param thd the thread handler
+ @return false as success, true otherwise
+*/
+static bool acquire_xid(THD *thd)
+{
+ bool rc= false;
+
+ if (thd->rgi_slave && thd->rgi_slave->is_async_xac &&
+ thd->rgi_slave->gtid_ev_flags2 & Gtid_log_event::FL_COMPLETED_XA)
+ {
+ XID_STATE &xid_state= thd->transaction->xid_state;
+
+ auto xs= xid_cache_search_maybe_wait(thd);
+ xid_state.xid_cache_element= xs;
+ if (!xs)
+ {
+ DBUG_ASSERT(thd->is_killed());
+
+ rpl_gtid *gtid= &thd->rgi_slave->current_gtid;
+ my_error(ER_XAER_RMERR, MYF(0));
+ sql_print_error("XA COMMIT of GTID %u-%u-%ll could not complete "
+ "after having been logged into binary log",
+ gtid->domain_id, gtid->server_id, gtid->seq_no);
+ rc= true;
+ }
+ }
+
+ return rc;
+}
+
int binlog_commit_by_xid(handlerton *hton, XID *xid)
{
int rc= 0;
@@ -1997,28 +2074,38 @@ int binlog_commit_by_xid(handlerton *hton, XID *xid)
if (thd->is_current_stmt_binlog_disabled())
{
- return thd->wait_for_prior_commit();
+ rc= thd->wait_for_prior_commit();
}
+ else
+ {
+ /* the asserted state can't be reachable with xa commit */
+ DBUG_ASSERT(!thd->get_stmt_da()->is_error() ||
+ thd->get_stmt_da()->sql_errno() != ER_XA_RBROLLBACK);
+ /*
+ This is a recovered user xa transaction commit.
+ Create a "temporary" binlog transaction to write the commit record
+ into binlog.
+ */
+ THD_TRANS trans;
+ trans.ha_list= NULL;
- /* the asserted state can't be reachable with xa commit */
- DBUG_ASSERT(!thd->get_stmt_da()->is_error() ||
- thd->get_stmt_da()->sql_errno() != ER_XA_RBROLLBACK);
- /*
- This is a recovered user xa transaction commit.
- Create a "temporary" binlog transaction to write the commit record
- into binlog.
- */
- THD_TRANS trans;
- trans.ha_list= NULL;
-
- thd->ha_data[hton->slot].ha_info[1].register_ha(&trans, hton);
- thd->ha_data[binlog_hton->slot].ha_info[1].set_trx_read_write();
- (void) thd->binlog_setup_trx_data();
+ thd->ha_data[hton->slot].ha_info[1].register_ha(&trans, hton);
+ thd->ha_data[binlog_hton->slot].ha_info[1].set_trx_read_write();
+ (void) thd->binlog_setup_trx_data();
- DBUG_ASSERT(thd->lex->sql_command == SQLCOM_XA_COMMIT);
+ DBUG_ASSERT(thd->lex->sql_command == SQLCOM_XA_COMMIT);
- rc= binlog_commit(thd, TRUE, FALSE);
- thd->ha_data[binlog_hton->slot].ha_info[1].reset();
+ rc= binlog_commit(thd, TRUE, FALSE);
+ thd->ha_data[binlog_hton->slot].ha_info[1].reset();
+ }
+ if (!rc)
+ {
+ rc= acquire_xid(thd);
+ }
+ if (thd->is_current_stmt_binlog_disabled())
+ {
+ thd->wakeup_subsequent_commits(rc);
+ }
return rc;
}
@@ -2031,33 +2118,54 @@ int binlog_rollback_by_xid(handlerton *hton, XID *xid)
if (thd->is_current_stmt_binlog_disabled())
{
- return thd->wait_for_prior_commit();
+ rc= thd->wait_for_prior_commit();
}
+ else if (thd->get_stmt_da()->is_error() &&
+ thd->get_stmt_da()->sql_errno() == ER_XA_RBROLLBACK)
+ rc= true;
+ else
+ {
+ THD_TRANS trans;
+ trans.ha_list= NULL;
- if (thd->get_stmt_da()->is_error() &&
- thd->get_stmt_da()->sql_errno() == ER_XA_RBROLLBACK)
- return rc;
-
- THD_TRANS trans;
- trans.ha_list= NULL;
-
- thd->ha_data[hton->slot].ha_info[1].register_ha(&trans, hton);
- thd->ha_data[hton->slot].ha_info[1].set_trx_read_write();
- (void) thd->binlog_setup_trx_data();
+ thd->ha_data[hton->slot].ha_info[1].register_ha(&trans, hton);
+ thd->ha_data[hton->slot].ha_info[1].set_trx_read_write();
+ (void) thd->binlog_setup_trx_data();
- DBUG_ASSERT(thd->lex->sql_command == SQLCOM_XA_ROLLBACK ||
- (thd->transaction->xid_state.get_state_code() == XA_ROLLBACK_ONLY));
+ DBUG_ASSERT(thd->lex->sql_command == SQLCOM_XA_ROLLBACK ||
+ (thd->transaction->xid_state.get_state_code() == XA_ROLLBACK_ONLY));
- rc= binlog_rollback(hton, thd, TRUE);
- thd->ha_data[hton->slot].ha_info[1].reset();
+ rc= binlog_rollback(hton, thd, TRUE);
+ thd->ha_data[hton->slot].ha_info[1].reset();
+ }
+ if (!rc)
+ {
+ rc= acquire_xid(thd);
+ }
+ if (thd->is_current_stmt_binlog_disabled())
+ {
+ thd->wakeup_subsequent_commits(rc);
+ }
return rc;
}
-
+/**
+ @param thd thread handler
+ @return true when thd carries an XA transaction in prepared state,
+ or the XA transaction is being completed by
+ asynchronously running "COMPLETE" by slave parallel thread;
+ false otherwise
+*/
inline bool is_prepared_xa(THD *thd)
{
- return thd->transaction->xid_state.is_explicit_XA() &&
+ bool is_async_xac= (thd->rgi_slave && thd->rgi_slave->is_async_xac);
+ DBUG_ASSERT(!is_async_xac ||
+ thd->lex->sql_command == SQLCOM_XA_ROLLBACK ||
+ thd->lex->sql_command == SQLCOM_XA_COMMIT);
+
+ return is_async_xac ? true :
+ thd->transaction->xid_state.is_explicit_XA() &&
thd->transaction->xid_state.get_state_code() == XA_PREPARED;
}
@@ -2185,7 +2293,9 @@ int binlog_commit(THD *thd, bool all, bool ro_1pc)
}
if (cache_mngr->trx_cache.empty() &&
- (thd->transaction->xid_state.get_state_code() != XA_PREPARED ||
+ ((thd->transaction->xid_state.get_state_code() != XA_PREPARED &&
+ !(thd->rgi_slave && thd->rgi_slave->is_parallel_exec &&
+ thd->lex->sql_command == SQLCOM_XA_COMMIT)) ||
!(thd->ha_data[binlog_hton->slot].ha_info[1].is_started() &&
thd->ha_data[binlog_hton->slot].ha_info[1].is_trx_read_write())))
{
@@ -2279,7 +2389,9 @@ static int binlog_rollback(handlerton *hton, THD *thd, bool all)
}
if (!cache_mngr->trx_cache.has_incident() && cache_mngr->trx_cache.empty() &&
- (thd->transaction->xid_state.get_state_code() != XA_PREPARED ||
+ ((thd->transaction->xid_state.get_state_code() != XA_PREPARED &&
+ !(thd->rgi_slave && thd->rgi_slave->is_parallel_exec &&
+ thd->lex->sql_command == SQLCOM_XA_ROLLBACK)) ||
!(thd->ha_data[binlog_hton->slot].ha_info[1].is_started() &&
thd->ha_data[binlog_hton->slot].ha_info[1].is_trx_read_write())))
{
@@ -8382,7 +8494,9 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
*/
DBUG_ASSERT(!cache_mngr->stmt_cache.empty() ||
!cache_mngr->trx_cache.empty() ||
- current->thd->transaction->xid_state.is_explicit_XA());
+ (current->thd->transaction->xid_state.is_explicit_XA() ||
+ (current->thd->rgi_slave &&
+ current->thd->rgi_slave->is_async_xac)));
if (unlikely((current->error= write_transaction_or_stmt(current,
commit_id))))
@@ -10510,13 +10624,20 @@ int TC_LOG_BINLOG::unlog_xa_prepare(THD *thd, bool all)
binlog_cache_mngr *cache_mngr= thd->binlog_setup_trx_data();
int cookie= 0;
+ int rc= 0;
+
+ if (thd->rgi_slave && thd->is_current_stmt_binlog_disabled())
+ {
+ rc= thd->wait_for_prior_commit();
+ if (rc == 0)
+ thd->wakeup_subsequent_commits(rc);
+ return rc;
+ }
if (!cache_mngr->need_unlog)
{
Ha_trx_info *ha_info;
uint rw_count= ha_count_rw_all(thd, &ha_info);
- bool rc= false;
-
/*
This transaction has not been binlogged as indicated by need_unlog.
Such exceptional cases include transactions with no effect to engines,
diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc
index 92ff401a260..230a7a4667f 100644
--- a/sql/log_event_server.cc
+++ b/sql/log_event_server.cc
@@ -3314,16 +3314,22 @@ Gtid_log_event::Gtid_log_event(THD *thd_arg, uint64 seq_no_arg,
XID_STATE &xid_state= thd->transaction->xid_state;
if (is_transactional)
{
- if (xid_state.is_explicit_XA() &&
- (thd->lex->sql_command == SQLCOM_XA_PREPARE ||
- xid_state.get_state_code() == XA_PREPARED))
+ bool is_async_xac= false;
+ if ((xid_state.is_explicit_XA() &&
+ (thd->lex->sql_command == SQLCOM_XA_PREPARE ||
+ xid_state.get_state_code() == XA_PREPARED)) ||
+ (is_async_xac= (thd->rgi_slave && thd->rgi_slave->is_async_xac)))
{
DBUG_ASSERT(!(thd->lex->sql_command == SQLCOM_XA_COMMIT &&
thd->lex->xa_opt == XA_ONE_PHASE));
+ DBUG_ASSERT(!is_async_xac ||
+ thd->lex->sql_command == SQLCOM_XA_ROLLBACK ||
+ thd->lex->sql_command == SQLCOM_XA_COMMIT);
flags2|= thd->lex->sql_command == SQLCOM_XA_PREPARE ?
FL_PREPARED_XA : FL_COMPLETED_XA;
- xid.set(xid_state.get_xid());
+ xid.set(is_async_xac? thd->lex->xid :
+ thd->transaction->xid_state.get_xid());
}
/* count non-zero extra recoverable engines; total = extra + 1 */
if (has_xid)
@@ -4172,9 +4178,6 @@ int XA_prepare_log_event::do_commit()
thd->lex->xid= &xid;
if (!one_phase)
{
- if ((res= thd->wait_for_prior_commit()))
- return res;
-
thd->lex->sql_command= SQLCOM_XA_PREPARE;
res= trans_xa_prepare(thd);
}
diff --git a/sql/mysqld.cc b/sql/mysqld.cc
index c0dd56ab3d5..0f76dcc2351 100644
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@ -1055,7 +1055,7 @@ PSI_cond_key key_BINLOG_COND_xid_list,
key_BINLOG_COND_queue_busy;
PSI_cond_key key_RELAYLOG_COND_relay_log_updated,
key_RELAYLOG_COND_bin_log_updated, key_COND_wakeup_ready,
- key_COND_wait_commit;
+ key_COND_wait_commit, key_COND_wait_commit_dep;
PSI_cond_key key_RELAYLOG_COND_queue_busy;
PSI_cond_key key_TC_LOG_MMAP_COND_queue_busy;
PSI_cond_key key_COND_rpl_thread_queue, key_COND_rpl_thread,
@@ -1083,6 +1083,7 @@ static PSI_cond_info all_server_conds[]=
{ &key_RELAYLOG_COND_queue_busy, "MYSQL_RELAY_LOG::COND_queue_busy", 0},
{ &key_COND_wakeup_ready, "THD::COND_wakeup_ready", 0},
{ &key_COND_wait_commit, "wait_for_commit::COND_wait_commit", 0},
+ { &key_COND_wait_commit, "wait_for_commit::COND_wait_commit_dep", 0},
{ &key_COND_cache_status_changed, "Query_cache::COND_cache_status_changed", 0},
{ &key_COND_manager, "COND_manager", PSI_FLAG_GLOBAL},
{ &key_COND_server_started, "COND_server_started", PSI_FLAG_GLOBAL},
@@ -9224,6 +9225,7 @@ PSI_stage_info stage_waiting_for_deadlock_kill= { 0, "Waiting for parallel repli
PSI_stage_info stage_starting= { 0, "starting", 0};
PSI_stage_info stage_waiting_for_flush= { 0, "Waiting for non trans tables to be flushed", 0};
PSI_stage_info stage_waiting_for_ddl= { 0, "Waiting for DDLs", 0};
+PSI_stage_info stage_waiting_for_prior_xa_transaction= { 0, "Waiting for prior xa transaction", 0};
PSI_memory_key key_memory_DATE_TIME_FORMAT;
PSI_memory_key key_memory_DDL_LOG_MEMORY_ENTRY;
diff --git a/sql/mysqld.h b/sql/mysqld.h
index fc8afa06638..4245a1a5e10 100644
--- a/sql/mysqld.h
+++ b/sql/mysqld.h
@@ -376,7 +376,7 @@ extern PSI_cond_key key_BINLOG_COND_xid_list, key_BINLOG_update_cond,
key_COND_start_thread;
extern PSI_cond_key key_RELAYLOG_COND_relay_log_updated,
key_RELAYLOG_COND_bin_log_updated, key_COND_wakeup_ready,
- key_COND_wait_commit;
+ key_COND_wait_commit, key_COND_wait_commit_dep;
extern PSI_cond_key key_RELAYLOG_COND_queue_busy;
extern PSI_cond_key key_TC_LOG_MMAP_COND_queue_busy;
extern PSI_cond_key key_COND_rpl_thread, key_COND_rpl_thread_queue,
@@ -679,7 +679,7 @@ extern PSI_stage_info stage_slave_background_process_request;
extern PSI_stage_info stage_slave_background_wait_request;
extern PSI_stage_info stage_waiting_for_deadlock_kill;
extern PSI_stage_info stage_starting;
-
+extern PSI_stage_info stage_waiting_for_prior_xa_transaction;
#ifdef HAVE_PSI_STATEMENT_INTERFACE
/**
Statement instrumentation keys (sql).
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index c26263401b8..2653e294590 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -9,6 +9,8 @@
#ifdef WITH_WSREP
#include "wsrep_trans_observer.h"
#endif
+#include <algorithm>
+using std::max;
/*
Code for optional parallel execution of replicated events on the slave.
@@ -760,7 +762,8 @@ convert_kill_to_deadlock_error(rpl_group_info *rgi)
return;
err_code= thd->get_stmt_da()->sql_errno();
if ((rgi->speculation == rpl_group_info::SPECULATE_OPTIMISTIC &&
- err_code != ER_PRIOR_COMMIT_FAILED) ||
+ (err_code != ER_PRIOR_COMMIT_FAILED &&
+ err_code != ER_XAER_NOTA)) ||
((err_code == ER_QUERY_INTERRUPTED || err_code == ER_CONNECTION_KILLED) &&
rgi->killed_for_retry))
{
@@ -2364,16 +2367,9 @@ rpl_parallel_entry::choose_thread(rpl_group_info *rgi, bool *did_enter_cond,
idx= rpl_thread_idx;
if (gtid_ev)
{
- if (gtid_ev->flags2 &
- (Gtid_log_event::FL_COMPLETED_XA | Gtid_log_event::FL_PREPARED_XA))
- idx= my_hash_sort(&my_charset_bin, gtid_ev->xid.key(),
- gtid_ev->xid.key_length()) % rpl_thread_max;
- else
- {
- ++idx;
- if (idx >= rpl_thread_max)
- idx= 0;
- }
+ ++idx;
+ if (idx >= rpl_thread_max)
+ idx= 0;
rpl_thread_idx= idx;
}
thr= rpl_threads[idx];
@@ -2467,6 +2463,7 @@ free_rpl_parallel_entry(void *element)
}
mysql_cond_destroy(&e->COND_parallel_entry);
mysql_mutex_destroy(&e->LOCK_parallel_entry);
+ e->concurrent_xaps_window.~Dynamic_array();
my_free(e);
}
@@ -2521,6 +2518,19 @@ rpl_parallel::find(uint32 domain_id)
e->domain_id= domain_id;
e->stop_on_error_sub_id= (uint64)ULONGLONG_MAX;
e->pause_sub_id= (uint64)ULONGLONG_MAX;
+
+ e->concurrent_xaps_window.init((PSI_memory_key) PSI_INSTRUMENT_ME,
+ max((decltype(e->rpl_thread_max)) 2,
+ 2*e->rpl_thread_max));
+ e->cxap_lhs= e->cxap_rhs= 0;
+
+ /*
+ 0 initialize each element
+ */
+ for (size_t i= 0; i < e->concurrent_xaps_window.max_size(); i++)
+ {
+ e->concurrent_xaps_window.at(i)= {0, 0};
+ }
mysql_mutex_init(key_LOCK_parallel_entry, &e->LOCK_parallel_entry,
MY_MUTEX_INIT_FAST);
mysql_cond_init(key_COND_parallel_entry, &e->COND_parallel_entry, NULL);
@@ -2798,6 +2808,90 @@ abandon_worker_thread(THD *thd, rpl_parallel_thread *cur_thread,
mysql_cond_signal(&cur_thread->COND_rpl_thread);
}
+/**
+ Check the concurrency status of @c xid with ones in progress.
+ Any new @c xid of XA-prepare (@c is_xap is true then) is appended to
+ a sliding window designed as circular buffer. Through search in the window
+ the return result is computed.
+
+ @param e parallel entry pointer
+ @param xid a pointer to the xid of either XA-prepare of XA-"complete"
+ @param is_xap
+ true when xid belongs to XA-prepare
+ @return true when there exists a duplicate xid hash value,
+ false otherwise.
+*/
+static bool
+handle_xa_prepera_duplicate_xid(rpl_parallel_entry *e, XID *xid, bool is_xap)
+{
+ DBUG_ASSERT(e->current_group_info ||
+ (e->count_queued_event_groups == 0 &&
+ e->cxap_lhs == e->cxap_rhs && e->cxap_lhs == 0));
+ DBUG_ASSERT(xid);
+ DBUG_ASSERT(!xid->is_null());
+ DBUG_ASSERT(xid->key());
+ DBUG_ASSERT(xid->key_length());
+
+ uint64 curr_event_count= e->count_queued_event_groups;
+ uint32 i;
+ bool rc= false;
+ /*
+ We've seen XAP's before, so move the LHS up to a relevant spot.
+ LHS = RHS indicates the empty buffer (which implies RHS is exclusive "edge"
+ of the window.
+ Otherwise RHS always points to a free cell of which one at least must
+ exist at this point.
+ While transaction disribution is Round-robin, potential conflicts with
+ the current input xid can come only from
+ the preceeding 2*|W| - 1 xids, the 2*|W|th in the past is safe.
+ */
+ for (i= e->cxap_lhs; i != e->cxap_rhs;
+ i= (i+1) % (e->concurrent_xaps_window.max_size()))
+ {
+ uint64 old_event_count= e->concurrent_xaps_window.at(i).second;
+ uint64 queued_event_diff= curr_event_count - old_event_count;
+ if (queued_event_diff >= e->rpl_thread_max)
+ {
+ /*
+ Squeeze the window from the left
+ as this XAP can't run in parallel with us.
+ */
+ e->cxap_lhs= (i+1) % (e->concurrent_xaps_window.max_size());
+ }
+ else
+ {
+ // new LHS is determined
+ DBUG_ASSERT(e->cxap_lhs != e->cxap_rhs);
+ break;
+ }
+ }
+
+ std::size_t xid_hash= std::hash<XID>{}(*xid);
+ for (; i != e->cxap_rhs; i= (i+1) % (e->concurrent_xaps_window.max_size()))
+ {
+ std::size_t old_xid_hash= e->concurrent_xaps_window.at(i).first;
+ if (old_xid_hash == xid_hash)
+ {
+ rc= true;
+ break;
+ }
+ }
+
+ // Add the XAP to the sliding window
+ if (is_xap)
+ {
+ e->concurrent_xaps_window.at(e->cxap_rhs).first= xid_hash;
+ e->concurrent_xaps_window.at(e->cxap_rhs).second= curr_event_count;
+ e->cxap_rhs= (e->cxap_rhs + 1) % (e->concurrent_xaps_window.max_size());
+ if (e->cxap_rhs == e->cxap_lhs)
+ {
+ // the entire array is full therefore the lhs has become stale
+ e->cxap_lhs= (e->cxap_lhs + 1) % (e->concurrent_xaps_window.max_size());
+ }
+ }
+
+ return rc;
+}
/*
do_event() is executed by the sql_driver_thd thread.
@@ -3046,6 +3140,18 @@ rpl_parallel::do_event(rpl_group_info *serial_rgi, Log_event *ev,
new_gco= true;
force_switch_flag= 0;
gco= e->current_gco;
+ /*
+ Take care of duplicate xids in XA-prepare, XA-"complete" should not
+ race its XA-prepare parent either. When the current transaction's xid
+ was seen and its transaction may still be in process this event group
+ gets flagged to wait for prior commits at the start of execution.
+ */
+ if ((gtid_flags & (Gtid_log_event::FL_PREPARED_XA |
+ Gtid_log_event::FL_COMPLETED_XA)) &&
+ handle_xa_prepera_duplicate_xid(e, >id_ev->xid,
+ gtid_flags &
+ Gtid_log_event::FL_PREPARED_XA))
+ gtid_flags &= ~Gtid_log_event::FL_ALLOW_PARALLEL;
if (likely(gco))
{
uint8 flags= gco->flags;
diff --git a/sql/rpl_parallel.h b/sql/rpl_parallel.h
index 0b1d3cf9d80..b76d6749f6d 100644
--- a/sql/rpl_parallel.h
+++ b/sql/rpl_parallel.h
@@ -324,6 +324,14 @@ struct rpl_parallel_thread_pool {
void release_thread(rpl_parallel_thread *rpt);
};
+template <>
+struct std::hash<XID>
+{
+ std::size_t operator()(const XID& xid) const
+ {
+ return my_hash_sort(&my_charset_bin, xid.key(), xid.key_length());
+ }
+};
struct rpl_parallel_entry {
mysql_mutex_t LOCK_parallel_entry;
@@ -419,6 +427,14 @@ struct rpl_parallel_entry {
/* The group_commit_orderer object for the events currently being queued. */
group_commit_orderer *current_gco;
+ /*
+ Circular buffer of size slave_parallel_threads to hold XIDs of XA-prepare
+ group of events which may be processed concurrently.
+ See how handle_xa_prepera_duplicate_xid operates on it.
+ */
+ Dynamic_array<std::pair<std::size_t, uint32>> concurrent_xaps_window;
+ uint32 cxap_lhs, cxap_rhs;
+
rpl_parallel_thread * choose_thread(rpl_group_info *rgi, bool *did_enter_cond,
PSI_stage_info *old_stage,
Gtid_log_event *gtid_ev);
diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc
index 375fb1d1c58..fdeb0d67872 100644
--- a/sql/rpl_rli.cc
+++ b/sql/rpl_rli.cc
@@ -2155,12 +2155,14 @@ rpl_group_info::reinit(Relay_log_info *rli)
gtid_ignore_duplicate_state= GTID_DUPLICATE_NULL;
speculation= SPECULATE_NO;
commit_orderer.reinit();
+ is_async_xac= false;
}
rpl_group_info::rpl_group_info(Relay_log_info *rli)
: thd(0), wait_commit_sub_id(0),
wait_commit_group_info(0), parallel_entry(0),
- deferred_events(NULL), m_annotate_event(0), is_parallel_exec(false)
+ deferred_events(NULL), m_annotate_event(0), is_parallel_exec(false),
+ is_async_xac(false)
{
reinit(rli);
bzero(¤t_gtid, sizeof(current_gtid));
@@ -2291,7 +2293,7 @@ void rpl_group_info::cleanup_context(THD *thd, bool error)
if (thd->transaction->xid_state.is_explicit_XA() &&
thd->transaction->xid_state.get_state_code() != XA_PREPARED)
xa_trans_force_rollback(thd);
-
+ is_async_xac= false;
thd->release_transactional_locks();
if (thd == rli->sql_driver_thd)
diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h
index cc807852bf2..06eef910de4 100644
--- a/sql/rpl_rli.h
+++ b/sql/rpl_rli.h
@@ -835,6 +835,14 @@ struct rpl_group_info
};
uchar killed_for_retry;
+ /*
+ When true indicates that the user xa transaction is going to
+ complete (with COMMIT or ROLLBACK) by the worker thread,
+ *while* another worker is still preparing it. Once the latter is done
+ the xid will be acquired and the flag gets reset.
+ */
+ bool is_async_xac;
+
rpl_group_info(Relay_log_info *rli_);
~rpl_group_info();
void reinit(Relay_log_info *rli);
diff --git a/sql/sql_array.h b/sql/sql_array.h
index 8610e971016..c79c0c257a0 100644
--- a/sql/sql_array.h
+++ b/sql/sql_array.h
@@ -137,7 +137,7 @@ template <class Elem> class Dynamic_array
*/
Elem& at(size_t idx)
{
- DBUG_ASSERT(idx < array.elements);
+ DBUG_ASSERT(idx < max_size());
return *(((Elem*)array.buffer) + idx);
}
/// Const variant of at(), which cannot change data
@@ -172,6 +172,8 @@ template <class Elem> class Dynamic_array
size_t size() const { return array.elements; }
+ size_t max_size() const { return array.max_element; }
+
const Elem *end() const
{
return back() + 1;
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 1b78f88bd3c..04338831cd7 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -7960,6 +7960,7 @@ wait_for_commit::wait_for_commit()
{
mysql_mutex_init(key_LOCK_wait_commit, &LOCK_wait_commit, MY_MUTEX_INIT_FAST);
mysql_cond_init(key_COND_wait_commit, &COND_wait_commit, 0);
+ mysql_cond_init(key_COND_wait_commit_dep, &COND_wait_commit_dep, 0);
reinit();
}
@@ -7989,6 +7990,7 @@ wait_for_commit::~wait_for_commit()
mysql_mutex_destroy(&LOCK_wait_commit);
mysql_cond_destroy(&COND_wait_commit);
+ mysql_cond_destroy(&COND_wait_commit_dep);
}
diff --git a/sql/sql_class.h b/sql/sql_class.h
index e6580d2432c..72ca39c7710 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -2367,6 +2367,15 @@ struct wait_for_commit
event group is fully done.
*/
bool wakeup_blocked;
+ /*
+ The condition variable servers as a part of facilities to handle various
+ commit time additional dependency between groups of replication events, e.g
+ XA-Prepare -> XA-Commit, or XA-Prepare -> XA-Prepare all with the same xid.
+ */
+ mysql_cond_t COND_wait_commit_dep;
+#ifndef DBUG_OFF
+ bool debug_done;
+#endif
void register_wait_for_prior_commit(wait_for_commit *waitee);
int wait_for_prior_commit(THD *thd, bool allow_kill=true)
diff --git a/sql/xa.cc b/sql/xa.cc
index 9df9da7acf1..3d1a7a2360a 100644
--- a/sql/xa.cc
+++ b/sql/xa.cc
@@ -22,7 +22,7 @@
#include "my_cpu.h"
#include <pfs_transaction_provider.h>
#include <mysql/psi/mysql_transaction.h>
-
+#include "rpl_rli.h" // rpl_group_info
static bool slave_applier_reset_xa_trans(THD *thd);
/***************************************************************************
@@ -79,6 +79,10 @@ class XID_cache_element
uint rm_error;
enum xa_states xa_state;
XID xid;
+ /* parallel slave worker waiters. `c` stands for complete, `p` prepare */
+ std::atomic<wait_for_commit *> c_waiter; // set by asynch run xa-"complete"
+ std::atomic<wait_for_commit *> p_waiter; // set by duplicate xid xa-start
+
bool is_set(int32_t flag)
{ return m_state.load(std::memory_order_relaxed) & flag; }
void set(int32_t flag)
@@ -134,6 +138,7 @@ class XID_cache_element
element->rm_error= 0;
element->xa_state= new_element->xa_state;
element->xid.set(new_element->xid);
+ element->c_waiter= element->p_waiter= NULL;
new_element->xid_cache_element= element;
}
static void lf_alloc_constructor(uchar *ptr)
@@ -243,7 +248,7 @@ void xid_cache_free()
Find recovered XA transaction by XID.
*/
-static XID_cache_element *xid_cache_search(THD *thd, XID *xid)
+XID_cache_element *xid_cache_search(THD *thd, XID *xid)
{
DBUG_ASSERT(thd->xid_hash_pins);
XID_cache_element *element=
@@ -254,16 +259,221 @@ static XID_cache_element *xid_cache_search(THD *thd, XID *xid)
/* The element can be removed from lf_hash by other thread, but
element->acquire_recovered() will return false in this case. */
if (!element->acquire_recovered())
+ {
element= 0;
+ if (thd->rgi_slave && thd->rgi_slave->is_parallel_exec)
+ {
+ DBUG_ASSERT(thd->lex->sql_command == SQLCOM_XA_COMMIT ||
+ thd->lex->sql_command == SQLCOM_XA_ROLLBACK);
+ thd->rgi_slave->is_async_xac= true;
+ }
+ }
lf_hash_search_unpin(thd->xid_hash_pins);
/* Once the element is acquired (i.e. got the ACQUIRED bit) by this thread,
only this thread can delete it. The deletion happens in xid_cache_delete().
See also the XID_cache_element documentation. */
DEBUG_SYNC(thd, "xa_after_search");
}
+ else if (thd->rgi_slave && thd->rgi_slave->is_parallel_exec)
+ {
+ DBUG_ASSERT(thd->lex->sql_command == SQLCOM_XA_COMMIT ||
+ thd->lex->sql_command == SQLCOM_XA_ROLLBACK);
+ }
+
return element;
}
+const int SPIN_MAX= 20;
+/**
+ The function tries inserting a xid into the system hash until succeeds.
+ Re-trying can be caused solely by existence of an earlier transaction with
+ a duplicate xid.
+ Analogously to @c xid_cache_search_maybe_wait, it is expecting, here the duplicate,
+ xid in the way will be eventually deleted from the hash.
+
+ @param thd thread handler
+ @return false as success,
+ true otherwise.
+*/
+bool xid_cache_insert_maybe_wait(THD* thd)
+{
+ int i= 0;
+ bool rc;
+
+ do
+ {
+ if ((rc= xid_cache_insert(thd, &thd->transaction->xid_state, thd->lex->xid)))
+ ut_delay(1 + i++);
+ }
+ while (rc && i < SPIN_MAX);
+
+ if (rc)
+ {
+ wait_for_commit *waiter= NULL;
+ XID *xid= thd->lex->xid;
+ XID_cache_element *element=
+ (XID_cache_element*) lf_hash_search(&xid_cache, thd->xid_hash_pins,
+ xid->key(), xid->key_length());
+ if (element)
+ {
+ PSI_stage_info old_stage;
+ wait_for_commit *exp= NULL, *waiter= thd->wait_for_commit_ptr;
+#ifndef DBUG_OFF
+ waiter->debug_done= false;
+#endif
+
+ while (unlikely(!element->
+ p_waiter.compare_exchange_weak(exp, waiter,
+ std::memory_order_acq_rel)))
+ {
+ if (exp)
+ {
+ DBUG_ASSERT(exp != waiter);
+ waiter= NULL; // notifier is seen
+
+ break;
+ }
+ else
+ {
+ (void) LF_BACKOFF();
+ }
+ }
+ lf_hash_search_unpin(thd->xid_hash_pins);
+
+ if (waiter) // notifier was not seen
+ {
+ mysql_mutex_lock(&waiter->LOCK_wait_commit);
+ thd->ENTER_COND(&waiter->COND_wait_commit_dep, &waiter->LOCK_wait_commit,
+ &stage_waiting_for_prior_xa_transaction,
+ &old_stage);
+ if ((element=
+ (XID_cache_element*) lf_hash_search(&xid_cache, thd->xid_hash_pins,
+ xid->key(), xid->key_length())))
+ {
+ lf_hash_search_unpin(thd->xid_hash_pins);
+ mysql_cond_wait(&waiter->COND_wait_commit_dep,
+ &waiter->LOCK_wait_commit);
+
+ DBUG_ASSERT(waiter->debug_done || thd->check_killed(1));
+ }
+ thd->EXIT_COND(&old_stage);
+ }
+ }
+ if (!(rc= thd->check_killed(1)))
+ {
+ // (element && waiter = NULL) indicates the duplicate xid is coming
+ do
+ rc= xid_cache_insert(thd, &thd->transaction->xid_state, thd->lex->xid);
+ while (rc && element && !waiter && (ut_delay(1), true));
+ }
+ }
+
+ return rc;
+}
+
+/**
+ XA-"complete" run by parallel slave gets access to its xid.
+ Analogously to @c xid_cache_insert_maybe_wait, it is expecting, here its, xid
+ supplied through the THD argument, will be soon (the parent XAP has already
+ waken up transactions before the current one) released for acquisition.
+
+ @param thd thread handler
+ @return XID_cache_element pointer or NULL when the search is interruped
+ by kill.
+*/
+XID_cache_element * xid_cache_search_maybe_wait(THD* thd)
+{
+ if (thd->fix_xid_hash_pins())
+ {
+ my_error(ER_OUT_OF_RESOURCES, MYF(0));
+ return NULL;
+ }
+
+ XID_cache_element *xs;
+ XID *xid= thd->lex->xid;
+ int i= 0;
+ do
+ {
+ if (!(xs= xid_cache_search(thd, thd->lex->xid)))
+ ut_delay(1 + i++);
+ }
+ while (!xs && i < SPIN_MAX);
+
+ if (!xs)
+ {
+ XID_cache_element *element=
+ (XID_cache_element*) lf_hash_search(&xid_cache, thd->xid_hash_pins,
+ xid->key(), xid->key_length());
+ if (element)
+ {
+ lf_hash_search_unpin(thd->xid_hash_pins);
+ if (!element->acquire_recovered())
+ {
+ wait_for_commit *exp= NULL, *waiter= thd->wait_for_commit_ptr;
+ bool waiter_done= true; // assumption
+
+
+ /*
+ Set itself to wait for xid owner while taking care of race with it on
+ marking the xid element. When the element is found to be marked not
+ by us that indicates xid has been released.
+ */
+ while (unlikely(!element->
+ c_waiter.
+ compare_exchange_weak(exp, waiter,
+ std::memory_order_acq_rel)))
+ {
+ if (exp)
+ {
+ DBUG_ASSERT(exp != waiter);
+ waiter= NULL; // notifier is seen
+
+ break;
+ }
+ else
+ {
+ (void) LF_BACKOFF();
+ }
+ }
+
+ if (waiter) // notifier was not seen
+ {
+ PSI_stage_info old_stage;
+ mysql_mutex_lock(&waiter->LOCK_wait_commit);
+ thd->ENTER_COND(&waiter->COND_wait_commit_dep, &waiter->LOCK_wait_commit,
+ &stage_waiting_for_prior_xa_transaction,
+ &old_stage);
+ if (element->c_waiter.load(std::memory_order_relaxed) &&
+ likely(!thd->check_killed(1)))
+ mysql_cond_wait(&waiter->COND_wait_commit_dep,
+ &waiter->LOCK_wait_commit);
+
+ if (element->c_waiter.load(std::memory_order_relaxed))
+ {
+ waiter_done= false;
+ DBUG_ASSERT(thd->check_killed(1));
+ }
+ thd->EXIT_COND(&old_stage);
+ }
+
+ if (waiter_done &&
+ likely(element->is_set(XID_cache_element::RECOVERED |
+ XID_cache_element::ACQUIRED)))
+ xs= element;
+ else
+ goto end;
+ }
+ else
+ {
+ xs= element;
+ }
+ }
+ }
+
+end:
+ return xs;
+}
+
bool xid_cache_insert(XID *xid)
{
@@ -302,7 +512,8 @@ bool xid_cache_insert(THD *thd, XID_STATE *xid_state, XID *xid)
xid_state->xid_cache_element->set(XID_cache_element::ACQUIRED);
break;
case 1:
- my_error(ER_XAER_DUPID, MYF(0));
+ if (!(thd->rgi_slave && thd->rgi_slave->is_parallel_exec))
+ my_error(ER_XAER_DUPID, MYF(0));
}
return res;
}
@@ -311,9 +522,39 @@ bool xid_cache_insert(THD *thd, XID_STATE *xid_state, XID *xid)
static void xid_cache_delete(THD *thd, XID_cache_element *&element)
{
DBUG_ASSERT(thd->xid_hash_pins);
+
element->mark_uninitialized();
+ wait_for_commit *waiter= NULL;
+ if (thd->rgi_slave && thd->rgi_slave->is_parallel_exec)
+ {
+ wait_for_commit *notifier= &thd->rgi_slave->commit_orderer;
+ while (unlikely(!element->
+ p_waiter.compare_exchange_weak(waiter, notifier,
+ std::memory_order_acq_rel)))
+ {
+ if (waiter)
+ {
+ DBUG_ASSERT(notifier != waiter);
+
+ break;
+ }
+ else
+ {
+ (void) LF_BACKOFF();
+ }
+ }
+ }
lf_hash_delete(&xid_cache, thd->xid_hash_pins,
element->xid.key(), element->xid.key_length());
+ if (waiter)
+ {
+ mysql_mutex_lock(&waiter->LOCK_wait_commit);
+#ifndef DBUG_OFF
+ waiter->debug_done= true;
+#endif
+ mysql_cond_signal(&waiter->COND_wait_commit_dep);
+ mysql_mutex_unlock(&waiter->LOCK_wait_commit);
+ }
}
@@ -456,7 +697,23 @@ bool trans_xa_start(THD *thd)
else if (!trans_begin(thd))
{
MYSQL_SET_TRANSACTION_XID(thd->m_transaction_psi, thd->lex->xid, XA_ACTIVE);
- if (xid_cache_insert(thd, &thd->transaction->xid_state, thd->lex->xid))
+
+ bool parallel_slave_xap_status= true; // `true` presumes ordinary XA START.
+ if (thd->rgi_slave &&
+ thd->rgi_slave->is_parallel_exec)
+ {
+ DBUG_ASSERT(thd->rgi_slave->gtid_ev_flags2 |
+ Gtid_log_event::FL_PREPARED_XA);
+ /*
+ The status gets refined below normally to flip in which case `false`
+ designates the xid insert is done.
+ Possibly incurred wait is when xid is duplicate.
+ */
+ parallel_slave_xap_status= xid_cache_insert_maybe_wait(thd);
+ }
+
+ if (parallel_slave_xap_status &&
+ xid_cache_insert(thd, &thd->transaction->xid_state, thd->lex->xid))
{
trans_rollback(thd);
DBUG_RETURN(true);
@@ -602,12 +859,19 @@ bool trans_xa_commit(THD *thd)
my_error(ER_OUT_OF_RESOURCES, MYF(0));
DBUG_RETURN(TRUE);
}
+ DBUG_ASSERT(!thd->rgi_slave || !thd->rgi_slave->is_async_xac);
- if (auto xs= xid_cache_search(thd, thd->lex->xid))
+ /*
+ Parallel slave may not succeed acquiring xid, in which case
+ @c is_async_xac is @c true, it will do that later.
+ */
+ XID_cache_element *xs;
+ if ((xs= xid_cache_search(thd, thd->lex->xid)) ||
+ (thd->rgi_slave && thd->rgi_slave->is_async_xac))
{
bool xid_deleted= false;
MDL_request mdl_request;
- bool rw_trans= (xs->rm_error != ER_XA_RBROLLBACK);
+ bool rw_trans= (xs && xs->rm_error != ER_XA_RBROLLBACK);
if (rw_trans && thd->is_read_only_ctx())
{
@@ -615,8 +879,7 @@ bool trans_xa_commit(THD *thd)
res= 1;
goto _end_external_xid;
}
-
- res= xa_trans_rolled_back(xs);
+ res= xs ? xa_trans_rolled_back(xs) : 0;
/*
Acquire metadata lock which will ensure that COMMIT is blocked
by active FLUSH TABLES WITH READ LOCK (and vice versa COMMIT in
@@ -645,7 +908,7 @@ bool trans_xa_commit(THD *thd)
}
DBUG_ASSERT(!xid_state.xid_cache_element);
- xid_state.xid_cache_element= xs;
+ xid_state.xid_cache_element= xs; // may be NULL on parallel slave
ha_commit_or_rollback_by_xid(thd->lex->xid, !res);
if (!res && thd->is_error())
{
@@ -654,13 +917,16 @@ bool trans_xa_commit(THD *thd)
res= true;
goto _end_external_xid;
}
- xid_cache_delete(thd, xs);
+ DBUG_ASSERT(xs || (thd->rgi_slave && thd->rgi_slave->is_async_xac &&
+ xid_state.xid_cache_element));
+
+ xid_cache_delete(thd, xid_state.xid_cache_element);
xid_deleted= true;
_end_external_xid:
xid_state.xid_cache_element= 0;
res= res || thd->is_error();
- if (!xid_deleted)
+ if (!xid_deleted && xs)
xs->acquired_to_recovered();
if (mdl_request.ticket)
{
@@ -790,12 +1056,14 @@ bool trans_xa_rollback(THD *thd)
DBUG_RETURN(TRUE);
}
- if (auto xs= xid_cache_search(thd, thd->lex->xid))
+ XID_cache_element *xs;
+ if ((xs= xid_cache_search(thd, thd->lex->xid)) ||
+ (thd->rgi_slave && thd->rgi_slave->is_async_xac))
{
bool res;
bool xid_deleted= false;
MDL_request mdl_request;
- bool rw_trans= (xs->rm_error != ER_XA_RBROLLBACK);
+ bool rw_trans= (xs && xs->rm_error != ER_XA_RBROLLBACK);
if (rw_trans && thd->is_read_only_ctx())
{
@@ -822,7 +1090,7 @@ bool trans_xa_rollback(THD *thd)
{
thd->backup_commit_lock= &mdl_request;
}
- res= xa_trans_rolled_back(xs);
+ res= xs ? xa_trans_rolled_back(xs) : 0;
DBUG_ASSERT(!xid_state.xid_cache_element);
xid_state.xid_cache_element= xs;
@@ -831,12 +1099,15 @@ bool trans_xa_rollback(THD *thd)
{
goto _end_external_xid;
}
- xid_cache_delete(thd, xs);
+ DBUG_ASSERT(xs || (thd->rgi_slave && thd->rgi_slave->is_async_xac &&
+ xid_state.xid_cache_element));
+
+ xid_cache_delete(thd, xid_state.xid_cache_element);
xid_deleted= true;
_end_external_xid:
xid_state.xid_cache_element= 0;
- if (!xid_deleted)
+ if (!xid_deleted && xs)
xs->acquired_to_recovered();
if (mdl_request.ticket)
{
@@ -1146,7 +1417,7 @@ static bool slave_applier_reset_xa_trans(THD *thd)
{
thd->transaction->xid_state.set_error(ER_XA_RBROLLBACK);
}
- thd->transaction->xid_state.xid_cache_element->acquired_to_recovered();
+ auto element= thd->transaction->xid_state.xid_cache_element;
thd->transaction->xid_state.xid_cache_element= 0;
for (Ha_trx_info *ha_info= thd->transaction->all.ha_list, *ha_info_next;
@@ -1158,6 +1429,34 @@ static bool slave_applier_reset_xa_trans(THD *thd)
thd->transaction->all.ha_list= 0;
ha_close_connection(thd);
+ element->acquired_to_recovered();
+ if (thd->rgi_slave && thd->rgi_slave->is_parallel_exec)
+ {
+ /* make the xid available to a possible (xa-"complete") waiter */
+ wait_for_commit *xac_waiter= NULL,
+ *notifier= &thd->rgi_slave->commit_orderer;
+ while (unlikely(!element->
+ c_waiter.compare_exchange_weak(xac_waiter, notifier,
+ std::memory_order_acq_rel)))
+ {
+ if (xac_waiter)
+ {
+ break;
+ }
+ else
+ {
+ (void) LF_BACKOFF();
+ }
+ }
+ if (xac_waiter)
+ {
+ // unmark and signal
+ mysql_mutex_lock(&xac_waiter->LOCK_wait_commit);
+ element->c_waiter.store(NULL, std::memory_order_relaxed);
+ mysql_cond_signal(&xac_waiter->COND_wait_commit_dep);
+ mysql_mutex_unlock(&xac_waiter->LOCK_wait_commit);
+ }
+ }
thd->transaction->cleanup();
thd->transaction->all.reset();
diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc
index daf27822085..53ce685fa35 100644
--- a/storage/innobase/trx/trx0undo.cc
+++ b/storage/innobase/trx/trx0undo.cc
@@ -639,8 +639,7 @@ static void trx_undo_write_xid(buf_block_t *block, uint16_t offset,
static_cast<uint32_t>(xid.bqual_length));
const ulint xid_length= static_cast<ulint>(xid.gtrid_length
+ xid.bqual_length);
- mtr->memcpy(*block, &block->page.frame[offset + TRX_UNDO_XA_XID],
- xid.data, xid_length);
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*block, &block->page.frame[offset + TRX_UNDO_XA_XID], xid.data, xid_length);
if (UNIV_LIKELY(xid_length < XIDDATASIZE))
mtr->memset(block, offset + TRX_UNDO_XA_XID + xid_length,
XIDDATASIZE - xid_length, 0);
--
2.30.2
1
0

[PATCH 0/4] MDEV-31273, Pre-compute binlog checksums outside of LOCK_log
by Kristian Nielsen 18 Oct '23
by Kristian Nielsen 18 Oct '23
18 Oct '23
Hi Monty,
Here's the implementation of MDEV-31273, pre-compute binlog checksums.
The main patch to review is the last one, number 4. This is the actual
implementation of binlog checksum pre-computation, and the only patch
that changes the behaviour of the code.
Most of the work (and most of the changes) are cleanups of the old checksum
code that don't change the functionality but removes a lot of complex and
hard-to-modify logic (and I think actually fixes a bug or two). I have kept
this cleanup separate in the first 3 patches to make it easier to review and
not get mixed up with the actual implementation of the new functionality.
With this patch series, calculation of binlog checksum will happen when
writing events into the stmt/trx caches. Later, when writing the binlog file
under LOCK_log, only a direct copy of the bytes is done, which should
improve binlog scalability with checksums enabled.
The patch series is also available on github:
https://github.com/MariaDB/server/commits/knielsen_mdev31273
- Kristian.
Kristian Nielsen (4):
MDEV-31273: Replace Log_event::writer with function parameter
MDEV-31273: Eliminate Log_event::checksum_alg
MDEV-31273: Refactor MYSQL_BIN_LOG::write_cache()
MDEV-31273: Precompute binlog checksums
include/my_atomic.h | 41 +-
include/my_sys.h | 2 +
.../main/mysqlbinlog_row_compressed.result | 48 +-
.../main/mysqlbinlog_row_minimal.result | 48 +-
.../main/mysqlbinlog_stmt_compressed.result | 16 +-
mysql-test/main/mysqld--help.result | 7 +
.../suite/binlog/include/binlog_ioerr.inc | 3 +
mysql-test/suite/binlog/r/binlog_ioerr.result | 2 +
.../r/binlog_mysqlbinlog_raw_flush.result | 1 +
mysql-test/suite/binlog/t/binlog_killed.test | 2 +-
.../t/binlog_mysqlbinlog_raw_flush.test | 2 +
.../t/binlog_table_map_optional_metadata.test | 4 +-
.../binlog_encryption/binlog_ioerr.result | 2 +
.../suite/rpl/r/rpl_checksum_cache.result | 43 +-
.../suite/rpl/t/rpl_checksum_cache.test | 98 +++-
.../r/sysvars_server_notembedded.result | 10 +
mysys/mf_iocache2.c | 34 ++
sql/log.cc | 468 ++++++++++--------
sql/log.h | 14 +-
sql/log_event.cc | 27 +-
sql/log_event.h | 158 +++---
sql/log_event_client.cc | 22 +-
sql/log_event_old.cc | 14 +-
sql/log_event_old.h | 4 +-
sql/log_event_server.cc | 395 ++++++---------
sql/mysqld.cc | 1 +
sql/mysqld.h | 1 +
sql/privilege.h | 3 +
sql/slave.cc | 60 +--
sql/sql_repl.cc | 2 +-
sql/sys_vars.cc | 13 +
sql/wsrep_binlog.cc | 6 +-
sql/wsrep_mysqld.cc | 12 +-
33 files changed, 922 insertions(+), 641 deletions(-)
--
2.30.2
3
7
This mode can be used to disable warnings for functions whose use is
strongly discouraged because of inherent limitations in the functionality
they provide. But the use of which can be needed for backwards compatibility
with legacy applications, and which cannot easily be replaced by the user.
Currently used for weak encryption functions ENCODE()/DECODE(), to help
users who need these functions to access existing data encoded with this
non-standard algorithm.
Signed-off-by: Kristian Nielsen <knielsen(a)knielsen-hq.org>
---
mysql-test/main/func_str.result | 4 ++++
mysql-test/main/func_str.test | 2 ++
mysql-test/main/mysqld--help.result | 2 +-
mysql-test/suite/sys_vars/r/old_mode_basic.result | 4 ++--
.../sys_vars/r/sysvars_server_notembedded.result | 2 +-
mysql-test/suite/sys_vars/t/old_mode_basic.test | 2 +-
sql/item_strfunc.cc | 11 ++++++++++-
sql/sql_class.h | 1 +
sql/sys_vars.cc | 3 ++-
9 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/mysql-test/main/func_str.result b/mysql-test/main/func_str.result
index 8c09d19cc44..725ce2b7c46 100644
--- a/mysql-test/main/func_str.result
+++ b/mysql-test/main/func_str.result
@@ -323,6 +323,10 @@ decode(encode("abcdef","monty"),"monty")="abcdef"
Warnings:
Warning 1287 'encode' is deprecated and will be removed in a future release
Warning 1287 'decode' is deprecated and will be removed in a future release
+SET STATEMENT old_mode= 'COMPAT_DISCOURAGED'
+ FOR select decode(encode("abcdef","monty"),"monty")="abcdef";
+decode(encode("abcdef","monty"),"monty")="abcdef"
+1
select quote('\'\"\\test');
quote('\'\"\\test')
'\'"\\test'
diff --git a/mysql-test/main/func_str.test b/mysql-test/main/func_str.test
index feb3057ec41..e4b572a7f3f 100644
--- a/mysql-test/main/func_str.test
+++ b/mysql-test/main/func_str.test
@@ -142,6 +142,8 @@ select least(1,2,3) | greatest(16,32,8), least(5,4)*1,greatest(-1.0,1.0)*1,least
select decode(encode(repeat("a",100000),"monty"),"monty")=repeat("a",100000);
--enable_view_protocol
select decode(encode("abcdef","monty"),"monty")="abcdef";
+SET STATEMENT old_mode= 'COMPAT_DISCOURAGED'
+ FOR select decode(encode("abcdef","monty"),"monty")="abcdef";
select quote('\'\"\\test');
select quote(concat('abc\'', '\\cba'));
diff --git a/mysql-test/main/mysqld--help.result b/mysql-test/main/mysqld--help.result
index 6dc8543f225..0d5d2daade5 100644
--- a/mysql-test/main/mysqld--help.result
+++ b/mysql-test/main/mysqld--help.result
@@ -706,7 +706,7 @@ The following specify which files/extra groups are read (specified before remain
NO_DUP_KEY_WARNINGS_WITH_IGNORE, NO_PROGRESS_INFO,
ZERO_DATE_TIME_CAST, UTF8_IS_UTF8MB3,
IGNORE_INDEX_ONLY_FOR_JOIN, COMPAT_5_1_CHECKSUM,
- LOCK_ALTER_TABLE_COPY
+ LOCK_ALTER_TABLE_COPY, COMPAT_DISCOURAGED
--old-passwords Use old password encryption method (needed for 4.0 and
older clients)
--old-style-user-limits
diff --git a/mysql-test/suite/sys_vars/r/old_mode_basic.result b/mysql-test/suite/sys_vars/r/old_mode_basic.result
index 7701bc6bbfa..3032a82581b 100644
--- a/mysql-test/suite/sys_vars/r/old_mode_basic.result
+++ b/mysql-test/suite/sys_vars/r/old_mode_basic.result
@@ -132,8 +132,8 @@ Warning 1287 'ZERO_DATE_TIME_CAST' is deprecated and will be removed in a future
SELECT @@global.old_mode;
@@global.old_mode
ZERO_DATE_TIME_CAST
-SET @@global.old_mode = 128;
-ERROR 42000: Variable 'old_mode' can't be set to the value of '128'
+SET @@global.old_mode = 256;
+ERROR 42000: Variable 'old_mode' can't be set to the value of '256'
SELECT @@global.old_mode;
@@global.old_mode
ZERO_DATE_TIME_CAST
diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
index 0f4c54bb217..a48dd2dafe1 100644
--- a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
+++ b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
@@ -2459,7 +2459,7 @@ VARIABLE_COMMENT Used to emulate old behavior from earlier MariaDB or MySQL vers
NUMERIC_MIN_VALUE NULL
NUMERIC_MAX_VALUE NULL
NUMERIC_BLOCK_SIZE NULL
-ENUM_VALUE_LIST NO_DUP_KEY_WARNINGS_WITH_IGNORE,NO_PROGRESS_INFO,ZERO_DATE_TIME_CAST,UTF8_IS_UTF8MB3,IGNORE_INDEX_ONLY_FOR_JOIN,COMPAT_5_1_CHECKSUM,LOCK_ALTER_TABLE_COPY
+ENUM_VALUE_LIST NO_DUP_KEY_WARNINGS_WITH_IGNORE,NO_PROGRESS_INFO,ZERO_DATE_TIME_CAST,UTF8_IS_UTF8MB3,IGNORE_INDEX_ONLY_FOR_JOIN,COMPAT_5_1_CHECKSUM,LOCK_ALTER_TABLE_COPY,COMPAT_DISCOURAGED
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME OLD_PASSWORDS
diff --git a/mysql-test/suite/sys_vars/t/old_mode_basic.test b/mysql-test/suite/sys_vars/t/old_mode_basic.test
index cb18796729e..c3fc4c57189 100644
--- a/mysql-test/suite/sys_vars/t/old_mode_basic.test
+++ b/mysql-test/suite/sys_vars/t/old_mode_basic.test
@@ -172,7 +172,7 @@ SET @@global.old_mode = 4;
SELECT @@global.old_mode;
--Error ER_WRONG_VALUE_FOR_VAR
-SET @@global.old_mode = 128;
+SET @@global.old_mode = 256;
SELECT @@global.old_mode;
# use of decimal values
diff --git a/sql/item_strfunc.cc b/sql/item_strfunc.cc
index 4b96271c331..339430a2794 100644
--- a/sql/item_strfunc.cc
+++ b/sql/item_strfunc.cc
@@ -2720,7 +2720,16 @@ bool Item_func_encode::seed()
hash_password(rand_nr, key->ptr(), key->length());
sql_crypt.init(rand_nr);
- warn_deprecated<1103>(current_thd, func_name_cstring().str);
+ if (!(current_thd->variables.old_behavior & OLD_MODE_COMPAT_DISCOURAGED))
+ {
+ /*
+ This function should not be removed despite the deprecation warning.
+ That would cause problems for users who can then no longer access
+ columns in their database that might have been inserted as
+ ENCODE(str,pass_str).
+ */
+ warn_deprecated<1103>(current_thd, func_name_cstring().str);
+ }
return FALSE;
}
diff --git a/sql/sql_class.h b/sql/sql_class.h
index 28c2831cfee..ff730ec2297 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -203,6 +203,7 @@ enum enum_binlog_row_image {
#define OLD_MODE_IGNORE_INDEX_ONLY_FOR_JOIN (1 << 4)
#define OLD_MODE_COMPAT_5_1_CHECKSUM (1 << 5)
#define OLD_MODE_LOCK_ALTER_TABLE_COPY (1 << 6)
+#define OLD_MODE_COMPAT_DISCOURAGED (1 << 7)
#define OLD_MODE_DEFAULT_VALUE OLD_MODE_UTF8_IS_UTF8MB3
diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc
index e25d293bbb7..18d08996aa8 100644
--- a/sql/sys_vars.cc
+++ b/sql/sys_vars.cc
@@ -3863,12 +3863,13 @@ static const char *old_mode_names[]=
"IGNORE_INDEX_ONLY_FOR_JOIN", // deprecated since 11.3
"COMPAT_5_1_CHECKSUM", // deprecated since 11.3
"LOCK_ALTER_TABLE_COPY", // deprecated since 11.3
+ "COMPAT_DISCOURAGED",
0
};
void old_mode_deprecated_warnings(THD *thd, ulonglong v)
{
- v &= ~OLD_MODE_DEFAULT_VALUE;
+ v &= ~(OLD_MODE_DEFAULT_VALUE | OLD_MODE_COMPAT_DISCOURAGED);
for (uint i=0; old_mode_names[i]; i++)
if ((1ULL<<i) & v)
{
--
2.30.2
1
0

[PATCH] MDEV-10356: rpl.rpl_parallel_temptable failure due to incorrect commit optimization of temptables
by Kristian Nielsen 07 Sep '23
by Kristian Nielsen 07 Sep '23
07 Sep '23
The problem was that parallel replication of temporary tables using
statement-based binlogging could overlap the COMMIT in one thread with a DML
or DROP TEMPORARY TABLE in another thread using the same temporary table.
Temporary tables are not safe for concurrent access, so this caused
reference to freed memory and possibly other nastiness.
The fix is to disable the optimisation with overlapping commits of one
transaction with the start of a later transaction, when temporary tables are
in use. Then the following event groups will be blocked from starting until
the one using temporary tables is completed.
This also fixes occasional test failures of rpl.rpl_parallel_temptable seen
in Buildbot.
Signed-off-by: Kristian Nielsen <knielsen(a)knielsen-hq.org>
---
sql/rpl_parallel.cc | 18 +++++++++++++++++-
sql/sql_class.cc | 4 ++++
sql/sql_class.h | 13 +++++++++++++
3 files changed, 34 insertions(+), 1 deletion(-)
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index 3bd27c73932..fc4434b75de 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -218,6 +218,7 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id,
waiting for this). In most cases (normal DML), it will be a no-op.
*/
rgi->mark_start_commit_no_lock();
+ rgi->commit_orderer.wakeup_blocked= false;
if (entry->last_committed_sub_id < sub_id)
{
@@ -1425,7 +1426,22 @@ handle_rpl_parallel_thread(void *arg)
if (!thd->killed)
{
DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit");
- rgi->mark_start_commit();
+ if (thd->lex->stmt_accessed_temp_table())
+ {
+ /*
+ Temporary tables are special, they require strict
+ single-threaded use as they have no locks protecting concurrent
+ access. Therefore, we cannot safely use the optimization of
+ overlapping the commit of this transaction with the start of the
+ following.
+ So we skip the early mark_start_commit() and also block any
+ wakeup_subsequent_commits() until this event group is fully
+ done, inside finish_event_group().
+ */
+ rgi->commit_orderer.wakeup_blocked= true;
+ }
+ else
+ rgi->mark_start_commit();
DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit");
}
}
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index e7e27401d61..17feb006e21 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -7536,6 +7536,7 @@ wait_for_commit::reinit()
wakeup_error= 0;
wakeup_subsequent_commits_running= false;
commit_started= false;
+ wakeup_blocked= false;
#ifdef SAFE_MUTEX
/*
When using SAFE_MUTEX, the ordering between taking the LOCK_wait_commit
@@ -7808,6 +7809,9 @@ wait_for_commit::wakeup_subsequent_commits2(int wakeup_error)
{
wait_for_commit *waiter;
+ if (unlikely(wakeup_blocked))
+ return;
+
mysql_mutex_lock(&LOCK_wait_commit);
wakeup_subsequent_commits_running= true;
waiter= subsequent_commits_list;
diff --git a/sql/sql_class.h b/sql/sql_class.h
index 4487a67c76d..4c172ba8e2a 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -2142,6 +2142,19 @@ struct wait_for_commit
group commit as T1.
*/
bool commit_started;
+ /*
+ Set to temporarily ignore calls to wakeup_subsequent_commits(). The
+ caller must arrange that another wakeup_subsequent_commits() gets called
+ later after wakeup_blocked has been set back to false.
+
+ This is used for parallel replication with temporary tables.
+ Temporary tables require strict single-threaded operation. The normal
+ optimization, of doing wakeup_subsequent_commits early and overlapping
+ part of the commit with the following transaction, is not safe. Thus
+ when temporary tables are replicated, wakeup is blocked until the
+ event group is fully done.
+ */
+ bool wakeup_blocked;
void register_wait_for_prior_commit(wait_for_commit *waitee);
int wait_for_prior_commit(THD *thd, bool allow_kill=true)
--
2.30.2
1
0

[Commits] cb06612a9da: MDEV-31655: Parallel replication deadlock victim preference code errorneously removed
by Kristian Nielsen 10 Jul '23
by Kristian Nielsen 10 Jul '23
10 Jul '23
revision-id: cb06612a9da09a7981ada84768793f2ff3ef842c (mariadb-10.4.30-3-gcb06612a9da)
parent(s): dbe5c20b755b87f67d87990c3baabc866667e41b
author: Kristian Nielsen
committer: Kristian Nielsen
timestamp: 2023-07-11 00:31:29 +0200
message:
MDEV-31655: Parallel replication deadlock victim preference code errorneously removed
Restore code to make InnoDB choose the second transaction as a deadlock
victim if two transactions deadlock that need to commit in-order for
parallel replication. This code was erroneously removed when VATS was
implemented in InnoDB.
Also add a test case for InnoDB choosing the right deadlock victim.
Signed-off-by: Kristian Nielsen <knielsen(a)knielsen-hq.org>
---
.../suite/binlog_encryption/rpl_parallel.result | 42 ++++++++++++-
mysql-test/suite/rpl/r/rpl_parallel.result | 42 ++++++++++++-
mysql-test/suite/rpl/t/rpl_parallel.test | 71 +++++++++++++++++++++-
sql/rpl_parallel.cc | 7 ++-
sql/sql_class.cc | 43 +++++++++++++
storage/innobase/lock/lock0lock.cc | 12 ++++
storage/innobase/trx/trx0trx.cc | 12 ++++
7 files changed, 225 insertions(+), 4 deletions(-)
diff --git a/mysql-test/suite/binlog_encryption/rpl_parallel.result b/mysql-test/suite/binlog_encryption/rpl_parallel.result
index b75a66a634a..b24ff7ba53d 100644
--- a/mysql-test/suite/binlog_encryption/rpl_parallel.result
+++ b/mysql-test/suite/binlog_encryption/rpl_parallel.result
@@ -2,6 +2,7 @@ include/master-slave.inc
[connection master]
connection server_2;
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
SET GLOBAL slave_parallel_threads=10;
ERROR HY000: This operation cannot be performed as you have a running slave ''; run STOP SLAVE '' first
include/stop_slave.inc
@@ -1680,13 +1681,52 @@ a
2000
SELECT * FROM t2 WHERE a>=2000 ORDER BY a;
a
+MDEV-31655: Parallel replication deadlock victim preference code erroneously removed
+connection server_1;
+CREATE TABLE t7 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+BEGIN;
+COMMIT;
+include/save_master_gtid.inc
+connection server_2;
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+set @@global.slave_parallel_threads= 5;
+set @@global.slave_parallel_mode= conservative;
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug= "+d,rpl_mdev31655_zero_retries";
+connection master;
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+SET @commit_id= 1+1000;
+SET @commit_id= 2+1000;
+SET @commit_id= 3+1000;
+SET @commit_id= 4+1000;
+SET @commit_id= 5+1000;
+SET @commit_id= 6+1000;
+SET @commit_id= 7+1000;
+SET @commit_id= 8+1000;
+SET @commit_id= 9+1000;
+SET @commit_id= 10+1000;
+SET SESSION debug_dbug= @old_dbug;
+SELECT COUNT(*), SUM(a*100*b) FROM t7;
+COUNT(*) SUM(a*100*b)
+10 225000
+include/save_master_gtid.inc
+connection server_2;
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SET GLOBAL debug_dbug= @old_dbug;
+SELECT COUNT(*), SUM(a*100*b) FROM t7;
+COUNT(*) SUM(a*100*b)
+10 225000
connection server_2;
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+SET GLOBAL slave_parallel_mode=@old_parallel_mode;
include/start_slave.inc
SET DEBUG_SYNC= 'RESET';
connection server_1;
DROP function foo;
-DROP TABLE t1,t2,t3,t4,t5,t6;
+DROP TABLE t1,t2,t3,t4,t5,t6,t7;
SET DEBUG_SYNC= 'RESET';
include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/r/rpl_parallel.result b/mysql-test/suite/rpl/r/rpl_parallel.result
index 9b2e68d366e..ef89d954faa 100644
--- a/mysql-test/suite/rpl/r/rpl_parallel.result
+++ b/mysql-test/suite/rpl/r/rpl_parallel.result
@@ -2,6 +2,7 @@ include/master-slave.inc
[connection master]
connection server_2;
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
SET GLOBAL slave_parallel_threads=10;
ERROR HY000: This operation cannot be performed as you have a running slave ''; run STOP SLAVE '' first
include/stop_slave.inc
@@ -1679,13 +1680,52 @@ a
2000
SELECT * FROM t2 WHERE a>=2000 ORDER BY a;
a
+MDEV-31655: Parallel replication deadlock victim preference code erroneously removed
+connection server_1;
+CREATE TABLE t7 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+BEGIN;
+COMMIT;
+include/save_master_gtid.inc
+connection server_2;
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+set @@global.slave_parallel_threads= 5;
+set @@global.slave_parallel_mode= conservative;
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug= "+d,rpl_mdev31655_zero_retries";
+connection master;
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+SET @commit_id= 1+1000;
+SET @commit_id= 2+1000;
+SET @commit_id= 3+1000;
+SET @commit_id= 4+1000;
+SET @commit_id= 5+1000;
+SET @commit_id= 6+1000;
+SET @commit_id= 7+1000;
+SET @commit_id= 8+1000;
+SET @commit_id= 9+1000;
+SET @commit_id= 10+1000;
+SET SESSION debug_dbug= @old_dbug;
+SELECT COUNT(*), SUM(a*100*b) FROM t7;
+COUNT(*) SUM(a*100*b)
+10 225000
+include/save_master_gtid.inc
+connection server_2;
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SET GLOBAL debug_dbug= @old_dbug;
+SELECT COUNT(*), SUM(a*100*b) FROM t7;
+COUNT(*) SUM(a*100*b)
+10 225000
connection server_2;
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+SET GLOBAL slave_parallel_mode=@old_parallel_mode;
include/start_slave.inc
SET DEBUG_SYNC= 'RESET';
connection server_1;
DROP function foo;
-DROP TABLE t1,t2,t3,t4,t5,t6;
+DROP TABLE t1,t2,t3,t4,t5,t6,t7;
SET DEBUG_SYNC= 'RESET';
include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/t/rpl_parallel.test b/mysql-test/suite/rpl/t/rpl_parallel.test
index 9ba7a30f2eb..d43cec4df34 100644
--- a/mysql-test/suite/rpl/t/rpl_parallel.test
+++ b/mysql-test/suite/rpl/t/rpl_parallel.test
@@ -13,6 +13,7 @@
--connection server_2
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
--error ER_SLAVE_MUST_STOP
SET GLOBAL slave_parallel_threads=10;
--source include/stop_slave.inc
@@ -2203,16 +2204,84 @@ SELECT * FROM t1 WHERE a>=2000 ORDER BY a;
SELECT * FROM t2 WHERE a>=2000 ORDER BY a;
+--echo MDEV-31655: Parallel replication deadlock victim preference code erroneously removed
+# The problem was that InnoDB would choose the wrong deadlock victim.
+# Create a lot of transactions that can cause deadlocks, and use error
+# injection to check that the first transactions in each group is never
+# selected as deadlock victim.
+--let $rows= 10
+--let $transactions= 5
+--let $gcos= 10
+
+--connection server_1
+CREATE TABLE t7 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+BEGIN;
+--disable_query_log
+--let $i= 0
+while ($i < 10) {
+ eval INSERT INTO t7 VALUES ($i, 0);
+ inc $i;
+}
+--enable_query_log
+COMMIT;
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/sync_with_master_gtid.inc
+--source include/stop_slave.inc
+eval set @@global.slave_parallel_threads= $transactions;
+set @@global.slave_parallel_mode= conservative;
+SET @old_dbug= @@GLOBAL.debug_dbug;
+# This error injection will allow no retries for GTIDs divisible by 1000.
+SET GLOBAL debug_dbug= "+d,rpl_mdev31655_zero_retries";
+
+--connection master
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+
+--let $j= 1
+while ($j <= $gcos) {
+ eval SET @commit_id= $j+1000;
+ --let $i= 0
+ while ($i < $transactions) {
+ --disable_query_log
+ eval SET SESSION gtid_seq_no= 1000 + 1000*$j + $i;
+ BEGIN;
+ --let $k= 0
+ while ($k < $rows) {
+ eval UPDATE t7 SET b=b+1 WHERE a=(($i+$k) MOD $rows);
+ inc $k;
+ }
+ COMMIT;
+ --enable_query_log
+ inc $i;
+ }
+ inc $j;
+}
+
+SET SESSION debug_dbug= @old_dbug;
+SELECT COUNT(*), SUM(a*100*b) FROM t7;
+
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+SET GLOBAL debug_dbug= @old_dbug;
+SELECT COUNT(*), SUM(a*100*b) FROM t7;
+
+
# Clean up.
--connection server_2
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+SET GLOBAL slave_parallel_mode=@old_parallel_mode;
--source include/start_slave.inc
SET DEBUG_SYNC= 'RESET';
--connection server_1
DROP function foo;
-DROP TABLE t1,t2,t3,t4,t5,t6;
+DROP TABLE t1,t2,t3,t4,t5,t6,t7;
SET DEBUG_SYNC= 'RESET';
--source include/rpl_end.inc
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index b550315d69f..1aeb1257c4a 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -1385,8 +1385,13 @@ handle_rpl_parallel_thread(void *arg)
err= dbug_simulate_tmp_error(rgi, thd););
if (unlikely(err))
{
+ ulong max_retries= slave_trans_retries;
convert_kill_to_deadlock_error(rgi);
- if (has_temporary_error(thd) && slave_trans_retries > 0)
+ DBUG_EXECUTE_IF("rpl_mdev31655_zero_retries",
+ if ((rgi->current_gtid.seq_no % 1000) == 0)
+ max_retries= 0;
+ );
+ if (has_temporary_error(thd) && max_retries > 0)
err= retry_event_group(rgi, rpt, qev);
}
}
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 8ed3f8a9c5e..e6ed7ca1cc4 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -5247,6 +5247,49 @@ thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd)
return 0;
}
+
+/*
+ If the storage engine detects a deadlock, and needs to choose a victim
+ transaction to roll back, it can call this function to ask the upper
+ server layer for which of two possible transactions is prefered to be
+ aborted and rolled back.
+
+ In parallel replication, if two transactions are running in parallel and
+ one is fixed to commit before the other, then the one that commits later
+ will be prefered as the victim - chosing the early transaction as a victim
+ will not resolve the deadlock anyway, as the later transaction still needs
+ to wait for the earlier to commit.
+
+ The return value is -1 if the first transaction is prefered as a deadlock
+ victim, 1 if the second transaction is prefered, or 0 for no preference (in
+ which case the storage engine can make the choice as it prefers).
+*/
+extern "C" int
+thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2)
+{
+ rpl_group_info *rgi1, *rgi2;
+
+ if (!thd1 || !thd2)
+ return 0;
+
+ /*
+ If the transactions are participating in the same replication domain in
+ parallel replication, then request to select the one that will commit
+ later (in the fixed commit order from the master) as the deadlock victim.
+ */
+ rgi1= thd1->rgi_slave;
+ rgi2= thd2->rgi_slave;
+ if (rgi1 && rgi2 &&
+ rgi1->is_parallel_exec &&
+ rgi1->rli == rgi2->rli &&
+ rgi1->current_gtid.domain_id == rgi2->current_gtid.domain_id)
+ return rgi1->gtid_sub_id < rgi2->gtid_sub_id ? 1 : -1;
+
+ /* No preferences, let the storage engine decide. */
+ return 0;
+}
+
+
extern "C" int thd_non_transactional_update(const MYSQL_THD thd)
{
return(thd->transaction.all.modified_non_trans_table);
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index b1cf2152cd6..469d03eaa06 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -49,6 +49,8 @@ Created 5/7/1996 Heikki Tuuri
#include <mysql/service_wsrep.h>
#endif /* WITH_WSREP */
+extern "C" int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2);
+
/** Lock scheduling algorithm */
ulong innodb_lock_schedule_algorithm;
@@ -1538,6 +1540,16 @@ static bool has_higher_priority(lock_t *lock1, lock_t *lock2)
} else if (!lock_get_wait(lock2)) {
return false;
}
+ // Ask the upper server layer if any of the two trx should be prefered.
+ int preference = thd_deadlock_victim_preference(lock1->trx->mysql_thd,
+ lock2->trx->mysql_thd);
+ if (preference == -1) {
+ // lock1 is preferred as a victim, so lock2 has higher priority
+ return false;
+ } else if (preference == 1) {
+ // lock2 is preferred as a victim, so lock1 has higher priority
+ return true;
+ }
return lock1->trx->start_time_micro <= lock2->trx->start_time_micro;
}
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
index 7cd95878b0c..0771d764fb6 100644
--- a/storage/innobase/trx/trx0trx.cc
+++ b/storage/innobase/trx/trx0trx.cc
@@ -52,6 +52,9 @@ Created 3/26/1996 Heikki Tuuri
#include <set>
#include <new>
+extern "C"
+int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2);
+
/** The bit pattern corresponding to TRX_ID_MAX */
const byte trx_id_max_bytes[8] = {
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
@@ -1906,6 +1909,15 @@ trx_weight_ge(
{
ibool a_notrans_edit;
ibool b_notrans_edit;
+ int pref;
+
+ /* First ask the upper server layer if it has any preference for which
+ to prefer as a deadlock victim. */
+ pref= thd_deadlock_victim_preference(a->mysql_thd, b->mysql_thd);
+ if (pref < 0)
+ return FALSE;
+ else if (pref > 0)
+ return TRUE;
/* If mysql_thd is NULL for a transaction we assume that it has
not edited non-transactional tables. */
1
0

[Commits] dbe5c20b755: MDEV-31482: Lock wait timeout with INSERT-SELECT, autoinc, and statement-based replication
by Kristian Nielsen 10 Jul '23
by Kristian Nielsen 10 Jul '23
10 Jul '23
revision-id: dbe5c20b755b87f67d87990c3baabc866667e41b (mariadb-10.4.30-2-gdbe5c20b755)
parent(s): a6114df595eeb7aeed4b050c9a3f4640c4320b5f
author: Kristian Nielsen
committer: Kristian Nielsen
timestamp: 2023-07-09 16:45:47 +0200
message:
MDEV-31482: Lock wait timeout with INSERT-SELECT, autoinc, and statement-based replication
Remove the exception that InnoDB does not report auto-increment locks waits
to the parallel replication.
There was an assumption that these waits could not cause conflicts with
in-order parallel replication and thus need not be reported. However, this
assumption is wrong and it is possible to get conflicts that lead to hangs
for the duration of --innodb-lock-wait-timeout. This can be seen with three
transactions:
1. T1 is waiting for T3 on an autoinc lock
2. T2 is waiting for T1 to commit
3. T3 is waiting on a normal row lock held by T2
Here, T3 needs to be deadlock killed on the wait by T1.
Signed-off-by: Kristian Nielsen <knielsen(a)knielsen-hq.org>
---
mysql-test/suite/rpl/r/rpl_parallel_autoinc.result | 95 ++++++++++++++
mysql-test/suite/rpl/t/rpl_parallel_autoinc.test | 140 +++++++++++++++++++++
sql/sql_class.cc | 6 -
storage/innobase/lock/lock0lock.cc | 8 +-
4 files changed, 236 insertions(+), 13 deletions(-)
diff --git a/mysql-test/suite/rpl/r/rpl_parallel_autoinc.result b/mysql-test/suite/rpl/r/rpl_parallel_autoinc.result
new file mode 100644
index 00000000000..c1829bafa1a
--- /dev/null
+++ b/mysql-test/suite/rpl/r/rpl_parallel_autoinc.result
@@ -0,0 +1,95 @@
+include/master-slave.inc
+[connection master]
+MDEV-31482: Lock wait timeout with INSERT-SELECT, autoinc, and statement-based replication
+include/rpl_connect.inc [creating slave2]
+include/rpl_connect.inc [creating slave3]
+connection master;
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CREATE TABLE t1 (a INT PRIMARY KEY AUTO_INCREMENT, b INT, c INT, INDEX (c)) ENGINE=InnoDB;
+INSERT INTO t1 (b,c) VALUES (0, 1), (0, 1), (0, 2), (0,3), (0, 5), (0, 7), (0, 8);
+CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t2 VALUES (10,1), (20,2), (30,3), (40,4), (50,5);
+CREATE TABLE t3 (a VARCHAR(20) PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t3 VALUES ('row for T1', 0), ('row for T2', 0), ('row for T3', 0);
+include/save_master_gtid.inc
+connection slave;
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+set @@global.slave_parallel_threads= 3;
+set @@global.slave_parallel_mode= OPTIMISTIC;
+set @@global.innodb_lock_wait_timeout= 20;
+connection master;
+BEGIN;
+UPDATE t3 SET b=b+1 where a="row for T1";
+INSERT INTO t1(b, c) SELECT 1, t2.b FROM t2 WHERE a=10;
+Warnings:
+Note 1592 Unsafe statement written to the binary log using statement format since BINLOG_FORMAT = STATEMENT. Statements writing to a table with an auto-increment column after selecting from another table are unsafe because the order in which rows are retrieved determines what (if any) rows will be written. This order cannot be predicted and may differ on master and the slave
+COMMIT;
+DELETE FROM t1 WHERE c >= 4 and c < 6;
+BEGIN;
+UPDATE t3 SET b=b+1 where a="row for T3";
+INSERT INTO t1(b, c) SELECT 3, t2.b FROM t2 WHERE a >= 20 AND a <= 40;
+Warnings:
+Note 1592 Unsafe statement written to the binary log using statement format since BINLOG_FORMAT = STATEMENT. Statements writing to a table with an auto-increment column after selecting from another table are unsafe because the order in which rows are retrieved determines what (if any) rows will be written. This order cannot be predicted and may differ on master and the slave
+COMMIT;
+include/save_master_gtid.inc
+connection slave1;
+BEGIN;
+SELECT * FROM t3 WHERE a="row for T1" FOR UPDATE;
+a b
+row for T1 0
+connection slave2;
+BEGIN;
+SELECT * FROM t3 WHERE a="row for T3" FOR UPDATE;
+a b
+row for T3 0
+connection slave3;
+BEGIN;
+DELETE FROM t2 WHERE a=30;
+connection slave;
+include/start_slave.inc
+connection slave2;
+ROLLBACK;
+connection slave1;
+ROLLBACK;
+connection slave3;
+ROLLBACK;
+connection slave;
+include/sync_with_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+a b c
+1 0 1
+2 0 1
+3 0 2
+4 0 3
+6 0 7
+7 0 8
+8 1 1
+9 3 2
+10 3 3
+11 3 4
+SELECT * FROM t2 ORDER BY a;
+a b
+10 1
+20 2
+30 3
+40 4
+50 5
+SELECT * FROM t3 ORDER BY a;
+a b
+row for T1 1
+row for T2 0
+row for T3 1
+connection master;
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format");
+DROP TABLE t1, t2, t3;
+connection slave;
+include/stop_slave.inc
+SET @@global.slave_parallel_threads= 0;
+SET @@global.slave_parallel_mode= conservative;
+SET @@global.innodb_lock_wait_timeout= 50;
+include/start_slave.inc
+SELECT @@GLOBAL.innodb_autoinc_lock_mode;
+@@GLOBAL.innodb_autoinc_lock_mode
+1
+include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/t/rpl_parallel_autoinc.test b/mysql-test/suite/rpl/t/rpl_parallel_autoinc.test
new file mode 100644
index 00000000000..0e96b4dfb80
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_parallel_autoinc.test
@@ -0,0 +1,140 @@
+--source include/have_binlog_format_statement.inc
+--source include/have_innodb.inc
+--source include/master-slave.inc
+
+--echo MDEV-31482: Lock wait timeout with INSERT-SELECT, autoinc, and statement-based replication
+
+# The scenario is transactions T1, T2, T3:
+#
+# T1 is waiting for T3 on an autoinc lock
+# T2 is waiting for T1 to commit
+# T3 is waiting on a normal row lock held by T2
+#
+# This caused a hang until innodb_lock_wait_timeout, because autoinc
+# locks were not reported to the in-order parallel replication, so T3
+# was not deadlock killed.
+
+--let $lock_wait_timeout=20
+
+--let $rpl_connection_name= slave2
+--let $rpl_server_number= 2
+--source include/rpl_connect.inc
+
+--let $rpl_connection_name= slave3
+--let $rpl_server_number= 2
+--source include/rpl_connect.inc
+
+--connection master
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+
+# A table as destination for INSERT-SELECT
+CREATE TABLE t1 (a INT PRIMARY KEY AUTO_INCREMENT, b INT, c INT, INDEX (c)) ENGINE=InnoDB;
+INSERT INTO t1 (b,c) VALUES (0, 1), (0, 1), (0, 2), (0,3), (0, 5), (0, 7), (0, 8);
+
+# A table as source for INSERT-SELECT.
+CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t2 VALUES (10,1), (20,2), (30,3), (40,4), (50,5);
+
+# A table to help order slave worker threads to setup the desired scenario.
+CREATE TABLE t3 (a VARCHAR(20) PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t3 VALUES ('row for T1', 0), ('row for T2', 0), ('row for T3', 0);
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/sync_with_master_gtid.inc
+--source include/stop_slave.inc
+--let $save_innodb_lock_wait_timeout= `SELECT @@global.innodb_lock_wait_timeout`
+--let $save_slave_parallel_threads= `SELECT @@global.slave_parallel_threads`
+--let $save_slave_parallel_mode= `SELECT @@global.slave_parallel_mode`
+set @@global.slave_parallel_threads= 3;
+set @@global.slave_parallel_mode= OPTIMISTIC;
+eval set @@global.innodb_lock_wait_timeout= $lock_wait_timeout;
+
+--connection master
+# Transaction T1.
+BEGIN;
+UPDATE t3 SET b=b+1 where a="row for T1";
+INSERT INTO t1(b, c) SELECT 1, t2.b FROM t2 WHERE a=10;
+COMMIT;
+
+# Transaction T2.
+DELETE FROM t1 WHERE c >= 4 and c < 6;
+
+# Transaction T3.
+BEGIN;
+UPDATE t3 SET b=b+1 where a="row for T3";
+INSERT INTO t1(b, c) SELECT 3, t2.b FROM t2 WHERE a >= 20 AND a <= 40;
+COMMIT;
+
+--source include/save_master_gtid.inc
+
+--connection slave1
+# Temporarily block T1 to create the scheduling that triggers the bug.
+BEGIN;
+SELECT * FROM t3 WHERE a="row for T1" FOR UPDATE;
+
+--connection slave2
+# Temporarily block T3 from starting (so T2 can reach commit).
+BEGIN;
+SELECT * FROM t3 WHERE a="row for T3" FOR UPDATE;
+
+--connection slave3
+# This critical step blocks T3 after it has inserted its first row,
+# and thus taken the auto-increment lock, but before it has reached
+# the point where it gets a row lock wait on T2. Even though
+# auto-increment lock waits were not reported due to the bug,
+# transitive lock waits (T1 waits on autoinc of T3 which waits on row
+# on T2) _were_ reported as T1 waiting on T2, and thus a deadlock kill
+# happened and the bug was not triggered.
+BEGIN;
+DELETE FROM t2 WHERE a=30;
+
+--connection slave
+--source include/start_slave.inc
+
+# First let T2 complete until it is waiting for T1 to commit.
+--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state='Waiting for prior transaction to commit' and command LIKE 'Slave_worker';
+--source include/wait_condition.inc
+
+# Then let T3 reach the point where it has obtained the autoinc lock,
+# but it is not yet waiting for a row lock held by T2.
+--connection slave2
+ROLLBACK;
+--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state='Sending data' and info LIKE 'INSERT INTO t1(b, c) SELECT 3, t2.b%' and time_ms > 500 and command LIKE 'Slave_worker';
+--source include/wait_condition.inc
+
+# Now let T1 continue, while T3 is holding the autoinc lock but before
+# it is waiting for T2. Wait a short while to give the hang a chance to
+# happen; T1 needs to get to request the autoinc lock before we let T3
+# continue. (There's a small chance the sleep will be too small, which will
+# let the test occasionally pass on non-fixed server).
+--connection slave1
+ROLLBACK;
+--sleep 0.5
+
+# Now let T3 continue; the bug was that this lead to an undetected
+# deadlock that remained until innodb lock wait timeout.
+--connection slave3
+ROLLBACK;
+
+--connection slave
+--let $slave_timeout= `SELECT $lock_wait_timeout/2`
+--source include/sync_with_master_gtid.inc
+--let $slave_timeout=
+SELECT * FROM t1 ORDER BY a;
+SELECT * FROM t2 ORDER BY a;
+SELECT * FROM t3 ORDER BY a;
+
+# Cleanup.
+--connection master
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format");
+DROP TABLE t1, t2, t3;
+
+--connection slave
+--source include/stop_slave.inc
+eval SET @@global.slave_parallel_threads= $save_slave_parallel_threads;
+eval SET @@global.slave_parallel_mode= $save_slave_parallel_mode;
+eval SET @@global.innodb_lock_wait_timeout= $save_innodb_lock_wait_timeout;
+--source include/start_slave.inc
+SELECT @@GLOBAL.innodb_autoinc_lock_mode;
+--source include/rpl_end.inc
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 73bb654080a..8ed3f8a9c5e 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -5119,12 +5119,6 @@ thd_need_wait_reports(const MYSQL_THD thd)
deadlock with the pre-determined commit order, we kill the later
transaction, and later re-try it, to resolve the deadlock.
- This call need only receive reports about waits for locks that will remain
- until the holding transaction commits. InnoDB auto-increment locks,
- for example, are released earlier, and so need not be reported. (Such false
- positives are not harmful, but could lead to unnecessary kill and retry, so
- best avoided).
-
Returns 1 if the OTHER_THD will be killed to resolve deadlock, 0 if not. The
actual kill will happen later, asynchronously from another thread. The
caller does not need to take any actions on the return value if the
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 26388ad95e2..b1cf2152cd6 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -6872,13 +6872,7 @@ DeadlockChecker::search()
return m_start;
}
- /* We do not need to report autoinc locks to the upper
- layer. These locks are released before commit, so they
- can not cause deadlocks with binlog-fixed commit
- order. */
- if (m_report_waiters
- && (lock_get_type_low(lock) != LOCK_TABLE
- || lock_get_mode(lock) != LOCK_AUTO_INC)) {
+ if (m_report_waiters) {
thd_rpl_deadlock_check(m_start->mysql_thd,
lock->trx->mysql_thd);
}
1
0

[Commits] 60c1b15: MDEV-31102 Crash when pushing condition into view defined as union
by IgorBabaev 21 Apr '23
by IgorBabaev 21 Apr '23
21 Apr '23
revision-id: 60c1b15328165ce215071b4f307fd20bc8a0b545 (mariadb-10.4.28-105-g60c1b15)
parent(s): fc6e8a3d3264078bed28632a289130b1dc24daea
author: Igor Babaev
committer: Igor Babaev
timestamp: 2023-04-21 13:46:14 -0700
message:
MDEV-31102 Crash when pushing condition into view defined as union
This bug could manifest itself at the first execution of prepared statement
created for queries using a materialized view defined as union. A crash
could happen for sure if the query contained a condition pushable into
the view and this condition was over the column defined via a complex string
expression requiring implicit conversion from one charset to another for
some of its sub-expressions. The bug could cause crashes when executing
PS for some other queries whose optimization needed building clones for
such expressions.
This bug was introduced in the patch for MDEV-29988 where the class
Item_direct_ref_to_item was added. The implementations of the virtual
methods get_copy() and build_clone() were invalid for the class and this
could cause crashes after the method build_clone() was called for
expressions containing objects of the Item_direct_ref_to_item type.
Approved by Sergei Golubchik <serg(a)mariadb.com>
---
mysql-test/main/derived_cond_pushdown.result | 83 ++++++++++++++++++++++++++++
mysql-test/main/derived_cond_pushdown.test | 30 ++++++++++
sql/item.h | 17 +++++-
3 files changed, 128 insertions(+), 2 deletions(-)
diff --git a/mysql-test/main/derived_cond_pushdown.result b/mysql-test/main/derived_cond_pushdown.result
index 41f9ac6..4b202ea 100644
--- a/mysql-test/main/derived_cond_pushdown.result
+++ b/mysql-test/main/derived_cond_pushdown.result
@@ -18275,4 +18275,87 @@ id select_type table type possible_keys key key_len ref rows Extra
3 DERIVED t1 ALL NULL NULL NULL NULL 3 Using temporary
drop view v1;
drop table t1;
+#
+# MDEV-31102: execution of PS for query where pushdown of condition
+# into view defined as union is applied
+#
+create table t1 (
+n int,
+lv varchar(31) charset latin1,
+mv varchar(31) charset utf8mb3
+) engine=myisam;
+insert into t1 values (1,'aa','xxx'), ('2','bb','yyy'), (3,'cc','zzz');
+create view v1 as
+select case when n=1 then lv when n=2 then mv else NULL end as r from t1
+union
+select 'a';
+select * from v1 where r < 'x';
+r
+aa
+a
+explain extended select * from v1 where r < 'x';
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 PRIMARY <derived2> ALL NULL NULL NULL NULL 3 100.00 Using where
+2 DERIVED t1 ALL NULL NULL NULL NULL 3 100.00 Using where
+3 UNION NULL NULL NULL NULL NULL NULL NULL NULL No tables used
+NULL UNION RESULT <union2,3> ALL NULL NULL NULL NULL NULL NULL
+Warnings:
+Note 1003 /* select#1 */ select `v1`.`r` AS `r` from `test`.`v1` where `v1`.`r` < 'x'
+explain format=json select * from v1 where r < 'x';
+EXPLAIN
+{
+ "query_block": {
+ "select_id": 1,
+ "table": {
+ "table_name": "<derived2>",
+ "access_type": "ALL",
+ "rows": 3,
+ "filtered": 100,
+ "attached_condition": "v1.r < 'x'",
+ "materialized": {
+ "query_block": {
+ "union_result": {
+ "table_name": "<union2,3>",
+ "access_type": "ALL",
+ "query_specifications": [
+ {
+ "query_block": {
+ "select_id": 2,
+ "table": {
+ "table_name": "t1",
+ "access_type": "ALL",
+ "rows": 3,
+ "filtered": 100,
+ "attached_condition": "case when t1.n = 1 then convert(t1.lv using utf8) when t1.n = 2 then t1.mv else NULL end < 'x'"
+ }
+ }
+ },
+ {
+ "query_block": {
+ "select_id": 3,
+ "operation": "UNION",
+ "table": {
+ "message": "No tables used"
+ }
+ }
+ }
+ ]
+ }
+ }
+ }
+ }
+ }
+}
+prepare stmt from "select * from v1 where r < 'x'";
+execute stmt;
+r
+aa
+a
+execute stmt;
+r
+aa
+a
+deallocate prepare stmt;
+drop view v1;
+drop table t1;
# End of 10.4 tests
diff --git a/mysql-test/main/derived_cond_pushdown.test b/mysql-test/main/derived_cond_pushdown.test
index 6cfe23b..b4e131d 100644
--- a/mysql-test/main/derived_cond_pushdown.test
+++ b/mysql-test/main/derived_cond_pushdown.test
@@ -3942,4 +3942,34 @@ explain select * from v1;
drop view v1;
drop table t1;
+--echo #
+--echo # MDEV-31102: execution of PS for query where pushdown of condition
+--echo # into view defined as union is applied
+--echo #
+
+create table t1 (
+ n int,
+ lv varchar(31) charset latin1,
+ mv varchar(31) charset utf8mb3
+) engine=myisam;
+insert into t1 values (1,'aa','xxx'), ('2','bb','yyy'), (3,'cc','zzz');
+create view v1 as
+select case when n=1 then lv when n=2 then mv else NULL end as r from t1
+union
+select 'a';
+
+let $q=
+select * from v1 where r < 'x';
+
+eval $q;
+eval explain extended $q;
+eval explain format=json $q;
+eval prepare stmt from "$q";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+drop view v1;
+drop table t1;
+
--echo # End of 10.4 tests
diff --git a/sql/item.h b/sql/item.h
index 31568aa..1e0caaa 100644
--- a/sql/item.h
+++ b/sql/item.h
@@ -7647,7 +7647,7 @@ class Item_direct_ref_to_item : public Item_direct_ref
Item *get_tmp_table_item(THD *thd)
{ return m_item->get_tmp_table_item(thd); }
Item *get_copy(THD *thd)
- { return m_item->get_copy(thd); }
+ { return get_item_copy<Item_direct_ref_to_item>(thd, this); }
COND *build_equal_items(THD *thd, COND_EQUAL *inherited,
bool link_item_fields,
COND_EQUAL **cond_equal_ref)
@@ -7715,7 +7715,20 @@ class Item_direct_ref_to_item : public Item_direct_ref
bool excl_dep_on_grouping_fields(st_select_lex *sel)
{ return m_item->excl_dep_on_grouping_fields(sel); }
bool is_expensive() { return m_item->is_expensive(); }
- Item* build_clone(THD *thd) { return get_copy(thd); }
+ void set_item(Item *item) { m_item= item; }
+ Item *build_clone(THD *thd)
+ {
+ Item *clone_item= m_item->build_clone(thd);
+ if (clone_item)
+ {
+ Item_direct_ref_to_item *copy= (Item_direct_ref_to_item *) get_copy(thd);
+ if (!copy)
+ return 0;
+ copy->set_item(clone_item);
+ return copy;
+ }
+ return 0;
+ }
void split_sum_func(THD *thd, Ref_ptr_array ref_pointer_array,
List<Item> &fields, uint flags)
1
0